Commit 057dbb84 by fangxingjun

Merge branch 'developer' of 47.106.101.75:abel_cjy/Amazon-Selection-Data into developer

parents 0feb4e27 e92a6ecf
...@@ -40,7 +40,7 @@ class DwtStThemeAgg(object): ...@@ -40,7 +40,7 @@ class DwtStThemeAgg(object):
self.u_theme_pattern = F.udf(udf_ele_mattch, StringType()) self.u_theme_pattern = F.udf(udf_ele_mattch, StringType())
self.u_theme_contain_judge = F.udf(self.udf_theme_contain_judge, IntegerType()) self.u_theme_contain_judge = F.udf(self.udf_theme_contain_judge, IntegerType())
self.u_judge_twin_words = F.udf(self.udf_judge_twin_words, IntegerType()) self.u_judge_twin_words = F.udf(self.udf_judge_twin_words, IntegerType())
self.u_filter_sec_pattern_words = F.udf(self.udf_filter_sec_pattern_words, IntegerType()) self.u_filter_pattern_words = F.udf(self.udf_filter_pattern_words, IntegerType())
# 全局df初始化 # 全局df初始化
self.df_st_base = self.spark.sql(f"select 1+1;") self.df_st_base = self.spark.sql(f"select 1+1;")
...@@ -180,8 +180,7 @@ class DwtStThemeAgg(object): ...@@ -180,8 +180,7 @@ class DwtStThemeAgg(object):
return F.udf(udf_filter_blacklist, IntegerType()) return F.udf(udf_filter_blacklist, IntegerType())
@staticmethod @staticmethod
def udf_filter_sec_pattern_words(st_word, pattern_list): def udf_filter_pattern_words(st_word, pattern_list):
# 标记一些特殊情况指定的二级词,方便后期过滤
filter_flag = 0 filter_flag = 0
theme_list = ['combination', 'size'] theme_list = ['combination', 'size']
if pattern_list: if pattern_list:
...@@ -191,7 +190,7 @@ class DwtStThemeAgg(object): ...@@ -191,7 +190,7 @@ class DwtStThemeAgg(object):
# 进行单项 数字+month/months的所有二级词 和 数字连接t+ boys/girls的二级词特殊匹配 # 进行单项 数字+month/months的所有二级词 和 数字连接t+ boys/girls的二级词特殊匹配
date_pattern = re.compile(r"(\d+(?:\.\d+)?) +(month|months)\b", flags=re.IGNORECASE) date_pattern = re.compile(r"(\d+(?:\.\d+)?) +(month|months)\b", flags=re.IGNORECASE)
numt_pattern = re.compile(r"((?:\d+)t)(?: +)(boys|girls|boy|girl)\b", flags=re.IGNORECASE) numt_pattern = re.compile(r"((?:\d+)t)(?: +)(boys|girls|boy|girl)\b", flags=re.IGNORECASE)
other_pattern = re.compile(r"\b(women|men|man|woman|for|cute|fashion|kids?|adults?|girls?|boys?)\b", flags=re.IGNORECASE) other_pattern = re.compile(r"\b(womens?|mens?|mans?|womans?|fors?|cutes?|fashions?|kids?|adults?|girls?|boys?)\b", flags=re.IGNORECASE)
if re.search(date_pattern, st_word): if re.search(date_pattern, st_word):
return 1 return 1
if re.search(numt_pattern, st_word): if re.search(numt_pattern, st_word):
...@@ -350,8 +349,6 @@ class DwtStThemeAgg(object): ...@@ -350,8 +349,6 @@ class DwtStThemeAgg(object):
self.read_data() self.read_data()
# 模板词归一化处理 # 模板词归一化处理
self.handle_base_pattern_data() self.handle_base_pattern_data()
# 二级词单独处理
self.handle_sec_st()
# 将一级二级模板词和搜索词进行匹配,做中间存储 # 将一级二级模板词和搜索词进行匹配,做中间存储
self.handle_st_filter_table() self.handle_st_filter_table()
# 统计各模板词的指标 pattern_type=0 # 统计各模板词的指标 pattern_type=0
...@@ -399,23 +396,18 @@ class DwtStThemeAgg(object): ...@@ -399,23 +396,18 @@ class DwtStThemeAgg(object):
'st_blacklist_flag', self.filter_blacklist_words(pd_match_blacklist)("search_term") 'st_blacklist_flag', self.filter_blacklist_words(pd_match_blacklist)("search_term")
).filter('st_blacklist_flag != 1').cache() ).filter('st_blacklist_flag != 1').cache()
# 处理二级词 def handle_st_filter_table(self):
def handle_sec_st(self): # 过滤特殊词
self.df_sec_words = self.df_base_filter_date.filter('st_word_num = 2') self.df_base_filter_date = self.df_base_filter_date.join(
self.df_sec_words = self.df_sec_words.join(
self.df_theme, on=['search_term'], how='left' self.df_theme, on=['search_term'], how='left'
) ).withColumn(
self.df_sec_words = self.df_sec_words.withColumn( "filter_flag", self.u_filter_pattern_words(F.col("search_term"), F.col("pattern_list"))
"filter_flag", self.u_filter_sec_pattern_words(F.col("search_term"), F.col("pattern_list")) ).filter(
) "filter_flag != 1"
# 过滤掉被标记为1的数据 ).select(
self.df_sec_words = self.df_sec_words.filter("filter_flag != 1") 'search_term', 'st_word_num', 'st_bsr_cate_1_id_new', 'st_bsr_cate_current_id_new', 'rank', 'rank_change_rate', 'rank_rate_of_change'
self.df_sec_words = self.df_sec_words.select( ).cache()
'search_term', 'st_word_num', 'st_bsr_cate_1_id_new', 'st_bsr_cate_current_id_new',
'rank', 'rank_change_rate', 'rank_rate_of_change'
)
def handle_st_filter_table(self):
df_st_filter_base = self.df_st_base.select( df_st_filter_base = self.df_st_base.select(
F.col('st_key'), F.col('st_key'),
F.col('search_term'), F.col('search_term'),
...@@ -425,12 +417,6 @@ class DwtStThemeAgg(object): ...@@ -425,12 +417,6 @@ class DwtStThemeAgg(object):
F.lit(self.date_info).alias('date_info') F.lit(self.date_info).alias('date_info')
).cache() ).cache()
# 将处理后的二级词和一级词合并
df_one_word = self.df_base_filter_date.filter('st_word_num = 1').select(
'search_term', 'st_word_num', 'st_bsr_cate_1_id_new', 'st_bsr_cate_current_id_new',
'rank', 'rank_change_rate', 'rank_rate_of_change'
)
self.df_base_filter_date = self.df_sec_words.unionByName(df_one_word).cache()
pattern_words = self.df_base_filter_date.select('search_term') pattern_words = self.df_base_filter_date.select('search_term')
# 将数据转换成pandas_df # 将数据转换成pandas_df
dict_df = pattern_words.toPandas() dict_df = pattern_words.toPandas()
...@@ -461,7 +447,6 @@ class DwtStThemeAgg(object): ...@@ -461,7 +447,6 @@ class DwtStThemeAgg(object):
df_list.append(df_union_filter) df_list.append(df_union_filter)
for i in range(0, len(df_list), batch_size): for i in range(0, len(df_list), batch_size):
print(f"当前是word_batches的轮回:f{word_batches.index(word_batch)},当前写入表的df索引位置:{i + 1}") print(f"当前是word_batches的轮回:f{word_batches.index(word_batch)},当前写入表的df索引位置:{i + 1}")
tmp_df = []
tmp_df = df_list[i:i + batch_size] tmp_df = df_list[i:i + batch_size]
result_df = self.udf_unionAll(*tmp_df) result_df = self.udf_unionAll(*tmp_df)
result_df = result_df.repartition(1) result_df = result_df.repartition(1)
......
...@@ -48,7 +48,11 @@ class EsStDetail(TemplatesMysql): ...@@ -48,7 +48,11 @@ class EsStDetail(TemplatesMysql):
self.record_table_name_field = f'{self.site_name}_flow_asin_last_month' if self.date_type == 'month' else f'{self.site_name}_flow_asin_last30day' self.record_table_name_field = f'{self.site_name}_flow_asin_last_month' if self.date_type == 'month' else f'{self.site_name}_flow_asin_last30day'
# elasticsearch相关配置 # elasticsearch相关配置
self.client = EsUtils.get_es_client() self.client = EsUtils.get_es_client()
self.es_options = EsUtils.get_es_options(self.es_index_name) # 富集策略相关配置,用于更新 usr_mask_type 字段
self.policy_name1 = "user_mask_asin_policy"
self.policy_name2 = "user_mask_category_policy"
self.pipeline_id = "user_asin_mask_enrich_pipeline"
self.es_options = EsUtils.get_es_options(self.es_index_name, self.pipeline_id)
self.es_body = EsUtils.get_es_body() self.es_body = EsUtils.get_es_body()
# 正式导出需入导出记录表 # 正式导出需入导出记录表
...@@ -105,6 +109,10 @@ class EsStDetail(TemplatesMysql): ...@@ -105,6 +109,10 @@ class EsStDetail(TemplatesMysql):
def es_prepare(self): def es_prepare(self):
print("当前链接的es节点信息为:" + str(EsUtils.__es_ip__)) print("当前链接的es节点信息为:" + str(EsUtils.__es_ip__))
EsUtils.create_index(self.es_index_name, self.client, self.es_body) EsUtils.create_index(self.es_index_name, self.client, self.es_body)
# 执行富集策略
EsUtils.user_enrich_pipeline(self.client, self.pipeline_id, self.policy_name1, self.policy_name2)
self.client.enrich.execute_policy(name=self.policy_name1)
self.client.enrich.execute_policy(name=self.policy_name2)
if self.date_type != 'month': if self.date_type != 'month':
if not EsUtils.exist_index_alias(self.alias_name, self.client): if not EsUtils.exist_index_alias(self.alias_name, self.client):
EsUtils.create_index_alias(self.es_index_name, self.alias_name, self.client) EsUtils.create_index_alias(self.es_index_name, self.alias_name, self.client)
......
...@@ -50,7 +50,11 @@ class KafkaFlowAsinDetail(Templates): ...@@ -50,7 +50,11 @@ class KafkaFlowAsinDetail(Templates):
self.es_index_name = f"{self.topic_name}_test" if self.test_flag == 'test' else f"{self.topic_name}" self.es_index_name = f"{self.topic_name}_test" if self.test_flag == 'test' else f"{self.topic_name}"
self.es_index_alias_name = f"{self.site_name}_st_detail_last_4_week_test" if self.test_flag == 'test' else f"{self.site_name}_st_detail_last_4_week" self.es_index_alias_name = f"{self.site_name}_st_detail_last_4_week_test" if self.test_flag == 'test' else f"{self.site_name}_st_detail_last_4_week"
self.es_index_body = EsUtils.get_es_body() self.es_index_body = EsUtils.get_es_body()
self.es_options = EsUtils.get_es_options(self.es_index_name) # 富集策略相关配置,用于更新 usr_mask_type 字段
self.policy_name1 = "user_mask_asin_policy"
self.policy_name2 = "user_mask_category_policy"
self.pipeline_id = "user_asin_mask_enrich_pipeline"
self.es_options = EsUtils.get_es_options(self.es_index_name, self.pipeline_id)
self.db_save = 'kafka_flow_asin_detail' self.db_save = 'kafka_flow_asin_detail'
self.app_name = self.get_app_name() self.app_name = self.get_app_name()
print(f"任务名称:{self.app_name}") print(f"任务名称:{self.app_name}")
...@@ -235,7 +239,7 @@ class KafkaFlowAsinDetail(Templates): ...@@ -235,7 +239,7 @@ class KafkaFlowAsinDetail(Templates):
cate_1_pattern = self.pattern1_dict[self.site_name] cate_1_pattern = self.pattern1_dict[self.site_name]
df = df.withColumn("asin_bs_sellers_rank_lower", F.lower("best_sellers_rank")) df = df.withColumn("asin_bs_sellers_rank_lower", F.lower("best_sellers_rank"))
df = df.withColumn("asin_bs", self.u_parse_bs_category( df = df.withColumn("asin_bs", self.u_parse_bs_category(
"asin_bs_sellers_rank_lower", "best_sellers_herf", "all_best_sellers_herf", F.lit(cate_current_pattern), F.lit(cate_1_pattern))) "asin_bs_sellers_rank_lower", "best_sellers_herf", "all_best_sellers_herf", F.lit(cate_current_pattern), F.lit(cate_1_pattern), "node_id"))
df = df.withColumn("asin_bs_cate_1_id", df.asin_bs.getField("asin_bs_cate_1_id")) \ df = df.withColumn("asin_bs_cate_1_id", df.asin_bs.getField("asin_bs_cate_1_id")) \
.withColumn("asin_bs_cate_current_id", df.asin_bs.getField("asin_bs_cate_current_id")) \ .withColumn("asin_bs_cate_current_id", df.asin_bs.getField("asin_bs_cate_current_id")) \
.withColumn("asin_bs_cate_1_rank", df.asin_bs.getField("asin_bs_cate_1_rank")) \ .withColumn("asin_bs_cate_1_rank", df.asin_bs.getField("asin_bs_cate_1_rank")) \
...@@ -828,6 +832,10 @@ class KafkaFlowAsinDetail(Templates): ...@@ -828,6 +832,10 @@ class KafkaFlowAsinDetail(Templates):
# 创建对应es索引 # 创建对应es索引
EsUtils.create_index(self.es_index_name, self.client, self.es_index_body) EsUtils.create_index(self.es_index_name, self.client, self.es_index_body)
print("索引名称为:", self.es_index_name) print("索引名称为:", self.es_index_name)
# 执行富集策略
self.client.enrich.execute_policy(name=self.policy_name1)
self.client.enrich.execute_policy(name=self.policy_name2)
EsUtils.user_enrich_pipeline(self.client, self.pipeline_id, self.policy_name1, self.policy_name2)
if not EsUtils.exist_index_alias(self.es_index_alias_name, self.client): if not EsUtils.exist_index_alias(self.es_index_alias_name, self.client):
EsUtils.create_index_alias(self.es_index_name, self.es_index_alias_name, self.client) EsUtils.create_index_alias(self.es_index_name, self.es_index_alias_name, self.client)
else: else:
......
"""
@Author : HuangJian
@Description : asin详情表-周表
@SourceTable : us_asin_detail_2023_18
@SinkTable : ods_asin_detail
@CreateTime : 2022/05/18 14:55
@UpdateTime : 2022/05/18 14:55
"""
import os import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.common_util import DateTypes from utils.secure_db_client import get_remote_engine
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
...@@ -24,151 +14,40 @@ if __name__ == '__main__': ...@@ -24,151 +14,40 @@ if __name__ == '__main__':
assert date_type is not None, "date_type 不能为空!" assert date_type is not None, "date_type 不能为空!"
assert date_info is not None, "date_info 不能为空!" assert date_info is not None, "date_info 不能为空!"
hive_table = f"ods_asin_detail" d1, d2 = CommonUtil.split_month_week_date(date_type, date_info)
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
db_type = 'postgresql_14'
import_table = f"{site_name}_asin_detail_month_{d1}_{d2}"
hive_table = "ods_asin_detail"
partition_dict = { partition_dict = {
"site_name": site_name, "site_name": site_name,
"date_type": date_type, "date_type": date_type,
"date_info": date_info "date_info": date_info
} }
# 落表路径校验
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}") cols = "id, asin, img_url, title, title_len, price, rating, total_comments, buy_box_seller_type, page_inventory, " \
"category, volume, weight, rank, launch_time, created_time as created_at, updated_time as updated_at, " \
# 日期拆分 "category_state, img_num, img_type, activity_type, one_two_val, three_four_val, five_six_val, eight_val, " \
d1, d2 = CommonUtil.split_month_week_date(date_type, date_info) "qa_num, one_star, two_star, three_star, four_star, five_star, low_star, together_asin, brand, ac_name, " \
"material, node_id, data_type, sp_num, describe, weight_str, package_quantity, pattern_name, follow_sellers, " \
if date_type == DateTypes.week.name: "product_description, buy_sales, image_view, spider_int, lob_asin_json, seller_json, customer_reviews_json, " \
# pg的分区周单位数是带0,如01、02、03 "product_json, product_detail_json, review_ai_text, review_label_json, sp_initial_seen_asins_json, " \
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}' "sp_4stars_initial_seen_asins_json, sp_delivery_initial_seen_asins_json, compare_similar_asin_json, " \
# 这里主要是区分db链接 "together_asin_json, min_match_asin_json, variat_num, current_asin, img_list, variat_list, parent_asin, " \
if site_name == 'us' and date_info >= '2023-26': "bundles_this_asins_json, video_m3u8_url, result_list_json, bundle_asin_component_json, review_json_list"
db_type = 'postgresql'
if date_info >= '2023-34': engine = get_remote_engine(
db_type = 'postgresql_14' site_name=site_name,
date_col = "launch_time,created_time as created_at,updated_time as updated_at" db_type=db_type
new_col = ',describe' )
else:
db_type = 'postgresql_14' engine.sqoop_raw_import(
date_col = "launch_time,created_time as created_at,updated_time as updated_at" query=f"SELECT {cols} FROM {import_table} WHERE 1=1 and $CONDITIONS",
new_col = ',describe' hive_table=hive_table,
hdfs_path=hdfs_path,
print(f"同步连接的db_type:{db_type}") partitions=partition_dict,
m=50,
# 这里主要是区分新增字段 split_by='id'
# 18周新增字段weight_str )
if date_info >= '2023-18':
new_col += ',weight_str' pass
# 21周新增字段package_quantity、pattern_name
if date_info >= '2023-21':
new_col += ',package_quantity,pattern_name'
# 49周新增字段follow_sellers
if date_info >= '2023-49':
new_col += ',follow_sellers'
# 51周新增字段product_description,buy_sales
if date_info >= '2023-51':
new_col += ',product_description,buy_sales'
# 2024-02周新增字段image_view
if date_info >= '2024-02':
new_col += ',image_view'
# # 2024-05周新增字段product_json,product_detail_json,review_ai_text,review_label_json
# if date_info >= '2024-05':
# new_col += ',product_json,product_detail_json,review_ai_text,review_label_json'
import_table = f"{site_name}_asin_detail_{d1}_{d2}"
if date_type == DateTypes.month.name or date_type == DateTypes.month_week.name:
db_type = 'postgresql_14'
date_col = "launch_time, created_time as created_at, updated_time as updated_at"
new_col = "describe, weight_str, package_quantity, pattern_name, follow_sellers, product_description, buy_sales, image_view, spider_int, " \
"lob_asin_json, seller_json, customer_reviews_json, product_json, product_detail_json, review_ai_text, review_label_json, sp_initial_seen_asins_json, " \
"sp_4stars_initial_seen_asins_json, sp_delivery_initial_seen_asins_json, compare_similar_asin_json, together_asin_json, min_match_asin_json, " \
"variat_num, current_asin, img_list, variat_list, parent_asin, bundles_this_asins_json, video_m3u8_url, result_list_json, bundle_asin_component_json"
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
import_table = f"{site_name}_asin_detail_month_{d1}_{d2}"
sql_query = f"""
select
id,
asin,
img_url,
title,
title_len,
price,
rating,
total_comments,
buy_box_seller_type,
page_inventory,
category,
volume,
weight,
rank,
{date_col},
category_state,
img_num,
img_type,
activity_type,
one_two_val,
three_four_val,
five_six_val,
eight_val,
qa_num,
one_star,
two_star,
three_star,
four_star,
five_star,
low_star,
together_asin,
brand,
ac_name,
material,
node_id,
data_type,
sp_num,
{new_col}
from {import_table}
where 1=1
and \$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil.check_schema_before_import(db_type=db_type,
site_name=site_name,
query=sql_query,
hive_tb_name=hive_table,
msg_usr=['chenyuanjie'],
partition_dict=partition_dict)
# 生成导出脚本
import_sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=sql_query,
hdfs_path=hdfs_path,
map_num=50,
key='id')
# 导入前先删除原始hdfs数据
HdfsUtils.delete_hdfs_file(hdfs_path)
# 创建ssh Client对象--用于执行cmd命令
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
# 关闭链接
client.close()
# 导入后检测--检测数据一致性
if date_type != 'month_week':
CommonUtil.check_import_sync_num(db_type=db_type,
partition_dict=partition_dict,
import_query=sql_query,
hive_tb_name=hive_table,
msg_usr=['chenyuanjie'])
# 导入后验证--重点字段阈值预警
CommonUtil.check_fields_and_warning(hive_tb_name=hive_table, partition_dict=partition_dict)
...@@ -2,16 +2,17 @@ import os ...@@ -2,16 +2,17 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
from utils.db_util import DbTypes from utils.db_util import DbTypes
from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
import_tb = f"{site_name}_all_syn_st_asin"
db_type = DbTypes.postgresql.name
import_tb = f"{site_name}_all_syn_st_asin"
query = f""" query = f"""
select asin, select asin,
state, state,
...@@ -20,32 +21,22 @@ if __name__ == '__main__': ...@@ -20,32 +21,22 @@ if __name__ == '__main__':
where state = 4 where state = 4
and \$CONDITIONS and \$CONDITIONS
""" """
hive_tb = "ods_asin_err_state" hive_tb = "ods_asin_err_state"
partition_dict = { partition_dict = {
"site_name": site_name "site_name": site_name
} }
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
db_type = DbTypes.postgresql.name engine = get_remote_engine(
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type, site_name=site_name,
site_name=site_name, db_type=db_type
query=query, )
hive_tb_name=hive_tb,
msg_usr=['wujicang'] engine.sqoop_raw_import(
) query=query,
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!" hive_table=hive_tb,
hdfs_path=hdfs_path,
partitions=partition_dict
)
if not empty_flag:
sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=query,
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
client.close()
pass pass
...@@ -2,75 +2,59 @@ import os ...@@ -2,75 +2,59 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
hive_tb = "ods_bs_category"
db_type = "mysql" db_type = "mysql"
import_tb = f"{site_name}_bs_category" import_tb = f"{site_name}_bs_category"
partition_dict = {
"site_name": site_name,
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
query = f""" query = f"""
select select
id, id,
p_id, p_id,
ch_name, ch_name,
en_name, en_name,
nodes_num, nodes_num,
path, path,
is_show, is_show,
one_category_id, one_category_id,
and_en_name, and_en_name,
leaf_node, leaf_node,
delete_time, delete_time,
full_name, full_name,
category_id, category_id,
category_parent_id, category_parent_id,
category_first_id, category_first_id,
category_state, category_state,
redirect_flag, redirect_flag,
redirect_first_id, redirect_first_id,
created_at, created_at,
updated_at updated_at
from {import_tb} from {import_tb}
where 1 = 1 where 1 = 1
and \$CONDITIONS and \$CONDITIONS
""" """
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type, hive_tb = "ods_bs_category"
site_name=site_name, partition_dict = {
query=query, "site_name": site_name,
hive_tb_name=hive_tb, }
msg_usr=['chenyuanjie'] hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
)
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if not empty_flag: engine = get_remote_engine(
sh = CommonUtil.build_import_sh(site_name=site_name, site_name=site_name,
db_type=db_type, db_type=db_type
query=query, )
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
client.close()
# 导入后检测--检测数据一致性 engine.sqoop_raw_import(
CommonUtil.check_import_sync_num(db_type=db_type, query=query,
partition_dict=partition_dict, hive_table=hive_tb,
import_query=query, hdfs_path=hdfs_path,
hive_tb_name=hive_tb, partitions=partition_dict
msg_usr=['chenyuanjie'] )
)
pass pass
...@@ -2,9 +2,9 @@ import os ...@@ -2,9 +2,9 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil,DateTypes from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
...@@ -13,7 +13,19 @@ if __name__ == '__main__': ...@@ -13,7 +13,19 @@ if __name__ == '__main__':
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
assert date_type is not None, "date_type 不能为空!" assert date_type is not None, "date_type 不能为空!"
assert date_info is not None, "date_info 不能为空!" assert date_info is not None, "date_info 不能为空!"
year, week = date_info.split("-")
d1, d2 = CommonUtil.split_month_week_date(date_type, date_info)
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
db_type = 'postgresql_14'
import_tb = f"{site_name}_bs_category_asin_detail_month_{d1}_{d2}"
query = f"""
select
id, asin, null as week, best_sellers_rank, created_time as created_at, updated_time as updated_at, last_herf, all_best_sellers_href
from {import_tb}
where 1=1
and \$CONDITIONS
"""
hive_tb = "ods_bs_category_asin_detail" hive_tb = "ods_bs_category_asin_detail"
partition_dict = { partition_dict = {
"site_name": site_name, "site_name": site_name,
...@@ -21,77 +33,19 @@ if __name__ == '__main__': ...@@ -21,77 +33,19 @@ if __name__ == '__main__':
"date_info": date_info, "date_info": date_info,
} }
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
if date_type == DateTypes.week.name:
if site_name == "us":
if date_info >= '2023-18':
db_type = "postgresql"
if date_info >= '2023-34':
db_type = 'postgresql_14'
import_tb = f"{site_name}_bs_category_asin_detail_{year}_{week}"
cols = f"id,asin,{week} as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
params = "1 = 1"
else:
db_type = "mysql"
import_tb = f"{site_name}_bs_category_asin_detail"
cols = "id,asin,week,best_sellers_rank,created_at,updated_at,last_herf"
params = f"week = {int(week)} and DATE_FORMAT(created_at,'%Y') = {year}"
else:
db_type = "postgresql_14"
import_tb = f"{site_name}_bs_category_asin_detail_{year}_{week}"
cols = f"id,asin,{week} as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
params = "1 = 1"
if date_type == DateTypes.month.name or date_type == DateTypes.month_week.name:
# 日期拆分
d1, d2 = CommonUtil.split_month_week_date(date_type, date_info)
if site_name in ['us', 'uk', 'de']:
db_type = 'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
cols = f"id,asin,null as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
import_tb = f"{site_name}_bs_category_asin_detail_month_{d1}_{d2}"
params = f" 1=1 "
else:
print(f"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误")
exit()
query = f"""
select
{cols}
from {import_tb}
where {params}
and \$CONDITIONS
"""
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type,
site_name=site_name,
query=query,
hive_tb_name=hive_tb,
msg_usr=['chenyuanjie']
)
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if not empty_flag:
sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=query,
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
client.close()
# 导入后检测--检测数据一致性 engine = get_remote_engine(
CommonUtil.check_import_sync_num(db_type=db_type, site_name=site_name,
partition_dict=partition_dict, db_type=db_type
import_query=query, )
hive_tb_name=hive_tb,
msg_usr=['chenyuanjie'] engine.sqoop_raw_import(
) query=query,
hive_table=hive_tb,
hdfs_path=hdfs_path,
partitions=partition_dict,
m=50,
split_by='id'
)
pass pass
...@@ -2,72 +2,69 @@ import os ...@@ -2,72 +2,69 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
date_info = CommonUtil.get_sys_arg(2, None) date_info = CommonUtil.get_sys_arg(2, None)
assert site_name is not None, "sitename 不能为空!" assert site_name is not None, "sitename 不能为空!"
assert date_info is not None, "date_info 不能为空!" assert date_info is not None, "date_info 不能为空!"
hive_tb = "ods_bs_category_top100_asin"
partition_dict = {
"site_name": site_name
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
db_type = "mysql" db_type = "mysql"
if date_info == 'all': if date_info == 'all':
query = f""" query = f"""
select id, select
asin, id,
cate_1_id, asin,
cate_current_id, cate_1_id,
bsr_rank, cate_current_id,
rating, bsr_rank,
total_comments, rating,
created_at as updated_at, total_comments,
date_info, created_at as updated_at,
category_id date_info,
category_id
from {site_name}_bs_category_top100_asin from {site_name}_bs_category_top100_asin
where 1 = 1 where 1 = 1
and \$CONDITIONS and \$CONDITIONS
""" """
pass
else: else:
query = f""" query = f"""
select id, select
asin, id,
cate_1_id, asin,
cate_current_id, cate_1_id,
bsr_rank, cate_current_id,
rating, bsr_rank,
total_comments, rating,
created_at as updated_at, total_comments,
date_info, created_at as updated_at,
category_id date_info,
category_id
from {site_name}_bs_category_top100_asin from {site_name}_bs_category_top100_asin
where 1 = 1 where 1 = 1
and date_info = '{date_info}' and date_info = '{date_info}'
and \$CONDITIONS and \$CONDITIONS
""" """
pass
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type, hive_tb = "ods_bs_category_top100_asin"
site_name=site_name, partition_dict = {
query=query, "site_name": site_name
hive_tb_name=hive_tb, }
msg_usr=['wujicang']) hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
engine = get_remote_engine(
site_name=site_name,
db_type=db_type
)
engine.sqoop_raw_import(
query=query,
hive_table=hive_tb,
hdfs_path=hdfs_path,
partitions=partition_dict
)
if not empty_flag:
sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=query,
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
pass pass
...@@ -2,61 +2,45 @@ import os ...@@ -2,61 +2,45 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
hive_tb = "ods_bsr_end"
db_type = "mysql" db_type = "mysql"
import_tb = f"{site_name}_bsr_end" import_tb = f"{site_name}_bsr_end"
query = f"""
select
id,
rank,
bsr_name,
created_at,
updated_at,
category_id
from {import_tb}
where 1 = 1
and \$CONDITIONS
"""
hive_tb = "ods_bsr_end"
partition_dict = { partition_dict = {
"site_name": site_name, "site_name": site_name,
} }
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
query = f""" engine = get_remote_engine(
select site_name=site_name,
id, db_type=db_type
rank, )
bsr_name,
created_at,
updated_at,
category_id
from {import_tb}
where 1 = 1
and \$CONDITIONS
"""
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type, engine.sqoop_raw_import(
site_name=site_name, query=query,
query=query, hive_table=hive_tb,
hive_tb_name=hive_tb, hdfs_path=hdfs_path,
msg_usr=['chenyuanjie'] partitions=partition_dict
) )
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if not empty_flag:
sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=query,
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
client.close()
# 导入后检测--检测数据一致性
CommonUtil.check_import_sync_num(db_type=db_type,
partition_dict=partition_dict,
import_query=query,
hive_tb_name=hive_tb,
msg_usr=['chenyuanjie']
)
pass pass
...@@ -2,76 +2,69 @@ import os ...@@ -2,76 +2,69 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
date_info = CommonUtil.get_sys_arg(2, None) date_info = CommonUtil.get_sys_arg(2, None)
assert site_name is not None, "sitename 不能为空!" assert site_name is not None, "sitename 不能为空!"
assert date_info is not None, "date_info 不能为空!" assert date_info is not None, "date_info 不能为空!"
hive_tb = "ods_new_releases_top100_asin"
partition_dict = { db_type = "mysql"
"site_name": site_name
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
if date_info == 'all': if date_info == 'all':
query = f""" query = f"""
select id, select
asin, id,
cate_1_id, asin,
cate_current_id, cate_1_id,
bsr_rank, cate_current_id,
rating, bsr_rank,
total_comments, rating,
created_at as updated_at, total_comments,
date_info, created_at as updated_at,
category_id date_info,
category_id
from {site_name}_new_releases_top100_asin from {site_name}_new_releases_top100_asin
where 1 = 1 where 1 = 1
and \$CONDITIONS and \$CONDITIONS
""" """
pass
else: else:
query = f""" query = f"""
select id, select
asin, id,
cate_1_id, asin,
cate_current_id, cate_1_id,
bsr_rank, cate_current_id,
rating, bsr_rank,
total_comments, rating,
created_at as updated_at, total_comments,
date_info, created_at as updated_at,
category_id date_info,
category_id
from {site_name}_new_releases_top100_asin from {site_name}_new_releases_top100_asin
where 1 = 1 where 1 = 1
and date_info = '{date_info}' and date_info = '{date_info}'
and \$CONDITIONS and \$CONDITIONS
""" """
pass
print("================================sql====================================") hive_tb = "ods_new_releases_top100_asin"
print(query) partition_dict = {
db_type = "mysql" "site_name": site_name
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type, }
site_name=site_name, hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
query=query,
hive_tb_name=hive_tb, engine = get_remote_engine(
msg_usr=['wujicang'] site_name=site_name,
) db_type=db_type
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!" )
engine.sqoop_raw_import(
query=query,
hive_table=hive_tb,
hdfs_path=hdfs_path,
partitions=partition_dict
)
if not empty_flag:
sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=query,
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
pass pass
"""
@Author : HuangJian
@Description : 各站点店铺asin详情表-- 月抓取
@SourceTable : us_other_search_term_data_2023_18
@SinkTable : ods_other_search_term_data
@CreateTime : 2022/05/23 09:55
@UpdateTime : 2022/05/23 09:55
"""
import os import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil
from utils.common_util import DateTypes
from utils.hdfs_utils import HdfsUtils
from utils.common_util import CommonUtil
from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
...@@ -25,96 +14,51 @@ if __name__ == '__main__': ...@@ -25,96 +14,51 @@ if __name__ == '__main__':
assert date_type is not None, "date_type 不能为空!" assert date_type is not None, "date_type 不能为空!"
assert date_info is not None, "date_info 不能为空!" assert date_info is not None, "date_info 不能为空!"
hive_table = f"ods_other_search_term_data" db_type = 'postgresql_14'
d1, d2 = CommonUtil.split_month_week_date(date_type, date_info)
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
import_table = f"{site_name}_other_search_term_month_{d1}_{d2}"
sql_query = f"""
select
id,
search_term,
asin,
page,
buy_data,
label,
created_time,
updated_time,
asin_brand
from {import_table}
where 1=1
and \$CONDITIONS
"""
if site_name == 'us':
map_num = 20
else:
map_num = 5
hive_table = "ods_other_search_term_data"
partition_dict = { partition_dict = {
"site_name": site_name, "site_name": site_name,
"date_type": date_type, "date_type": date_type,
"date_info": date_info "date_info": date_info
} }
# 落表路径校验
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
# 日期拆分 engine = get_remote_engine(
d1, d2 = CommonUtil.split_month_week_date(date_type, date_info) site_name=site_name,
db_type = '' db_type=db_type
)
if date_type == DateTypes.week.name:
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
if site_name == 'us' and date_info >= '2023-18':
db_type = 'postgresql'
if date_info >= '2023-34':
db_type = 'postgresql_14'
# pg的分区周单位数是带0,如01、02、03
import_table = f"{site_name}_other_search_term_{d1}_{d2}"
else:
db_type = 'postgresql_14'
import_table = f"{site_name}_other_search_term_{d1}_{d2}"
if date_type == DateTypes.month.name or date_type == DateTypes.month_week.name:
if site_name in ['us', 'uk', 'de']:
db_type = 'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
import_table = f"{site_name}_other_search_term_month_{d1}_{d2}"
else :
print(f"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误")
exit()
assert db_type != '', "未获取到db_type,请检查!"
sql_query = f"""
select
id,
search_term,
asin,
page,
buy_data,
label,
created_time,
updated_time,
asin_brand
from {import_table}
where 1=1
and \$CONDITIONS
"""
# 进行schema和数据校验
if site_name not in ('fr', 'it', 'es'):
CommonUtil.check_schema_before_import(db_type=db_type,
site_name=site_name,
query=sql_query,
hive_tb_name=hive_table,
msg_usr=['fangxingjun','pengyanbing','chenyuanjie']
,partition_dict = partition_dict)
if site_name == 'us': engine.sqoop_raw_import(
map_num = 20 query=sql_query,
else: hive_table=hive_table,
map_num = 5 hdfs_path=hdfs_path,
# 生成导出脚本 partitions=partition_dict,
import_sh = CommonUtil.build_import_sh(site_name=site_name, m=map_num,
db_type=db_type, split_by='id'
query=sql_query, )
hdfs_path=hdfs_path,
map_num=map_num,
key='id'
)
# 导入前先删除原始hdfs数据
HdfsUtils.delete_hdfs_file(hdfs_path)
# 创建ssh Client对象--用于执行cmd命令
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
# 关闭链接
client.close()
# 导入后检测--检测同步数据数据量的一致性 pass
CommonUtil.check_import_sync_num(db_type=db_type,
partition_dict=partition_dict,
import_query=sql_query,
hive_tb_name=hive_table,
msg_usr=['fangxingjun','pengyanbing','chenyuanjie'])
\ No newline at end of file
...@@ -2,10 +2,9 @@ import os ...@@ -2,10 +2,9 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.common_util import DateTypes from utils.secure_db_client import get_remote_engine
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
...@@ -21,8 +20,6 @@ if __name__ == '__main__': ...@@ -21,8 +20,6 @@ if __name__ == '__main__':
print("uk站点已无ac类型词,退出执行!") print("uk站点已无ac类型词,退出执行!")
sys.exit(0) sys.exit(0)
hive_tb = f"ods_search_term_{st_type}"
if st_type in ["zr", "sp"]: if st_type in ["zr", "sp"]:
cols = "search_term,asin,page,page_row,created_time,updated_time,id" cols = "search_term,asin,page,page_row,created_time,updated_time,id"
elif st_type in ["sb", "tr"]: elif st_type in ["sb", "tr"]:
...@@ -30,92 +27,48 @@ if __name__ == '__main__': ...@@ -30,92 +27,48 @@ if __name__ == '__main__':
else: else:
cols = "search_term,asin,page,created_time,updated_time,id" cols = "search_term,asin,page,created_time,updated_time,id"
db_type = 'postgresql_14'
# 日期拆分
d1, d2 = CommonUtil.split_month_week_date(date_type, date_info) d1, d2 = CommonUtil.split_month_week_date(date_type, date_info)
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
if date_type == DateTypes.week.name: import_tb = f"{site_name}_search_term_rank_{st_type}_month_{d1}_{d2}"
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
if site_name == 'us' and date_info >= '2023-18':
db_type = 'postgresql'
# pg的分区周单位数是带0,如01、02、03
if date_info >= '2023-34':
db_type = 'postgresql_14'
import_tb = f"{site_name}_search_term_rank_{st_type}_{d1}_{d2}"
else:
db_type = 'postgresql_14'
import_tb = f"{site_name}_search_term_rank_{st_type}_{d1}_{d2}"
if date_type == DateTypes.month.name or date_type == DateTypes.month_week.name:
if site_name in ['us', 'uk', 'de']:
db_type = 'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
import_tb = f"{site_name}_search_term_rank_{st_type}_month_{d1}_{d2}"
else :
print(f"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误")
exit()
query = f""" query = f"""
select {cols} select {cols} from {import_tb} where 1 = 1 and \$CONDITIONS
from {import_tb}
where 1 = 1
and \$CONDITIONS
""" """
print(f"当前链接的数据库为:{db_type},同步的表为:{import_tb}") print(f"当前链接的数据库为:{db_type},同步的表为:{import_tb}")
hive_tb = f"ods_search_term_{st_type}"
partition_dict = { partition_dict = {
"site_name": site_name, "site_name": site_name,
"date_type": date_type, "date_type": date_type,
"date_info": date_info, "date_info": date_info,
} }
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
if st_type in ['er', 'tr']:
empty_flag = False
print(f"st_type类型为{st_type},符合不检测类型跳过检测!")
else:
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type,
site_name=site_name,
query=query,
hive_tb_name=hive_tb,
msg_usr=['fangxingjun','pengyanbing','chenyuanjie'],
partition_dict=partition_dict
)
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if not empty_flag: if st_type == "zr":
# zr的数据量较大,同步时进行多进程同步 if site_name == "us":
if st_type in ['zr']: map_num = 40
sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=query,
hdfs_path=hdfs_path,
map_num=10,
key='id')
else: else:
sh = CommonUtil.build_import_sh(site_name=site_name, map_num = 15
db_type=db_type, elif st_type in ["sb", "sp"]:
query=query, if site_name == "us":
hdfs_path=hdfs_path) map_num = 6
else:
# 导入前先删除 map_num = 2
HdfsUtils.delete_hdfs_file(hdfs_path) else:
client = SSHUtil.get_ssh_client() map_num = 1
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb) engine = get_remote_engine(
client.close() site_name=site_name,
db_type=db_type
# 导入后检测--检测数据一致性 )
if date_type != 'month_week':
CommonUtil.check_import_sync_num(db_type=db_type, engine.sqoop_raw_import(
partition_dict=partition_dict, query=query,
import_query=query, hive_table=hive_tb,
hive_tb_name=hive_tb, hdfs_path=hdfs_path,
msg_usr=['fangxingjun','pengyanbing','chenyuanjie']) partitions=partition_dict,
m=map_num,
split_by='id'
)
pass pass
...@@ -2,61 +2,42 @@ import os ...@@ -2,61 +2,42 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
hive_tb = "ods_self_asin" db_type = "mysql"
partition_dict = {
"site_name": site_name,
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
query = f""" query = f"""
select select
id, id,
asin, asin,
created_at as created_time, created_at as created_time,
updated_at as updated_time updated_at as updated_time
from {site_name}_self_asin from {site_name}_self_asin
where 1 = 1 where 1 = 1
and \$CONDITIONS and \$CONDITIONS
""" """
db_type = "mysql" hive_tb = "ods_self_asin"
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type, partition_dict = {
site_name=site_name, "site_name": site_name,
query=query, }
hive_tb_name=hive_tb, hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
msg_usr=['chenyuanjie']
) engine = get_remote_engine(
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!" site_name=site_name,
db_type=db_type
if not empty_flag: )
sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type, engine.sqoop_raw_import(
query=query, query=query,
hdfs_path=hdfs_path) hive_table=hive_tb,
# 导入前先删除 hdfs_path=hdfs_path,
HdfsUtils.delete_hdfs_file(hdfs_path) partitions=partition_dict
client = SSHUtil.get_ssh_client() )
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
client.close()
# 导入后检测--检测数据一致性
CommonUtil.check_import_sync_num(db_type=db_type,
partition_dict=partition_dict,
import_query=query,
hive_tb_name=hive_tb,
msg_usr=['chenyuanjie'])
pass pass
...@@ -2,96 +2,89 @@ import os ...@@ -2,96 +2,89 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
from utils.db_util import DbTypes from utils.db_util import DbTypes
from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None) date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None) date_info = CommonUtil.get_sys_arg(3, None)
assert site_name is not None, "sitename 不能为空!" assert site_name is not None, "site_name 不能为空!"
assert date_info is not None, "date_info 不能为空!" assert date_info is not None, "date_info 不能为空!"
year = CommonUtil.reformat_date(date_info, "%Y-%m-%d", "%Y", )
hive_tb = "ods_self_asin_detail"
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info,
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
db_type = DbTypes.postgresql.name
year = CommonUtil.reformat_date(date_info, "%Y-%m-%d", "%Y", )
query = f""" query = f"""
select select
asin, asin,
img_url, img_url,
title, title,
title_len, title_len,
price, price,
rating, rating,
total_comments, total_comments,
buy_box_seller_type, buy_box_seller_type,
page_inventory, page_inventory,
category, category,
volume, volume,
weight, weight,
rank, rank,
launch_time, launch_time,
video_url, video_url,
add_url, add_url,
material, material,
created_at, created_at,
img_num, img_num,
img_type, img_type,
qa_num, qa_num,
brand, brand,
ac_name, ac_name,
node_id, node_id,
sp_num, sp_num,
mpn, mpn,
online_time, online_time,
describe, describe,
one_star, one_star,
two_star, two_star,
three_star, three_star,
four_star, four_star,
five_star, five_star,
low_star, low_star,
asin_type, asin_type,
is_coupon, is_coupon,
search_category, search_category,
weight_str, weight_str,
account_name, account_name,
other_seller_name, other_seller_name,
account_id account_id
from {site_name}_self_asin_detail_{year} from {site_name}_self_asin_detail_{year}
where 1 = 1 where 1 = 1
and site = '{site_name}' and site = '{site_name}'
and bsr_date_info = '{date_info}' and bsr_date_info = '{date_info}'
and date_info >= '{date_info}' and date_info >= '{date_info}'
and \$CONDITIONS and \$CONDITIONS
""" """
print("sql ======================================================")
print(query) hive_tb = "ods_self_asin_detail"
db_type = DbTypes.postgresql.name partition_dict = {
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type, "site_name": site_name,
site_name=site_name, "date_type": date_type,
query=query, "date_info": date_info,
hive_tb_name=hive_tb, }
msg_usr=['wujicang'] hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
)
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!" engine = get_remote_engine(
site_name=site_name,
db_type=db_type
)
engine.sqoop_raw_import(
query=query,
hive_table=hive_tb,
hdfs_path=hdfs_path,
partitions=partition_dict
)
if not empty_flag:
sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=query,
hdfs_path=hdfs_path)
# 导入前先删除
HdfsUtils.delete_hdfs_file(hdfs_path)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
pass pass
...@@ -3,22 +3,15 @@ import sys ...@@ -3,22 +3,15 @@ import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
hive_table = "ods_self_asin_related_traffic"
partition_dict = {"site_name": site_name}
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
db_type = 'mysql' db_type = 'mysql'
import_table = f"{site_name}_self_asin_detail" import_table = f"{site_name}_self_asin_detail"
sql_query = f""" sql_query = f"""
select select
id, id,
...@@ -37,17 +30,29 @@ if __name__ == '__main__': ...@@ -37,17 +30,29 @@ if __name__ == '__main__':
and \$CONDITIONS and \$CONDITIONS
""" """
# 生成导出脚本 hive_table = "ods_self_asin_related_traffic"
import_sh = CommonUtil.build_import_sh( partition_dict = {
site_name=site_name, db_type=db_type, query=sql_query, hdfs_path=hdfs_path, map_num=25, key='id' "site_name": site_name
}
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
if site_name == 'us':
map_num = 25
else:
map_num = 1
engine = get_remote_engine(
site_name=site_name,
db_type=db_type
)
engine.sqoop_raw_import(
query=sql_query,
hive_table=hive_table,
hdfs_path=hdfs_path,
partitions=partition_dict,
m=map_num,
split_by='id'
) )
# 导入前先删除原始hdfs数据 pass
HdfsUtils.delete_hdfs_file(hdfs_path)
# 创建ssh Client对象--用于执行cmd命令
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
# 关闭链接
client.close()
...@@ -2,11 +2,10 @@ import os ...@@ -2,11 +2,10 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.common_util import DateTypes from utils.common_util import DateTypes
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
...@@ -16,17 +15,6 @@ if __name__ == '__main__': ...@@ -16,17 +15,6 @@ if __name__ == '__main__':
assert date_type is not None, "date_type 不能为空!" assert date_type is not None, "date_type 不能为空!"
assert date_info is not None, "date_info 不能为空!" assert date_info is not None, "date_info 不能为空!"
hive_table = f"ods_seller_account_feedback"
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
# 落表路径校验
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
suffix = str(date_info).replace("-", "_") suffix = str(date_info).replace("-", "_")
import_table = f"{site_name}_seller_account_feedback_{suffix}" import_table = f"{site_name}_seller_account_feedback_{suffix}"
if date_type == DateTypes.month.name and date_info >= '2023-08': if date_type == DateTypes.month.name and date_info >= '2023-08':
...@@ -51,24 +39,24 @@ if __name__ == '__main__': ...@@ -51,24 +39,24 @@ if __name__ == '__main__':
and \$CONDITIONS and \$CONDITIONS
""" """
# 进行schema和数据校验 hive_table = "ods_seller_account_feedback"
CommonUtil.check_schema_before_import(db_type=db_type, partition_dict = {
site_name=site_name, "site_name": site_name,
query=sql_query, "date_type": date_type,
hive_tb_name=hive_table, "date_info": date_info
msg_usr=['chenyuanjie']) }
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
engine = get_remote_engine(
site_name=site_name,
db_type=db_type
)
engine.sqoop_raw_import(
query=sql_query,
hive_table=hive_table,
hdfs_path=hdfs_path,
partitions=partition_dict
)
# 生成导出脚本 pass
import_sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=sql_query,
hdfs_path=hdfs_path)
# 导入前先删除原始hdfs数据
HdfsUtils.delete_hdfs_file(hdfs_path)
# 创建ssh Client对象--用于执行cmd命令
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
# 关闭链接
client.close()
"""
@Author : HuangJian
@Description : 各站点店铺名称与店铺id关系全量表--传参为单站点
@SourceTable : us_seller_account_feedback
@SinkTable : ods_seller_account_feedback
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import os import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.db_util import DBUtil
from utils.common_util import CommonUtil
from utils.common_util import DateTypes
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from utils.common_util import CommonUtil
from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
db_type = 'mysql'
hive_table = f"ods_seller_account_syn" import_table = f"{site_name}_seller_account_syn_distinct"
sql_query = f"""
select
id,
account_name,
url,
state,
created_at,
updated_at,
seller_id
from {import_table}
where 1=1
and \$CONDITIONS
"""
hive_table = "ods_seller_account_syn"
partition_dict = { partition_dict = {
"site_name": site_name "site_name": site_name
} }
# 落表路径校验
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
import_table = f"{site_name}_seller_account_syn_distinct" engine = get_remote_engine(
db_type = 'mysql' site_name=site_name,
db_type=db_type
)
sql_query = f"""
select
id,
account_name,
url,
state,
created_at,
updated_at,
seller_id
from {import_table}
where 1=1
and \$CONDITIONS
"""
# 进行schema和数据校验 engine.sqoop_raw_import(
CommonUtil.check_schema_before_import(db_type=db_type, query=sql_query,
site_name=site_name, hive_table=hive_table,
query=sql_query, hdfs_path=hdfs_path,
hive_tb_name=hive_table, partitions=partition_dict
msg_usr=['chenyuanjie']) )
# 生成导出脚本 pass
import_sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=sql_query,
hdfs_path=hdfs_path)
# 导入前先删除原始hdfs数据
HdfsUtils.delete_hdfs_file(hdfs_path)
# 创建ssh Client对象--用于执行cmd命令
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
# 关闭链接
client.close()
"""
@Author : HuangJian
@Description : 各站点店铺名称与asin关系全量表--传参为单站点
@SourceTable : us_seller_account_feedback
@SinkTable : ods_seller_account_feedback
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import os import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.db_util import DBUtil
from utils.common_util import CommonUtil
from utils.common_util import DateTypes
from utils.hdfs_utils import HdfsUtils
from utils.spark_util import SparkUtil
from utils.common_util import CommonUtil
from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
db_type = 'mysql'
hive_table = f"ods_seller_asin_account" import_table = f"{site_name}_seller_asin_account"
sql_query = f"""
select
id,
account_name,
asin,
created_at,
updated_at,
seller_id
from {import_table}
where 1=1
and \$CONDITIONS
"""
hive_table = "ods_seller_asin_account"
partition_dict = { partition_dict = {
"site_name": site_name "site_name": site_name
} }
# 落表路径校验
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
import_table = f"{site_name}_seller_asin_account"
db_type = 'mysql'
sql_query = f"""
select
id,
account_name,
asin,
created_at,
updated_at,
seller_id
from {import_table}
where 1=1
and \$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil.check_schema_before_import(db_type=db_type,
site_name=site_name,
query=sql_query,
hive_tb_name=hive_table,
msg_usr=['chenyuanjie'])
# 生成导出脚本 if site_name == 'us':
import_sh = CommonUtil.build_import_sh(site_name=site_name, map_num = 100
db_type=db_type, else:
query=sql_query, map_num = 40
hdfs_path=hdfs_path,
map_num=10, engine = get_remote_engine(
key='id') site_name=site_name,
# 导入前先删除原始hdfs数据 db_type=db_type
HdfsUtils.delete_hdfs_file(hdfs_path) )
# 创建ssh Client对象--用于执行cmd命令
client = SSHUtil.get_ssh_client() engine.sqoop_raw_import(
SSHUtil.exec_command_async(client, import_sh, ignore_err=False) query=sql_query,
# 创建lzo索引和修复元数据 hive_table=hive_table,
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table) hdfs_path=hdfs_path,
# 关闭链接 partitions=partition_dict,
client.close() m=map_num,
split_by='id'
)
pass
"""
@Author : HuangJian
@Description : 各站点店铺asin详情表-- 月抓取
@SourceTable : us_asin_detail_product_2023
@SinkTable : ods_asin_detail_product
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import os import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.db_util import DBUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.common_util import DateTypes from utils.common_util import DateTypes
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
from utils.spark_util import SparkUtil
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
...@@ -29,21 +17,8 @@ if __name__ == '__main__': ...@@ -29,21 +17,8 @@ if __name__ == '__main__':
# 该表现在为月同步表,因此增加月类型校验 # 该表现在为月同步表,因此增加月类型校验
assert date_type == DateTypes.month.name, "date_type类型不对,应为month" assert date_type == DateTypes.month.name, "date_type类型不对,应为month"
hive_table = f"ods_asin_detail_product"
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
# 落表路径校验
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
# 日期拆分
suffix = str(date_info).replace("-", "_") suffix = str(date_info).replace("-", "_")
import_table = f"{site_name}_seller_asin_product_{suffix}" import_table = f"{site_name}_seller_asin_product_{suffix}"
# db_type = 'postgresql'
if date_type == DateTypes.month.name and date_info >= '2023-08': if date_type == DateTypes.month.name and date_info >= '2023-08':
db_type = 'postgresql_14' db_type = 'postgresql_14'
else: else:
...@@ -51,44 +26,51 @@ if __name__ == '__main__': ...@@ -51,44 +26,51 @@ if __name__ == '__main__':
print("当前链接的数据库为:", db_type) print("当前链接的数据库为:", db_type)
sql_query = f""" sql_query = f"""
select select
id, id,
null as account_id, null as account_id,
asin, asin,
title, title,
img_url, img_url,
price, price,
rating, rating,
total_comments, total_comments,
null as week, null as week,
row_num, row_num,
created_at, created_at,
updated_at, updated_at,
null as month, null as month,
seller_id seller_id
from {import_table} from {import_table}
where 1=1 where 1=1
and \$CONDITIONS and \$CONDITIONS
""" """
hive_table = "ods_asin_detail_product"
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
if site_name == 'us':
map_num = 8
else:
map_num = 3
engine = get_remote_engine(
site_name=site_name,
db_type=db_type
)
# 进行schema和数据校验 engine.sqoop_raw_import(
CommonUtil.check_schema_before_import(db_type=db_type, query=sql_query,
site_name=site_name, hive_table=hive_table,
query=sql_query, hdfs_path=hdfs_path,
hive_tb_name=hive_table, partitions=partition_dict,
msg_usr=['chenyuanjie']) m=map_num,
split_by='id'
)
# 生成导出脚本 pass
import_sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=sql_query,
hdfs_path=hdfs_path)
# 导入前先删除原始hdfs数据
HdfsUtils.delete_hdfs_file(hdfs_path)
# 创建ssh Client对象--用于执行cmd命令
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
# 关闭链接
client.close()
import os import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil
from utils.common_util import DateTypes
from utils.hdfs_utils import HdfsUtils
from utils.common_util import CommonUtil
from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
...@@ -17,75 +14,51 @@ if __name__ == '__main__': ...@@ -17,75 +14,51 @@ if __name__ == '__main__':
assert date_type is not None, "date_type 不能为空!" assert date_type is not None, "date_type 不能为空!"
assert date_info is not None, "date_info 不能为空!" assert date_info is not None, "date_info 不能为空!"
hive_table = f"ods_st_quantity_being_sold" db_type = 'postgresql_14'
d1, d2 = CommonUtil.split_month_week_date(date_type, date_info)
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
import_table = f"{site_name}_brand_analytics_month_{d1}_{d2}"
sql_query = f"""
select
id,
search_term,
quantity_being_sold,
date_info as date_flag,
created_time,
updated_time,
quantity_being_sold_str,
result_count,
departments
from {import_table}
where 1=1
and \$CONDITIONS
"""
hive_table = "ods_st_quantity_being_sold"
partition_dict = { partition_dict = {
"site_name": site_name, "site_name": site_name,
"date_type": date_type, "date_type": date_type,
"date_info": date_info "date_info": date_info
} }
# 落表路径校验
hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict) hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
# 日期拆分
d1, d2 = CommonUtil.split_month_week_date(date_type, date_info)
if date_type == DateTypes.week.name: if site_name == 'us':
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}' map_num = 4
if site_name == 'us' and date_info >= '2023-18': else:
db_type = 'postgresql' map_num = 1
if date_info >= '2023-34':
db_type = 'postgresql_14'
else:
db_type = 'postgresql_14'
import_table = f"{site_name}_brand_analytics_{d1}_{d2}"
if date_type == DateTypes.month.name or date_type == DateTypes.month_week.name: engine = get_remote_engine(
if site_name in ['us', 'uk', 'de']: site_name=site_name,
db_type = 'postgresql_14' db_type=db_type
# pg的分区单位数是带0,如01、02、03 )
d2 = f'0{d2}' if int(d2) < 10 else f'{d2}'
import_table = f"{site_name}_brand_analytics_month_{d1}_{d2}"
else :
print(f"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误")
exit()
sql_query = f"""
select
id,
search_term,
quantity_being_sold,
date_info as date_flag,
created_time,
updated_time,
quantity_being_sold_str,
result_count,
departments
from {import_table}
where 1=1
and \$CONDITIONS
"""
# 进行schema和数据校验 engine.sqoop_raw_import(
if site_name not in ('fr', 'it', 'es'): query=sql_query,
CommonUtil.check_schema_before_import(db_type=db_type, hive_table=hive_table,
site_name=site_name, hdfs_path=hdfs_path,
query=sql_query, partitions=partition_dict,
hive_tb_name=hive_table, m=map_num,
msg_usr=['fangxingjun','chenyuanjie']) split_by='id'
)
# 生成导出脚本 pass
import_sh = CommonUtil.build_import_sh(site_name=site_name,
db_type=db_type,
query=sql_query,
hdfs_path=hdfs_path)
# 导入前先删除原始hdfs数据
HdfsUtils.delete_hdfs_file(hdfs_path)
# 创建ssh Client对象--用于执行cmd命令
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
# 创建lzo索引和修复元数据
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
# 关闭链接
client.close()
...@@ -2,62 +2,41 @@ import os ...@@ -2,62 +2,41 @@ import os
import sys import sys
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils from utils.secure_db_client import get_remote_engine
if __name__ == '__main__': if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None) site_name = CommonUtil.get_sys_arg(1, None)
assert site_name is not None, "site_name 不能为空!" assert site_name is not None, "site_name 不能为空!"
hive_tb = "ods_theme"
db_type = "mysql" db_type = "mysql"
partition_dict = {
"site_name": site_name
}
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
print(f"hdfs_path is {hdfs_path}")
import_tb = f"{site_name}_theme" import_tb = f"{site_name}_theme"
cols = "id,theme_type_en,theme_type_ch,theme_en,theme_ch,created_at,updated_at" cols = "id, theme_type_en, theme_type_ch, theme_en, theme_ch, created_at, updated_at"
query = f""" query = f"""
select select
{cols} {cols}
from {import_tb} from {import_tb}
where 1 = 1 where 1 = 1
and \$CONDITIONS and \$CONDITIONS
""" """
empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type, hive_tb = "ods_theme"
site_name=site_name, partition_dict = {
query=query, "site_name": site_name
hive_tb_name=hive_tb, }
msg_usr=['chenyuanjie'] hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
)
assert check_flag, f"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!" engine = get_remote_engine(
site_name=site_name,
if not empty_flag: db_type=db_type
sh = CommonUtil.build_import_sh(site_name=site_name, )
db_type=db_type,
query=query, engine.sqoop_raw_import(
hdfs_path=hdfs_path) query=query,
# 导入前先删除 hive_table=hive_tb,
HdfsUtils.delete_hdfs_file(hdfs_path) hdfs_path=hdfs_path,
client = SSHUtil.get_ssh_client() partitions=partition_dict
SSHUtil.exec_command_async(client, sh, ignore_err=False) )
CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
client.close()
# 导入后检测--检测数据一致性
CommonUtil.check_import_sync_num(db_type=db_type,
partition_dict=partition_dict,
import_query=query,
hive_tb_name=hive_tb,
msg_usr=['chenyuanjie'])
pass pass
# author : wangrui
# data : 2023/3/9 15:50
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
...@@ -27,7 +25,7 @@ class EsUtils(object): ...@@ -27,7 +25,7 @@ class EsUtils(object):
# 获取elasticsearch相关配置 # 获取elasticsearch相关配置
@staticmethod @staticmethod
def get_es_options(es_index_name): def get_es_options(es_index_name, pipeline_id):
return { return {
"es.nodes": EsUtils.__es_ip__, "es.nodes": EsUtils.__es_ip__,
"es.port": EsUtils.__es_port__, "es.port": EsUtils.__es_port__,
...@@ -40,7 +38,8 @@ class EsUtils(object): ...@@ -40,7 +38,8 @@ class EsUtils(object):
"es.batch.size.entries": "5000", "es.batch.size.entries": "5000",
"es.nodes.wan.only": "false", "es.nodes.wan.only": "false",
"es.batch.write.concurrency": "30", "es.batch.write.concurrency": "30",
"es.write.operation": "upsert" "es.write.operation": "index",
"es.ingest.pipeline": f"{pipeline_id}"
} }
# 获取elasticsearch中索引配置信息 # 获取elasticsearch中索引配置信息
...@@ -487,7 +486,6 @@ class EsUtils(object): ...@@ -487,7 +486,6 @@ class EsUtils(object):
index_name_list = list(alias_info.keys()) index_name_list = list(alias_info.keys())
return index_name_list return index_name_list
#删除索引别名 #删除索引别名
@staticmethod @staticmethod
def delete_index_alias(alias_name, client): def delete_index_alias(alias_name, client):
...@@ -500,7 +498,79 @@ class EsUtils(object): ...@@ -500,7 +498,79 @@ class EsUtils(object):
else: else:
print("索引别名不存在!") print("索引别名不存在!")
@staticmethod
def user_enrich_pipeline(client, pipeline_id, policy_name1, policy_name2):
pipeline_body = {
"description": "asin flow user mask pipeline",
"processors": [
{
"enrich": {
"policy_name": f"{policy_name1}",
"field": "asin",
"target_field": "policy_add_1",
"max_matches": 1,
"ignore_missing": True
},
},
{
"enrich": {
"policy_name": f"{policy_name2}",
"field": "category_id",
"target_field": "policy_add_2",
"max_matches": 1,
"ignore_missing": True
},
},
{
"set": {
"field": "usr_mask_type",
"value": "{{policy_add_1.usr_mask_type}}",
"ignore_empty_value": True
}
},
{
"set": {
"field": "usr_mask_progress",
"value": "{{policy_add_1.usr_mask_progress}}",
"ignore_empty_value": True
}
},
{
"set": {
"field": "package_quantity",
"value": "{{policy_add_1.package_quantity}}",
"ignore_empty_value": True
}
},
{
"set": {
"field": "usr_mask_type",
"value": "{{policy_add_2.usr_mask_type}}",
"ignore_empty_value": True
}
},
{
"remove": {
"field": "policy_add_1",
"ignore_missing": True
}
},
{
"remove": {
"field": "policy_add_2",
"ignore_missing": True
}
},
{
"convert": {
"field": "package_quantity",
"type": "integer",
"ignore_missing": True
}
}
]
}
client.ingest.put_pipeline(id=pipeline_id, body=pipeline_body)
if __name__ == '__main__': if __name__ == '__main__':
pass pass
...@@ -680,13 +680,14 @@ def udf_extract_weight_format(weight_str: str): ...@@ -680,13 +680,14 @@ def udf_extract_weight_format(weight_str: str):
# 分类提取-返回: 一级/当前分类id+一级/当前分类排名 # 分类提取-返回: 一级/当前分类id+一级/当前分类排名
# 参考dim_asin_bs_info.py使用 # 参考dim_asin_bs_info.py使用
def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_sellers_href, cate_current_pattern, def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_sellers_href, cate_current_pattern,
cate_1_pattern): cate_1_pattern, node_id):
""" """
asin_bs_sellers_rank_lower: 底部分类字符串 asin_bs_sellers_rank_lower: 底部分类字符串
last_herf: 最后一级分类链接 last_herf: 最后一级分类链接
all_best_sellers_href: 所有分类链接 all_best_sellers_href: 所有分类链接
cate_current_pattern: 当前分类排名匹配规则 cate_current_pattern: 当前分类排名匹配规则
cate_1_pattern: 一级分类排名匹配规则 cate_1_pattern: 一级分类排名匹配规则
node_id: 页面头部抓取分类id
""" """
# if (site_name == 'us' and date_type in ['month', 'month_week'] and date_info >= '2023-11') or (site_name != 'us' and date_type in ['week'] and date_info >= '2023-41'): # if (site_name == 'us' and date_type in ['month', 'month_week'] and date_info >= '2023-11') or (site_name != 'us' and date_type in ['week'] and date_info >= '2023-41'):
...@@ -711,7 +712,43 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller ...@@ -711,7 +712,43 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
break break
# 2. 解析一级和当前 分类 + 排名 # 2. 解析一级和当前 分类 + 排名
# 2.1 提取分类 # 2.1 先检查 node_id 是否在 href_list 中
cate_1_id, cate_current_id, cate_1_rank, cate_current_rank = None, None, None, None
if node_id and len(href_list) > 1:
node_id_str = str(node_id)
matched_idx = None
for i, href in enumerate(href_list):
if node_id_str in href: # 判断node_id是否在url中出现
matched_idx = i
break
if matched_idx is not None:
# 提取对应分类ID
cate_current_id = re.findall('bestsellers/(.*)/ref', href_list[matched_idx])
cate_current_id = cate_current_id[0].split("/")[-1] if cate_current_id else None
# 一级分类还是取第一个
cate_1_id = re.findall('bestsellers/(.*)/ref', href_list[0])
cate_1_id = cate_1_id[0].split("/")[0] if cate_1_id else None
# 解析排名
if asin_bs_sellers_rank_lower is not None:
asin_bs_sellers_rank_lower2 = asin_bs_sellers_rank_lower.replace(".", "").replace(",", "").replace(" 100 ", "")
else:
asin_bs_sellers_rank_lower2 = ''
rank_list = re.findall(cate_current_pattern, asin_bs_sellers_rank_lower2)
rank_list = [int(rank) for rank in rank_list]
# 如果 rank_list 长度和 href_list 对齐,则取对应位置的排名
if matched_idx < len(rank_list):
cate_current_rank = rank_list[matched_idx]
# 一级分类排名
if rank_list and cate_1_pattern in asin_bs_sellers_rank_lower:
cate_1_rank = rank_list[0]
return cate_1_id, cate_current_id, cate_1_rank, cate_current_rank
# 2.2 提取分类
if href_list: if href_list:
if len(href_list) == 1: if len(href_list) == 1:
cate_list = re.findall('bestsellers/(.*)/ref', href_list[0]) cate_list = re.findall('bestsellers/(.*)/ref', href_list[0])
...@@ -735,7 +772,7 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller ...@@ -735,7 +772,7 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
else: else:
cate_1_id, cate_current_id = None, None cate_1_id, cate_current_id = None, None
# 2.2 提取排名 # 2.3 提取排名
if asin_bs_sellers_rank_lower is not None: if asin_bs_sellers_rank_lower is not None:
asin_bs_sellers_rank_lower2 = asin_bs_sellers_rank_lower.replace(".", "").replace(",", "").replace(" 100 ", "") asin_bs_sellers_rank_lower2 = asin_bs_sellers_rank_lower.replace(".", "").replace(",", "").replace(" 100 ", "")
else: else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment