Commit 281b9289 by chenyuanjie

asin信息库流程调整

parent d32c0830
......@@ -7,7 +7,7 @@ from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from utils.es_util import EsUtils
from utils.db_util import DBUtil
from datetime import datetime, timedelta
from datetime import datetime
from pyspark.sql import functions as F
......@@ -19,15 +19,8 @@ class EsAiAsinAdd(object):
self.date_info = date_info
self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
if self.site_name == 'us':
self.pg_tb = "ai_asin_analyze_detail"
else:
self.pg_tb = f"{self.site_name}_ai_asin_analyze_detail"
launch_time_base_date = self.spark.sql(
f"""SELECT max(`date`) AS last_day FROM dim_date_20_to_30 WHERE year_month = '{self.date_info}'"""
).collect()[0]['last_day']
self.launch_time_interval_dict = self.get_launch_time_interval_dict(launch_time_base_date)
self.pg_conn = DBUtil.get_connection_info("postgresql", "us")
self.export_pg_tb = f"{self.site_name}_ai_asin_detail_month_{self.date_info.replace('-', '_')}"
self.es_client = EsUtils.get_es_client()
self.es_index = f"{self.site_name}_ai_asin_analyze_detail_{self.date_info.replace('-', '_')}"
......@@ -36,19 +29,8 @@ class EsAiAsinAdd(object):
self.df_ai_asin_detail = self.spark.sql(f"select 1+1;")
self.df_ai_asin_analyze = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
@staticmethod
def get_launch_time_interval_dict(base_date):
base_date = datetime.strptime(base_date, '%Y-%m-%d')
return {
"one_month": (base_date + timedelta(days=-30)).strftime('%Y-%m-%d'),
"three_month": (base_date + timedelta(days=-90)).strftime('%Y-%m-%d'),
"six_month": (base_date + timedelta(days=-180)).strftime('%Y-%m-%d'),
"twelve_month": (base_date + timedelta(days=-360)).strftime('%Y-%m-%d'),
"twenty_four_month": (base_date + timedelta(days=-720)).strftime('%Y-%m-%d'),
"thirty_six_month": (base_date + timedelta(days=-1080)).strftime('%Y-%m-%d')
}
self.df_save_pg = self.spark.sql(f"select 1+1;")
self.df_save_es = self.spark.sql(f"select 1+1;")
@staticmethod
def get_es_options(index_name, pipeline_id):
......@@ -105,7 +87,12 @@ class EsAiAsinAdd(object):
bought_month_mom,
bought_month_yoy,
is_new_flag,
is_ascending_flag
is_ascending_flag,
review_json_list,
launch_time_type,
describe,
product_json,
product_detail_json
from dwt_ai_asin_add
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
......@@ -140,14 +127,13 @@ class EsAiAsinAdd(object):
array_to_string(package_quantity_arr, ',') as package_quantity_arr,
package_quantity_flag,
label_content
from {self.pg_tb}
from {self.site_name}_ai_asin_analyze_detail
"""
conn_info = DBUtil.get_connection_info("postgresql", "us")
self.df_ai_asin_analyze = SparkUtil.read_jdbc_query(
session=self.spark,
url=conn_info["url"],
pwd=conn_info["pwd"],
username=conn_info["username"],
url=self.pg_conn["url"],
pwd=self.pg_conn["pwd"],
username=self.pg_conn["username"],
query=sql2
).withColumn(
'package_quantity_arr', F.split(F.col('package_quantity_arr'), ',')
......@@ -158,83 +144,48 @@ class EsAiAsinAdd(object):
self.df_ai_asin_analyze.show(10, True)
def handle_data(self):
# 补充launch_time_type字段
one_month = self.launch_time_interval_dict['one_month']
three_month = self.launch_time_interval_dict['three_month']
six_month = self.launch_time_interval_dict['six_month']
twelve_month = self.launch_time_interval_dict['twelve_month']
twenty_four_month = self.launch_time_interval_dict['twenty_four_month']
thirty_six_month = self.launch_time_interval_dict['thirty_six_month']
expr_str = f"""
CASE WHEN launch_time >= '{one_month}' THEN 1
WHEN launch_time >= '{three_month}' AND launch_time < '{one_month}' THEN 2
WHEN launch_time >= '{six_month}' AND launch_time < '{three_month}' THEN 3
WHEN launch_time >= '{twelve_month}' AND launch_time < '{six_month}' THEN 4
WHEN launch_time >= '{twenty_four_month}' AND launch_time < '{twelve_month}' THEN 5
WHEN launch_time >= '{thirty_six_month}' AND launch_time < '{twenty_four_month}' THEN 6
WHEN launch_time < '{thirty_six_month}' THEN 7
ELSE 0 END
"""
self.df_ai_asin_detail = self.df_ai_asin_detail.withColumn('launch_time_type', F.expr(expr_str))
def save_data(self):
self.df_save = self.df_ai_asin_detail.join(
self.df_save_pg = self.df_ai_asin_detail.join(
self.df_ai_asin_analyze, 'asin', 'left_anti'
).select(
'site_name', 'asin', 'weight', 'bought_month', 'category', 'img', 'title', 'brand', 'account_name',
'account_addr', 'buy_box_seller_type', 'launch_time', 'img_num', 'variation_flag', 'variation_num',
'ao_val', 'category_id', 'category_current_id', 'parent_asin', 'bsr_rank', 'price', 'rating',
'total_comments', 'seller_id', 'fb_country_name', 'review_json_list', 'launch_time_type', 'describe',
'product_json', 'product_detail_json', 'bought_month_mom', 'bought_month_yoy', 'is_new_flag',
'is_ascending_flag'
)
self.df_save_es = self.df_ai_asin_detail.join(
self.df_ai_asin_analyze, 'asin', 'inner'
).select(
'account_addr',
'account_name',
'analyze_id',
'ao_val',
'appearance',
'asin',
'bought_month',
'bought_month_mom',
'bought_month_yoy',
'brand',
'bsr_rank',
'buy_box_seller_type',
'category',
'category_current_id',
'category_id',
'color',
'crowd',
'fb_country_name',
'function',
'img',
'img_num',
'is_ascending_flag',
'is_new_flag',
'label_content',
'launch_time',
'launch_time_type',
'material',
'package_quantity',
'package_quantity_arr',
'package_quantity_flag',
'parent_asin',
'price',
'rating',
'scene_comment',
'scene_title',
'seller_id',
'shape',
'short_desc',
'site_name',
'size',
'theme',
'title',
'title_pic_content',
'title_pic_flag',
'title_word_content',
'title_word_flag',
'total_comments',
'uses',
'variation_flag',
'variation_num',
'weight'
).cache()
'account_addr', 'account_name', 'analyze_id', 'ao_val', 'appearance', 'asin', 'bought_month',
'bought_month_mom', 'bought_month_yoy', 'brand', 'bsr_rank', 'buy_box_seller_type', 'category',
'category_current_id', 'category_id', 'color', 'crowd', 'fb_country_name', 'function', 'img',
'img_num', 'is_ascending_flag', 'is_new_flag', 'label_content', 'launch_time', 'launch_time_type',
'material', 'package_quantity', 'package_quantity_arr', 'package_quantity_flag', 'parent_asin',
'price', 'rating', 'scene_comment', 'scene_title', 'seller_id', 'shape', 'short_desc', 'site_name',
'size', 'theme', 'title', 'title_pic_content', 'title_pic_flag', 'title_word_content',
'title_word_flag', 'total_comments', 'uses', 'variation_flag', 'variation_num', 'weight'
)
def save_data(self):
# 将新增asin导出给济苍
try:
self.df_save_pg.write.format("jdbc") \
.option("url", self.pg_conn["url"]) \
.option("dbtable", f"{self.export_pg_tb}") \
.option("user", self.pg_conn["username"]) \
.option("password", self.pg_conn["pwd"]) \
.mode("append") \
.save()
CommonUtil.send_wx_msg(['wujicang', 'chenyuanjie'], 'ASIN信息库增量数据导出', f'详情:{self.export_pg_tb} {self.site_name} {self.date_type} {self.date_info}')
except Exception as e:
print("An error occurred while writing to Elasticsearch:", str(e))
CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ASIN信息库增量数据导出失败', f'详情:{self.export_pg_tb} {self.site_name} {self.date_type} {self.date_info}')
# 将增量asin导出到es
try:
self.df_save.write.format("org.elasticsearch.spark.sql") \
self.df_save_es.write.format("org.elasticsearch.spark.sql") \
.options(**self.es_options) \
.mode("append") \
.save()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment