asin信息库流程调整

281b9289 · chenyuanjie · d32c0830 · 281b9289
Commit 281b9289 authored Feb 25, 2026 by chenyuanjie
Show whitespace changes
Inline Side-by-side

Showing with 54 additions and 103 deletions

es_ai_asin_add.py Pyspark_job/export_es/es_ai_asin_add.py +54 -103

No files found.
--- a/Pyspark_job/export_es/es_ai_asin_add.py
+++ b/Pyspark_job/export_es/es_ai_asin_add.py
@@ -7,7 +7,7 @@ from utils.common_util import CommonUtil
 from utils.spark_util import SparkUtil
 from utils.es_util import EsUtils
 from utils.db_util import DBUtil
-from datetime import datetime, timedelta
+from datetime import datetime
 from pyspark.sql import functions as F


@@ -19,15 +19,8 @@ class EsAiAsinAdd(object):
        self.date_info = date_info
        self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")

-        if self.site_name == 'us':
-            self.pg_tb = "ai_asin_analyze_detail"
-        else:
-            self.pg_tb = f"{self.site_name}_ai_asin_analyze_detail"
-
-        launch_time_base_date = self.spark.sql(
-            f"""SELECT max(`date`) AS last_day FROM dim_date_20_to_30 WHERE year_month = '{self.date_info}'"""
-        ).collect()[0]['last_day']
-        self.launch_time_interval_dict = self.get_launch_time_interval_dict(launch_time_base_date)
+        self.pg_conn = DBUtil.get_connection_info("postgresql", "us")
+        self.export_pg_tb = f"{self.site_name}_ai_asin_detail_month_{self.date_info.replace('-', '_')}"

        self.es_client = EsUtils.get_es_client()
        self.es_index = f"{self.site_name}_ai_asin_analyze_detail_{self.date_info.replace('-', '_')}"
@@ -36,19 +29,8 @@ class EsAiAsinAdd(object):

        self.df_ai_asin_detail = self.spark.sql(f"select 1+1;")
        self.df_ai_asin_analyze = self.spark.sql(f"select 1+1;")
-        self.df_save = self.spark.sql(f"select 1+1;")
-
-    @staticmethod
-    def get_launch_time_interval_dict(base_date):
-        base_date = datetime.strptime(base_date, '%Y-%m-%d')
-        return {
-            "one_month": (base_date + timedelta(days=-30)).strftime('%Y-%m-%d'),
-            "three_month": (base_date + timedelta(days=-90)).strftime('%Y-%m-%d'),
-            "six_month": (base_date + timedelta(days=-180)).strftime('%Y-%m-%d'),
-            "twelve_month": (base_date + timedelta(days=-360)).strftime('%Y-%m-%d'),
-            "twenty_four_month": (base_date + timedelta(days=-720)).strftime('%Y-%m-%d'),
-            "thirty_six_month": (base_date + timedelta(days=-1080)).strftime('%Y-%m-%d')
-        }
+        self.df_save_pg = self.spark.sql(f"select 1+1;")
+        self.df_save_es = self.spark.sql(f"select 1+1;")

    @staticmethod
    def get_es_options(index_name, pipeline_id):
@@ -105,7 +87,12 @@ class EsAiAsinAdd(object):
            bought_month_mom,
            bought_month_yoy,
            is_new_flag,
-            is_ascending_flag
+            is_ascending_flag,
+            review_json_list,
+            launch_time_type,
+            describe,
+            product_json,
+            product_detail_json
        from dwt_ai_asin_add 
        where site_name = '{self.site_name}' 
          and date_type = '{self.date_type}' 
@@ -140,14 +127,13 @@ class EsAiAsinAdd(object):
            array_to_string(package_quantity_arr, ',') as package_quantity_arr,
            package_quantity_flag,
            label_content
-        from {self.pg_tb}
+        from {self.site_name}_ai_asin_analyze_detail
        """
-        conn_info = DBUtil.get_connection_info("postgresql", "us")
        self.df_ai_asin_analyze = SparkUtil.read_jdbc_query(
            session=self.spark,
-            url=conn_info["url"],
-            pwd=conn_info["pwd"],
-            username=conn_info["username"],
+            url=self.pg_conn["url"],
+            pwd=self.pg_conn["pwd"],
+            username=self.pg_conn["username"],
            query=sql2
        ).withColumn(
            'package_quantity_arr', F.split(F.col('package_quantity_arr'), ',')
@@ -158,83 +144,48 @@ class EsAiAsinAdd(object):
        self.df_ai_asin_analyze.show(10, True)

    def handle_data(self):
-        # 补充launch_time_type字段
-        one_month = self.launch_time_interval_dict['one_month']
-        three_month = self.launch_time_interval_dict['three_month']
-        six_month = self.launch_time_interval_dict['six_month']
-        twelve_month = self.launch_time_interval_dict['twelve_month']
-        twenty_four_month = self.launch_time_interval_dict['twenty_four_month']
-        thirty_six_month = self.launch_time_interval_dict['thirty_six_month']
-        expr_str = f"""
-            CASE WHEN launch_time >= '{one_month}' THEN 1 
-                 WHEN launch_time >= '{three_month}' AND launch_time < '{one_month}' THEN 2 
-                 WHEN launch_time >= '{six_month}' AND launch_time < '{three_month}' THEN 3 
-                 WHEN launch_time >= '{twelve_month}' AND launch_time < '{six_month}' THEN 4 
-                 WHEN launch_time >= '{twenty_four_month}' AND launch_time < '{twelve_month}' THEN 5 
-                 WHEN launch_time >= '{thirty_six_month}' AND launch_time < '{twenty_four_month}' THEN 6 
-                 WHEN launch_time < '{thirty_six_month}' THEN 7 
-                 ELSE 0 END
-        """
-        self.df_ai_asin_detail = self.df_ai_asin_detail.withColumn('launch_time_type', F.expr(expr_str))
-
-    def save_data(self):
-        self.df_save = self.df_ai_asin_detail.join(
+        self.df_save_pg = self.df_ai_asin_detail.join(
+            self.df_ai_asin_analyze, 'asin', 'left_anti'
+        ).select(
+            'site_name', 'asin', 'weight', 'bought_month', 'category', 'img', 'title', 'brand', 'account_name', 
+            'account_addr', 'buy_box_seller_type', 'launch_time', 'img_num', 'variation_flag', 'variation_num', 
+            'ao_val', 'category_id', 'category_current_id', 'parent_asin', 'bsr_rank', 'price', 'rating', 
+            'total_comments', 'seller_id', 'fb_country_name', 'review_json_list', 'launch_time_type', 'describe', 
+            'product_json', 'product_detail_json', 'bought_month_mom', 'bought_month_yoy', 'is_new_flag', 
+            'is_ascending_flag'
+        )
+
+        self.df_save_es = self.df_ai_asin_detail.join(
            self.df_ai_asin_analyze, 'asin', 'inner'
        ).select(
-            'account_addr',
-            'account_name',
-            'analyze_id',
-            'ao_val',
-            'appearance',
-            'asin',
-            'bought_month',
-            'bought_month_mom',
-            'bought_month_yoy',
-            'brand',
-            'bsr_rank',
-            'buy_box_seller_type',
-            'category',
-            'category_current_id',
-            'category_id',
-            'color',
-            'crowd',
-            'fb_country_name',
-            'function',
-            'img',
-            'img_num',
-            'is_ascending_flag',
-            'is_new_flag',
-            'label_content',
-            'launch_time',
-            'launch_time_type',
-            'material',
-            'package_quantity',
-            'package_quantity_arr',
-            'package_quantity_flag',
-            'parent_asin',
-            'price',
-            'rating',
-            'scene_comment',
-            'scene_title',
-            'seller_id',
-            'shape',
-            'short_desc',
-            'site_name',
-            'size',
-            'theme',
-            'title',
-            'title_pic_content',
-            'title_pic_flag',
-            'title_word_content',
-            'title_word_flag',
-            'total_comments',
-            'uses',
-            'variation_flag',
-            'variation_num',
-            'weight'
-        ).cache()
+            'account_addr', 'account_name', 'analyze_id', 'ao_val', 'appearance', 'asin', 'bought_month', 
+            'bought_month_mom', 'bought_month_yoy', 'brand', 'bsr_rank', 'buy_box_seller_type', 'category', 
+            'category_current_id', 'category_id', 'color', 'crowd', 'fb_country_name', 'function', 'img', 
+            'img_num', 'is_ascending_flag', 'is_new_flag', 'label_content', 'launch_time', 'launch_time_type', 
+            'material', 'package_quantity', 'package_quantity_arr', 'package_quantity_flag', 'parent_asin', 
+            'price', 'rating', 'scene_comment', 'scene_title', 'seller_id', 'shape', 'short_desc', 'site_name', 
+            'size', 'theme', 'title', 'title_pic_content', 'title_pic_flag', 'title_word_content', 
+            'title_word_flag', 'total_comments', 'uses', 'variation_flag', 'variation_num', 'weight'
+        )
+
+    def save_data(self):
+        # 将新增asin导出给济苍
+        try:
+            self.df_save_pg.write.format("jdbc") \
+                .option("url", self.pg_conn["url"]) \
+                .option("dbtable", f"{self.export_pg_tb}") \
+                .option("user", self.pg_conn["username"]) \
+                .option("password", self.pg_conn["pwd"]) \
+                .mode("append") \
+                .save()
+            CommonUtil.send_wx_msg(['wujicang', 'chenyuanjie'], 'ASIN信息库增量数据导出', f'详情：{self.export_pg_tb} {self.site_name} {self.date_type} {self.date_info}')
+        except Exception as e:
+            print("An error occurred while writing to Elasticsearch:", str(e))
+            CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ASIN信息库增量数据导出失败', f'详情：{self.export_pg_tb} {self.site_name} {self.date_type} {self.date_info}')
+
+        # 将增量asin导出到es
        try:
-            self.df_save.write.format("org.elasticsearch.spark.sql") \
+            self.df_save_es.write.format("org.elasticsearch.spark.sql") \
                .options(**self.es_options) \
                .mode("append") \
                .save()