Merge branch 'developer' of http://47.106.101.75/abel_cjy/Amazon-Selection-Data into developer

7ad606c5 · wangjing · 14af7a9b · 0f05969c · 7ad606c5 · 7ad606c5
Commit 7ad606c5 authored Mar 25, 2026 by wangjing
19 changed files
--- a/Pyspark_job/dim/dim_asin_detail.py
+++ b/Pyspark_job/dim/dim_asin_detail.py
@@ -159,7 +159,8 @@ class DimAsinDetail(object):
            REGEXP_REPLACE(seller_json, chr(10), '') as seller_json, buy_box_seller_type as asin_buy_box_seller_type, 
            customer_reviews_json, parent_asin, img_list, created_at as created_time, updated_at as updated_time, 
            updated_at as dt, variat_num as variation_num, fbm_delivery_price as asin_fbm_price, 
-            get_json_object(product_json, '$.Color') as product_json_color
+            get_json_object(product_json, '$.Color') as product_json_color, 
+            get_json_object(product_json, '$.Number of Items') as product_json_number_of_items 
            from ods_asin_detail where site_name='{self.site_name}' {self.date_sql}"""
        print(sql)
        self.df_asin_detail = self.spark.sql(sqlQuery=sql)
@@ -348,15 +349,23 @@ class DimAsinDetail(object):
        ).withColumn(
            "variat_package_quantity_is_abnormal", self.df_asin_detail.variat_parse.getField("is_package_quantity_abnormal")
        ).drop("title_parse", "variat_parse", "variat_attribute")
+        # Number of Items：直接从 product_json 提取，cast 失败（脏数据）自动为 null
        self.df_asin_detail = self.df_asin_detail.withColumn(
-            "package_quantity", F.expr(""" CASE
+            "number_of_items", F.col("product_json_number_of_items").cast("int")
-                     WHEN title_package_quantity is null and variat_package_quantity is not null THEN variat_package_quantity
+        ).drop("product_json_number_of_items")
-                     WHEN title_package_quantity is not null THEN title_package_quantity
+        # 优先级：Number of Items > 属性字段 > 标题解析 > 默认1
+        self.df_asin_detail = self.df_asin_detail.withColumn(
+            "package_quantity", F.expr("""CASE
+                     WHEN number_of_items IS NOT NULL AND number_of_items > 0 THEN number_of_items
+                     WHEN variat_package_quantity IS NOT NULL THEN variat_package_quantity
+                     WHEN title_package_quantity IS NOT NULL THEN title_package_quantity
                     ELSE 1 END""")).withColumn(
            "is_package_quantity_abnormal", F.expr("""CASE
-                     WHEN title_package_quantity is null  and variat_package_quantity is not null THEN variat_package_quantity_is_abnormal
+                     WHEN number_of_items IS NOT NULL AND number_of_items > 0 THEN 0
-                     WHEN title_package_quantity is not null THEN title_package_quantity_is_abnormal
+                     WHEN variat_package_quantity IS NOT NULL THEN variat_package_quantity_is_abnormal
-                     ELSE 2 END""")).drop("title_package_quantity", "variat_package_quantity", "title_package_quantity_is_abnormal", "variat_package_quantity_is_abnormal")
+                     WHEN title_package_quantity IS NOT NULL THEN title_package_quantity_is_abnormal
+                     ELSE 2 END""")).drop("number_of_items", "title_package_quantity", "variat_package_quantity",
+                                          "title_package_quantity_is_abnormal", "variat_package_quantity_is_abnormal")
        self.df_asin_detail = self.df_asin_detail.join(self.df_user_package_num, on=['asin', 'asin_title'], how='left')
        self.df_asin_detail = self.df_asin_detail.withColumn(
            "package_quantity", F.coalesce(F.col("user_package_num"), F.col("package_quantity"))).withColumn(

--- a/Pyspark_job/dwt/dwt_fb_base_report.py
+++ b/Pyspark_job/dwt/dwt_fb_base_report.py
@@ -99,7 +99,8 @@ class DwtFbBaseReport(object):
            cur_fd.fb_crawl_date,
            round((count_30_day_num - last_30_day_num) / last_30_day_num, 4) as count_30_day_rate,
            round((count_1_year_num - last_1_year_num) / last_1_year_num, 4) as count_1_year_rate,
-            round((count_lifetime_num - last_lifetime_num) / last_lifetime_num, 4) as count_life_time_rate
+            round((count_lifetime_num - last_lifetime_num) / last_lifetime_num, 4) as count_life_time_rate,
+            seller_rating
        from 
        (
            select 

--- a/Pyspark_job/dwt/dwt_flow_asin.py
+++ b/Pyspark_job/dwt/dwt_flow_asin.py
@@ -521,10 +521,27 @@ class DwtFlowAsin(Templates):
      CASE WHEN hide_flag = 1 THEN 1 WHEN category_first_id = 'grocery' and category_id != '6492272011' THEN 1 
      WHEN category_id in ('21393128011', '21377129011', '21377127011', '21377130011', '21388218011', '21377132011') THEN 1
      ELSE 0 END""")).drop("hide_flag")
-        self.df_asin_detail = self.df_asin_detail.withColumn("asin_is_need", F.expr("""
+        # 解析 asin_category_desc 取 › 分隔的第一个元素作为补充分类名称
-        CASE WHEN category_first_id in ('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed') THEN 1 
+        self.df_asin_detail = self.df_asin_detail.withColumn(
-        WHEN asin NOT LIKE 'B0%' THEN 1 
+            "desc_category_first_name",
+            F.lower(F.trim(F.split(F.col("asin_category_desc"), "›").getItem(0)))
+        )
+        # 读取 Hive 分类维表，获取分类名称与ID的对应关系
+        sql_dim = f"""
+            select lower(trim(en_name)) as desc_category_first_name, category_first_id as desc_category_first_id
+            from dim_bsr_category_tree where site_name = '{self.site_name}' and category_parent_id = 0 and leaf_node = 2
+        """
+        df_bsr_category = F.broadcast(self.spark.sql(sqlQuery=sql_dim))
+        # join 补充分类ID
+        self.df_asin_detail = self.df_asin_detail.join(df_bsr_category, on=['desc_category_first_name'], how='left')
+        # 两个分类ID均在过滤列表中才标记为1
+        need_categories = "('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed')"
+        self.df_asin_detail = self.df_asin_detail.withColumn("asin_is_need", F.expr(f"""
+        CASE WHEN category_first_id in {need_categories}
+                  AND desc_category_first_id in {need_categories} THEN 1
+             WHEN asin NOT LIKE 'B0%' THEN 1
        ELSE 0 END"""))
+        self.df_asin_detail = self.df_asin_detail.drop("desc_category_first_name", "desc_category_first_id")
        self.df_asin_detail = self.df_asin_detail.withColumn("asin_type", F.expr("""
            CASE WHEN asin_is_self=1 THEN 1 WHEN asin_is_need=1 THEN 2 WHEN asin_is_hide=1 THEN 3 ELSE 0 END"""
            )).drop("asin_is_self", "asin_is_need", "asin_is_hide")

--- a/Pyspark_job/dwt/dwt_nsr_asin_detail.py
+++ b/Pyspark_job/dwt/dwt_nsr_asin_detail.py
@@ -154,7 +154,7 @@ class DwtNsrAsinDetail(object):
            select asin,
                   asin_ao_val                     as ao_val,
                   bsr_orders                      as bsr_orders,
-                   asin_bsr_orders_change          as bsr_orders_change,
+                   asin_bsr_orders_mom          as bsr_orders_change,
                   asin_air_freight_gross_margin   as asin_air_freight_gross_margin,
                   asin_ocean_freight_gross_margin as asin_ocean_freight_gross_margin,
                   cast(asin_bought_month as int ) as asin_bought_month

--- a/Pyspark_job/export_es/es_ai_asin_add.py
+++ b/Pyspark_job/export_es/es_ai_asin_add.py
@@ -29,6 +29,7 @@ class EsAiAsinAdd(object):
        self.df_ai_asin_detail = self.spark.sql(f"select 1+1;")
        self.df_ai_asin_analyze = self.spark.sql(f"select 1+1;")
+        self.df_profit_rate = self.spark.sql(f"select 1+1;")
        self.df_save_pg = self.spark.sql(f"select 1+1;")
        self.df_save_es = self.spark.sql(f"select 1+1;")
@@ -126,7 +127,10 @@ class EsAiAsinAdd(object):
            title_word_content,
            array_to_string(package_quantity_arr, ',') as package_quantity_arr,
            package_quantity_flag,
-            label_content
+            label_content,
+            festival,
+            multi_color_flag,
+            multi_color_content
        from {self.site_name}_ai_asin_analyze_detail
        """
        self.df_ai_asin_analyze = SparkUtil.read_jdbc_query(
@@ -143,6 +147,16 @@ class EsAiAsinAdd(object):
        print("AI分析数据如下：")
        self.df_ai_asin_analyze.show(10, True)
+        # 读取利润率数据
+        sql3 = f"""
+        select asin, price, ocean_profit, air_profit
+        from dim_asin_profit_rate_info
+        where site_name = '{self.site_name}'
+        """
+        self.df_profit_rate = self.spark.sql(sql3).repartition(40, 'asin').cache()
+        print("利润率数据如下：")
+        self.df_profit_rate.show(10, True)
    def handle_data(self):
        self.df_save_pg = self.df_ai_asin_detail.join(
            self.df_ai_asin_analyze, 'asin', 'left_anti'
@@ -155,20 +169,63 @@ class EsAiAsinAdd(object):
            'is_ascending_flag'
        )
+        df_profit = self.df_profit_rate.withColumn(
+            "profit_rate_extra",
+            F.when(
+                F.col("ocean_profit").isNull() & F.col("air_profit").isNull(),
+                F.lit(None)
+            ).otherwise(
+                F.struct(
+                    F.col("ocean_profit").alias("ocean_profit"),
+                    F.col("air_profit").alias("air_profit")
+                )
+            )
+        ).drop("ocean_profit", "air_profit")
        self.df_save_es = self.df_ai_asin_detail.join(
            self.df_ai_asin_analyze, 'asin', 'inner'
+        ).withColumn(
+            'profit_key', F.concat_ws("_", F.col("asin"), F.col("price"))
+        ).join(
+            df_profit, on=["asin", "price"], how="left"
        ).select(
-            'account_addr', 'account_name', 'analyze_id', 'ao_val', 'appearance', 'asin', 'bought_month', 
+            'account_addr', 'account_name', 'analyze_id', 'ao_val', 'appearance', 'asin', 'bought_month',
-            'bought_month_mom', 'bought_month_yoy', 'brand', 'bsr_rank', 'buy_box_seller_type', 'category', 
+            'bought_month_mom', 'bought_month_yoy', 'brand', 'bsr_rank', 'buy_box_seller_type', 'category',
-            'category_current_id', 'category_id', 'color', 'crowd', 'fb_country_name', 'function', 'img', 
+            'category_current_id', 'category_id', 'color', 'crowd', 'fb_country_name', 'festival', 'function',
-            'img_num', 'is_ascending_flag', 'is_new_flag', 'label_content', 'launch_time', 'launch_time_type', 
+            'img', 'img_num', 'is_ascending_flag', 'is_new_flag', 'label_content', 'launch_time', 'launch_time_type',
-            'material', 'package_quantity', 'package_quantity_arr', 'package_quantity_flag', 'parent_asin', 
+            'material', 'multi_color_content', 'multi_color_flag', 'package_quantity', 'package_quantity_arr',
-            'price', 'rating', 'scene_comment', 'scene_title', 'seller_id', 'shape', 'short_desc', 'site_name', 
+            'package_quantity_flag', 'parent_asin', 'price', 'profit_key', 'profit_rate_extra', 'rating',
-            'size', 'theme', 'title', 'title_pic_content', 'title_pic_flag', 'title_word_content', 
+            'scene_comment', 'scene_title', 'seller_id', 'shape', 'short_desc', 'site_name', 'size', 'theme',
-            'title_word_flag', 'total_comments', 'uses', 'variation_flag', 'variation_num', 'weight'
+            'title', 'title_pic_content', 'title_pic_flag', 'title_word_content', 'title_word_flag',
+            'total_comments', 'uses', 'variation_flag', 'variation_num', 'weight'
        )
+    def create_pg_table(self):
+        """
+        根据模板表创建月度 PG 表：
+          1. LIKE INCLUDING ALL：复制所有字段类型、其他列默认值、约束、索引
+          2. 重建 id 列独立序列，避免与模板表共享同一序列
+        """
+        template_tb = "us_ai_asin_detail_month_2026_01"
+        engine = DBUtil.get_db_engine("postgresql", "us")
+        # 表已存在则跳过
+        result = DBUtil.engine_exec_sql(engine, f"SELECT to_regclass('{self.export_pg_tb}')")
+        if list(result)[0][0] is not None:
+            print(f"PostgreSQL 表 {self.export_pg_tb} 已存在，跳过建表")
+            return
+        # 建表 + 为 id 列创建独立序列
+        sql = f"""
+            CREATE TABLE {self.export_pg_tb} (LIKE {template_tb} INCLUDING ALL);
+            ALTER TABLE {self.export_pg_tb} ALTER COLUMN id DROP DEFAULT;
+            CREATE SEQUENCE {self.export_pg_tb}_id_seq OWNED BY {self.export_pg_tb}.id;
+            ALTER TABLE {self.export_pg_tb} ALTER COLUMN id SET DEFAULT nextval('{self.export_pg_tb}_id_seq')
+        """
+        DBUtil.exec_sql("postgresql", "us", sql)
+        print(f"PostgreSQL 表 {self.export_pg_tb} 创建完成（独立自增序列）")
    def save_data(self):
+        # 创建月度 PG 表
+        self.create_pg_table()
        # 将新增asin导出给济苍
        try:
            self.df_save_pg.write.format("jdbc") \
@@ -180,11 +237,12 @@ class EsAiAsinAdd(object):
                .save()
            CommonUtil.send_wx_msg(['wujicang', 'chenyuanjie'], 'ASIN信息库增量数据导出', f'详情：{self.export_pg_tb} {self.site_name} {self.date_type} {self.date_info}')
        except Exception as e:
-            print("An error occurred while writing to Elasticsearch:", str(e))
+            print("An error occurred while writing to PostgreSQL:", str(e))
            CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ASIN信息库增量数据导出失败', f'详情：{self.export_pg_tb} {self.site_name} {self.date_type} {self.date_info}')
        # 将增量asin导出到es
        try:
+            EsUtils.create_index(self.es_index, self.es_client, EsUtils.get_es_ai_body())
            self.df_save_es.write.format("org.elasticsearch.spark.sql") \
                .options(**self.es_options) \
                .mode("append") \

--- a/Pyspark_job/export_es/es_asin_profit_rate.py
+++ b/Pyspark_job/export_es/es_asin_profit_rate.py
--- a/Pyspark_job/listen_program/import_st_to_pg14.py
+++ b/Pyspark_job/listen_program/import_st_to_pg14.py
@@ -21,7 +21,7 @@ class ImportStToPg14(object):
        self.df_st_month = pd.DataFrame()
        self.df_st_month_state = pd.DataFrame()
        self.df_save = pd.DataFrame()
-        self.fetch_year_month_by_week()  # 如果传的date_type='week', 将date_info转换成月的值
+        # self.fetch_year_month_by_week()  # 如果传的date_type='week', 将date_info转换成月的值
        self.year, self.month = self.date_info.split("-")[0], int(self.date_info.split("-")[1])
    def fetch_year_month_by_week(self):
@@ -31,6 +31,7 @@ class ImportStToPg14(object):
            self.date_info = list(df.year_month)[0]
    def read_data(self):
+        self.fetch_year_month_by_week()  # 如果传的date_type='week', 将date_info转换成月的值
        # 1. 读取date_20_to_30表获取月份对应的周
        sql_get_week = f"select year_week, year, week from selection.date_20_to_30 WHERE `year_month`='{self.date_info}' and week_day=1"
        df_week = pd.read_sql(sql_get_week, con=self.engine_mysql)

--- a/Pyspark_job/my_kafka/kafka_flow_asin_detail.py
+++ b/Pyspark_job/my_kafka/kafka_flow_asin_detail.py
@@ -99,6 +99,7 @@ class KafkaFlowAsinDetail(Templates):
        self.df_asin_keep_date = self.spark.sql("select 1+1;")
        self.df_asin_bsr_end = self.spark.sql("select 1+1;")
        self.df_hide_category = self.spark.sql("select 1+1;")
+        self.df_bsr_category = self.spark.sql("select 1+1;")
        self.df_asin_new_cate = self.spark.sql("select 1+ 1;")
        self.df_user_package_num = self.spark.sql("select 1+1;")
        self.df_asin_category = self.spark.sql("select 1+1;")
@@ -190,7 +191,8 @@ class KafkaFlowAsinDetail(Templates):
            StructField("customer_reviews_json", StringType(), True),
            StructField("img_list", StringType(), True),
            StructField("follow_sellers", IntegerType(), True),
-            StructField("fbm_delivery_price", FloatType(), True)
+            StructField("fbm_delivery_price", FloatType(), True),
+            StructField("product_json", StringType(), True)
        ])
        return schema
@@ -448,15 +450,26 @@ class KafkaFlowAsinDetail(Templates):
            withColumn("title_package_quantity_is_abnormal", df.title_parse.getField("is_package_quantity_abnormal")). \
            withColumn("variat_package_quantity_is_abnormal", df.variat_parse.getField("is_package_quantity_abnormal")). \
            drop("title_parse", "variat_parse", "variat_attribute")
+        # Number of Items：从 product_json 提取，cast 失败（脏数据）自动为 null，提取后立即 drop
+        df = df.withColumn(
+            "number_of_items",
+            F.get_json_object(F.col("product_json"), "$.Number of Items").cast("int")
+        ).drop("product_json")
+        # 优先级：Number of Items > 属性字段 > 标题解析 > 默认1
        df = df.withColumn(
            "package_quantity", F.expr("""
-            CASE WHEN title_package_quantity is null and variat_package_quantity is not null THEN variat_package_quantity 
+            CASE WHEN number_of_items IS NOT NULL AND number_of_items > 0 THEN number_of_items
-            WHEN title_package_quantity is not null THEN title_package_quantity ELSE 1 END""")
+                 WHEN variat_package_quantity IS NOT NULL THEN variat_package_quantity
+                 WHEN title_package_quantity IS NOT NULL THEN title_package_quantity
+                 ELSE 1 END""")
        ).withColumn(
            "is_package_quantity_abnormal", F.expr("""
-            CASE WHEN title_package_quantity is null  and variat_package_quantity is not null THEN variat_package_quantity_is_abnormal 
+            CASE WHEN number_of_items IS NOT NULL AND number_of_items > 0 THEN 0
-            WHEN title_package_quantity is not null THEN title_package_quantity_is_abnormal ELSE 2 END""")
+                 WHEN variat_package_quantity IS NOT NULL THEN variat_package_quantity_is_abnormal
-        ).drop("title_package_quantity", "variat_package_quantity", "title_package_quantity_is_abnormal", "variat_package_quantity_is_abnormal")
+                 WHEN title_package_quantity IS NOT NULL THEN title_package_quantity_is_abnormal
+                 ELSE 2 END""")
+        ).drop("number_of_items", "title_package_quantity", "variat_package_quantity",
+               "title_package_quantity_is_abnormal", "variat_package_quantity_is_abnormal")
        df = df.withColumn("title", F.lower(F.col("title")))
        df = df.join(self.df_user_package_num, on=['asin', 'title'], how='left')
        df = df.withColumn("package_quantity", F.coalesce(F.col("user_package_num"), F.col("package_quantity"))). \
@@ -567,11 +580,18 @@ class KafkaFlowAsinDetail(Templates):
        df = df.withColumn("bsr_type", F.expr("""
        CASE WHEN limit_rank is null and asin_bs_cate_1_rank <= 500000 THEN 1 WHEN limit_rank is not null and asin_bs_cate_1_rank <= limit_rank THEN 1 ELSE 0 END"""
                                              )).drop("limit_rank")
-        # 5. 是否必需ASIN
+        # 5. 是否必需ASIN（双重确认：BSR分类 + 页面描述分类，两者均在排除列表才标记为1）
-        df = df.withColumn("is_need_asin", F.expr("""
+        df = df.withColumn(
-        CASE WHEN asin_bs_cate_1_id in ('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed') THEN 1 
+            "desc_category_first_name",
-        WHEN asin NOT LIKE 'B0%' THEN 1 
+            F.lower(F.trim(F.split(F.col("category"), "›").getItem(0)))
+        ).join(self.df_bsr_category, on=['desc_category_first_name'], how='left')
+        need_categories = "('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed')"
+        df = df.withColumn("is_need_asin", F.expr(f"""
+        CASE WHEN asin_bs_cate_1_id in {need_categories}
+                  AND desc_category_first_id in {need_categories} THEN 1
+             WHEN asin NOT LIKE 'B0%' THEN 1
        ELSE 0 END"""))
+        df = df.drop("desc_category_first_name", "desc_category_first_id")
        # 6. asin_type
        df = df.withColumn("asin_type", F.expr("""
        CASE WHEN is_self_asin=1 THEN 1 WHEN is_need_asin=1 THEN 2 WHEN is_hide_asin=1 THEN 3 ELSE 0 END"""
@@ -805,6 +825,13 @@ class KafkaFlowAsinDetail(Templates):
                username=us_mysql_con_info['username'], query=sql)
            self.df_hide_category = F.broadcast(df_hide_category)
            self.df_hide_category.show(10, truncate=False)
+        print("5.1 读取BSR分类树（用于双重确认is_need_asin）")
+        sql = f"""
+            select lower(trim(en_name)) as desc_category_first_name, category_first_id as desc_category_first_id
+            from dim_bsr_category_tree where site_name = '{self.site_name}' and category_parent_id = 0 and leaf_node = 2
+        """
+        self.df_bsr_category = F.broadcast(self.spark.sql(sqlQuery=sql))
+        self.df_bsr_category.show(10, truncate=False)
        print("6. 读取asin_label信息")
        sql = f"""
           select asin, label from 
@@ -986,6 +1013,7 @@ class KafkaFlowAsinDetail(Templates):
                   F.col("describe_len").alias("asin_describe_len")
                   )
        df = df.drop("category", "seller_json")
+        df = df.withColumn("date_info_del", F.lit(self.date_info))
        df.write.format("org.elasticsearch.spark.sql").options(**self.es_options).mode("append").save()
        end_time = time.time()
        elapsed_time = end_time - start_time

--- a/Pyspark_job/my_kafka/kafka_rank_asin_detail.py
+++ b/Pyspark_job/my_kafka/kafka_rank_asin_detail.py
@@ -98,6 +98,7 @@ class KafkaRankAsinDetail(Templates):
        self.df_asin_keep_date = self.spark.sql("select 1+1;")
        self.df_asin_bsr_end = self.spark.sql("select 1+1;")
        self.df_hide_category = self.spark.sql("select 1+1;")
+        self.df_bsr_category = self.spark.sql("select 1+1;")
        self.df_asin_new_cate = self.spark.sql("select 1+ 1;")
        self.df_user_package_num = self.spark.sql("select 1+1;")
        self.df_asin_category = self.spark.sql("select 1+1;")
@@ -189,7 +190,8 @@ class KafkaRankAsinDetail(Templates):
            StructField("customer_reviews_json", StringType(), True),
            StructField("img_list", StringType(), True),
            StructField("follow_sellers", IntegerType(), True),
-            StructField("fbm_delivery_price", FloatType(), True)
+            StructField("fbm_delivery_price", FloatType(), True),
+            StructField("product_json", StringType(), True)
        ])
        return schema
@@ -447,15 +449,26 @@ class KafkaRankAsinDetail(Templates):
            withColumn("title_package_quantity_is_abnormal", df.title_parse.getField("is_package_quantity_abnormal")). \
            withColumn("variat_package_quantity_is_abnormal", df.variat_parse.getField("is_package_quantity_abnormal")). \
            drop("title_parse", "variat_parse", "variat_attribute")
+        # Number of Items：从 product_json 提取，cast 失败（脏数据）自动为 null，提取后立即 drop
+        df = df.withColumn(
+            "number_of_items",
+            F.get_json_object(F.col("product_json"), "$.Number of Items").cast("int")
+        ).drop("product_json")
+        # 优先级：Number of Items > 属性字段 > 标题解析 > 默认1
        df = df.withColumn(
            "package_quantity", F.expr("""
-            CASE WHEN title_package_quantity is null and variat_package_quantity is not null THEN variat_package_quantity 
+            CASE WHEN number_of_items IS NOT NULL AND number_of_items > 0 THEN number_of_items
-            WHEN title_package_quantity is not null THEN title_package_quantity ELSE 1 END""")
+                 WHEN variat_package_quantity IS NOT NULL THEN variat_package_quantity
+                 WHEN title_package_quantity IS NOT NULL THEN title_package_quantity
+                 ELSE 1 END""")
        ).withColumn(
            "is_package_quantity_abnormal", F.expr("""
-            CASE WHEN title_package_quantity is null  and variat_package_quantity is not null THEN variat_package_quantity_is_abnormal 
+            CASE WHEN number_of_items IS NOT NULL AND number_of_items > 0 THEN 0
-            WHEN title_package_quantity is not null THEN title_package_quantity_is_abnormal ELSE 2 END""")
+                 WHEN variat_package_quantity IS NOT NULL THEN variat_package_quantity_is_abnormal
-        ).drop("title_package_quantity", "variat_package_quantity", "title_package_quantity_is_abnormal", "variat_package_quantity_is_abnormal")
+                 WHEN title_package_quantity IS NOT NULL THEN title_package_quantity_is_abnormal
+                 ELSE 2 END""")
+        ).drop("number_of_items", "title_package_quantity", "variat_package_quantity",
+               "title_package_quantity_is_abnormal", "variat_package_quantity_is_abnormal")
        df = df.withColumn("title", F.lower(F.col("title")))
        df = df.join(self.df_user_package_num, on=['asin', 'title'], how='left')
        df = df.withColumn("package_quantity", F.coalesce(F.col("user_package_num"), F.col("package_quantity"))). \
@@ -566,11 +579,18 @@ class KafkaRankAsinDetail(Templates):
        df = df.withColumn("bsr_type", F.expr("""
        CASE WHEN limit_rank is null and asin_bs_cate_1_rank <= 500000 THEN 1 WHEN limit_rank is not null and asin_bs_cate_1_rank <= limit_rank THEN 1 ELSE 0 END"""
                                              )).drop("limit_rank")
-        # 5. 是否必需ASIN
+        # 5. 是否必需ASIN（双重确认：BSR分类 + 页面描述分类，两者均在排除列表才标记为1）
-        df = df.withColumn("is_need_asin", F.expr("""
+        df = df.withColumn(
-        CASE WHEN asin_bs_cate_1_id in ('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed') THEN 1 
+            "desc_category_first_name",
-        WHEN asin NOT LIKE 'B0%' THEN 1 
+            F.lower(F.trim(F.split(F.col("category"), "›").getItem(0)))
+        ).join(self.df_bsr_category, on=['desc_category_first_name'], how='left')
+        need_categories = "('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed')"
+        df = df.withColumn("is_need_asin", F.expr(f"""
+        CASE WHEN asin_bs_cate_1_id in {need_categories}
+                  AND desc_category_first_id in {need_categories} THEN 1
+             WHEN asin NOT LIKE 'B0%' THEN 1
        ELSE 0 END"""))
+        df = df.drop("desc_category_first_name", "desc_category_first_id")
        # 6. asin_type
        df = df.withColumn("asin_type", F.expr("""
        CASE WHEN is_self_asin=1 THEN 1 WHEN is_need_asin=1 THEN 2 WHEN is_hide_asin=1 THEN 3 ELSE 0 END"""
@@ -804,6 +824,13 @@ class KafkaRankAsinDetail(Templates):
                username=us_mysql_con_info['username'], query=sql)
            self.df_hide_category = F.broadcast(df_hide_category)
            self.df_hide_category.show(10, truncate=False)
+        print("5.1 读取BSR分类树（用于双重确认is_need_asin）")
+        sql = f"""
+            select lower(trim(en_name)) as desc_category_first_name, category_first_id as desc_category_first_id
+            from dim_bsr_category_tree where site_name = '{self.site_name}' and category_parent_id = 0 and leaf_node = 2
+        """
+        self.df_bsr_category = F.broadcast(self.spark.sql(sqlQuery=sql))
+        self.df_bsr_category.show(10, truncate=False)
        print("6. 读取asin_label信息")
        sql = f"""
           select asin, label from 
@@ -982,6 +1009,7 @@ class KafkaRankAsinDetail(Templates):
                   F.col("describe_len").alias("asin_describe_len")
                   )
        df = df.drop("category", "seller_json")
+        df = df.withColumn("date_info_del", F.lit("1970-01"))
        df.write.format("org.elasticsearch.spark.sql").options(**self.es_options).mode("append").save()
        end_time = time.time()
        elapsed_time = end_time - start_time

--- a/Pyspark_job/script/dolphinscheduler_task_monitor.py
+++ b/Pyspark_job/script/dolphinscheduler_task_monitor.py
+import os
+import sys
+sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
+from utils.DolphinschedulerHelper import DolphinschedulerHelper
+from utils.redis_utils import RedisUtils
+from utils.common_util import CommonUtil
+if __name__ == '__main__':
+    # 监听半小时之内报错的任务
+    client = RedisUtils.getClient()
+    redis_key = "dolphinscheduler_task_monitor:lateTime"
+    lateTime = client.get(redis_key)
+    lateTime = lateTime or "2026-03-20 00:00:00"
+    import json
+    print(lateTime)
+    req_params = {
+        "pageNo": 1,
+        "pageSize": 100,
+        "stateType": "FAILURE",
+        "startDate": lateTime
+    }
+    project_name = "big_data_selection"
+    errList = DolphinschedulerHelper.list_projects_task(project_name=project_name, req_params=req_params)
+    errMsg = []
+    for it in errList:
+        task_name = "-".join(it['name'].split("-")[:-2])
+        paramMap = DolphinschedulerHelper.view_process_instance_variables(project_name, it["id"])
+        errMsg.append(f"""任务[{task_name}]执行失败,启动参数为:{json.dumps(paramMap)}""")
+    from datetime import datetime
+    now = datetime.now()
+    formatted_time = now.strftime("%Y-%m-%d %H:%M:%S")
+    if len(errMsg) > 0:
+        all_msg = "\n".join(errMsg)
+        msg = f"""截止到日期{lateTime}到{formatted_time},海豚任务报错详情如下\n{all_msg}"""
+        CommonUtil.send_msg_robot(msg)
+    client.set(redis_key, formatted_time)
--- a/Pyspark_job/script/export_asin_without_keepa.py
+++ b/Pyspark_job/script/export_asin_without_keepa.py
--- a/Pyspark_job/sqoop_export/export_dwt_ai_asin_add.py
+++ b/Pyspark_job/sqoop_export/export_dwt_ai_asin_add.py
-import os
-import sys
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.common_util import CommonUtil
-from utils.secure_db_client import get_remote_engine
-if __name__ == '__main__':
-    site_name = CommonUtil.get_sys_arg(1, None)
-    date_type = CommonUtil.get_sys_arg(2, None)
-    date_info = CommonUtil.get_sys_arg(3, None)
-    print(f"执行参数为{sys.argv}")
-    # 获取数据库引擎
-    db_type = "postgresql_15"
-    engine = get_remote_engine(
-        site_name='us',
-        db_type=db_type
-    )
-    if site_name == 'us':
-        export_tb = f"ai_asin_detail_month_{date_info.replace('-', '_')}"
-    else:
-        export_tb = f"{site_name}_ai_asin_detail_month_{date_info.replace('-', '_')}"
-    # 导出数据
-    engine.sqoop_raw_export(
-        hive_table="dwt_ai_asin_add",
-        import_table=export_tb,
-        partitions={
-            "site_name": site_name,
-            "date_type": date_type,
-            "date_info": date_info
-        },
-        m=30,
-        cols="site_name,asin,weight,bought_month,category,img,title,brand,account_name,account_addr,buy_box_seller_type,"
-             "launch_time,img_num,variation_flag,variation_num,ao_val,category_id,category_current_id,parent_asin,bsr_rank,"
-             "price,rating,total_comments,seller_id,fb_country_name,review_json_list,launch_time_type,describe,product_json,"
-             "product_detail_json,bought_month_mom,bought_month_yoy,is_new_flag,is_ascending_flag"
-    )
-    print("success")
--- a/Pyspark_job/sqoop_import/ods_merchantwords_brand_analytics.py
+++ b/Pyspark_job/sqoop_import/ods_merchantwords_brand_analytics.py
-import os
-import sys
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.ssh_util import SSHUtil
-from utils.common_util import CommonUtil
-from utils.hdfs_utils import HdfsUtils
-if __name__ == '__main__':
-    site_name = CommonUtil.get_sys_arg(1, None)
-    date_type = CommonUtil.get_sys_arg(2, None)
-    date_info = CommonUtil.get_sys_arg(3, None)
-    assert site_name is not None, "site_name 不能为空！"
-    assert date_type is not None, "date_type 不能为空！"
-    assert date_info is not None, "date_info 不能为空！"
-    hive_table = f"ods_merchantwords_brand_analytics"
-    partition_dict = {
-        "site_name": site_name,
-        "date_type": date_type,
-        "date_info": date_info
-    }
-    # 落表路径校验
-    hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
-    print(f"hdfs_path is {hdfs_path}")
-    year, month, day = date_info.split("-")
-    db_type = 'postgresql_16'
-    import_table = f"{site_name}_merchantwords_brand_analytics_{year}_{month}_{day}"
-    sql_query = f"""
-        select 
-         id,
-         search_term,
-         quantity_being_sold,
-         created_time,
-         updated_time,
-         quantity_being_sold_str,
-         result_count,
-         departments
-        from {import_table} 
-        where 1=1
-        and \$CONDITIONS
-    """
-    # 进行schema和数据校验
-    CommonUtil.check_schema_before_import(db_type=db_type,
-                                          site_name=site_name,
-                                          query=sql_query,
-                                          hive_tb_name=hive_table,
-                                          msg_usr=['chenyuanjie'],
-                                          partition_dict=partition_dict)
-    # 生成导出脚本
-    import_sh = CommonUtil.build_import_sh(site_name=site_name,
-                                           db_type=db_type,
-                                           query=sql_query,
-                                           hdfs_path=hdfs_path,
-                                           map_num=1,
-                                           key='id'
-                                           )
-    HdfsUtils.delete_hdfs_file(hdfs_path)
-    client = SSHUtil.get_ssh_client()
-    SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
-    CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
-    client.close()
-    CommonUtil.check_import_sync_num(db_type=db_type,
-                                     partition_dict=partition_dict,
-                                     import_query=sql_query,
-                                     hive_tb_name=hive_table,
-                                     msg_usr=['chenyuanjie'])
\ No newline at end of file
--- a/Pyspark_job/sqoop_import/ods_merchantwords_other_search_term_data.py
+++ b/Pyspark_job/sqoop_import/ods_merchantwords_other_search_term_data.py
-import os
-import sys
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.ssh_util import SSHUtil
-from utils.common_util import CommonUtil
-from utils.hdfs_utils import HdfsUtils
-if __name__ == '__main__':
-    site_name = CommonUtil.get_sys_arg(1, None)
-    date_type = CommonUtil.get_sys_arg(2, None)
-    date_info = CommonUtil.get_sys_arg(3, None)
-    assert site_name is not None, "site_name 不能为空！"
-    assert date_type is not None, "date_type 不能为空！"
-    assert date_info is not None, "date_info 不能为空！"
-    hive_table = f"ods_merchantwords_other_search_term_data"
-    partition_dict = {
-        "site_name": site_name,
-        "date_type": date_type,
-        "date_info": date_info
-    }
-    # 落表路径校验
-    hdfs_path = CommonUtil.build_hdfs_path(hive_table, partition_dict=partition_dict)
-    print(f"hdfs_path is {hdfs_path}")
-    year, month, day = date_info.split("-")
-    db_type = 'postgresql_16'
-    import_table = f"{site_name}_merchantwords_other_search_term_{year}_{month}_{day}"
-    sql_query = f"""
-        select 
-         id,
-         search_term,
-         asin,
-         page,
-         buy_data,
-         label,
-         created_time,
-         updated_time
-        from {import_table} 
-        where 1=1
-        and \$CONDITIONS
-    """
-    # 进行schema和数据校验
-    CommonUtil.check_schema_before_import(db_type=db_type,
-                                          site_name=site_name,
-                                          query=sql_query,
-                                          hive_tb_name=hive_table,
-                                          msg_usr=['chenyuanjie'],
-                                          partition_dict=partition_dict)
-    import_sh = CommonUtil.build_import_sh(site_name=site_name,
-                                           db_type=db_type,
-                                           query=sql_query,
-                                           hdfs_path=hdfs_path,
-                                           map_num=35,
-                                           key='id'
-                                           )
-    HdfsUtils.delete_hdfs_file(hdfs_path)
-    client = SSHUtil.get_ssh_client()
-    SSHUtil.exec_command_async(client, import_sh, ignore_err=False)
-    CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_table)
-    client.close()
\ No newline at end of file
--- a/Pyspark_job/sqoop_import/ods_merchantwords_search_term_type.py
+++ b/Pyspark_job/sqoop_import/ods_merchantwords_search_term_type.py
-import os
-import sys
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.ssh_util import SSHUtil
-from utils.common_util import CommonUtil
-from utils.hdfs_utils import HdfsUtils
-if __name__ == '__main__':
-    site_name = CommonUtil.get_sys_arg(1, None)
-    st_type = CommonUtil.get_sys_arg(2, None)
-    date_type = CommonUtil.get_sys_arg(3, None)
-    date_info = CommonUtil.get_sys_arg(4, None)
-    assert site_name is not None, "site_name 不能为空！"
-    assert st_type is not None, "st_type 不能为空！"
-    assert date_type is not None, "date_type 不能为空！"
-    assert date_info is not None, "date_info 不能为空！"
-    if site_name == 'us':
-        if date_info == '2024-05-05':
-            if st_type == "bs":
-                quit()
-        elif date_info in ['2024-06-06', '2024-06-07', '2024-06-08']:
-            if st_type == "hr":
-                quit()
-    hive_tb = f"ods_merchantwords_search_term_{st_type}"
-    partition_dict = {
-        "site_name": site_name,
-        "date_type": date_type,
-        "date_info": date_info
-    }
-    if st_type in ["zr", "sp"]:
-        cols = "search_term,asin,page,page_row,id,created_time,updated_time"
-    elif st_type in ["sb"]:
-        cols = "search_term,asin,page,data_type,id,created_time,updated_time"
-    else:
-        cols = "search_term,asin,page,created_time,updated_time"
-    db_type = 'postgresql_16'
-    year, month, day = date_info.split("-")
-    import_tb = f"{site_name}_merchantwords_search_term_rank_{st_type}_{year}_{month}_{day}"
-    query = f"""
-        select 
-            {cols}
-        from 
-            {import_tb}
-        where 1 = 1
-        and \$CONDITIONS
-    """
-    print(f"当前同步的表为：{import_tb}")
-    hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
-    print(f"hdfs_path is {hdfs_path}")
-    empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type,
-                                                                   site_name=site_name,
-                                                                   query=query,
-                                                                   hive_tb_name=hive_tb,
-                                                                   msg_usr=['chenyuanjie'],
-                                                                   partition_dict=partition_dict
-                                                                   )
-    assert check_flag, f"导入hive表{hive_tb}表结构检查失败！请检查query是否异常！！"
-    if not empty_flag:
-        if st_type in ["zr", "sp", "sb"]:
-            if st_type == "zr":
-                map_num = 50
-            elif st_type == "sp":
-                map_num = 25
-            else:
-                map_num = 15
-            sh = CommonUtil.build_import_sh(site_name=site_name,
-                                            db_type=db_type,
-                                            query=query,
-                                            hdfs_path=hdfs_path,
-                                            map_num=map_num,
-                                            key="id")
-        else:
-            sh = CommonUtil.build_import_sh(site_name=site_name,
-                                            db_type=db_type,
-                                            query=query,
-                                            hdfs_path=hdfs_path)
-        HdfsUtils.delete_hdfs_file(hdfs_path)
-        client = SSHUtil.get_ssh_client()
-        SSHUtil.exec_command_async(client, sh, ignore_err=False)
-        CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
-        client.close()
-    pass
\ No newline at end of file
--- a/Pyspark_job/utils/DolphinschedulerHelper.py
+++ b/Pyspark_job/utils/DolphinschedulerHelper.py
@@ -498,3 +498,26 @@ class DolphinschedulerHelper(object):
            return resp_json['msg']
        else:
            raise Exception(f"任务停止失败")
+    @classmethod
+    def list_projects_task(cls, project_name: str = _def_project_name, req_params=None):
+        """
+        根据当前运行脚本判断是否是海豚上正在运行的任务并获取任务参数
+        :param project_name: 默认是 big_data_selection
+        """
+        if req_params is None:
+            req_params = {}
+        project_map = cls.get_project_map()
+        project_code = project_map.get(project_name)
+        url = f"{cls._ip_port}/dolphinscheduler/projects/{project_code}/process-instances"
+        resp = requests.get(
+            url,
+            headers=cls.get_http_header(),
+            params=req_params
+        )
+        resp_json = json.loads(resp.content.decode("utf-8"))
+        if bool(resp_json['success']):
+            return resp_json['data']['totalList']
+        else:
+            return None
--- a/Pyspark_job/utils/common_util.py
+++ b/Pyspark_job/utils/common_util.py
@@ -1884,17 +1884,24 @@ outputformat 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
            df_latest_asin_detail_with_parent = df_latest_asin_detail_with_parent.withColumnRenamed(f"new_{column}", f"{column}")
        return df_asin_detail, df_latest_asin_detail_with_parent
+    @staticmethod
+    def send_msg_robot(msg: str):
+        webhook_url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=c519c702-6164-45c8-98b5-a87e52150f19"
+        headers = {
+            "Content-Type": "application/json"
+        }
+        data = {
+            "msgtype": "text",
+            "text": {
+                "content": msg
+            }
+        }
+        resp = requests.post(webhook_url, json=data, headers=headers)
+        if resp.status_code == 200:
+            result = resp.json()
+            if result.get("errcode") == 0:
+                print("发送成功")
+            else:
+                print("发送失败:", result)
+        else:
+            print("HTTP错误:", resp.status_code, resp.text)
--- a/Pyspark_job/utils/es_util.py
+++ b/Pyspark_job/utils/es_util.py
@@ -1044,6 +1044,141 @@ class EsUtils(object):
                    },
                    "img_type_arr": {
                        "type": "integer"
+                    },
+                    "date_info_del": {
+                        "type": "keyword"
+                    }
+                }
+            }
+        }
+    @staticmethod
+    def get_es_ai_body():
+        return {
+            "settings": {
+                "number_of_shards": "3",
+                "number_of_replicas": "1",
+                "analysis": {
+                    "filter": {
+                        "en_snowball": {
+                            "type": "snowball",
+                            "language": "English"
+                        },
+                        "en_synonym": {
+                            "type": "synonym_graph",
+                            "synonyms_path": "analysis/synonyms_en.txt",
+                            "updateable": "true"
+                        }
+                    },
+                    "analyzer": {
+                        "en_analyzer": {
+                            "type": "custom",
+                            "tokenizer": "standard",
+                            "filter": [
+                                "lowercase",
+                                "en_snowball"
+                            ]
+                        },
+                        "en_search_analyzer": {
+                            "tokenizer": "standard",
+                            "filter": [
+                                "lowercase",
+                                "en_synonym",
+                                "en_snowball"
+                            ]
+                        }
+                    },
+                    "normalizer": {
+                        "lowercase_normalizer": {
+                            "type": "custom",
+                            "char_filter": [],
+                            "filter": [
+                                "lowercase"
+                            ]
+                        }
+                    }
+                }
+            },
+            "mappings": {
+                "properties": {
+                    "asin": {"type": "keyword"},
+                    "parent_asin": {"type": "keyword"},
+                    "site_name": {"type": "keyword"},
+                    "analyze_id": {"type": "integer"},
+                    "seller_id": {"type": "keyword"},
+                    "title": {
+                        "type": "text",
+                        "analyzer": "en_analyzer",
+                        "search_analyzer": "en_search_analyzer"
+                    },
+                    "img": {"type": "keyword"},
+                    "img_num": {"type": "integer"},
+                    "launch_time": {"type": "keyword"},
+                    "launch_time_type": {"type": "integer"},
+                    "price": {"type": "scaled_float", "scaling_factor": 100},
+                    "rating": {"type": "scaled_float", "scaling_factor": 100},
+                    "total_comments": {"type": "integer"},
+                    "bought_month": {"type": "integer"},
+                    "bought_month_mom": {"type": "scaled_float", "scaling_factor": 100},
+                    "bought_month_yoy": {"type": "scaled_float", "scaling_factor": 100},
+                    "bsr_rank": {"type": "integer"},
+                    "bsr_rank_str": {"type": "keyword"},
+                    "ao_val": {"type": "scaled_float", "scaling_factor": 100},
+                    "variation_flag": {"type": "integer"},
+                    "variation_num": {"type": "integer"},
+                    "is_ascending_flag": {"type": "integer"},
+                    "is_new_flag": {"type": "integer"},
+                    "buy_box_seller_type": {"type": "keyword"},
+                    "category": {"type": "keyword"},
+                    "category_id": {"type": "keyword"},
+                    "category_current_id": {"type": "keyword"},
+                    "festival": {"type": "keyword"},
+                    "fb_country_name": {"type": "keyword"},
+                    "profit_key": {"type": "keyword"},
+                    "multi_color_flag": {"type": "keyword"},
+                    "package_quantity_flag": {"type": "keyword"},
+                    "title_pic_flag": {"type": "keyword"},
+                    "title_word_flag": {"type": "keyword"},
+                    "account_addr": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "account_name": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "appearance": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "brand": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "color": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "crowd": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "function": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "material": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "multi_color_content": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "package_quantity": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "package_quantity_arr": {"type": "integer"},
+                    "scene_comment": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "scene_title": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "shape": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "short_desc": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "size": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "theme": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "title_pic_content": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "title_word_content": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "uses": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "weight": {"type": "keyword", "normalizer": "lowercase_normalizer"},
+                    "label_content": {
+                        "type": "text",
+                        "fields": {
+                            "keyword": {
+                                "type": "keyword",
+                                "normalizer": "lowercase_normalizer"
+                            }
+                        }
+                    },
+                    "profit_rate_extra": {
+                        "type": "object",
+                        "properties": {
+                            "ocean_profit": {
+                                "type": "float"
+                            },
+                            "air_profit": {
+                                "type": "float"
+                            }
+                        }
                    }
                }
            }

--- a/Pyspark_job/utils/templates.py
+++ b/Pyspark_job/utils/templates.py
@@ -93,6 +93,8 @@ class Templates(object):
        # 测试标识
        self.test_flag = 'normal'
        self.beginning_offsets_dict = {}  # history消费时, 初始的偏移量
+        # 记录最后一次收到非空批次的时间，用于无新数据超时检测
+        self.last_data_time = time.time()
        # redis连接对象--用来锁定--解决并发
        self.client = get_redis_h14()
@@ -453,11 +455,25 @@ class Templates(object):
            self.query.awaitTermination()
    def handle_kafka_stream_templates(self, kafka_df, epoch_id):
-        if self.spider_type == 'asin详情' and kafka_df.count() > 0:
+        has_data = self.spider_type == 'asin详情' and kafka_df.count() > 0
+        if has_data:
            kafka_df = self.deduplication_kafka_data(kafka_df, "asin", "asinUpdateTime")
        self.handle_kafka_stream(kafka_df, epoch_id)
+        if has_data:
+            # 处理完成后更新时间戳，避免长批次处理耗时误触发超时
+            self.last_data_time = time.time()
        if self.test_flag == 'normal':
            self.kafka_consumption_is_finished()
+            # 仅当前批次无新数据时才做超时检测
+            # 若距上次有效数据已超过 30 分钟，说明爬虫可能已完成但状态表尚未更新
+            # 进入轮询，每 2 分钟重新检查一次，直到状态更新后 kafka_consumption_is_finished() 内部 exit(0)
+            if not has_data:
+                elapsed = time.time() - self.last_data_time
+                if elapsed > 30 * 60:
+                    print(f"[超时检测] 已 {elapsed / 60:.1f} 分钟无新数据，进入状态轮询（每2分钟检查一次），等待爬虫状态更新")
+                    while True:
+                        time.sleep(120)
+                        self.kafka_consumption_is_finished()
    def handle_kafka_stream(self, kafka_df, epoch_id):
        pass
@@ -657,14 +673,19 @@ class Templates(object):
                wx_msg = f"站点： {self.site_name}, {self.date_type}, {self.date_info} asin详情实时消费数据到redis准备工作已完成，可以开启详情爬取！"
            else:
                pass
-            try:
+            sql = f"UPDATE selection.workflow_progress SET {kafka_field}=3, updated_at=CURRENT_TIMESTAMP where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}' and page='asin详情'"
-                sql = f"UPDATE selection.workflow_progress SET {kafka_field}=3, updated_at=CURRENT_TIMESTAMP where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}' and page='asin详情'"
+            for retry in range(5):
-                DBUtil.exec_sql('mysql', 'us', sql)
+                try:
-                CommonUtil.send_wx_msg(wx_users, f"asin详情kafka消费", wx_msg)
+                    DBUtil.exec_sql('mysql', 'us', sql)
-            except Exception as e:
+                    CommonUtil.send_wx_msg(wx_users, f"asin详情kafka消费", wx_msg)
-                print(e, traceback.format_exc())
+                    break
-                CommonUtil.send_wx_msg(wx_users, f"\u26A0asin详情kafka实时消费\u26A0",
+                except Exception as e:
-                                       f"站点： {self.site_name} asin详情实时消费准备失败，请等待处理！")
+                    print(f"UPDATE workflow_progress 失败（第{retry + 1}次），等待10s重试", e, traceback.format_exc())
+                    if retry == 4:
+                        CommonUtil.send_wx_msg(wx_users, f"\u26A0asin详情kafka实时消费\u26A0",
+                                               f"站点： {self.site_name} asin详情实时消费准备失败，请等待处理！")
+                    else:
+                        time.sleep(10)
        else:
            pass