导出keepa_asin补充分类过滤逻辑

aa8471e3 · chenyuanjie · d9aa0ce9 · aa8471e3
Commit aa8471e3 authored Mar 26, 2026 by chenyuanjie
Show whitespace changes
Inline Side-by-side

Showing with 33 additions and 11 deletions

export_asin_without_keepa.py Pyspark_job/script/export_asin_without_keepa.py +33 -11

No files found.
--- a/Pyspark_job/script/export_asin_without_keepa.py
+++ b/Pyspark_job/script/export_asin_without_keepa.py
@@ -101,7 +101,7 @@ class ExportAsinWithoutKeepa(object):
        print("1. [month_week] 读取 dim_asin_detail")
        sql = f"""
            select asin, asin_price, asin_bought_month,
-                   asin_is_self,
+                   asin_is_self, asin_category_desc,
                   category_id       as top_category_id,
                   category_first_id as top_category_first_id
            from dim_asin_detail
@@ -137,8 +137,20 @@ class ExportAsinWithoutKeepa(object):
        """
        df_measure = self.spark.sql(sql).repartition(40, 'asin')

-        # ④ us_bs_category_hide → 隐藏分类（用于 asin_type 计算）
-        print("4. 读取 us_bs_category_hide (隐藏分类)")
+        # ④ dim_bsr_category_tree → desc_category_first_id（用于 asin_is_need 双重校验）
+        print("4. 读取 dim_bsr_category_tree (分类名称→ID 映射)")
+        sql = f"""
+            select lower(trim(en_name)) as desc_category_first_name,
+                   category_first_id    as desc_category_first_id
+            from dim_bsr_category_tree
+            where site_name = '{self.site_name}'
+              and category_parent_id = 0
+              and leaf_node = 2
+        """
+        df_bsr_category = F.broadcast(self.spark.sql(sqlQuery=sql))
+
+        # ⑤ us_bs_category_hide → 隐藏分类（用于 asin_type 计算）
+        print("5. 读取 us_bs_category_hide (隐藏分类)")
        mysql_con = DBUtil.get_connection_info("mysql", self.site_name)
        sql = "select category_id_base as category_id, 1 as hide_flag from us_bs_category_hide group by category_id_base"
        df_hide = SparkUtil.read_jdbc_query(
@@ -146,8 +158,8 @@ class ExportAsinWithoutKeepa(object):
            pwd=mysql_con['pwd'], username=mysql_con['username'], query=sql
        )

-        # ⑤ 组装
-        print("5. 组装主DataFrame")
+        # ⑥ 组装
+        print("6. 组装主DataFrame")
        df = df_dim \
            .join(df_bsr,     on='asin', how='left') \
            .join(df_measure, on='asin', how='left')
@@ -169,7 +181,15 @@ class ExportAsinWithoutKeepa(object):
        ).drop("asin_amazon_orders")

        # asin_type 计算（对齐 dwt.handle_asin_is_hide）
+        # desc_category_first_name：解析 asin_category_desc，取 › 分隔的第一段
+        df = df.withColumn(
+            "desc_category_first_name",
+            F.lower(F.trim(F.split(F.col("asin_category_desc"), "›").getItem(0)))
+        ).join(df_bsr_category, on='desc_category_first_name', how='left') \
+         .drop("desc_category_first_name", "asin_category_desc")
+
        df = df.join(F.broadcast(df_hide), on='category_id', how='left')
+        need_categories = NEED_FILTER_CATEGORIES
        df = df.withColumn(
            "asin_is_hide",
            F.expr(f"""
@@ -181,11 +201,13 @@ class ExportAsinWithoutKeepa(object):
        ).withColumn(
            "asin_is_need",
            F.expr(f"""
-                CASE WHEN category_first_id IN {NEED_FILTER_CATEGORIES} THEN 1
+                CASE WHEN category_first_id IN {need_categories}
+                          AND desc_category_first_id IN {need_categories} THEN 1
                     WHEN asin NOT LIKE 'B0%' THEN 1
                     ELSE 0 END
            """)
-        ).withColumn(
+        ).drop("desc_category_first_id") \
+         .withColumn(
            "asin_type",
            F.expr("""
                CASE WHEN asin_is_self = 1 THEN 1
@@ -207,7 +229,7 @@ class ExportAsinWithoutKeepa(object):
        # month_week：字段在 Python 中计算，需在此处做条件过滤
        # month：SQL 中已完成过滤，直接跳过此步
        if self.date_type == 'month_week':
-            print("6. [month_week] 筛选目标ASIN")
+            print("7. [month_week] 筛选目标ASIN")
            df = df.filter(
                F.col("asin_type").isin(0, 1, 3)
            ).filter(
@@ -223,7 +245,7 @@ class ExportAsinWithoutKeepa(object):

        # 排除 dim_keepa_asin_info 中已有有效keepa数据的ASIN
        # 若 package_length/width/height/weight 任意一个 < 0，视为数据异常，不排除（需重新抓取）
-        print("7. 排除已有keepa数据的ASIN (dim_keepa_asin_info)")
+        print("8. 排除已有keepa数据的ASIN (dim_keepa_asin_info)")
        df_keepa = self.spark.sql(f"""
            select asin from dim_keepa_asin_info
            where site_name      = '{self.site_name}'
@@ -236,7 +258,7 @@ class ExportAsinWithoutKeepa(object):
        print(f"排除keepa后数据量: {df.count()}")

        # 排除 {pg_table} 中已导出的ASIN
-        print(f"8. 排除已导出的ASIN ({self.pg_table})")
+        print(f"9. 排除已导出的ASIN ({self.pg_table})")
        pg_con_info = DBUtil.get_connection_info("postgresql_cluster", self.site_name)
        df_exported = SparkUtil.read_jdbc_query(
            session=self.spark,
@@ -258,7 +280,7 @@ class ExportAsinWithoutKeepa(object):
    # ------------------------------------------------------------------ #
    def save_data(self):
        total = self.df_save.count()
-        print(f"9. 写入 PostgreSQL 表 {self.pg_table}，共 {total} 条")
+        print(f"10. 写入 PostgreSQL 表 {self.pg_table}，共 {total} 条")
        con_info = DBUtil.get_connection_info('postgresql_cluster', self.site_name)
        self.df_save.write.format("jdbc") \
            .option("url",      con_info["url"]) \