Merge remote-tracking branch 'origin/developer' into developer

5bf88f84 · 吴济苍 · 67c72130 · 5ead1c9f · 5bf88f84 · 5bf88f84
Commit 5bf88f84 authored Mar 20, 2026 by 吴济苍
7 changed files
--- a/Pyspark_job/dwt/dwt_flow_asin.py
+++ b/Pyspark_job/dwt/dwt_flow_asin.py
@@ -521,10 +521,27 @@ class DwtFlowAsin(Templates):
      CASE WHEN hide_flag = 1 THEN 1 WHEN category_first_id = 'grocery' and category_id != '6492272011' THEN 1 
      WHEN category_id in ('21393128011', '21377129011', '21377127011', '21377130011', '21388218011', '21377132011') THEN 1
      ELSE 0 END""")).drop("hide_flag")
-        self.df_asin_detail = self.df_asin_detail.withColumn("asin_is_need", F.expr("""
-        CASE WHEN category_first_id in ('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed') THEN 1 
-        WHEN asin NOT LIKE 'B0%' THEN 1 
+        # 解析 asin_category_desc 取 › 分隔的第一个元素作为补充分类名称
+        self.df_asin_detail = self.df_asin_detail.withColumn(
+            "desc_category_first_name",
+            F.lower(F.trim(F.split(F.col("asin_category_desc"), "›").getItem(0)))
+        )
+        # 读取 Hive 分类维表，获取分类名称与ID的对应关系
+        sql_dim = f"""
+            select lower(trim(en_name)) as desc_category_first_name, category_first_id as desc_category_first_id
+            from dim_bsr_category_tree where site_name = '{self.site_name}' and category_parent_id = 0 and leaf_node = 2
+        """
+        df_bsr_category = F.broadcast(self.spark.sql(sqlQuery=sql_dim))
+        # join 补充分类ID
+        self.df_asin_detail = self.df_asin_detail.join(df_bsr_category, on=['desc_category_first_name'], how='left')
+        # 两个分类ID均在过滤列表中才标记为1
+        need_categories = "('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed')"
+        self.df_asin_detail = self.df_asin_detail.withColumn("asin_is_need", F.expr(f"""
+        CASE WHEN category_first_id in {need_categories}
+                  AND desc_category_first_id in {need_categories} THEN 1
+             WHEN asin NOT LIKE 'B0%' THEN 1
        ELSE 0 END"""))
+        self.df_asin_detail = self.df_asin_detail.drop("desc_category_first_name", "desc_category_first_id")
        self.df_asin_detail = self.df_asin_detail.withColumn("asin_type", F.expr("""
            CASE WHEN asin_is_self=1 THEN 1 WHEN asin_is_need=1 THEN 2 WHEN asin_is_hide=1 THEN 3 ELSE 0 END"""
            )).drop("asin_is_self", "asin_is_need", "asin_is_hide")

--- a/Pyspark_job/dwt/dwt_nsr_asin_detail.py
+++ b/Pyspark_job/dwt/dwt_nsr_asin_detail.py
@@ -154,7 +154,7 @@ class DwtNsrAsinDetail(object):
            select asin,
                   asin_ao_val                     as ao_val,
                   bsr_orders                      as bsr_orders,
-                   asin_bsr_orders_change          as bsr_orders_change,
+                   asin_bsr_orders_mom          as bsr_orders_change,
                   asin_air_freight_gross_margin   as asin_air_freight_gross_margin,
                   asin_ocean_freight_gross_margin as asin_ocean_freight_gross_margin,
                   cast(asin_bought_month as int ) as asin_bought_month

--- a/Pyspark_job/export_es/es_ai_asin_add.py
+++ b/Pyspark_job/export_es/es_ai_asin_add.py
@@ -200,7 +200,32 @@ class EsAiAsinAdd(object):
            'total_comments', 'uses', 'variation_flag', 'variation_num', 'weight'
        )

+    def create_pg_table(self):
+        """
+        根据模板表创建月度 PG 表：
+          1. LIKE INCLUDING ALL：复制所有字段类型、其他列默认值、约束、索引
+          2. 重建 id 列独立序列，避免与模板表共享同一序列
+        """
+        template_tb = "us_ai_asin_detail_month_2026_01"
+        engine = DBUtil.get_db_engine("postgresql", "us")
+        # 表已存在则跳过
+        result = DBUtil.engine_exec_sql(engine, f"SELECT to_regclass('{self.export_pg_tb}')")
+        if list(result)[0][0] is not None:
+            print(f"PostgreSQL 表 {self.export_pg_tb} 已存在，跳过建表")
+            return
+        # 建表 + 为 id 列创建独立序列
+        sql = f"""
+            CREATE TABLE {self.export_pg_tb} (LIKE {template_tb} INCLUDING ALL);
+            ALTER TABLE {self.export_pg_tb} ALTER COLUMN id DROP DEFAULT;
+            CREATE SEQUENCE {self.export_pg_tb}_id_seq OWNED BY {self.export_pg_tb}.id;
+            ALTER TABLE {self.export_pg_tb} ALTER COLUMN id SET DEFAULT nextval('{self.export_pg_tb}_id_seq')
+        """
+        DBUtil.exec_sql("postgresql", "us", sql)
+        print(f"PostgreSQL 表 {self.export_pg_tb} 创建完成（独立自增序列）")
+
    def save_data(self):
+        # 创建月度 PG 表
+        self.create_pg_table()
        # 将新增asin导出给济苍
        try:
            self.df_save_pg.write.format("jdbc") \

--- a/Pyspark_job/export_es/es_asin_profit_rate.py
+++ b/Pyspark_job/export_es/es_asin_profit_rate.py
--- a/Pyspark_job/listen_program/import_st_to_pg14.py
+++ b/Pyspark_job/listen_program/import_st_to_pg14.py
@@ -21,7 +21,7 @@ class ImportStToPg14(object):
        self.df_st_month = pd.DataFrame()
        self.df_st_month_state = pd.DataFrame()
        self.df_save = pd.DataFrame()
-        self.fetch_year_month_by_week()  # 如果传的date_type='week', 将date_info转换成月的值
+        # self.fetch_year_month_by_week()  # 如果传的date_type='week', 将date_info转换成月的值
        self.year, self.month = self.date_info.split("-")[0], int(self.date_info.split("-")[1])

    def fetch_year_month_by_week(self):
@@ -31,6 +31,7 @@ class ImportStToPg14(object):
            self.date_info = list(df.year_month)[0]

    def read_data(self):
+        self.fetch_year_month_by_week()  # 如果传的date_type='week', 将date_info转换成月的值
        # 1. 读取date_20_to_30表获取月份对应的周
        sql_get_week = f"select year_week, year, week from selection.date_20_to_30 WHERE `year_month`='{self.date_info}' and week_day=1"
        df_week = pd.read_sql(sql_get_week, con=self.engine_mysql)

--- a/Pyspark_job/script/export_asin_without_keepa.py
+++ b/Pyspark_job/script/export_asin_without_keepa.py
@@ -221,11 +221,17 @@ class ExportAsinWithoutKeepa(object):
            df = df.cache()
            print(f"筛选后数据量: {df.count()}")

-        # 排除 dim_keepa_asin_info 中已有 package_length 的ASIN
+        # 排除 dim_keepa_asin_info 中已有有效keepa数据的ASIN
+        # 若 package_length/width/height/weight 任意一个 < 0，视为数据异常，不排除（需重新抓取）
        print("7. 排除已有keepa数据的ASIN (dim_keepa_asin_info)")
-        df_keepa = self.spark.sql(
-            "select asin from dim_keepa_asin_info where package_length is not null"
-        ).repartition(40, 'asin')
+        df_keepa = self.spark.sql(f"""
+            select asin from dim_keepa_asin_info
+            where site_name      = '{self.site_name}'
+              and package_length >= 0
+              and package_width  >= 0
+              and package_height >= 0
+              and weight         >= 0
+        """).repartition(40, 'asin')
        df = df.join(df_keepa, on='asin', how='left_anti').cache()
        print(f"排除keepa后数据量: {df.count()}")


--- a/Pyspark_job/sqoop_export/export_dwt_ai_asin_add.py
+++ b/Pyspark_job/sqoop_export/export_dwt_ai_asin_add.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.common_util import CommonUtil
-from utils.secure_db_client import get_remote_engine
-
-if __name__ == '__main__':
-    site_name = CommonUtil.get_sys_arg(1, None)
-    date_type = CommonUtil.get_sys_arg(2, None)
-    date_info = CommonUtil.get_sys_arg(3, None)
-    print(f"执行参数为{sys.argv}")
-
-    # 获取数据库引擎
-    db_type = "postgresql_15"
-    engine = get_remote_engine(
-        site_name='us',
-        db_type=db_type
-    )
-    if site_name == 'us':
-        export_tb = f"ai_asin_detail_month_{date_info.replace('-', '_')}"
-    else:
-        export_tb = f"{site_name}_ai_asin_detail_month_{date_info.replace('-', '_')}"
-
-    # 导出数据
-    engine.sqoop_raw_export(
-        hive_table="dwt_ai_asin_add",
-        import_table=export_tb,
-        partitions={
-            "site_name": site_name,
-            "date_type": date_type,
-            "date_info": date_info
-        },
-        m=30,
-        cols="site_name,asin,weight,bought_month,category,img,title,brand,account_name,account_addr,buy_box_seller_type,"
-             "launch_time,img_num,variation_flag,variation_num,ao_val,category_id,category_current_id,parent_asin,bsr_rank,"
-             "price,rating,total_comments,seller_id,fb_country_name,review_json_list,launch_time_type,describe,product_json,"
-             "product_detail_json,bought_month_mom,bought_month_yoy,is_new_flag,is_ascending_flag"
-    )
-
-    print("success")