Commit 5bf88f84 by 吴济苍

Merge remote-tracking branch 'origin/developer' into developer

parents 67c72130 5ead1c9f
...@@ -521,10 +521,27 @@ class DwtFlowAsin(Templates): ...@@ -521,10 +521,27 @@ class DwtFlowAsin(Templates):
CASE WHEN hide_flag = 1 THEN 1 WHEN category_first_id = 'grocery' and category_id != '6492272011' THEN 1 CASE WHEN hide_flag = 1 THEN 1 WHEN category_first_id = 'grocery' and category_id != '6492272011' THEN 1
WHEN category_id in ('21393128011', '21377129011', '21377127011', '21377130011', '21388218011', '21377132011') THEN 1 WHEN category_id in ('21393128011', '21377129011', '21377127011', '21377130011', '21388218011', '21377132011') THEN 1
ELSE 0 END""")).drop("hide_flag") ELSE 0 END""")).drop("hide_flag")
self.df_asin_detail = self.df_asin_detail.withColumn("asin_is_need", F.expr(""" # 解析 asin_category_desc 取 › 分隔的第一个元素作为补充分类名称
CASE WHEN category_first_id in ('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed') THEN 1 self.df_asin_detail = self.df_asin_detail.withColumn(
"desc_category_first_name",
F.lower(F.trim(F.split(F.col("asin_category_desc"), "›").getItem(0)))
)
# 读取 Hive 分类维表,获取分类名称与ID的对应关系
sql_dim = f"""
select lower(trim(en_name)) as desc_category_first_name, category_first_id as desc_category_first_id
from dim_bsr_category_tree where site_name = '{self.site_name}' and category_parent_id = 0 and leaf_node = 2
"""
df_bsr_category = F.broadcast(self.spark.sql(sqlQuery=sql_dim))
# join 补充分类ID
self.df_asin_detail = self.df_asin_detail.join(df_bsr_category, on=['desc_category_first_name'], how='left')
# 两个分类ID均在过滤列表中才标记为1
need_categories = "('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed')"
self.df_asin_detail = self.df_asin_detail.withColumn("asin_is_need", F.expr(f"""
CASE WHEN category_first_id in {need_categories}
AND desc_category_first_id in {need_categories} THEN 1
WHEN asin NOT LIKE 'B0%' THEN 1 WHEN asin NOT LIKE 'B0%' THEN 1
ELSE 0 END""")) ELSE 0 END"""))
self.df_asin_detail = self.df_asin_detail.drop("desc_category_first_name", "desc_category_first_id")
self.df_asin_detail = self.df_asin_detail.withColumn("asin_type", F.expr(""" self.df_asin_detail = self.df_asin_detail.withColumn("asin_type", F.expr("""
CASE WHEN asin_is_self=1 THEN 1 WHEN asin_is_need=1 THEN 2 WHEN asin_is_hide=1 THEN 3 ELSE 0 END""" CASE WHEN asin_is_self=1 THEN 1 WHEN asin_is_need=1 THEN 2 WHEN asin_is_hide=1 THEN 3 ELSE 0 END"""
)).drop("asin_is_self", "asin_is_need", "asin_is_hide") )).drop("asin_is_self", "asin_is_need", "asin_is_hide")
......
...@@ -154,7 +154,7 @@ class DwtNsrAsinDetail(object): ...@@ -154,7 +154,7 @@ class DwtNsrAsinDetail(object):
select asin, select asin,
asin_ao_val as ao_val, asin_ao_val as ao_val,
bsr_orders as bsr_orders, bsr_orders as bsr_orders,
asin_bsr_orders_change as bsr_orders_change, asin_bsr_orders_mom as bsr_orders_change,
asin_air_freight_gross_margin as asin_air_freight_gross_margin, asin_air_freight_gross_margin as asin_air_freight_gross_margin,
asin_ocean_freight_gross_margin as asin_ocean_freight_gross_margin, asin_ocean_freight_gross_margin as asin_ocean_freight_gross_margin,
cast(asin_bought_month as int ) as asin_bought_month cast(asin_bought_month as int ) as asin_bought_month
......
...@@ -200,7 +200,32 @@ class EsAiAsinAdd(object): ...@@ -200,7 +200,32 @@ class EsAiAsinAdd(object):
'total_comments', 'uses', 'variation_flag', 'variation_num', 'weight' 'total_comments', 'uses', 'variation_flag', 'variation_num', 'weight'
) )
def create_pg_table(self):
"""
根据模板表创建月度 PG 表:
1. LIKE INCLUDING ALL:复制所有字段类型、其他列默认值、约束、索引
2. 重建 id 列独立序列,避免与模板表共享同一序列
"""
template_tb = "us_ai_asin_detail_month_2026_01"
engine = DBUtil.get_db_engine("postgresql", "us")
# 表已存在则跳过
result = DBUtil.engine_exec_sql(engine, f"SELECT to_regclass('{self.export_pg_tb}')")
if list(result)[0][0] is not None:
print(f"PostgreSQL 表 {self.export_pg_tb} 已存在,跳过建表")
return
# 建表 + 为 id 列创建独立序列
sql = f"""
CREATE TABLE {self.export_pg_tb} (LIKE {template_tb} INCLUDING ALL);
ALTER TABLE {self.export_pg_tb} ALTER COLUMN id DROP DEFAULT;
CREATE SEQUENCE {self.export_pg_tb}_id_seq OWNED BY {self.export_pg_tb}.id;
ALTER TABLE {self.export_pg_tb} ALTER COLUMN id SET DEFAULT nextval('{self.export_pg_tb}_id_seq')
"""
DBUtil.exec_sql("postgresql", "us", sql)
print(f"PostgreSQL 表 {self.export_pg_tb} 创建完成(独立自增序列)")
def save_data(self): def save_data(self):
# 创建月度 PG 表
self.create_pg_table()
# 将新增asin导出给济苍 # 将新增asin导出给济苍
try: try:
self.df_save_pg.write.format("jdbc") \ self.df_save_pg.write.format("jdbc") \
......
...@@ -21,7 +21,7 @@ class ImportStToPg14(object): ...@@ -21,7 +21,7 @@ class ImportStToPg14(object):
self.df_st_month = pd.DataFrame() self.df_st_month = pd.DataFrame()
self.df_st_month_state = pd.DataFrame() self.df_st_month_state = pd.DataFrame()
self.df_save = pd.DataFrame() self.df_save = pd.DataFrame()
self.fetch_year_month_by_week() # 如果传的date_type='week', 将date_info转换成月的值 # self.fetch_year_month_by_week() # 如果传的date_type='week', 将date_info转换成月的值
self.year, self.month = self.date_info.split("-")[0], int(self.date_info.split("-")[1]) self.year, self.month = self.date_info.split("-")[0], int(self.date_info.split("-")[1])
def fetch_year_month_by_week(self): def fetch_year_month_by_week(self):
...@@ -31,6 +31,7 @@ class ImportStToPg14(object): ...@@ -31,6 +31,7 @@ class ImportStToPg14(object):
self.date_info = list(df.year_month)[0] self.date_info = list(df.year_month)[0]
def read_data(self): def read_data(self):
self.fetch_year_month_by_week() # 如果传的date_type='week', 将date_info转换成月的值
# 1. 读取date_20_to_30表获取月份对应的周 # 1. 读取date_20_to_30表获取月份对应的周
sql_get_week = f"select year_week, year, week from selection.date_20_to_30 WHERE `year_month`='{self.date_info}' and week_day=1" sql_get_week = f"select year_week, year, week from selection.date_20_to_30 WHERE `year_month`='{self.date_info}' and week_day=1"
df_week = pd.read_sql(sql_get_week, con=self.engine_mysql) df_week = pd.read_sql(sql_get_week, con=self.engine_mysql)
......
...@@ -221,11 +221,17 @@ class ExportAsinWithoutKeepa(object): ...@@ -221,11 +221,17 @@ class ExportAsinWithoutKeepa(object):
df = df.cache() df = df.cache()
print(f"筛选后数据量: {df.count()}") print(f"筛选后数据量: {df.count()}")
# 排除 dim_keepa_asin_info 中已有 package_length 的ASIN # 排除 dim_keepa_asin_info 中已有有效keepa数据的ASIN
# 若 package_length/width/height/weight 任意一个 < 0,视为数据异常,不排除(需重新抓取)
print("7. 排除已有keepa数据的ASIN (dim_keepa_asin_info)") print("7. 排除已有keepa数据的ASIN (dim_keepa_asin_info)")
df_keepa = self.spark.sql( df_keepa = self.spark.sql(f"""
"select asin from dim_keepa_asin_info where package_length is not null" select asin from dim_keepa_asin_info
).repartition(40, 'asin') where site_name = '{self.site_name}'
and package_length >= 0
and package_width >= 0
and package_height >= 0
and weight >= 0
""").repartition(40, 'asin')
df = df.join(df_keepa, on='asin', how='left_anti').cache() df = df.join(df_keepa, on='asin', how='left_anti').cache()
print(f"排除keepa后数据量: {df.count()}") print(f"排除keepa后数据量: {df.count()}")
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.secure_db_client import get_remote_engine
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
print(f"执行参数为{sys.argv}")
# 获取数据库引擎
db_type = "postgresql_15"
engine = get_remote_engine(
site_name='us',
db_type=db_type
)
if site_name == 'us':
export_tb = f"ai_asin_detail_month_{date_info.replace('-', '_')}"
else:
export_tb = f"{site_name}_ai_asin_detail_month_{date_info.replace('-', '_')}"
# 导出数据
engine.sqoop_raw_export(
hive_table="dwt_ai_asin_add",
import_table=export_tb,
partitions={
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
},
m=30,
cols="site_name,asin,weight,bought_month,category,img,title,brand,account_name,account_addr,buy_box_seller_type,"
"launch_time,img_num,variation_flag,variation_num,ao_val,category_id,category_current_id,parent_asin,bsr_rank,"
"price,rating,total_comments,seller_id,fb_country_name,review_json_list,launch_time_type,describe,product_json,"
"product_detail_json,bought_month_mom,bought_month_yoy,is_new_flag,is_ascending_flag"
)
print("success")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment