Commit dac2671d by hejiangming

月搜索词增加新字段 增加中间表 dim_st_detail_history 中间表 在dim_st_detail流程后 记录搜索词 首次与最后一次出现的时间 …

月搜索词增加新字段  增加中间表 dim_st_detail_history 中间表 在dim_st_detail流程后 记录搜索词 首次与最后一次出现的时间  导出增加hive判断 对应要导出的分区为空就终止代码
parent 75ae57fd
"""
@Description : ABA搜索词全历史出现中间表 - 每月增量更新脚本
@业务背景 :
本中间表 dim_st_detail_history 服务于 dwt_aba_st_analytics.is_first_ever_text 字段判断
(全历史首次出现)。每月 dim_st_detail 跑完后,通过本脚本把当月数据合并进中间表,
保持中间表始终是"截止当月"的最新状态。
@数据流向 :
dim_st_detail_history (M-1 状态) + dim_st_detail (当月分区)
↓ UNION + groupby search_term + MIN(first) / MAX(last)
dim_st_detail_history (M 状态)
@调度位置 : 每月 dim_st_detail 跑完之后、dwt_aba_st_analytics 跑之前
@幂等性 :
- 当月 dim 词集合不变 → 重跑结果完全一致
- 当月 dim 词集合缩小(某词被剔除)→ 该词在中间表里被自动清除
- 当月 dim 词集合扩大(多了新词)→ 新词被正确追加
@SourceTable : dim_st_detail (date_type='month', date_info=当月)
+ dim_st_detail_history (本表自身,读取历史状态)
@SinkTable : dim_st_detail_history
@CreateTime : 2026-05-07
@Param : site_name (us / uk / de) + date_info (YYYY-MM, 当月)
"""
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.hdfs_utils import HdfsUtils
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
class DimStDetailHistory(object):
def __init__(self, site_name, date_info):
self.site_name = site_name
self.date_info = date_info
self.hive_tb = "dim_st_detail_history"
self.hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dim/{self.hive_tb}/site_name={self.site_name}"
self.partitions_by = ['site_name']
app_name = f"{self.hive_tb}: {site_name}, {date_info}"
self.spark = SparkUtil.get_spark_session(app_name)
# 占位 DataFrame
self.df_history = self.spark.sql("select 1+1;")
self.df_current_month = self.spark.sql("select 1+1;")
self.df_save = self.spark.sql("select 1+1;")
def run(self):
# 启动校验:防止误传历史月份破坏中间表
self.check_rollback()
# 读取中间表历史数据 + dim 当月数据
self.read_data()
# UNION + GROUP BY 合并
self.handle_data()
# 写入中间表(覆盖 site_name 分区)
self.save_data()
def check_rollback(self):
# 中间表 max(last) 应该 ≤ 当前传入的 date_info
# 如果 > 当前 date_info,说明中间表已包含未来月份数据 → 判断为回溯历史月
# 直接跑增量会让 first=当月 的少量词的 last 字段被错误重置 → 走 init 重建更安全
check_sql = f"""
SELECT MAX(date_info_last) AS max_last
FROM {self.hive_tb}
WHERE site_name = '{self.site_name}'
"""
max_last = self.spark.sql(check_sql).collect()[0]['max_last']
if max_last is not None and max_last > self.date_info:
raise Exception(
f"中间表 max(date_info_last)={max_last} > 当前 {self.date_info},"
f"判断为回溯历史月场景。\n"
f"请改跑 dim_st_detail_history_init.py 重建中间表。"
)
print(f"[校验通过] 中间表 max(date_info_last)={max_last} ≤ 当前 {self.date_info}")
def read_data(self):
# 分支1的源:中间表里 first ≠ 当月 的所有词
# 用 != 而不是 <:误操作传历史月时数据损失最小
# (比如当前 2026-05 误传 2026-04,!= 只排除 first=2026-04 的少数词,
# 而 < 会排除 first=2026-04 + first=2026-05 一大批词)
history_sql = f"""
SELECT
search_term,
date_info_first,
date_info_last
FROM {self.hive_tb}
WHERE site_name = '{self.site_name}'
AND date_info_first != '{self.date_info}'
"""
print(f"\n[读中间表历史 SQL]\n{history_sql}")
self.df_history = self.spark.sql(history_sql)
# 分支2的源:dim 当月所有词(每词去重一次,因为 dim_st_detail 当月分区每词应该只一行,
# 但 distinct 防御一下避免脏数据)
current_sql = f"""
SELECT DISTINCT search_term
FROM dim_st_detail
WHERE site_name = '{self.site_name}'
AND date_type = 'month'
AND date_info = '{self.date_info}'
"""
print(f"\n[读 dim 当月 SQL]\n{current_sql}")
self.df_current_month = self.spark.sql(current_sql)
def handle_data(self):
# 分支2:当月词加上占位 first=last=当月
# 这样 UNION 后 GROUP BY MIN/MAX 时:
# - 老词:分支1 (first=老月份) UNION 分支2 (first=当月) → MIN=老月份不变 ✓
# 分支1 (last=之前月份) UNION 分支2 (last=当月) → MAX=当月(更新成功)✓
# - 新词:只在分支2 → first=last=当月 ✓
# - 当月被 dim 剔除的词:既不在分支1(被 != 过滤)也不在分支2(dim 没了)→ 自动消失 ✓
df_branch2 = self.df_current_month.select(
'search_term',
F.lit(self.date_info).alias('date_info_first'),
F.lit(self.date_info).alias('date_info_last')
)
# UNION + GROUP BY 聚合
df_union = self.df_history.unionByName(df_branch2)
self.df_save = df_union.groupBy('search_term').agg(
F.min('date_info_first').alias('date_info_first'),
F.max('date_info_last').alias('date_info_last')
)
def save_data(self):
# 加分区列(saveAsTable + partitionBy 需要 DataFrame 里有该列)
self.df_save = self.df_save.withColumn('site_name', F.lit(self.site_name))
# repartition 控制文件数(与 init 一致策略:us 5 个、uk/de 3 个)
target_partitions = 15 if self.site_name == 'us' else 10
self.df_save = self.df_save.repartition(target_partitions)
print(f"\n[repartition] 目标文件数: {target_partitions}")
# 关键:先 cache + count 强制把数据物化到内存
# 因为本脚本"自读自写"(读 dim_st_detail_history → 写 dim_st_detail_history),
# 必须在 delete_file_in_folder 之前完成对源数据的读取,否则 Spark 的 lazy execution
# 会让 saveAsTable 时才去读累加表,但此时累加表已被 delete 删空,导致历史数据丢失
self.df_save = self.df_save.cache()
row_count = self.df_save.count()
print(f"\n[已物化] 待写入行数: {row_count}, 目标文件数: {target_partitions}")
# delete_file_in_folder + saveAsTable(format='hive', mode='append'):项目主流写法
# delete_file_in_folder 删文件保目录,避免 Hive 找不到目录的 WARN
# format='hive' 让 Spark 按表 SerDe(LZO 文本)自动写入
# mode='append' 配合前面的 delete_file_in_folder 实现"覆盖整个 site_name 分区"语义
print(f"\n清除分区文件: {self.hdfs_path}")
HdfsUtils.delete_file_in_folder(self.hdfs_path)
print(f"写入表: {self.hive_tb}, 分区: {self.partitions_by}")
self.df_save.write.saveAsTable(
name=self.hive_tb,
format='hive',
mode='append',
partitionBy=self.partitions_by
)
self.df_save.unpersist()
print("[写入完成]")
# 验证
verify_sql = f"""
SELECT
COUNT(*) AS row_count,
COUNT(DISTINCT search_term) AS distinct_terms,
MIN(date_info_first) AS earliest_first,
MAX(date_info_last) AS latest_last
FROM {self.hive_tb}
WHERE site_name = '{self.site_name}'
"""
print(f"\n[验证 SQL]\n{verify_sql}")
self.spark.sql(verify_sql).show(truncate=False)
print(f"\n[完成] 增量更新 dim_st_detail_history site_name={self.site_name}, date_info={self.date_info}")
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_info = CommonUtil.get_sys_arg(2, None)
if not site_name or not date_info:
print("用法: spark-submit dim_st_detail_history.py <site_name> <date_info>")
print("示例: spark-submit dim_st_detail_history.py us 2026-05")
sys.exit(1)
print(f"{'=' * 60}")
print(f"开始增量更新中间表 dim_st_detail_history")
print(f"site_name = {site_name}, date_info = {date_info}")
print(f"{'=' * 60}")
obj = DimStDetailHistory(site_name=site_name, date_info=date_info)
obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.hdfs_utils import HdfsUtils
from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
class DimStDetailHistoryInit(object):
def __init__(self, site_name):
self.site_name = site_name
self.hive_tb = "dim_st_detail_history"
# 累加表 HDFS 路径,用于 delete_file_in_folder 清空目标分区文件
self.hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dim/{self.hive_tb}/site_name={self.site_name}"
self.partitions_by = ['site_name']
app_name = f"{self.hive_tb}_init: {site_name}"
self.spark = SparkUtil.get_spark_session(app_name)
def run(self):
# 第一步:聚合(用默认 200 partitions 做 GROUP BY,保证聚合性能)
aggregate_sql = f"""
SELECT
search_term,
MIN(date_info) AS date_info_first,
MAX(date_info) AS date_info_last
FROM dim_st_detail
WHERE site_name = '{self.site_name}'
AND date_type = 'month'
GROUP BY search_term
"""
print(f"\n[聚合 SQL]\n{aggregate_sql}")
df = self.spark.sql(aggregate_sql)
# 第二步:加分区列(saveAsTable + partitionBy 需要 DataFrame 里有该列)
df = df.withColumn('site_name', F.lit(self.site_name))
# 第三步:repartition 控制最终输出文件数,避免小文件
# us 数据量大(~1183万行)→ 5 个文件;uk/de 数据量小(~350万行)→ 3 个文件
target_partitions = 5 if self.site_name == 'us' else 3
df = df.repartition(target_partitions)
print(f"\n[repartition] 目标文件数: {target_partitions}")
# 第四步:清空目标分区文件 + saveAsTable 写入
# delete_file_in_folder:删文件保目录,避免 Hive 后续找不到目录报 WARN
# saveAsTable(format='hive', mode='append'): 项目主流写法
# format='hive' 让 Spark 按表的 SerDe 定义写入(这里是 LZO 文本格式),不会强制写 ORC
# mode='append' 配合 delete_file_in_folder 实现"覆盖"语义(项目通用约定)
print(f"\n清除分区文件: {self.hdfs_path}")
HdfsUtils.delete_file_in_folder(self.hdfs_path)
print(f"写入表: {self.hive_tb}, 分区: {self.partitions_by}")
df.write.saveAsTable(
name=self.hive_tb,
format='hive',
mode='append',
partitionBy=self.partitions_by
)
print(f"[写入完成]")
# 第五步:验证
verify_sql = f"""
SELECT
COUNT(*) AS row_count,
COUNT(DISTINCT search_term) AS distinct_terms,
MIN(date_info_first) AS earliest_first,
MAX(date_info_last) AS latest_last
FROM {self.hive_tb}
WHERE site_name = '{self.site_name}'
"""
print(f"\n[验证 SQL]\n{verify_sql}")
self.spark.sql(verify_sql).show(truncate=False)
print(f"\n[完成] init dim_st_detail_history site_name={self.site_name}")
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
if not site_name:
print("用法: spark-submit dim_st_detail_history_init.py <site_name>")
print("示例: spark-submit dim_st_detail_history_init.py us")
sys.exit(1)
print(f"{'=' * 60}")
print(f"开始初始化累加表 dim_st_detail_history")
print(f"site_name = {site_name}")
print(f"{'=' * 60}")
obj = DimStDetailHistoryInit(site_name=site_name)
obj.run()
......@@ -28,7 +28,7 @@ class DwtAbaLastChangeRate(object):
self.hive_tb = "dwt_aba_last_change_rate"
app_name = f"{ self.hive_tb}:{site_name}:{date_type}:{date_info}"
self.spark = SparkUtil.get_spark_session(app_name)
self.partitions_num = CommonUtil.reset_partitions(site_name, 1)
self.partitions_num = CommonUtil.reset_partitions(site_name, 2)
hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.hive_tb}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
print(f"清除hdfs目录中数据:{hdfs_path}")
HdfsUtils.delete_hdfs_file(hdfs_path)
......@@ -46,6 +46,8 @@ class DwtAbaLastChangeRate(object):
self.df_st_last_data = self.spark.sql(f"select 1+1;")
self.df_st_last_year_data = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
# 需求1:近6月排名变化率/变化量(仅 month 类型生效)
self.df_hist_rank = self.spark.sql(f"select 1+1;") # M-1~M-6 历史 rank
def handle_date_offset(self, handle_type: int):
# handle_type = 0 代表计算环比日期,等于 1 代表计算同比日期
......@@ -82,6 +84,13 @@ class DwtAbaLastChangeRate(object):
self.read_data()
self.handle_base()
self.handle_year_ratio()
# 需求1:仅 month 类型计算近6月排名变化(12 字段),其他类型补 null 占位(确保 save_data 不报错)
if self.date_type == DateTypes.month.name:
self.handle_rank_rate_history()
else:
self.handle_rank_rate_padding()
self.save_data()
def read_data(self):
......@@ -153,25 +162,49 @@ class DwtAbaLastChangeRate(object):
self.df_st_last_year_data = self.spark.sql(sql).repartition(40, 'search_term').cache()
self.df_st_last_year_data.show(10, truncate=True)
# 需求1:仅 month 类型读取历史数据用于近6月排名变化计算
if self.date_type == DateTypes.month.name:
# 计算 M-1 ~ M-6 月份字符串(如 2026-04 → ['2026-03', '2026-02', '2026-01', '2025-12', '2025-11', '2025-10'])
month_list = [CommonUtil.get_month_offset(self.date_info, -i) for i in range(1, 7)]
print(f"近6月历史月份列表: {month_list}")
# 一次读 dwt_aba_st_analytics 的 M-1~M-6 历史分区,仅取 search_term + rank
# 用于算 6 个变化量字段 + rank_rate_last_1_month
sql_hist_rank = f"""
select
search_term,
cast(rank as int) as rank,
date_info
from dwt_aba_st_analytics
where site_name = '{self.site_name}'
and date_type = '{self.date_type}'
and date_info in ({CommonUtil.list_to_insql(month_list)})
and rank > 0
"""
self.df_hist_rank = self.spark.sql(sql_hist_rank).repartition(40, 'search_term').cache()
print("self.df_hist_rank:")
self.df_hist_rank.show(10, truncate=True)
def handle_base(self):
self.df_st_base_data = self.df_aba_analytics.join(
self.df_aba_analytics_old, on='id', how='left'
)
self.df_st_base_data = self.df_st_base_data.withColumn(
'rank_rate_of_change',
F.round((F.col('rank') - F.col('last_rank')) / F.col('last_rank'), 3)
F.round((F.col('rank') - F.col('last_rank')) / F.col('last_rank'), 4)
).withColumn(
'bsr_orders_rate_of_change',
F.round((F.col('bsr_orders') - F.col('last_bsr_orders')) / F.col('last_bsr_orders'), 3)
F.round((F.col('bsr_orders') - F.col('last_bsr_orders')) / F.col('last_bsr_orders'), 4)
).withColumn(
'cn_seller_rate_of_change',
F.round((F.col('asin_cn_count') - F.col('last_asin_cn_count')) / F.col('last_asin_cn_count'), 3)
F.round((F.col('asin_cn_count') - F.col('last_asin_cn_count')) / F.col('last_asin_cn_count'), 4)
).withColumn(
'fbm_rate_of_change',
F.round((F.col('asin_fbm_count') - F.col('last_asin_fbm_count')) / F.col('last_asin_fbm_count'), 3)
F.round((F.col('asin_fbm_count') - F.col('last_asin_fbm_count')) / F.col('last_asin_fbm_count'), 4)
).withColumn(
'amazon_rate_of_change',
F.round((F.col('asin_amazon_count') - F.col('last_asin_amazon_count')) / F.col('last_asin_amazon_count'), 3)
F.round((F.col('asin_amazon_count') - F.col('last_asin_amazon_count')) / F.col('last_asin_amazon_count'), 4)
).select(
'id', 'search_term', 'rank', 'bsr_orders', 'asin_cn_count', 'asin_fbm_count', 'asin_amazon_count',
'rank_rate_of_change', 'bsr_orders_rate_of_change', 'cn_seller_rate_of_change',
......@@ -187,19 +220,19 @@ class DwtAbaLastChangeRate(object):
)
df_year_ratio = df_year_ratio.withColumn(
"rank_change_rate",
F.round(F.expr("(rank - last_year_rank) / last_year_rank"), 3)
F.round(F.expr("(rank - last_year_rank) / last_year_rank"), 4)
).withColumn(
"bsr_orders_change_rate",
F.round(F.expr("(bsr_orders - last_year_bsr_orders) / last_year_bsr_orders"), 3)
F.round(F.expr("(bsr_orders - last_year_bsr_orders) / last_year_bsr_orders"), 4)
).withColumn(
"cn_seller_change_rate",
F.round(F.expr("(asin_cn_count - last_year_asin_cn_count) / last_year_asin_cn_count"), 3)
F.round(F.expr("(asin_cn_count - last_year_asin_cn_count) / last_year_asin_cn_count"), 4)
).withColumn(
"fbm_change_rate",
F.round(F.expr("(asin_fbm_count - last_year_asin_fbm_count) / last_year_asin_fbm_count"), 3)
F.round(F.expr("(asin_fbm_count - last_year_asin_fbm_count) / last_year_asin_fbm_count"), 4)
).withColumn(
"amazon_change_rate",
F.round(F.expr("(asin_amazon_count - last_year_asin_amazon_count) / last_year_asin_amazon_count"), 3)
F.round(F.expr("(asin_amazon_count - last_year_asin_amazon_count) / last_year_asin_amazon_count"), 4)
).withColumn(
"search_volume_change_rate",
F.round(F.expr("(search_volume - last_year_search_volume) / last_year_search_volume"), 3)
......@@ -219,6 +252,86 @@ class DwtAbaLastChangeRate(object):
})
self.df_save = df_year_ratio
# 需求1:近6月排名变化率/变化量(仅 month 类型调用)
def handle_rank_rate_history(self):
"""
基于 self.df_save(已含同比/环比所有字段)+ M-1~M-6 历史 rank,
计算 12 个新字段并 join 到 self.df_save。
字段计算逻辑:
- rank_change_last_1_month: 当月 rank - 上1月 rank
- rank_change_1~5_month_ago: 上 N 月 rank - 上 N+1 月 rank
- rank_rate_last_1_month: 当月 rank vs 上1月 rank
- rank_rate_1~5_month_ago: 上 N 月 rank vs 上 N+1 月 rank(直接从 df_hist_rank 相邻月算,不再读历史 change_rate 分区)
下榜/新进榜区分(月约定:负数=改善/排名上升,正数=变差/排名下降):
- 当期 rank 为 null(下榜): 变化量 +10000000,变化率 +1000.0
- 上期 rank 为 null(新进榜): 变化量 -10000000,变化率 -1000.0
- 两端均 null: 变化量 0, 变化率 0.0
"""
month_list = [CommonUtil.get_month_offset(self.date_info, -i) for i in range(1, 7)]
# 把 df_hist_rank 按 date_info 拆成 6 份 DF,分别对应 rank_last_1_month ~ rank_last_6_month
df_rank_dfs = {}
for i in range(1, 7):
df_rank_dfs[i] = self.df_hist_rank.filter(
F.col('date_info') == month_list[i - 1]
).drop('date_info').withColumnRenamed('rank', f'rank_last_{i}_month')
# 依次 left join 6 份历史 rank 到 self.df_save
df = self.df_save
for i in range(1, 7):
df = df.join(df_rank_dfs[i], on='search_term', how='left')
# 变化量:区分下榜(+10000000) / 新进榜(-10000000) / 两端无数据(0) / 正常计算
def _rank_change(cur, prev):
return (
F.when(F.col(cur).isNull() & F.col(prev).isNull(), F.lit(0))
.when(F.col(cur).isNull(), F.lit(10000000))
.when(F.col(prev).isNull(), F.lit(-10000000))
.otherwise(F.col(cur) - F.col(prev))
)
# 变化率:月约定负数=改善(rank数字变小=排名上升),区分下榜(+1000) / 新进榜(-1000) / 两端无数据(0) / 正常计算
def _rank_rate(cur, prev):
return (
F.when(F.col(cur).isNull() & F.col(prev).isNull(), F.lit(0.0))
.when(F.col(cur).isNull(), F.lit(1000.0))
.when(F.col(prev).isNull(), F.lit(-1000.0))
.otherwise(F.round((F.col(cur) - F.col(prev)) / F.col(prev), 4))
)
# 6 个变化量
df = df.withColumn('rank_change_last_1_month', _rank_change('rank', 'rank_last_1_month')) \
.withColumn('rank_change_1_month_ago', _rank_change('rank_last_1_month', 'rank_last_2_month')) \
.withColumn('rank_change_2_month_ago', _rank_change('rank_last_2_month', 'rank_last_3_month')) \
.withColumn('rank_change_3_month_ago', _rank_change('rank_last_3_month', 'rank_last_4_month')) \
.withColumn('rank_change_4_month_ago', _rank_change('rank_last_4_month', 'rank_last_5_month')) \
.withColumn('rank_change_5_month_ago', _rank_change('rank_last_5_month', 'rank_last_6_month'))
# 6 个变化率(全部统一用 _rank_rate,含下榜/新进榜/正常三种情况)
df = df.withColumn('rank_rate_last_1_month', _rank_rate('rank', 'rank_last_1_month')) \
.withColumn('rank_rate_1_month_ago', _rank_rate('rank_last_1_month', 'rank_last_2_month')) \
.withColumn('rank_rate_2_month_ago', _rank_rate('rank_last_2_month', 'rank_last_3_month')) \
.withColumn('rank_rate_3_month_ago', _rank_rate('rank_last_3_month', 'rank_last_4_month')) \
.withColumn('rank_rate_4_month_ago', _rank_rate('rank_last_4_month', 'rank_last_5_month')) \
.withColumn('rank_rate_5_month_ago', _rank_rate('rank_last_5_month', 'rank_last_6_month'))
self.df_save = df
self.df_hist_rank.unpersist()
# 需求1:非 month 类型(day/week/last30day/last365day)补 12 字段为 null
# 原因:Hive 表加了这 12 列后,所有分区 schema 都得有这些字段,否则 save_data 的 select 报错
def handle_rank_rate_padding(self):
new_cols = [
'rank_rate_last_1_month', 'rank_rate_1_month_ago', 'rank_rate_2_month_ago',
'rank_rate_3_month_ago', 'rank_rate_4_month_ago', 'rank_rate_5_month_ago',
'rank_change_last_1_month', 'rank_change_1_month_ago', 'rank_change_2_month_ago',
'rank_change_3_month_ago', 'rank_change_4_month_ago', 'rank_change_5_month_ago',
]
for col_name in new_cols:
self.df_save = self.df_save.withColumn(col_name, F.lit(None))
def handle_365_data(self):
sql = f"""
with base_data as (
......@@ -258,10 +371,10 @@ class DwtAbaLastChangeRate(object):
base.search_term,
base.rank,
base.bsr_orders,
round((base.rank - chain.last_rank)/chain.last_rank,3) as rank_rate_of_change,
round((base.bsr_orders - chain.last_bsr_orders)/chain.last_bsr_orders,3) as bsr_orders_rate_of_change,
round((base.rank - year.last_year_rank)/year.last_year_rank,3) as rank_change_rate,
round((base.bsr_orders - year.last_year_bsr_orders)/year.last_year_bsr_orders,3) as bsr_orders_change_rate
round((base.rank - chain.last_rank)/chain.last_rank,4) as rank_rate_of_change,
round((base.bsr_orders - chain.last_bsr_orders)/chain.last_bsr_orders,4) as bsr_orders_rate_of_change,
round((base.rank - year.last_year_rank)/year.last_year_rank,4) as rank_change_rate,
round((base.bsr_orders - year.last_year_bsr_orders)/year.last_year_bsr_orders,4) as bsr_orders_change_rate
from base_data base left join chain_ratio_data chain
on base.id = chain.id
left join year_ratio_data year
......@@ -303,7 +416,20 @@ class DwtAbaLastChangeRate(object):
F.col("search_volume_change_rate"),
F.lit(self.site_name).alias("site_name"),
F.lit(self.date_type).alias("date_type"),
F.lit(self.date_info).alias("date_info")
F.lit(self.date_info).alias("date_info"),
# 需求1:12 个新字段,顺序与 Hive ALTER ADD COLUMNS、PG ADD COLUMN、sqoop_export 严格一致
F.col("rank_rate_last_1_month"),
F.col("rank_rate_1_month_ago"),
F.col("rank_rate_2_month_ago"),
F.col("rank_rate_3_month_ago"),
F.col("rank_rate_4_month_ago"),
F.col("rank_rate_5_month_ago"),
F.col("rank_change_last_1_month"),
F.col("rank_change_1_month_ago"),
F.col("rank_change_2_month_ago"),
F.col("rank_change_3_month_ago"),
F.col("rank_change_4_month_ago"),
F.col("rank_change_5_month_ago")
)
# 类型转换
......
......@@ -53,6 +53,7 @@ class DwtAbaStAnalytics(Templates):
self.df_asin_label = self.spark.sql(f"select 1+1;")
self.df_is_hidden_cate = self.spark.sql(f"select 1+1;")
self.df_asin_profit_rate = self.spark.sql(f"select 1+1;")
self.df_history_st = self.spark.sql(f"select 1+1;") # 累加表 dim_st_detail_history 历史搜索词,用于 is_first_ever_text 判断
# 自定义udf函数注册
self.u_contains = self.spark.udf.register('u_contains', self.udf_contains, IntegerType())
......@@ -469,6 +470,32 @@ class DwtAbaStAnalytics(Templates):
print("self.df_asin_profit_rate:")
self.df_asin_profit_rate.show(10, truncate=True)
if self.date_type == 'month':
# 读累加表 dim_st_detail_history,用于判断 is_first_ever_text(全历史首次出现)
# 调度上累加表 dim_st_detail_history 的当月增量必须先于本脚本完成(包含当月数据)
# 用 date_info_first < self.date_info 过滤:first 早于当月 → 该词历史出现过
# date_info_first 字段永不被覆盖(MIN 聚合天然幂等),任何重跑场景都安全
# 安全检查:累加表必须已初始化(防止新站点忘跑 init)
# 用 take(1) 而不是 count():take 读一行即返回,避免在 LZO 文本格式下扫全表
has_data = self.spark.sql(f"""
SELECT 1 FROM dim_st_detail_history WHERE site_name = '{self.site_name}' LIMIT 1
""").take(1)
assert len(has_data) > 0, f"累加表 dim_st_detail_history site_name={self.site_name} 分区为空,请先跑 dim_st_detail_history_init.py"
sql = f"""
select
search_term
from dim_st_detail_history
where site_name = '{self.site_name}'
and date_info_first < '{self.date_info}'
"""
# date_info_first < 当月 → 该词在当月之前已出现过(历史词)
# 后续 join 上 → is_first_ever_text=0;join 不上 → is_first_ever_text=1(全历史首次)
self.df_history_st = self.spark.sql(sqlQuery=sql).repartition(80, 'search_term').cache()
print("self.df_history_st:")
self.df_history_st.show(10, truncate=True)
def handle_data(self):
# 对基础计算表进行关联
self.handle_base_join()
......@@ -485,6 +512,12 @@ class DwtAbaStAnalytics(Templates):
# 语种处理
self.handle_calc_lang()
if self.date_type == 'month':
self.handle_first_ever_flag() # 全历史首次出现标记(基于累加表 dim_st_detail_history) 不是月流程填充 -1
else:
self.df_save = self.df_save.withColumn('is_first_ever_text', F.lit(-1))
# 处理输出字段
self.handle_column()
......@@ -580,7 +613,8 @@ class DwtAbaStAnalytics(Templates):
self.df_st_brand_cal = self.df_st_brand_cal.filter("asin_brand_name not in('null','None')")
self.df_st_brand_cal = self.df_st_brand_cal.groupby(['search_term', 'asin_brand_name']).agg(
F.sum("asin_amazon_orders").alias("asin_brand_bsr_orders_total"),
F.sum("asin_zr_orders").alias("asin_brand_zr_orders_total")
F.sum("asin_zr_orders").alias("asin_brand_zr_orders_total"),
F.count("asin").alias("brand_asin_count") # 需求3:每个品牌在该搜索词下的 ASIN 数(用于后续算占比)
)
self.df_top3_st_brand_cal = self.df_st_brand_cal
......@@ -611,8 +645,10 @@ class DwtAbaStAnalytics(Templates):
)
# 品牌总销量
# 需求3:合并 max(brand_asin_count) 到这次 groupby,避免多一次 shuffle
self.df_st_brand_cal = self.df_st_brand_cal.groupby(['search_term']).agg(
F.count_distinct("asin_brand_name").alias("page3_brand_num")
F.count_distinct("asin_brand_name").alias("page3_brand_num"),
F.max("brand_asin_count").alias("max_brand_asin_count") # 需求3:取该搜索词下 ASIN 数最多品牌的 ASIN 数
).repartition(80, 'search_term')
# 聚合得到st_brand
......@@ -623,14 +659,15 @@ class DwtAbaStAnalytics(Templates):
)
self.df_st_brand_cal = self.df_st_brand_cal.select(
"search_term", "page3_brand_num", "top3_brand_bsr_orders", "top3_brand_orders"
"search_term", "page3_brand_num", "top3_brand_bsr_orders", "top3_brand_orders", "max_brand_asin_count"
).cache()
# 计算卖家top3销量和总销量
self.df_st_seller_cal = self.df_st_seller_cal.filter("account_id is not null")
self.df_st_seller_cal = self.df_st_seller_cal.groupby(['search_term', 'account_id']).agg(
F.sum("asin_amazon_orders").alias("asin_seller_bsr_orders_total"),
F.sum("asin_zr_orders").alias("asin_seller_zr_orders_total")
F.sum("asin_zr_orders").alias("asin_seller_zr_orders_total"),
F.count("asin").alias("seller_asin_count") # 需求3:每个卖家在该搜索词下的 ASIN 数
)
self.df_top3_st_seller_cal = self.df_st_seller_cal
......@@ -662,8 +699,10 @@ class DwtAbaStAnalytics(Templates):
)
# 卖家总数量
# 需求3:合并 max(seller_asin_count) 到这次 groupby,避免多一次 shuffle
self.df_st_seller_cal = self.df_st_seller_cal.groupby(['search_term']).agg(
F.countDistinct("account_id").alias("page3_seller_num")
F.countDistinct("account_id").alias("page3_seller_num"),
F.max("seller_asin_count").alias("max_seller_asin_count") # 需求3:取该搜索词下 ASIN 数最多卖家的 ASIN 数
).repartition(80, 'search_term')
# 聚合得到st_seller
......@@ -674,7 +713,7 @@ class DwtAbaStAnalytics(Templates):
)
self.df_st_seller_cal = self.df_st_seller_cal.select(
"search_term", "page3_seller_num", "top3_seller_bsr_orders", "top3_seller_orders"
"search_term", "page3_seller_num", "top3_seller_bsr_orders", "top3_seller_orders", "max_seller_asin_count"
).cache()
# 计算最终指标
......@@ -696,39 +735,39 @@ class DwtAbaStAnalytics(Templates):
df_st_agg = df_st_agg.withColumn(
# 新品产品数量/前三页产品总数
"new_asin_proportion",
F.round(F.col("asin_is_new_total") / F.col("asin_count"), 3)
F.round(F.col("asin_is_new_total") / F.col("asin_count"), 4)
).withColumn(
# 当日A+商品占比
"aadd_proportion",
F.round(F.col("asin_aadd_count") / F.col("asin_count"), 3)
F.round(F.col("asin_aadd_count") / F.col("asin_count"), 4)
).withColumn(
# 当日视频商品占比
"sp_proportion",
F.round(F.col("asin_video_count") / F.col("asin_count"), 3)
F.round(F.col("asin_video_count") / F.col("asin_count"), 4)
).withColumn(
# 当日FBM商品占比
"fbm_proportion",
F.round(F.col("asin_fbm_count") / F.col("asin_count"), 3)
F.round(F.col("asin_fbm_count") / F.col("asin_count"), 4)
).withColumn(
# 中国卖家占比
"cn_proportion",
F.round(F.col("asin_cn_count") / F.col("asin_count"), 3)
F.round(F.col("asin_cn_count") / F.col("asin_count"), 4)
).withColumn(
# Amazon自营占比
"amzon_proportion",
F.round(F.col("asin_amazon_count") / F.col("asin_count"), 3)
F.round(F.col("asin_amazon_count") / F.col("asin_count"), 4)
).withColumn(
# 多颜色占比 = 关键字有颜色的asin数/关键字的asin数
"color_proportion",
F.round(F.col("asin_color_count") / F.col("asin_count"), 3)
F.round(F.col("asin_color_count") / F.col("asin_count"), 4)
).withColumn(
# 多色比例 = 关键词前三页产品标题中出现colorful/assorted color/multi color的词产品个数/前三页产品数量
"multi_color_proportion",
F.round(F.col("asin_multi_color_count") / F.col("asin_count"), 3)
F.round(F.col("asin_multi_color_count") / F.col("asin_count"), 4)
).withColumn(
# 多尺寸占比
"multi_size_proportion",
F.round(F.col("asin_multi_size_count") / F.col("asin_count"), 3)
F.round(F.col("asin_multi_size_count") / F.col("asin_count"), 4)
).withColumnRenamed(
# 新品总数
"asin_is_new_total",
......@@ -744,15 +783,25 @@ class DwtAbaStAnalytics(Templates):
).withColumn(
# 销量占比 新品销量占比
"new_bsr_orders_proportion",
F.round(F.col("new_asin_bsr_orders") / F.col("amazon_monthly_sales"), 3)
F.round(F.col("new_asin_bsr_orders") / F.col("amazon_monthly_sales"), 4)
).withColumn(
# 品牌垄断系数
# 品牌垄断系数(向上取整保留4位,避免低估垄断程度)
"brand_monopoly",
F.ceil((F.col("top3_brand_bsr_orders") / F.col("amazon_monthly_sales")) * 1000) / 1000
F.round(F.col("top3_brand_bsr_orders") / F.col("amazon_monthly_sales"), 4)
).withColumn(
# 卖家垄断系数
# 卖家垄断系数(向上取整保留4位,避免低估垄断程度)
"seller_monopoly",
F.ceil((F.col("top3_seller_bsr_orders") / F.col("amazon_monthly_sales")) * 1000) / 1000
F.round(F.col("top3_seller_bsr_orders") / F.col("amazon_monthly_sales"), 4)
).withColumn(
# 需求3:品牌ASIN占比 = 该搜索词下 ASIN 数最多的品牌的 ASIN 数 / 前三页所有产品数
# 分子来自 handle_brand_seller_agg 的 max_brand_asin_count(已过滤 null/None 品牌)
# 分母 total_asin_num 含全量 ASIN(含无品牌 ASIN)
"brand_asin_proportion",
F.round(F.col("max_brand_asin_count") / F.col("total_asin_num"), 4)
).withColumn(
# 需求3:卖家ASIN占比 = 该搜索词下 ASIN 数最多的卖家的 ASIN 数 / 前三页所有产品数
"seller_asin_proportion",
F.round(F.col("max_seller_asin_count") / F.col("total_asin_num"), 4)
).withColumn(
# ABA搜索词拆分的单词个数
"st_word_num",
......@@ -810,47 +859,59 @@ class DwtAbaStAnalytics(Templates):
F.coalesce(udf_detect_phrase_reg(lang_word_map)(F.col("search_term")).getField("lang"), F.lit("other"))
)
# 全历史首次出现标记(基于累加表 dim_st_detail_history 的历史搜索词集合)
def handle_first_ever_flag(self):
# self.df_history_st 在 read_data 阶段已经过滤了"first < 当月"——即在当月之前出现过的词
# 给这部分词标记 is_first_ever_text=0(历史出现过)
# 当月词 left join:
# - join 上 → 累加表里有该词的"早于当月的首次记录" → 历史出现过 → is_first_ever_text=0
# - join 不上 → 累加表里没有该词早于当月的记录 → 全历史首次出现 → fillna(1)
df_flag = self.df_history_st.withColumn('is_first_ever_text', F.lit(0))
self.df_save = self.df_save.join(df_flag, on='search_term', how='left') \
.fillna({'is_first_ever_text': 1})
self.df_history_st.unpersist()
def handle_column(self):
# 入库前字段处理
# 入库前字段处理,精度以 PG 集群列约束为基准
self.df_save = self.df_save.select(
"id",
"search_term",
"rank",
"category_id",
"orders",
F.col("amazon_monthly_sales").alias("bsr_orders"),
F.round("orders", 0).alias("orders"), # PG int4,来源 double,round 到整数
F.col("amazon_monthly_sales").alias("bsr_orders"), # PG int4,来源 int sum,无需 round
"search_volume",
"quantity_being_sold",
F.round("st_ao_avg", 3).alias("st_ao_avg"),
"st_ao_val_rate",
F.round("st_ao_avg", 4).alias("st_ao_avg"), # PG numeric(25,4)
F.round("st_ao_val_rate", 4).alias("st_ao_val_rate"), # PG numeric(25,4)
"new_bsr_orders_proportion",
"new_asin_proportion",
F.round("page1_title_proportion", 3).alias("page1_title_proportion"),
F.round("price_avg", 3).alias("price_avg"),
F.round("total_comments_avg", 0).alias("total_comments_avg"),
F.round("rating_avg", 3).alias("rating_avg"),
F.round("weight_avg", 3).alias("weight_avg"),
F.round("volume_avg", 3).alias("volume_avg"),
F.round("title_length_avg", 0).alias("title_length_avg"),
F.round("page1_title_proportion", 4).alias("page1_title_proportion"), # PG numeric(25,4)
F.round("price_avg", 2).alias("price_avg"), # PG numeric(25,2)
F.round("total_comments_avg", 0).alias("total_comments_avg"), # PG int4
F.round("rating_avg", 1).alias("rating_avg"), # PG numeric(25,1)
F.round("weight_avg", 4).alias("weight_avg"), # PG numeric(25,4)
F.round("volume_avg", 4).alias("volume_avg"), # PG numeric(35,4)
F.round("title_length_avg", 0).alias("title_length_avg"), # PG int4
"st_num",
"aadd_proportion",
"sp_proportion",
"fbm_proportion",
"cn_proportion",
"amzon_proportion",
"most_proportion",
"max_num",
F.round("most_proportion", 4).alias("most_proportion"), # PG numeric(25,4)
F.round("max_num", 4).alias("max_num"), # PG numeric(25,4)
"asin1",
"asin2",
"asin3",
F.round("click_share1", 3).alias("click_share1"),
F.round("click_share2", 3).alias("click_share2"),
F.round("click_share3", 3).alias("click_share3"),
F.round("total_click_share", 3).alias("total_click_share"),
F.round("conversion_share1", 3).alias("conversion_share1"),
F.round("conversion_share2", 3).alias("conversion_share2"),
F.round("conversion_share3", 3).alias("conversion_share3"),
F.round("total_conversion_share", 3).alias("total_conversion_share"),
F.round("click_share1", 4).alias("click_share1"), # PG numeric(20,4)
F.round("click_share2", 4).alias("click_share2"), # PG numeric(20,4)
F.round("click_share3", 4).alias("click_share3"), # PG numeric(20,4)
F.round("total_click_share", 4).alias("total_click_share"), # PG numeric(20,4)
F.round("conversion_share1", 4).alias("conversion_share1"), # PG numeric(20,4)
F.round("conversion_share2", 4).alias("conversion_share2"), # PG numeric(20,4)
F.round("conversion_share3", 4).alias("conversion_share3"), # PG numeric(20,4)
F.round("total_conversion_share", 4).alias("total_conversion_share"), # PG numeric(20,4)
"new_asin_num",
"total_asin_num",
"new_asin_orders",
......@@ -871,7 +932,7 @@ class DwtAbaStAnalytics(Templates):
"is_new_market_segment",
F.when(F.col('category_current_id').isNull(), F.col('category_id'))
.otherwise(F.col('category_current_id')).alias('category_current_id'),
"supply_demand",
F.round("supply_demand", 4).alias("supply_demand"), # PG numeric(25,4)
"market_cycle_type",
"color_proportion",
"multi_color_proportion",
......@@ -899,25 +960,28 @@ class DwtAbaStAnalytics(Templates):
"st_bsr_cate_current_id_new",
"st_crawl_date",
"is_high_return_text",
F.round("st_zr_page123_title_appear_rate", 3).alias("st_zr_page123_title_appear_rate"),
F.round("st_sp_page123_title_appear_rate", 3).alias("st_sp_page123_title_appear_rate"),
F.round("st_zr_page123_title_appear_rate", 4).alias("st_zr_page123_title_appear_rate"), # PG numeric(20,4)
F.round("st_sp_page123_title_appear_rate", 4).alias("st_sp_page123_title_appear_rate"), # PG numeric(20,4)
"st_competition_level",
"amazon_monthly_sales",
"st_zr_flow_proportion",
"st_ao_val_matrix",
"st_flow_proportion_matrix",
F.round("st_zr_flow_proportion", 4).alias("st_zr_flow_proportion"), # PG numeric(25,4)
F.round("st_ao_val_matrix", 4).alias("st_ao_val_matrix"), # PG numeric(25,4)
F.round("st_flow_proportion_matrix", 4).alias("st_flow_proportion_matrix"), # PG numeric(25,4)
"st_zr_counts",
"st_sp_counts",
"st_self_asin_counts",
"st_self_asin_proportion",
F.round("st_self_asin_proportion", 4).alias("st_self_asin_proportion"), # PG numeric(25,4)
"lang",
"asin_movie_type_count",
"is_hidden_cate",
"st_dd50_proportion",
"st_dd100_proportion",
"st_dd200_proportion",
F.round("ocean_profit_avg", 4).alias("gross_profit_fee_sea"),
F.round("air_profit_avg", 4).alias("gross_profit_fee_air")
F.round("st_dd50_proportion", 4).alias("st_dd50_proportion"), # PG numeric(25,4)
F.round("st_dd100_proportion", 4).alias("st_dd100_proportion"), # PG numeric(25,4)
F.round("st_dd200_proportion", 4).alias("st_dd200_proportion"), # PG numeric(25,4)
F.round("ocean_profit_avg", 4).alias("gross_profit_fee_sea"), # PG numeric(25,4)
F.round("air_profit_avg", 4).alias("gross_profit_fee_air"), # PG numeric(25,4)
"is_first_ever_text", # 需求2:全历史首次出现标记(1=当月首次, 0=历史出现过)
"brand_asin_proportion", # 需求3:前三页ASIN数最多品牌的ASIN数占比
"seller_asin_proportion" # 需求3:前三页ASIN数最多卖家的ASIN数占比
)
# 空值处理
......@@ -933,7 +997,9 @@ class DwtAbaStAnalytics(Templates):
"is_high_return_text": 0,
"amazon_monthly_sales": 0,
"bsr_orders": 0,
"is_hidden_cate": 0
"is_hidden_cate": 0,
"brand_asin_proportion": -1, # 需求3:分子为 null(搜索词全无品牌) → 占位 -1
"seller_asin_proportion": -1 # 需求3:分子为 null(搜索词全无账号) → 占位 -1
})
# 日期字段补全
......
......@@ -6,6 +6,7 @@ sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil, DateTypes
from utils.db_util import DBUtil
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
......@@ -31,7 +32,14 @@ if __name__ == '__main__':
# 获取数据库连接
engine = DBUtil.get_db_engine(db_type, site_name)
# 导出前校验 Hive 分区是否有数据,避免空分区触发交换导致 PG 数据被清空
hive_partition_path = f"/home/big_data_selection/dwt/dwt_aba_last_change_rate/site_name={site_name}/date_type={date_type}/date_info={date_info}"
hive_files = HdfsUtils.read_list(hive_partition_path)
if not hive_files:
print(f"[ERROR] Hive 分区无数据文件,路径:{hive_partition_path},跳过导出,请先检查 DWT 计算任务是否正常写入!")
engine.dispose()
sys.exit(1)
print(f"Hive 分区文件数:{len(hive_files)},路径:{hive_partition_path},继续导出")
# 导出表
export_base_tb = f"{site_name}_aba_last_change_rate"
......@@ -76,30 +84,57 @@ if __name__ == '__main__':
"date_type": date_type,
"date_info": date_info
}
# 基础导出字段(所有 date_type 通用)
export_base_cols = [
"search_term",
"search_term_id",
"date_type",
"date_info",
"rank_change_rate",
"bsr_orders_change_rate",
"cn_seller_change_rate",
"fbm_change_rate",
"amazon_change_rate",
"rank_rate_of_change",
"bsr_orders_rate_of_change",
"cn_seller_rate_of_change",
"fbm_rate_of_change",
"amazon_rate_of_change",
"created_time",
"updated_time"
]
# 需求1:12 个新字段为 month 月度专属
# - dwt_aba_last_change_rate_new.py 中 handle_rank_rate_history 仅 month 类型才计算实际值
# - 其他 date_type(day/week/last30day/last365day)走 handle_rank_rate_padding 填 null
# - 既然非 month 全是 null,就不该导出(避免污染 PG,避免 last365day 等独立表强制加列)
if date_type == DateTypes.month.name:
month_extra_cols = [
"rank_rate_last_1_month",
"rank_rate_1_month_ago",
"rank_rate_2_month_ago",
"rank_rate_3_month_ago",
"rank_rate_4_month_ago",
"rank_rate_5_month_ago",
"rank_change_last_1_month",
"rank_change_1_month_ago",
"rank_change_2_month_ago",
"rank_change_3_month_ago",
"rank_change_4_month_ago",
"rank_change_5_month_ago"
]
export_cols = export_base_cols + month_extra_cols
else:
export_cols = export_base_cols
# 导出表名
sh = CommonUtil.build_export_sh(
site_name=site_name,
db_type=db_type,
hive_tb="dwt_aba_last_change_rate",
export_tb=export_tb,
col=[
"search_term",
"search_term_id",
"date_type",
"date_info",
"rank_change_rate",
"bsr_orders_change_rate",
"cn_seller_change_rate",
"fbm_change_rate",
"amazon_change_rate",
"rank_rate_of_change",
"bsr_orders_rate_of_change",
"cn_seller_rate_of_change",
"fbm_rate_of_change",
"amazon_rate_of_change",
"created_time",
"updated_time"
],
col=export_cols,
partition_dict=partition_dict
)
......
......@@ -6,6 +6,7 @@ sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil, DateTypes
from utils.db_util import DBUtil
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
# 获取入参
......@@ -37,6 +38,16 @@ if __name__ == '__main__':
# 获取数据库连接
engine = DBUtil.get_db_engine(db_type, site_name)
# 导出前校验 Hive 分区是否有数据,避免空分区触发交换导致 PG 数据被清空
hive_partition_path = f"/home/big_data_selection/dwt/dwt_aba_st_analytics/site_name={site_name}/date_type={date_type}/date_info={date_info}"
hive_files = HdfsUtils.read_list(hive_partition_path)
if not hive_files:
print(f"[ERROR] Hive 分区无数据文件,路径:{hive_partition_path},跳过导出,请先检查 DWT 计算任务是否正常写入!")
engine.dispose()
sys.exit(1)
print(f"Hive 分区文件数:{len(hive_files)},路径:{hive_partition_path},继续导出")
suffix = str(date_info).replace("-", "_")
# 导出表--基准表名
......@@ -175,7 +186,13 @@ if __name__ == '__main__':
# month特有导出字段
tb_cols = [
"is_new_market_segment", "color_proportion", "supply_demand", "market_cycle_type", "is_high_return_text",
"st_zr_counts", "st_sp_counts", "st_self_asin_counts", "st_self_asin_proportion"
"st_zr_counts", "st_sp_counts", "st_self_asin_counts", "st_self_asin_proportion",
# 需求2 + 需求3:月度专属字段(仅 month 流程才有意义)
# is_first_ever_text 依赖累加表 dim_st_detail_history(仅 month 数据)
# brand_asin_proportion / seller_asin_proportion 服务月搜索词筛选页面
"is_first_ever_text",
"brand_asin_proportion",
"seller_asin_proportion"
]
# 处理导出表
export_master_tb = f"{export_base_tb}_{date_type}_{year_str}"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment