Commit de54f924 by chenyuanjie

增加字段asin_rank_in_cate:排名in分类 &&&&拼接

parent fdc90083
...@@ -48,6 +48,16 @@ class DimBsAsinInfo(Templates): ...@@ -48,6 +48,16 @@ class DimBsAsinInfo(Templates):
"fr": "(\d+) en ", "fr": "(\d+) en ",
"it": "(\d+) in ", "it": "(\d+) in ",
} }
# asin_rank_in_cate 解析正则(与 kafka_flow_asin_detail_to_doris.py udf_rank_and_category 一致)
self.pattern_str = {
"us": r"(\d+ in [\w&' -]+)",
"uk": r"(\d+ in [\w&' -]+)",
"de": r"nr (\d+ in [\w&' -]+?)(?=\s*\(|\s+nr\s+\d|$)",
"es": r"nº(\d+ en [\w&' -]+)",
"fr": r"(\d+ en [\w&' -]+)",
"it": r"n (\d+ in [\w&' -]+)",
}
self.u_rank_and_category = F.udf(self.udf_rank_and_category, StringType())
self.partitions_by = ['site_name', 'date_type', 'date_info'] self.partitions_by = ['site_name', 'date_type', 'date_info']
self.reset_partitions(partitions_num=20) self.reset_partitions(partitions_num=20)
self.get_year_week_tuple() self.get_year_week_tuple()
...@@ -172,6 +182,26 @@ class DimBsAsinInfo(Templates): ...@@ -172,6 +182,26 @@ class DimBsAsinInfo(Templates):
return cate_1_id, cate_current_id, cate_1_rank, cate_current_rank return cate_1_id, cate_current_id, cate_1_rank, cate_current_rank
@staticmethod
def udf_rank_and_category(best_sellers_rank, pattern_str, top100_prefix):
"""与 kafka_flow_asin_detail_to_doris.py udf_rank_and_category 一致:
lower 后定位、原文切片,保留分类名称大小写,&&&& 拼接多段排名+分类"""
import re
# 原文(去掉 . 和 ,)用于截取分类名,保留大小写
original = str(best_sellers_rank).replace(".", "").replace(",", "")
# de 数据形如 "Nr X in 分类 (Siehe Top 100 in 分类)" — 先把括号注释整段删除,
# 避免分类名尾部贪婪吃到 "(" 前空格
original = re.sub(r"\s*\([^)]*\)", "", original)
# lower 版本仅用于正则匹配定位(pattern_str 与 top100_prefix 都是小写)
lowered = original.lower().replace(top100_prefix, "")
# 同步从原文剔除 Top100 前缀(大小写不敏感)
cleaned_original = re.sub(re.escape(top100_prefix), "", original, flags=re.IGNORECASE)
# 用小写匹配定位每个 "排名 + 分类" 在原文中的 span,再切原文保留大小写
matches = list(re.finditer(pattern_str, lowered))
if not matches:
return None
return "&&&&".join([cleaned_original[m.start():m.end()] for m in matches])
def read_data(self): def read_data(self):
sql = f"select asin, category_id as asin_bs_cate_current_id_node, category_first_id as asin_bs_cate_1_id_node from dim_asin_detail where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info ='{self.date_info}';" # and date_info>='2023-15' sql = f"select asin, category_id as asin_bs_cate_current_id_node, category_first_id as asin_bs_cate_1_id_node from dim_asin_detail where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info ='{self.date_info}';" # and date_info>='2023-15'
print(f"1. 读取dim_asin_detail表node_id数据: sql -- {sql}") print(f"1. 读取dim_asin_detail表node_id数据: sql -- {sql}")
...@@ -243,6 +273,17 @@ class DimBsAsinInfo(Templates): ...@@ -243,6 +273,17 @@ class DimBsAsinInfo(Templates):
.withColumn('asin_bs_cate_current_rank', .withColumn('asin_bs_cate_current_rank',
self.df_bs_asin_detail.asin_bs_cate_ranks.getField('asin_bs_cate_current_rank')) \ self.df_bs_asin_detail.asin_bs_cate_ranks.getField('asin_bs_cate_current_rank')) \
.drop('asin_bs_cate_ranks') .drop('asin_bs_cate_ranks')
# 新增 asin_rank_in_cate:从原文 asin_bs_sellers_rank 解析排名+分类,&&&& 拼接,保留分类名大小写
top100_prefix = self.pattern1_dict[self.site_name]
pattern_str = self.pattern_str[self.site_name]
self.df_bs_asin_detail = self.df_bs_asin_detail.withColumn(
'asin_rank_in_cate',
self.u_rank_and_category(
F.col('asin_bs_sellers_rank'),
F.lit(pattern_str),
F.lit(top100_prefix),
)
)
# self.df_bs_asin_detail.show(10, truncate=False) # self.df_bs_asin_detail.show(10, truncate=False)
# self.df_save = self.df_asin_node_id.join( # self.df_save = self.df_asin_node_id.join(
# self.df_bs_asin_detail, 'asin', how='left' # self.df_bs_asin_detail, 'asin', how='left'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment