Commit 3a52c2df by chenyuanjie

no message

parent cda0402a
...@@ -680,13 +680,14 @@ def udf_extract_weight_format(weight_str: str): ...@@ -680,13 +680,14 @@ def udf_extract_weight_format(weight_str: str):
# 分类提取-返回: 一级/当前分类id+一级/当前分类排名 # 分类提取-返回: 一级/当前分类id+一级/当前分类排名
# 参考dim_asin_bs_info.py使用 # 参考dim_asin_bs_info.py使用
def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_sellers_href, cate_current_pattern, def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_sellers_href, cate_current_pattern,
cate_1_pattern): cate_1_pattern, node_id):
""" """
asin_bs_sellers_rank_lower: 底部分类字符串 asin_bs_sellers_rank_lower: 底部分类字符串
last_herf: 最后一级分类链接 last_herf: 最后一级分类链接
all_best_sellers_href: 所有分类链接 all_best_sellers_href: 所有分类链接
cate_current_pattern: 当前分类排名匹配规则 cate_current_pattern: 当前分类排名匹配规则
cate_1_pattern: 一级分类排名匹配规则 cate_1_pattern: 一级分类排名匹配规则
node_id: 页面头部抓取分类id
""" """
# if (site_name == 'us' and date_type in ['month', 'month_week'] and date_info >= '2023-11') or (site_name != 'us' and date_type in ['week'] and date_info >= '2023-41'): # if (site_name == 'us' and date_type in ['month', 'month_week'] and date_info >= '2023-11') or (site_name != 'us' and date_type in ['week'] and date_info >= '2023-41'):
...@@ -711,7 +712,43 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller ...@@ -711,7 +712,43 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
break break
# 2. 解析一级和当前 分类 + 排名 # 2. 解析一级和当前 分类 + 排名
# 2.1 提取分类 # 2.1 先检查 node_id 是否在 href_list 中
cate_1_id, cate_current_id, cate_1_rank, cate_current_rank = None, None, None, None
if node_id and len(href_list) > 1:
node_id_str = str(node_id)
matched_idx = None
for i, href in enumerate(href_list):
if node_id_str in href: # 判断node_id是否在url中出现
matched_idx = i
break
if matched_idx is not None:
# 提取对应分类ID
cate_current_id = re.findall('bestsellers/(.*)/ref', href_list[matched_idx])
cate_current_id = cate_current_id[0].split("/")[-1] if cate_current_id else None
# 一级分类还是取第一个
cate_1_id = re.findall('bestsellers/(.*)/ref', href_list[0])
cate_1_id = cate_1_id[0].split("/")[0] if cate_1_id else None
# 解析排名
if asin_bs_sellers_rank_lower is not None:
asin_bs_sellers_rank_lower2 = asin_bs_sellers_rank_lower.replace(".", "").replace(",", "").replace(" 100 ", "")
else:
asin_bs_sellers_rank_lower2 = ''
rank_list = re.findall(cate_current_pattern, asin_bs_sellers_rank_lower2)
rank_list = [int(rank) for rank in rank_list]
# 如果 rank_list 长度和 href_list 对齐,则取对应位置的排名
if matched_idx < len(rank_list):
cate_current_rank = rank_list[matched_idx]
# 一级分类排名
if rank_list and cate_1_pattern in asin_bs_sellers_rank_lower:
cate_1_rank = rank_list[0]
return cate_1_id, cate_current_id, cate_1_rank, cate_current_rank
# 2.2 提取分类
if href_list: if href_list:
if len(href_list) == 1: if len(href_list) == 1:
cate_list = re.findall('bestsellers/(.*)/ref', href_list[0]) cate_list = re.findall('bestsellers/(.*)/ref', href_list[0])
...@@ -735,7 +772,7 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller ...@@ -735,7 +772,7 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
else: else:
cate_1_id, cate_current_id = None, None cate_1_id, cate_current_id = None, None
# 2.2 提取排名 # 2.3 提取排名
if asin_bs_sellers_rank_lower is not None: if asin_bs_sellers_rank_lower is not None:
asin_bs_sellers_rank_lower2 = asin_bs_sellers_rank_lower.replace(".", "").replace(",", "").replace(" 100 ", "") asin_bs_sellers_rank_lower2 = asin_bs_sellers_rank_lower.replace(".", "").replace(",", "").replace(" 100 ", "")
else: else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment