no message

3a52c2df · chenyuanjie · cda0402a · 3a52c2df
Commit 3a52c2df authored Sep 28, 2025 by chenyuanjie
Show whitespace changes
Inline Side-by-side

Showing with 40 additions and 3 deletions

common_udf.py Pyspark_job/yswg_utils/common_udf.py +40 -3

No files found.
--- a/Pyspark_job/yswg_utils/common_udf.py
+++ b/Pyspark_job/yswg_utils/common_udf.py
@@ -680,13 +680,14 @@ def udf_extract_weight_format(weight_str: str):
 # 分类提取-返回: 一级/当前分类id+一级/当前分类排名
 # 参考dim_asin_bs_info.py使用
 def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_sellers_href, cate_current_pattern,
-                          cate_1_pattern):
+                          cate_1_pattern, node_id):
    """
    asin_bs_sellers_rank_lower: 底部分类字符串
    last_herf： 最后一级分类链接
    all_best_sellers_href： 所有分类链接
    cate_current_pattern： 当前分类排名匹配规则
    cate_1_pattern： 一级分类排名匹配规则
+    node_id： 页面头部抓取分类id
    """

    # if (site_name == 'us' and date_type in ['month', 'month_week'] and date_info >= '2023-11') or (site_name != 'us' and date_type in ['week'] and date_info >= '2023-41'):
@@ -711,7 +712,43 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
            break

    # 2. 解析一级和当前 分类 + 排名
-    # 2.1 提取分类
+    # 2.1 先检查 node_id 是否在 href_list 中
+    cate_1_id, cate_current_id, cate_1_rank, cate_current_rank = None, None, None, None
+    if node_id and len(href_list) > 1:
+        node_id_str = str(node_id)
+        matched_idx = None
+        for i, href in enumerate(href_list):
+            if node_id_str in href:  # 判断node_id是否在url中出现
+                matched_idx = i
+                break
+
+        if matched_idx is not None:
+            # 提取对应分类ID
+            cate_current_id = re.findall('bestsellers/(.*)/ref', href_list[matched_idx])
+            cate_current_id = cate_current_id[0].split("/")[-1] if cate_current_id else None
+
+            # 一级分类还是取第一个
+            cate_1_id = re.findall('bestsellers/(.*)/ref', href_list[0])
+            cate_1_id = cate_1_id[0].split("/")[0] if cate_1_id else None
+
+            # 解析排名
+            if asin_bs_sellers_rank_lower is not None:
+                asin_bs_sellers_rank_lower2 = asin_bs_sellers_rank_lower.replace(".", "").replace(",", "").replace(" 100 ", "")
+            else:
+                asin_bs_sellers_rank_lower2 = ''
+            rank_list = re.findall(cate_current_pattern, asin_bs_sellers_rank_lower2)
+            rank_list = [int(rank) for rank in rank_list]
+
+            # 如果 rank_list 长度和 href_list 对齐，则取对应位置的排名
+            if matched_idx < len(rank_list):
+                cate_current_rank = rank_list[matched_idx]
+            # 一级分类排名
+            if rank_list and cate_1_pattern in asin_bs_sellers_rank_lower:
+                cate_1_rank = rank_list[0]
+
+            return cate_1_id, cate_current_id, cate_1_rank, cate_current_rank
+
+    # 2.2 提取分类
    if href_list:
        if len(href_list) == 1:
            cate_list = re.findall('bestsellers/(.*)/ref', href_list[0])
@@ -735,7 +772,7 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
    else:
        cate_1_id, cate_current_id = None, None

-    # 2.2 提取排名
+    # 2.3 提取排名
    if asin_bs_sellers_rank_lower is not None:
        asin_bs_sellers_rank_lower2 = asin_bs_sellers_rank_lower.replace(".", "").replace(",", "").replace(" 100 ", "")
    else: