Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
3a52c2df
Commit
3a52c2df
authored
Sep 28, 2025
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
cda0402a
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
40 additions
and
3 deletions
+40
-3
common_udf.py
Pyspark_job/yswg_utils/common_udf.py
+40
-3
No files found.
Pyspark_job/yswg_utils/common_udf.py
View file @
3a52c2df
...
...
@@ -680,13 +680,14 @@ def udf_extract_weight_format(weight_str: str):
# 分类提取-返回: 一级/当前分类id+一级/当前分类排名
# 参考dim_asin_bs_info.py使用
def
udf_parse_bs_category
(
asin_bs_sellers_rank_lower
,
last_herf
,
all_best_sellers_href
,
cate_current_pattern
,
cate_1_pattern
):
cate_1_pattern
,
node_id
):
"""
asin_bs_sellers_rank_lower: 底部分类字符串
last_herf: 最后一级分类链接
all_best_sellers_href: 所有分类链接
cate_current_pattern: 当前分类排名匹配规则
cate_1_pattern: 一级分类排名匹配规则
node_id: 页面头部抓取分类id
"""
# if (site_name == 'us' and date_type in ['month', 'month_week'] and date_info >= '2023-11') or (site_name != 'us' and date_type in ['week'] and date_info >= '2023-41'):
...
...
@@ -711,7 +712,43 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
break
# 2. 解析一级和当前 分类 + 排名
# 2.1 提取分类
# 2.1 先检查 node_id 是否在 href_list 中
cate_1_id
,
cate_current_id
,
cate_1_rank
,
cate_current_rank
=
None
,
None
,
None
,
None
if
node_id
and
len
(
href_list
)
>
1
:
node_id_str
=
str
(
node_id
)
matched_idx
=
None
for
i
,
href
in
enumerate
(
href_list
):
if
node_id_str
in
href
:
# 判断node_id是否在url中出现
matched_idx
=
i
break
if
matched_idx
is
not
None
:
# 提取对应分类ID
cate_current_id
=
re
.
findall
(
'bestsellers/(.*)/ref'
,
href_list
[
matched_idx
])
cate_current_id
=
cate_current_id
[
0
]
.
split
(
"/"
)[
-
1
]
if
cate_current_id
else
None
# 一级分类还是取第一个
cate_1_id
=
re
.
findall
(
'bestsellers/(.*)/ref'
,
href_list
[
0
])
cate_1_id
=
cate_1_id
[
0
]
.
split
(
"/"
)[
0
]
if
cate_1_id
else
None
# 解析排名
if
asin_bs_sellers_rank_lower
is
not
None
:
asin_bs_sellers_rank_lower2
=
asin_bs_sellers_rank_lower
.
replace
(
"."
,
""
)
.
replace
(
","
,
""
)
.
replace
(
" 100 "
,
""
)
else
:
asin_bs_sellers_rank_lower2
=
''
rank_list
=
re
.
findall
(
cate_current_pattern
,
asin_bs_sellers_rank_lower2
)
rank_list
=
[
int
(
rank
)
for
rank
in
rank_list
]
# 如果 rank_list 长度和 href_list 对齐,则取对应位置的排名
if
matched_idx
<
len
(
rank_list
):
cate_current_rank
=
rank_list
[
matched_idx
]
# 一级分类排名
if
rank_list
and
cate_1_pattern
in
asin_bs_sellers_rank_lower
:
cate_1_rank
=
rank_list
[
0
]
return
cate_1_id
,
cate_current_id
,
cate_1_rank
,
cate_current_rank
# 2.2 提取分类
if
href_list
:
if
len
(
href_list
)
==
1
:
cate_list
=
re
.
findall
(
'bestsellers/(.*)/ref'
,
href_list
[
0
])
...
...
@@ -735,7 +772,7 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
else
:
cate_1_id
,
cate_current_id
=
None
,
None
# 2.
2
提取排名
# 2.
3
提取排名
if
asin_bs_sellers_rank_lower
is
not
None
:
asin_bs_sellers_rank_lower2
=
asin_bs_sellers_rank_lower
.
replace
(
"."
,
""
)
.
replace
(
","
,
""
)
.
replace
(
" 100 "
,
""
)
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment