Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
8b03ea2a
Commit
8b03ea2a
authored
Mar 19, 2026
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
隐藏分类补充解析
parent
b75a2b29
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
3 deletions
+20
-3
dwt_flow_asin.py
Pyspark_job/dwt/dwt_flow_asin.py
+20
-3
No files found.
Pyspark_job/dwt/dwt_flow_asin.py
View file @
8b03ea2a
...
...
@@ -521,10 +521,27 @@ class DwtFlowAsin(Templates):
CASE WHEN hide_flag = 1 THEN 1 WHEN category_first_id = 'grocery' and category_id != '6492272011' THEN 1
WHEN category_id in ('21393128011', '21377129011', '21377127011', '21377130011', '21388218011', '21377132011') THEN 1
ELSE 0 END"""
))
.
drop
(
"hide_flag"
)
self
.
df_asin_detail
=
self
.
df_asin_detail
.
withColumn
(
"asin_is_need"
,
F
.
expr
(
"""
CASE WHEN category_first_id in ('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed') THEN 1
WHEN asin NOT LIKE 'B0
%
' THEN 1
# 解析 asin_category_desc 取 › 分隔的第一个元素作为补充分类名称
self
.
df_asin_detail
=
self
.
df_asin_detail
.
withColumn
(
"desc_category_first_name"
,
F
.
lower
(
F
.
trim
(
F
.
split
(
F
.
col
(
"asin_category_desc"
),
"›"
)
.
getItem
(
0
)))
)
# 读取 Hive 分类维表,获取分类名称与ID的对应关系
sql_dim
=
f
"""
select lower(trim(en_name)) as desc_category_first_name, category_first_id as desc_category_first_id
from dim_bsr_category_tree where site_name = '{self.site_name}' and category_parent_id = 0 and leaf_node = 2
"""
df_bsr_category
=
F
.
broadcast
(
self
.
spark
.
sql
(
sqlQuery
=
sql_dim
))
# join 补充分类ID
self
.
df_asin_detail
=
self
.
df_asin_detail
.
join
(
df_bsr_category
,
on
=
[
'desc_category_first_name'
],
how
=
'left'
)
# 两个分类ID均在过滤列表中才标记为1
need_categories
=
"('mobile-apps', 'audible', 'books', 'music', 'dmusic', 'digital-text', 'magazines', 'movies-tv', 'software', 'videogames', 'amazon-devices', 'boost', 'us-live-explorations', 'amazon-renewed')"
self
.
df_asin_detail
=
self
.
df_asin_detail
.
withColumn
(
"asin_is_need"
,
F
.
expr
(
f
"""
CASE WHEN category_first_id in {need_categories}
AND desc_category_first_id in {need_categories} THEN 1
WHEN asin NOT LIKE 'B0
%
' THEN 1
ELSE 0 END"""
))
self
.
df_asin_detail
=
self
.
df_asin_detail
.
drop
(
"desc_category_first_name"
,
"desc_category_first_id"
)
self
.
df_asin_detail
=
self
.
df_asin_detail
.
withColumn
(
"asin_type"
,
F
.
expr
(
"""
CASE WHEN asin_is_self=1 THEN 1 WHEN asin_is_need=1 THEN 2 WHEN asin_is_hide=1 THEN 3 ELSE 0 END"""
))
.
drop
(
"asin_is_self"
,
"asin_is_need"
,
"asin_is_hide"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment