Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
35d1e0b5
Commit
35d1e0b5
authored
Jun 16, 2026
by
hejiangming
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
9e362c9c
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
59 additions
and
0 deletions
+59
-0
dws_aba_word_heat.py
Pyspark_job/dws/dws_aba_word_heat.py
+6
-0
dws_st_theme.py
Pyspark_job/dws/dws_st_theme.py
+53
-0
No files found.
Pyspark_job/dws/dws_aba_word_heat.py
View file @
35d1e0b5
...
@@ -123,6 +123,12 @@ class DwsAbaWordHeat(Templates):
...
@@ -123,6 +123,12 @@ class DwsAbaWordHeat(Templates):
pwd
=
pg_conn
[
"pwd"
],
username
=
pg_conn
[
"username"
],
pwd
=
pg_conn
[
"pwd"
],
username
=
pg_conn
[
"username"
],
query
=
sql_theme
query
=
sql_theme
)
)
# 词典 theme_ch 可能是逗号双值(如 "主题,风格"),且中英文逗号都可能出现。
# 先把全角逗号统一成半角,下游 _build_base_theme 的 split(theme_ch_list, ',') 才能把双值拆成原子主题;
# 否则中文逗号 "主题,风格" split 不开 → 整块变成一个废主题 token(与 dws_st_theme 同一个病)。
df_theme_raw
=
df_theme_raw
.
withColumn
(
"theme_ch"
,
F
.
regexp_replace
(
"theme_ch"
,
","
,
","
)
)
# 同一 word 在词典里可能有多行主题(如某词既是材质也是细分人群)
# 同一 word 在词典里可能有多行主题(如某词既是材质也是细分人群)
# 这里先按 word 聚合(sort_array 保证幂等);后续 _build_base_theme 还会再做一次 base 维度聚合
# 这里先按 word 聚合(sort_array 保证幂等);后续 _build_base_theme 还会再做一次 base 维度聚合
self
.
df_theme_agg
=
df_theme_raw
.
groupBy
(
'word'
)
.
agg
(
self
.
df_theme_agg
=
df_theme_raw
.
groupBy
(
'word'
)
.
agg
(
...
...
Pyspark_job/dws/dws_st_theme.py
View file @
35d1e0b5
...
@@ -236,12 +236,65 @@ class DwsStTheme(Templates):
...
@@ -236,12 +236,65 @@ class DwsStTheme(Templates):
query
=
sql4
query
=
sql4
)
)
# 词典主题双值拆分:theme_ch / theme_en 成对的逗号双值(如 "主题,风格" ↔ "topic,style")拆成多行
# 必须放在单复数扩展之前:让下游 collect_set 去重、sort_array 排序都基于单个原子主题
self
.
df_theme
=
self
.
_split_theme_multi_value
(
self
.
df_theme
)
# 词典单复数双向扩展(替代业务方手工录复数列)
# 词典单复数双向扩展(替代业务方手工录复数列)
# 业务问题:业务方录入只覆盖一种形式(girl 或 girls),搜索词里的另一种形式正则匹配不上
# 业务问题:业务方录入只覆盖一种形式(girl 或 girls),搜索词里的另一种形式正则匹配不上
# 解决方案:driver 端用 inflect 双向扩展原词 + base + plural 三种形式
# 解决方案:driver 端用 inflect 双向扩展原词 + base + plural 三种形式
# 性能:1w 多行词典在 driver 单机跑只需几秒
# 性能:1w 多行词典在 driver 单机跑只需几秒
self
.
df_theme
=
self
.
_expand_theme_with_plural
(
self
.
df_theme
)
self
.
df_theme
=
self
.
_expand_theme_with_plural
(
self
.
df_theme
)
def
_split_theme_multi_value
(
self
,
df_theme
):
"""
词典主题双值拆分:把 theme_ch / theme_en 里逗号拼接的双值(成对一一对应)按位置拆成多行。
修复早期 bug:下游 dwt_aba_st_analytics 用 concat_ws(',', collect_set(theme_ch)) 算 st_attribute_label,
若 theme_ch 存的是整块 "主题,风格",collect_set 按整串去重 → "风格" 这种重复值去不掉、且拆不开。
"""
# ============================================================
# Step 1:全角逗号 → 半角逗号,统一分隔符
# 为什么:业务方录入两种逗号都可能出现("主题,风格" / "主题,风格"),
# 先归一成半角 ",",下面 split 只需按一种分隔符处理
# ============================================================
df_theme
=
df_theme
.
withColumn
(
"theme_en"
,
F
.
regexp_replace
(
F
.
col
(
"theme_en"
),
","
,
","
)
)
.
withColumn
(
"theme_ch"
,
F
.
regexp_replace
(
F
.
col
(
"theme_ch"
),
","
,
","
)
)
# ============================================================
# Step 2:theme_en / theme_ch 各自 split 成数组,再 arrays_zip 按位置配对后 explode
# 为什么用 arrays_zip:两个字段是"成对双值"(theme_en="topic,style" 对 theme_ch="主题,风格"),
# 必须按下标一一对应配对,不能各自 explode(那样会变成 2×2 笛卡尔积错配)
# 例:theme_en="topic,style", theme_ch="主题,风格"
# → zip 配对 [(topic,主题),(style,风格)] → explode 成 2 行
# 单值行(无逗号):split 得长度 1 数组 → explode 仍是 1 行,不受影响
# ============================================================
df_theme
=
df_theme
.
withColumn
(
"theme_pair"
,
F
.
explode
(
F
.
arrays_zip
(
F
.
split
(
F
.
col
(
"theme_en"
),
","
)
.
alias
(
"theme_en"
),
F
.
split
(
F
.
col
(
"theme_ch"
),
","
)
.
alias
(
"theme_ch"
)
)
)
)
# ============================================================
# Step 3:取出配对后的单值,trim 去掉 "主题, 风格" 逗号后的空格,并过滤空值
# 为什么 trim:业务方录入逗号后常带空格("主题, 风格"),不去掉会产出 " 风格" 脏标签
# 为什么过滤空串:末尾多打逗号("主题,")会 split 出空元素,过滤掉避免落入空主题
# ============================================================
df_theme
=
df_theme
.
select
(
F
.
trim
(
F
.
col
(
"theme_pair.theme_en"
))
.
alias
(
"theme_en"
),
F
.
trim
(
F
.
col
(
"theme_pair.theme_ch"
))
.
alias
(
"theme_ch"
),
F
.
col
(
"label_ch"
),
F
.
col
(
"label_en_lower"
)
)
.
filter
(
"theme_ch is not null and theme_ch != '' and theme_en is not null and theme_en != ''"
)
return
df_theme
def
_expand_theme_with_plural
(
self
,
df_theme
):
def
_expand_theme_with_plural
(
self
,
df_theme
):
"""
"""
词典单复数双向扩展:业务方录"girl"自动补出"girls"(反之亦然),让搜索词两种形式都能匹配上。
词典单复数双向扩展:业务方录"girl"自动补出"girls"(反之亦然),让搜索词两种形式都能匹配上。
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment