Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
914bde21
Commit
914bde21
authored
Jun 09, 2026
by
hejiangming
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
3ce637b6
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
25 additions
and
3 deletions
+25
-3
dwt_aba_last365.py
Pyspark_job/dwt/dwt_aba_last365.py
+25
-3
No files found.
Pyspark_job/dwt/dwt_aba_last365.py
View file @
914bde21
...
...
@@ -114,7 +114,8 @@ class DwtAbaLast365(object):
st_word_num,
date_info,
st_num,
rank
rank,
st_attribute_label
from dwt_aba_st_analytics
where site_name = '{self.site_name}'
and date_type = '{self.date_type_original}'
...
...
@@ -259,6 +260,7 @@ class DwtAbaLast365(object):
def
handle_month_lastest
(
self
):
# 保留最新的月数据
# 注:st_attribute_label 已不在此处取(改到 handle_agg 按 12 个月合并去重,避免最新月没出现就丢标签)
self
.
df_base_lastest
=
self
.
df_base
.
filter
(
f
"date_info = '{self.date_info}'"
)
.
select
(
"search_term"
,
"rank"
,
"color_proportion"
,
"multi_color_proportion"
,
"multi_size_proportion"
,
"st_ao_avg"
,
"st_ao_val_rate"
,
"supply_demand"
,
"total_asin_num"
,
"new_asin_proportion"
,
"bsr_orders"
,
...
...
@@ -314,7 +316,22 @@ class DwtAbaLast365(object):
# 是否新细分市场 非平均数算法 12个月都是新出现 表明同比年也是新出现 即 sum=12 表示为1 否则都是0
F
.
avg
(
"is_new_market_segment"
)
.
cast
(
IntegerType
())
.
alias
(
"is_new_market_segment"
),
# 同比是否是热搜词 热搜词:最近1月/年中,出现的次数大于80% 如果月热搜词 is_search_text的和>=10 则是热搜词
F
.
expr
(
"sum(is_search_text) / 9.6"
)
.
cast
(
IntegerType
())
.
alias
(
"is_search_text"
)
F
.
expr
(
"sum(is_search_text) / 9.6"
)
.
cast
(
IntegerType
())
.
alias
(
"is_search_text"
),
# st_attribute_label 12 月合并去重(原来在 handle_month_lastest 只取最新月,最新月该词没出现会丢标签)
# 步骤: 单月逗号串 split 成 array → flatten 跨月合并 → array_distinct 去重 → sort_array 排序 → concat_ws 拼回串
# 过滤 '-1' (单月无匹配占位) 防止把占位合并进有效结果;12 月全 -1 时输出空串,nullif 转 null 后由 handle_save 的 na.fill 转 '-1'
F
.
expr
(
"""
nullif(
concat_ws(',',
sort_array(array_distinct(flatten(
collect_set(
case when st_attribute_label != '-1' then split(st_attribute_label, ',') end
)
)))
),
''
)
"""
)
.
alias
(
'st_attribute_label'
)
)
# 行转列的字段
...
...
@@ -595,6 +612,8 @@ class DwtAbaLast365(object):
F
.
col
(
"rank_rate_of_change_lastest"
),
# 最新出现的月份
F
.
col
(
"appear_month_lastest"
),
#搜索词属性标签(12 个月合并去重,逗号分隔字符串,sqoop 导出 PG 后转 VARCHAR[])
F
.
col
(
"st_attribute_label"
),
F
.
lit
(
self
.
site_name
)
.
alias
(
"site_name"
),
F
.
lit
(
self
.
date_type
)
.
alias
(
"date_type"
),
...
...
@@ -633,7 +652,10 @@ class DwtAbaLast365(object):
"top_rank"
,
F
.
col
(
"rank"
)
)
.
na
.
fill
({
"rank"
:
0
,
"top_rank"
:
0
"top_rank"
:
0
,
# handle_agg 里 12 月全 -1 时输出空串 → nullif 转 null → 这里 fillna '-1'
# 占位 "-1" 与 dwt_aba_st_analytics 端规则一致,Java 转 null 返前端
"st_attribute_label"
:
"-1"
})
.
cache
()
def
save_data
(
self
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment