Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
4af80620
Commit
4af80620
authored
May 14, 2026
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
流量选品30天-旧流程临时修改读取数据源
parent
61cd5e9d
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
86 additions
and
42 deletions
+86
-42
kafka_flow_asin_detail.py
Pyspark_job/my_kafka/kafka_flow_asin_detail.py
+43
-21
kafka_rank_asin_detail.py
Pyspark_job/my_kafka/kafka_rank_asin_detail.py
+43
-21
No files found.
Pyspark_job/my_kafka/kafka_flow_asin_detail.py
View file @
4af80620
...
@@ -86,6 +86,7 @@ class KafkaFlowAsinDetail(Templates):
...
@@ -86,6 +86,7 @@ class KafkaFlowAsinDetail(Templates):
"it"
:
"(
\
d+) in "
,
"it"
:
"(
\
d+) in "
,
}
}
# DataFrame初始化
# DataFrame初始化
self
.
date_info_last_month
=
CommonUtil
.
get_month_offset
(
self
.
date_info
,
-
1
)
self
.
date_info_last_year
=
CommonUtil
.
get_month_offset
(
self
.
date_info
,
-
12
)
self
.
date_info_last_year
=
CommonUtil
.
get_month_offset
(
self
.
date_info
,
-
12
)
self
.
df_previous_flow_asin
=
self
.
spark
.
sql
(
"select 1+1;"
)
self
.
df_previous_flow_asin
=
self
.
spark
.
sql
(
"select 1+1;"
)
self
.
df_previous_flow_asin_lastyear
=
self
.
spark
.
sql
(
"select 1+1;"
)
self
.
df_previous_flow_asin_lastyear
=
self
.
spark
.
sql
(
"select 1+1;"
)
...
@@ -853,29 +854,50 @@ class KafkaFlowAsinDetail(Templates):
...
@@ -853,29 +854,50 @@ class KafkaFlowAsinDetail(Templates):
return
df_save
return
df_save
def
read_data
(
self
):
def
read_data
(
self
):
print
(
"1. 读取上个维度的flow_asin"
)
# previous/lastyear 共用:单表读取 + PySpark DataFrame join,不再从 dwt_flow_asin 取数据
sql
=
f
"""
# date_type 固定 month;date_info 必传(previous=上个月,lastyear=去年同月)
select asin, asin_ao_val as previous_asin_ao_val, asin_price as previous_asin_price,
def
_load_baseline
(
date_info_target
,
alias_prefix
):
variation_num as previous_asin_variation_num, asin_rating as previous_asin_rating,
where_clause
=
f
"site_name = '{self.site_name}' and date_type = 'month' and date_info = '{date_info_target}'"
asin_total_comments as previous_asin_total_comments, first_category_rank as previous_first_category_rank,
sql_measure
=
f
"""
bsr_orders as previous_asin_bsr_orders, sales as previous_sales, asin_bought_month as previous_asin_bought_month
select asin, asin_ao_val, asin_bsr_orders
from dwt_flow_asin where site_name = '{self.site_name}' and date_type = '30day'
from dwd_asin_measure
where {where_clause}
"""
"""
print
(
"sql="
,
sql
)
sql_detail
=
f
"""
self
.
df_previous_flow_asin
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
select asin, asin_price, variation_num, asin_rating, asin_total_comments, asin_bought_month
self
.
df_previous_flow_asin
=
self
.
df_previous_flow_asin
.
repartition
(
self
.
repartition_num
)
.
persist
(
StorageLevel
.
DISK_ONLY
)
from dim_asin_detail
self
.
df_previous_flow_asin
.
show
(
10
,
truncate
=
False
)
where {where_clause}
print
(
"1b. 读取同比去年的flow_asin"
)
sql
=
f
"""
select asin, asin_ao_val as lastyear_asin_ao_val, asin_price as lastyear_asin_price,
variation_num as lastyear_asin_variation_num, asin_rating as lastyear_asin_rating,
asin_total_comments as lastyear_asin_total_comments, first_category_rank as lastyear_first_category_rank,
bsr_orders as lastyear_asin_bsr_orders, sales as lastyear_sales, asin_bought_month as lastyear_asin_bought_month
from dwt_flow_asin where site_name = '{self.site_name}' and date_type = 'month' and date_info = '{self.date_info_last_year}'
"""
"""
print
(
"sql="
,
sql
)
sql_bs
=
f
"""
self
.
df_previous_flow_asin_lastyear
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
select asin, asin_bs_cate_1_rank
self
.
df_previous_flow_asin_lastyear
=
self
.
df_previous_flow_asin_lastyear
.
repartition
(
self
.
repartition_num
)
.
persist
(
StorageLevel
.
DISK_ONLY
)
from dim_asin_bs_info
where {where_clause}
"""
print
(
f
"sql_measure({alias_prefix})="
,
sql_measure
)
print
(
f
"sql_detail ({alias_prefix})="
,
sql_detail
)
print
(
f
"sql_bs ({alias_prefix})="
,
sql_bs
)
df_m
=
self
.
spark
.
sql
(
sqlQuery
=
sql_measure
)
.
repartition
(
self
.
repartition_num
,
'asin'
)
df_d
=
self
.
spark
.
sql
(
sqlQuery
=
sql_detail
)
.
repartition
(
self
.
repartition_num
,
'asin'
)
df_b
=
self
.
spark
.
sql
(
sqlQuery
=
sql_bs
)
.
repartition
(
self
.
repartition_num
,
'asin'
)
df
=
df_m
.
join
(
df_d
,
on
=
'asin'
,
how
=
'left'
)
.
join
(
df_b
,
on
=
'asin'
,
how
=
'left'
)
return
df
.
select
(
F
.
col
(
'asin'
),
F
.
round
(
F
.
col
(
'asin_ao_val'
),
3
)
.
alias
(
f
'{alias_prefix}_asin_ao_val'
),
F
.
col
(
'asin_price'
)
.
alias
(
f
'{alias_prefix}_asin_price'
),
F
.
col
(
'variation_num'
)
.
alias
(
f
'{alias_prefix}_asin_variation_num'
),
F
.
col
(
'asin_rating'
)
.
alias
(
f
'{alias_prefix}_asin_rating'
),
F
.
col
(
'asin_total_comments'
)
.
alias
(
f
'{alias_prefix}_asin_total_comments'
),
F
.
col
(
'asin_bs_cate_1_rank'
)
.
alias
(
f
'{alias_prefix}_first_category_rank'
),
F
.
col
(
'asin_bsr_orders'
)
.
alias
(
f
'{alias_prefix}_asin_bsr_orders'
),
F
.
round
(
F
.
col
(
'asin_bsr_orders'
)
*
F
.
col
(
'asin_price'
),
2
)
.
alias
(
f
'{alias_prefix}_sales'
),
F
.
col
(
'asin_bought_month'
)
.
alias
(
f
'{alias_prefix}_asin_bought_month'
),
)
.
persist
(
StorageLevel
.
DISK_ONLY
)
print
(
f
"1. 读取上个月维度的flow_asin(date_type=month, date_info={self.date_info_last_month})"
)
self
.
df_previous_flow_asin
=
_load_baseline
(
self
.
date_info_last_month
,
'previous'
)
self
.
df_previous_flow_asin
.
show
(
10
,
truncate
=
False
)
print
(
f
"1b. 读取同比去年的flow_asin(date_type=month, date_info={self.date_info_last_year})"
)
self
.
df_previous_flow_asin_lastyear
=
_load_baseline
(
self
.
date_info_last_year
,
'lastyear'
)
self
.
df_previous_flow_asin_lastyear
.
show
(
10
,
truncate
=
False
)
self
.
df_previous_flow_asin_lastyear
.
show
(
10
,
truncate
=
False
)
print
(
"2. 获取卖家相关信息"
)
print
(
"2. 获取卖家相关信息"
)
sql
=
f
"""
sql
=
f
"""
...
...
Pyspark_job/my_kafka/kafka_rank_asin_detail.py
View file @
4af80620
...
@@ -85,6 +85,7 @@ class KafkaRankAsinDetail(Templates):
...
@@ -85,6 +85,7 @@ class KafkaRankAsinDetail(Templates):
"it"
:
"(
\
d+) in "
,
"it"
:
"(
\
d+) in "
,
}
}
# DataFrame初始化
# DataFrame初始化
self
.
date_info_last_month
=
CommonUtil
.
get_month_offset
(
self
.
date_info
[:
7
],
-
1
)
self
.
date_info_last_year
=
CommonUtil
.
get_month_offset
(
self
.
date_info
[:
7
],
-
12
)
self
.
date_info_last_year
=
CommonUtil
.
get_month_offset
(
self
.
date_info
[:
7
],
-
12
)
self
.
df_previous_flow_asin
=
self
.
spark
.
sql
(
"select 1+1;"
)
self
.
df_previous_flow_asin
=
self
.
spark
.
sql
(
"select 1+1;"
)
self
.
df_previous_flow_asin_lastyear
=
self
.
spark
.
sql
(
"select 1+1;"
)
self
.
df_previous_flow_asin_lastyear
=
self
.
spark
.
sql
(
"select 1+1;"
)
...
@@ -852,29 +853,50 @@ class KafkaRankAsinDetail(Templates):
...
@@ -852,29 +853,50 @@ class KafkaRankAsinDetail(Templates):
return
df_save
return
df_save
def
read_data
(
self
):
def
read_data
(
self
):
print
(
"1. 读取上个维度的flow_asin"
)
# previous/lastyear 共用:单表读取 + PySpark DataFrame join,不再从 dwt_flow_asin 取数据
sql
=
f
"""
# date_type 固定 month;date_info 必传(previous=上个月,lastyear=去年同月)
select asin, asin_ao_val as previous_asin_ao_val, asin_price as previous_asin_price,
def
_load_baseline
(
date_info_target
,
alias_prefix
):
variation_num as previous_asin_variation_num, asin_rating as previous_asin_rating,
where_clause
=
f
"site_name = '{self.site_name}' and date_type = 'month' and date_info = '{date_info_target}'"
asin_total_comments as previous_asin_total_comments, first_category_rank as previous_first_category_rank,
sql_measure
=
f
"""
bsr_orders as previous_asin_bsr_orders, sales as previous_sales, asin_bought_month as previous_asin_bought_month
select asin, asin_ao_val, asin_bsr_orders
from dwt_flow_asin where site_name = '{self.site_name}' and date_type = '30day'
from dwd_asin_measure
where {where_clause}
"""
"""
print
(
"sql="
,
sql
)
sql_detail
=
f
"""
self
.
df_previous_flow_asin
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
select asin, asin_price, variation_num, asin_rating, asin_total_comments, asin_bought_month
self
.
df_previous_flow_asin
=
self
.
df_previous_flow_asin
.
repartition
(
self
.
repartition_num
)
.
persist
(
StorageLevel
.
DISK_ONLY
)
from dim_asin_detail
self
.
df_previous_flow_asin
.
show
(
10
,
truncate
=
False
)
where {where_clause}
print
(
"1b. 读取同比去年的flow_asin"
)
sql
=
f
"""
select asin, asin_ao_val as lastyear_asin_ao_val, asin_price as lastyear_asin_price,
variation_num as lastyear_asin_variation_num, asin_rating as lastyear_asin_rating,
asin_total_comments as lastyear_asin_total_comments, first_category_rank as lastyear_first_category_rank,
bsr_orders as lastyear_asin_bsr_orders, sales as lastyear_sales, asin_bought_month as lastyear_asin_bought_month
from dwt_flow_asin where site_name = '{self.site_name}' and date_type = 'month' and date_info = '{self.date_info_last_year}'
"""
"""
print
(
"sql="
,
sql
)
sql_bs
=
f
"""
self
.
df_previous_flow_asin_lastyear
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
select asin, asin_bs_cate_1_rank
self
.
df_previous_flow_asin_lastyear
=
self
.
df_previous_flow_asin_lastyear
.
repartition
(
self
.
repartition_num
)
.
persist
(
StorageLevel
.
DISK_ONLY
)
from dim_asin_bs_info
where {where_clause}
"""
print
(
f
"sql_measure({alias_prefix})="
,
sql_measure
)
print
(
f
"sql_detail ({alias_prefix})="
,
sql_detail
)
print
(
f
"sql_bs ({alias_prefix})="
,
sql_bs
)
df_m
=
self
.
spark
.
sql
(
sqlQuery
=
sql_measure
)
.
repartition
(
self
.
repartition_num
,
'asin'
)
df_d
=
self
.
spark
.
sql
(
sqlQuery
=
sql_detail
)
.
repartition
(
self
.
repartition_num
,
'asin'
)
df_b
=
self
.
spark
.
sql
(
sqlQuery
=
sql_bs
)
.
repartition
(
self
.
repartition_num
,
'asin'
)
df
=
df_m
.
join
(
df_d
,
on
=
'asin'
,
how
=
'left'
)
.
join
(
df_b
,
on
=
'asin'
,
how
=
'left'
)
return
df
.
select
(
F
.
col
(
'asin'
),
F
.
round
(
F
.
col
(
'asin_ao_val'
),
3
)
.
alias
(
f
'{alias_prefix}_asin_ao_val'
),
F
.
col
(
'asin_price'
)
.
alias
(
f
'{alias_prefix}_asin_price'
),
F
.
col
(
'variation_num'
)
.
alias
(
f
'{alias_prefix}_asin_variation_num'
),
F
.
col
(
'asin_rating'
)
.
alias
(
f
'{alias_prefix}_asin_rating'
),
F
.
col
(
'asin_total_comments'
)
.
alias
(
f
'{alias_prefix}_asin_total_comments'
),
F
.
col
(
'asin_bs_cate_1_rank'
)
.
alias
(
f
'{alias_prefix}_first_category_rank'
),
F
.
col
(
'asin_bsr_orders'
)
.
alias
(
f
'{alias_prefix}_asin_bsr_orders'
),
F
.
round
(
F
.
col
(
'asin_bsr_orders'
)
*
F
.
col
(
'asin_price'
),
2
)
.
alias
(
f
'{alias_prefix}_sales'
),
F
.
col
(
'asin_bought_month'
)
.
alias
(
f
'{alias_prefix}_asin_bought_month'
),
)
.
persist
(
StorageLevel
.
DISK_ONLY
)
print
(
f
"1. 读取上个月维度的flow_asin(date_type=month, date_info={self.date_info_last_month})"
)
self
.
df_previous_flow_asin
=
_load_baseline
(
self
.
date_info_last_month
,
'previous'
)
self
.
df_previous_flow_asin
.
show
(
10
,
truncate
=
False
)
print
(
f
"1b. 读取同比去年的flow_asin(date_type=month, date_info={self.date_info_last_year})"
)
self
.
df_previous_flow_asin_lastyear
=
_load_baseline
(
self
.
date_info_last_year
,
'lastyear'
)
self
.
df_previous_flow_asin_lastyear
.
show
(
10
,
truncate
=
False
)
self
.
df_previous_flow_asin_lastyear
.
show
(
10
,
truncate
=
False
)
print
(
"2. 获取卖家相关信息"
)
print
(
"2. 获取卖家相关信息"
)
sql
=
f
"""
sql
=
f
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment