Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
31c1321b
Commit
31c1321b
authored
Nov 18, 2025
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
流量选品-更新hive中销量异常的数据
parent
7e007607
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
63 additions
and
168 deletions
+63
-168
rerun_dwt_flow_asin.py
Pyspark_job/script/rerun_dwt_flow_asin.py
+63
-21
aba_report_tmp.py
Pyspark_job/sqoop_export/aba_report_tmp.py
+0
-147
No files found.
Pyspark_job/script/rerun_dwt_flow_asin.py
View file @
31c1321b
...
@@ -2,32 +2,74 @@ import os
...
@@ -2,32 +2,74 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.DolphinschedulerHelper
import
DolphinschedulerHelper
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.spark_util
import
SparkUtil
from
utils.spark_util
import
SparkUtil
from
utils.db_util
import
DBUtil
from
pyspark.sql
import
functions
as
F
from
utils.hdfs_utils
import
HdfsUtils
if
__name__
==
'__main__'
:
class
RerunDemo
(
object
):
def
__init__
(
self
,
site_name
,
date_type
,
date_info
):
spark_session
=
SparkUtil
.
get_spark_session
(
"rerun-dwt—flow-asin"
)
self
.
site_name
=
site_name
date_list
=
[
"2024-02"
,
"2024-01"
,
"2023-12"
]
self
.
date_type
=
date_type
for
date_info
in
date_list
:
self
.
date_info
=
date_info
startParams
=
{
app_name
=
f
"{self.__class__.__name__}:{self.site_name}:{self.date_type}:{self.date_info}"
"site_name"
:
"us"
,
self
.
spark
=
SparkUtil
.
get_spark_session
(
app_name
)
"date_type"
:
"month"
,
self
.
db_save
=
"dwt_flow_asin"
"date_info"
:
date_info
self
.
partitions_num
=
200
}
# 分区字段
print
(
startParams
)
self
.
partitions_by
=
[
'site_name'
,
'date_type'
,
'date_info'
]
DolphinschedulerHelper
.
start_and_watch_process_instance
(
# 拼接where条件
"big_data_selection"
,
self
.
partitions_dict
=
{
process_df_name
=
'rerun_dwt_flow_asin_api'
,
'site_name'
:
self
.
site_name
,
startParams
=
{
'date_type'
:
self
.
date_type
,
"site_name"
:
"us"
,
'date_info'
:
self
.
date_info
"date_type"
:
"month"
,
"date_info"
:
date_info
}
}
def
run
(
self
):
sql
=
f
"""
select * from {self.db_save} where site_name = '{self.site_name}' and date_type = '{self.date_type}' and date_info = '{self.date_info}';
"""
df_history_data
=
self
.
spark
.
sql
(
sql
)
.
repartition
(
self
.
partitions_num
)
.
cache
()
print
(
"重跑前历史数据如下:"
)
df_history_data
.
show
(
10
,
True
)
sql
=
f
"""
select asin, 1 as flag from {self.site_name}_asin_detail_2025_buysales_err where date_info = '{self.date_info}'
"""
pg_con_info
=
DBUtil
.
get_connection_info
(
"postgresql_14"
,
self
.
site_name
)
df_asin
=
SparkUtil
.
read_jdbc_query
(
session
=
self
.
spark
,
url
=
pg_con_info
[
'url'
],
username
=
pg_con_info
[
'username'
],
pwd
=
pg_con_info
[
'pwd'
],
query
=
sql
)
)
df_asin
=
df_asin
.
dropDuplicates
([
'asin'
])
.
cache
()
print
(
"爬虫表数据量为:"
,
df_asin
.
count
())
df_save
=
df_history_data
.
join
(
df_asin
,
'asin'
,
'left'
)
.
withColumn
(
'asin_bought_month'
,
F
.
when
(
F
.
col
(
'flag'
)
==
1
,
F
.
lit
(
None
))
.
otherwise
(
F
.
col
(
'asin_bought_month'
))
)
.
drop
(
'flag'
)
.
cache
()
CommonUtil
.
send_wx_msg
([
"chenyuanjie"
],
"【rerun_dwt_flow_asin_api】重跑完成"
,
""
)
if
df_history_data
.
count
()
==
df_save
.
count
():
print
(
f
"当前存储的表名为:{self.db_save}, 分区为{self.partitions_by}"
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
self
.
db_save
,
self
.
partitions_dict
)
HdfsUtils
.
delete_file_in_folder
(
hdfs_path
)
df_save
.
repartition
(
self
.
partitions_num
)
.
write
.
saveAsTable
(
name
=
self
.
db_save
,
format
=
'hive'
,
mode
=
'append'
,
partitionBy
=
self
.
partitions_by
)
print
(
"重跑成功!"
)
else
:
print
(
"重跑失败!"
)
exit
()
pass
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
obj
=
RerunDemo
(
site_name
=
site_name
,
date_type
=
date_type
,
date_info
=
date_info
)
obj
.
run
()
Pyspark_job/sqoop_export/aba_report_tmp.py
deleted
100644 → 0
View file @
7e007607
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.db_util
import
DBUtil
if
__name__
==
'__main__'
:
# 获取入参
site_name
=
'us'
date_type
=
'month'
date_info
=
'2023-12'
db_type
=
'postgresql_cluster'
engine
=
DBUtil
.
get_db_engine
(
db_type
,
site_name
)
# 导出表--基准字段
export_base_cols
=
[
"search_term_id"
,
"date_info"
,
"search_term"
,
"bsr_orders"
,
"orders"
,
"top100_asin"
,
"top100_orders"
,
"top100_market_share"
,
"top100_is_new"
,
"aadd_bsr_orders"
,
"aadd_video_num"
,
"aadd_no_video_num"
,
"no_aadd_no_video_num"
,
"no_aadd_video_num"
,
"ao_range_val1"
,
"ao_range_val2"
,
"ao_range_val3"
,
"ao_range_val4"
,
"ao_range_val5"
,
"ao_range_val6"
,
"ao_range_val7"
,
"ao_range_val8"
,
"ao_range_val9"
,
"ao_range_val10"
,
"ao_range_val11"
,
"ao_range_val12"
,
"ao_range_market_share1"
,
"ao_range_market_share2"
,
"ao_range_market_share3"
,
"ao_range_market_share4"
,
"ao_range_market_share5"
,
"ao_range_market_share6"
,
"ao_range_market_share7"
,
"ao_range_market_share8"
,
"ao_range_market_share9"
,
"ao_range_market_share10"
,
"ao_range_market_share11"
,
"ao_range_market_share12"
,
"launch_time_num1"
,
"launch_time_num2"
,
"launch_time_num3"
,
"launch_time_num4"
,
"launch_time_num5"
,
"launch_time_num6"
,
"launch_time_num7"
,
"launch_time_num8"
,
"launch_time_market_share1"
,
"launch_time_market_share2"
,
"launch_time_market_share3"
,
"launch_time_market_share4"
,
"launch_time_market_share5"
,
"launch_time_market_share6"
,
"launch_time_market_share7"
,
"launch_time_market_share8"
,
"top20_asin"
,
"top20_orders"
,
"top20_brand"
,
"top20_brand_new_num_proportion"
,
"top20_brand_bsr_oders"
,
"top20_brand_market_share"
,
"comments_num1"
,
"comments_num2"
,
"comments_num3"
,
"comments_num4"
,
"comments_num5"
,
"comments_num6"
,
"comments_num7"
,
"comments_num8"
,
"comments_num9"
,
"comments_num10"
,
"comments_num11"
,
"comments_num12"
,
"comments_num13"
,
"comments_num_market_share1"
,
"comments_num_market_share2"
,
"comments_num_market_share3"
,
"comments_num_market_share4"
,
"comments_num_market_share5"
,
"comments_num_market_share6"
,
"comments_num_market_share7"
,
"comments_num_market_share8"
,
"comments_num_market_share9"
,
"comments_num_market_share10"
,
"comments_num_market_share11"
,
"comments_num_market_share12"
,
"comments_num_market_share13"
,
"buy_box_name"
,
"buy_box_num"
,
"seller_name"
,
"seller_num"
,
"seller_bsr_orders"
,
"color_name"
,
"color_num"
,
"total_asin_num"
,
"new_asin_num"
]
export_table
=
"us_aba_last_month_report_2023_12_copy"
export_cols
=
export_base_cols
+
[
"new_asin_bsr_orders"
,
"rating_section"
,
"rating_market_share_section"
,
"size_name"
,
"size_num"
,
"package_num_trend"
,
"package_num_trend_market_share"
,
"package_num_corresponding_asin"
,
"price_interval"
,
"price_interval_asin_count"
,
"price_interval_asin_market_share"
,
"color_bsr_orders_percent"
,
"size_bsr_orders_percent"
]
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
sh
=
CommonUtil
.
build_export_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
hive_tb
=
"dwt_aba_st_analytics_report"
,
export_tb
=
export_table
,
col
=
export_cols
,
partition_dict
=
partition_dict
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
client
.
close
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment