Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
1777e202
Commit
1777e202
authored
May 28, 2026
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
流量选品月导出Doris-流程控制
parent
30b05aea
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
64 additions
and
20 deletions
+64
-20
dwt_flow_asin_month.py
Pyspark_job/doris_handle/dwt_flow_asin_month.py
+64
-20
No files found.
Pyspark_job/doris_handle/dwt_flow_asin_month.py
View file @
1777e202
...
...
@@ -2,24 +2,30 @@
author: CT
description: 同步 Hive dwt_flow_asin 月维度数据到 Doris dwt.{site}_flow_asin_month
流程:
1) 在 Doris 建当月物化表 selection.{site}_flow_asin_month_{yyyy_mm}(IF NOT EXISTS)
2) Spark 读 Hive dwt_flow_asin 月数据,规范化后写入 dwt.{site}_flow_asin_month
3) Doris 端执行 INSERT OVERWRITE 把 dwt 主表 + 关联 JOIN 物化到 selection 月表
Step 0) 建 selection 月物化表 selection.{site}_flow_asin_month_{yyyy_mm}[_test]
Step 1~3) Spark 读 Hive → 规范化 → 写 Doris dwt.{site}_flow_asin_month
Step 4) Doris INSERT OVERWRITE 物化到 selection 月物化表
Step 5) 更新 MySQL workflow_everyday 流程记录表(仅 formal 模式)
支持 us / uk / de 三站点
支持 formal / test 模式:
- formal:selection 表名无后缀,更新流程记录表
- test :selection 表名加 _test 后缀,不更新流程记录表(dwt 主表不变)
执行示例:
spark-submit dwt_flow_asin_month.py us 2026-05
spark-submit dwt_flow_asin_month.py u
k 2026-05
spark-submit dwt_flow_asin_month.py
de 2026-05
spark-submit dwt_flow_asin_month.py us 2026-05
# 默认 formal
spark-submit dwt_flow_asin_month.py u
s 2026-05 formal
spark-submit dwt_flow_asin_month.py
us 2026-05 test # test 模式
"""
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
import
pandas
as
pd
from
pyspark.sql
import
functions
as
F
from
utils.spark_util
import
SparkUtil
from
utils.DorisHelper
import
DorisHelper
from
utils.db_util
import
DBUtil
,
DbTypes
DORIS_DB
=
"dwt"
...
...
@@ -50,9 +56,9 @@ def _exec_doris_sql(sql_list, use_type='selection'):
conn
.
close
()
def
build_create_table_sql
(
site_name
,
date_info_underscor
e
,
date_info
):
"""构建 selection.{
site}_flow_asin_month_{yyyy_mm} 建表语句(与 DDL 一致)"""
table_name
=
f
"{site_name}_flow_asin_month_{date_info_underscore}
"
def
build_create_table_sql
(
table_nam
e
,
date_info
):
"""构建 selection.{
table_name} 建表语句(与 DDL 一致);
table_name
由外层拼接:{site}_flow_asin_month_{yyyy_mm}[_test]""
"
return
f
"""
CREATE TABLE IF NOT EXISTS `selection`.`{table_name}`
(
...
...
@@ -203,9 +209,9 @@ PROPERTIES (
"""
def
build_insert_overwrite_sql
(
site_name
,
date_info_underscor
e
,
date_info
):
"""构建 INSERT OVERWRITE 到 selection.{
site}_flow_asin_month_{yyyy_mm} 的 SQL"""
table_name
=
f
"{site_name}_flow_asin_month_{date_info_underscore}
"
def
build_insert_overwrite_sql
(
site_name
,
table_nam
e
,
date_info
):
"""构建 INSERT OVERWRITE 到 selection.{
table_name} 的 SQL;
table_name
由外层拼接:{site}_flow_asin_month_{yyyy_mm}[_test]""
"
return
f
"""
INSERT OVERWRITE TABLE `selection`.`{table_name}`
SELECT
...
...
@@ -383,20 +389,54 @@ WHERE f.date_info = '{date_info}'
"""
def
main
(
site_name
,
date_info
):
def
modify_mission_record_status
(
site_name
,
date_info
,
result_type
):
"""流程记录表更新:仅 month + formal 模式才入库 mysql workflow_everyday
参考 export_es/es_flow_asin.py modify_mission_record_status"""
if
result_type
!=
'formal'
:
print
(
f
"[Step 5] result_type={result_type},跳过流程记录表更新"
)
return
record_table
=
'workflow_everyday'
record_table_name_field
=
f
'{site_name}_flow_asin_last_month'
record_type
=
'month'
cur_date
=
date_info
engine_mysql
=
DBUtil
.
get_db_engine
(
db_type
=
DbTypes
.
mysql
.
name
,
site_name
=
'us'
)
select_sql
=
(
f
"select id from {record_table} where site_name='{site_name}' and date_type='month' "
f
"and report_date='{cur_date}' and page='流量选品' and status_val=14 and is_end='是'"
)
df_is_finished
=
pd
.
read_sql
(
select_sql
,
engine_mysql
)
if
df_is_finished
.
empty
:
replace_sql
=
f
"""
replace into {record_table} (site_name, report_date, status, status_val, table_name, date_type, page, is_end, remark, export_db_type)
VALUES ('{site_name}', '{cur_date}', '流量选品计算完毕', 14, '{record_table_name_field}', '{record_type}', '流量选品', '是', '流量选品计算完毕', 'doris')
"""
DBUtil
.
exec_sql
(
'mysql'
,
'us'
,
replace_sql
)
print
(
f
"[Step 5] 流程记录表 workflow_everyday 已写入:{site_name} {cur_date}"
)
else
:
print
(
f
"[Step 5] 流程记录表已存在该记录,跳过"
)
def
main
(
site_name
,
date_info
,
result_type
=
'formal'
):
assert
site_name
in
SUPPORTED_SITES
,
f
"不支持的站点:{site_name},仅支持 us/uk/de"
assert
result_type
in
(
'formal'
,
'test'
),
f
"不支持的 result_type:{result_type},仅支持 formal/test"
doris_table
=
f
"{site_name}_flow_asin_month"
doris_table
=
f
"{site_name}_flow_asin_month"
# dwt 主表,不区分 test/formal
date_info_underscore
=
date_info
.
replace
(
'-'
,
'_'
)
selection_table
=
f
"{site_name}_flow_asin_month_{date_info_underscore}"
# selection 月物化表:test 模式加 _test 后缀
env_suffix
=
'_test'
if
result_type
==
'test'
else
''
selection_table
=
f
"{site_name}_flow_asin_month_{date_info_underscore}{env_suffix}"
print
(
f
"启动:site={site_name}, date_info={date_info}, result_type={result_type}"
)
print
(
f
" dwt 主表:dwt.{doris_table}(不区分 test/formal)"
)
print
(
f
" selection 物化表:selection.{selection_table}"
)
spark
=
SparkUtil
.
get_spark_session
(
f
"DwtFlowAsinMonth: {site_name} {date_info}"
f
"DwtFlowAsinMonth: {site_name} {date_info}
{result_type}
"
)
# ===== Step 0
:Doris 端建 selection 月物化表(IF NOT EXISTS)
=====
# ===== Step 0
:Doris 端建 selection 月物化表(IF NOT EXISTS)
=====
print
(
f
"[Step 0] Doris 建表 selection.{selection_table}"
)
_exec_doris_sql
([
build_create_table_sql
(
s
ite_name
,
date_info_underscor
e
,
date_info
)])
_exec_doris_sql
([
build_create_table_sql
(
s
election_tabl
e
,
date_info
)])
# ===== Step 1:读 Hive dwt_flow_asin 月数据 =====
sql
=
f
"""
...
...
@@ -631,7 +671,10 @@ def main(site_name, date_info):
# ===== Step 4:Doris 端 INSERT OVERWRITE 到 selection 月物化表 =====
print
(
f
"[Step 4] Doris INSERT OVERWRITE selection.{selection_table}"
)
_exec_doris_sql
([
build_insert_overwrite_sql
(
site_name
,
date_info_underscore
,
date_info
)])
_exec_doris_sql
([
build_insert_overwrite_sql
(
site_name
,
selection_table
,
date_info
)])
# ===== Step 5:流程记录表更新(仅 formal 模式)=====
modify_mission_record_status
(
site_name
,
date_info
,
result_type
)
print
(
"success!"
)
...
...
@@ -639,4 +682,5 @@ def main(site_name, date_info):
if
__name__
==
"__main__"
:
site_name
=
sys
.
argv
[
1
]
date_info
=
sys
.
argv
[
2
]
main
(
site_name
,
date_info
)
result_type
=
sys
.
argv
[
3
]
if
len
(
sys
.
argv
)
>
3
else
'formal'
main
(
site_name
,
date_info
,
result_type
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment