Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
ad94b0f3
Commit
ad94b0f3
authored
Jun 26, 2026
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix
parent
ef36ef57
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
50 additions
and
11 deletions
+50
-11
dws_user_selection_pattern.py
Pyspark_job/doris_handle/dws_user_selection_pattern.py
+50
-11
No files found.
Pyspark_job/doris_handle/dws_user_selection_pattern.py
View file @
ad94b0f3
...
@@ -4,8 +4,11 @@
...
@@ -4,8 +4,11 @@
- 串行逐 filter_id 执行,减小数据库压力
- 串行逐 filter_id 执行,减小数据库压力
- 支持断点续算:latest_computed_month 作为水位线,跳过已算月份
- 支持断点续算:latest_computed_month 作为水位线,跳过已算月份
- Doris AGGREGATE KEY(filter_id, asin) + MIN(date_info) 自动保留首次入选月份
- Doris AGGREGATE KEY(filter_id, asin) + MIN(date_info) 自动保留首次入选月份
- us 站点利润率/Keepa 字段每日刷新:近 REFRESH_MONTHS_COUNT 月 DELETE + 重算
- 水位线规则:刷新/补算成功→维护到最新月份;失败→维护到失败的前一个月
"""
"""
import
os
import
os
import
re
import
sys
import
sys
from
datetime
import
datetime
from
datetime
import
datetime
...
@@ -19,7 +22,13 @@ import pandas as pd
...
@@ -19,7 +22,13 @@ import pandas as pd
MYSQL_FILTER_TABLE
=
'flow_increment_filter_sql'
# MySQL 筛选模式日志表
MYSQL_FILTER_TABLE
=
'flow_increment_filter_sql'
# MySQL 筛选模式日志表
DORIS_RESULT_DB
=
'selection'
DORIS_RESULT_DB
=
'selection'
DORIS_RESULT_TABLE
=
'user_selection_pattern'
DORIS_RESULT_TABLE
=
'user_selection_pattern'
SUPPORTED_SITES
=
(
'us'
,
'uk'
,
'de'
)
SUPPORTED_SITES
=
(
'us'
,
'uk'
,
'de'
)
REFRESH_MONTHS_COUNT
=
3
REFRESH_FIELDS
=
{
'ocean_profit'
,
'air_profit'
,
'launch_time'
,
'launch_time_type'
,
'tracking_since'
,
'tracking_since_type'
,
}
# ===== 连接工厂 =====
# ===== 连接工厂 =====
...
@@ -71,6 +80,10 @@ def _get_available_months(doris_cur, site):
...
@@ -71,6 +80,10 @@ def _get_available_months(doris_cur, site):
return
months
return
months
def
_needs_refresh
(
where_sql
):
return
any
(
re
.
search
(
rf
'
\b
{re.escape(field)}
\b
'
,
where_sql
)
for
field
in
REFRESH_FIELDS
)
def
_update_mysql_log
(
mysql_conn
,
filter_id
,
latest_month
,
status
,
msg
):
def
_update_mysql_log
(
mysql_conn
,
filter_id
,
latest_month
,
status
,
msg
):
now
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
now
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
if
latest_month
is
not
None
:
if
latest_month
is
not
None
:
...
@@ -110,19 +123,45 @@ def _compute_one_filter(row, doris_cur, mysql_conn, months_by_site):
...
@@ -110,19 +123,45 @@ def _compute_one_filter(row, doris_cur, mysql_conn, months_by_site):
print
(
f
" [SKIP] 站点 {site} 无可用月表"
)
print
(
f
" [SKIP] 站点 {site} 无可用月表"
)
return
return
# 断点续算:从上次水位线的下一个月开始
# 近 N 月刷新(where_sql 涉及每日刷新字段时)
needs_refresh
=
_needs_refresh
(
where_sql
)
refresh_months
=
(
set
(
m
for
m
in
available
[
-
REFRESH_MONTHS_COUNT
:]
if
m
>=
base_month
)
if
(
needs_refresh
and
site
==
'us'
)
else
set
()
)
# 水位线补算,跳过已被刷新覆盖的月份
start_month
=
_next_month
(
latest_done
)
if
latest_done
else
base_month
start_month
=
_next_month
(
latest_done
)
if
latest_done
else
base_month
end_month
=
available
[
-
1
]
new_months
=
[
m
for
m
in
available
if
start_month
<=
m
<=
available
[
-
1
]
and
m
not
in
refresh_months
]
months
=
[
m
for
m
in
available
if
start_month
<=
m
<=
end_month
]
if
not
months
:
all_months
=
sorted
(
refresh_months
|
set
(
new_months
))
print
(
f
" [SKIP] 无新月份需计算(latest_computed={latest_done})"
)
if
not
all_months
:
print
(
f
" [SKIP] 无需计算(latest_computed={latest_done},refresh={needs_refresh})"
)
return
return
print
(
f
" 计算范围:{months[0]} ~ {months[-1]},共 {len(months)} 个月"
)
refresh_sorted
=
sorted
(
refresh_months
)
print
(
f
" 计算范围:{all_months[0]} ~ {all_months[-1]},共 {len(all_months)} 个月"
+
(
f
"(其中刷新:{refresh_sorted[0]} ~ {refresh_sorted[-1]})"
if
refresh_months
else
""
))
# Step 1:批量 DELETE 刷新月份旧数据
if
refresh_months
:
month_in
=
"','"
.
join
(
refresh_sorted
)
delete_sql
=
(
f
"DELETE FROM `{DORIS_RESULT_DB}`.`{DORIS_RESULT_TABLE}` "
f
"WHERE filter_id = {filter_id} AND date_info IN ('{month_in}')"
)
try
:
doris_cur
.
execute
(
delete_sql
)
print
(
f
" [DELETE] 已清除近 {len(refresh_months)} 月旧数据"
)
except
Exception
as
e
:
err
=
str
(
e
)[:
200
]
print
(
f
" [DELETE FAIL] {err}"
)
_update_mysql_log
(
mysql_conn
,
filter_id
,
latest_done
,
'failed'
,
f
'刷新DELETE失败: {err}'
)
return
last_ok_month
=
None
# Step 2:逐月 INSERT
for
month
in
months
:
# DELETE 成功后将水位线退至刷新窗口前一月,防止窗口滑移导致已删月份永久丢失
pre_refresh
=
[
m
for
m
in
available
if
m
<
refresh_sorted
[
0
]]
if
refresh_months
else
[]
last_ok_month
=
pre_refresh
[
-
1
]
if
pre_refresh
else
None
for
month
in
all_months
:
table
=
f
'{site}_flow_asin_month_{month.replace("-", "_")}'
table
=
f
'{site}_flow_asin_month_{month.replace("-", "_")}'
sql
=
f
"""
sql
=
f
"""
INSERT INTO `{DORIS_RESULT_DB}`.`{DORIS_RESULT_TABLE}`
INSERT INTO `{DORIS_RESULT_DB}`.`{DORIS_RESULT_TABLE}`
...
@@ -136,7 +175,7 @@ def _compute_one_filter(row, doris_cur, mysql_conn, months_by_site):
...
@@ -136,7 +175,7 @@ def _compute_one_filter(row, doris_cur, mysql_conn, months_by_site):
"""
"""
try
:
try
:
doris_cur
.
execute
(
sql
)
doris_cur
.
execute
(
sql
)
print
(
f
" [OK] {month}"
)
print
(
f
" [OK] {month}"
+
(
" [刷新]"
if
month
in
refresh_months
else
""
)
)
last_ok_month
=
month
last_ok_month
=
month
except
Exception
as
e
:
except
Exception
as
e
:
err
=
str
(
e
)[:
200
]
err
=
str
(
e
)[:
200
]
...
@@ -146,7 +185,7 @@ def _compute_one_filter(row, doris_cur, mysql_conn, months_by_site):
...
@@ -146,7 +185,7 @@ def _compute_one_filter(row, doris_cur, mysql_conn, months_by_site):
_update_mysql_log
(
_update_mysql_log
(
mysql_conn
,
filter_id
,
last_ok_month
,
'success'
,
mysql_conn
,
filter_id
,
last_ok_month
,
'success'
,
f
'
计算完成,共处理 {len(months)} 个月({months[0]} ~ {
months[-1]})'
f
'
完成:新增 {len(new_months)} 月,刷新 {len(refresh_months)} 月({all_months[0]} ~ {all_
months[-1]})'
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment