Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
e521edcf
Commit
e521edcf
authored
May 28, 2026
by
fangxingjun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
cc57a012
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
117 additions
and
115 deletions
+117
-115
import_st_to_pg14.py
Pyspark_job/listen_program/import_st_to_pg14.py
+48
-46
export_dwt_asin_sync.py
Pyspark_job/sqoop_export/export_dwt_asin_sync.py
+69
-69
No files found.
Pyspark_job/listen_program/import_st_to_pg14.py
View file @
e521edcf
...
...
@@ -10,6 +10,7 @@ sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
# from ..utils.templates import Templates
from
utils.db_util
import
DbTypes
,
DBUtil
from
urllib.parse
import
quote
from
listen_program.wf_month_control
import
wf_month_control
class
ImportStToPg14
(
object
):
...
...
@@ -46,6 +47,7 @@ class ImportStToPg14(object):
sql
=
f
"select `year_month` from selection.date_20_to_30 WHERE year_week='{self.date_info}' and week_day=1;"
df
=
pd
.
read_sql
(
sql
,
con
=
self
.
engine_mysql
)
self
.
date_info
=
list
(
df
.
year_month
)[
0
]
self
.
date_type
=
"month"
def
delete_dirty_data
(
self
):
print
(
f
"删除脏数据, 防止失败执行时报错"
)
...
...
@@ -118,8 +120,6 @@ class ImportStToPg14(object):
return
df_search_term
def
save_data
(
self
):
print
(
f
"存储{self.site_name}_search_term_month: {self.df_save.shape}"
)
self
.
df_save
.
to_sql
(
f
"{self.site_name}_search_term_month"
,
con
=
self
.
engine_pg14
,
index
=
False
,
if_exists
=
"append"
)
...
...
@@ -134,7 +134,9 @@ class ImportStToPg14(object):
print
(
f
"当前没有新增的搜索词同步, 不需要更改进度表, 退出程序"
)
quit
()
# 更改workflow_manager进度表
self
.
update_workflow_manager
()
# self.update_workflow_manager() # 更改到单独的命令执行
wf_month_control
(
site_name
=
self
.
site_name
,
date_type
=
self
.
date_type
,
date_info
=
self
.
date_info
,
spider_name
=
f
'{self.site_name}_spider_st'
,
wf_type
=
"spider"
)
with
self
.
engine_pg14
.
begin
()
as
conn
:
# sql_delete = f"delete from {self.site_name}_search_term_month_syn where (date_info='{self.date_info}' and state=1) or (date_info<'{self.date_info}');"
...
...
@@ -170,49 +172,49 @@ class ImportStToPg14(object):
]
return
[[
search_term
,
url
]
for
url
in
urls
]
def
update_workflow_manager
(
self
):
with
self
.
engine_mysql
.
begin
()
as
conn
:
priority
=
self
.
site_name_pri_dict
[
self
.
site_name
]
spider_script
=
f
'ansible dabing_all -f 10 -m shell -a "nohup /usr/local/bin/python3 /mnt/py_spider/threading_spider/Poll_site_search_term_month.py {self.site_name} {self.date_info} >/dev/null 2>&1 &";'
update_sql_workflow
=
f
"""
INSERT INTO selection.workflow_manager
(
workflow_name,
site_name,
date_type,
date_info,
priority,
spider_name,
spider_script,
spider_is_ready,
spider_state,
bg_name,
bg_dol_name,
bg_dol_state
)
VALUES
(
'月全流程',
'{self.site_name}',
'month',
'{self.date_info}',
{priority},
'{self.site_name}_spider_st',
'{spider_script}',
'yes',
1,
'{self.site_name}_asin_export',
'ALL站点-asin同步-pg-api',
1
)
ON DUPLICATE KEY UPDATE
spider_is_ready = VALUES(spider_is_ready),
spider_script = VALUES(spider_script),
bg_dol_state = VALUES(bg_dol_state),
spider_state = VALUES(spider_state);
"""
print
(
f
"workflow_manager进度表---重置爬虫的搜索词抓取进度: {update_sql_workflow}"
)
conn
.
execute
(
update_sql_workflow
)
#
def update_workflow_manager(self):
#
with self.engine_mysql.begin() as conn:
#
priority = self.site_name_pri_dict[self.site_name]
#
spider_script = f'ansible dabing_all -f 10 -m shell -a "nohup /usr/local/bin/python3 /mnt/py_spider/threading_spider/Poll_site_search_term_month.py {self.site_name} {self.date_info} >/dev/null 2>&1 &";'
#
update_sql_workflow = f"""
#
INSERT INTO selection.workflow_manager
#
(
#
workflow_name,
#
site_name,
#
date_type,
#
date_info,
#
priority,
#
spider_name,
#
spider_script,
#
spider_is_ready,
#
spider_state,
#
bg_name,
#
bg_dol_name,
#
bg_dol_state
#
)
#
VALUES
#
(
#
'月全流程',
#
'{self.site_name}',
#
'month',
#
'{self.date_info}',
#
{priority},
#
'{self.site_name}_spider_st',
#
'{spider_script}',
#
'yes',
#
1,
# '{self.site_name}_asin_export',
#
'ALL站点-asin同步-pg-api',
#
1
#
)
#
ON DUPLICATE KEY UPDATE
# spider_is_ready = VALUES(spider_is_ready),
# spider_script = VALUES(spider_script),
# bg_dol_state = VALUES(bg_dol_state),
#
spider_state = VALUES(spider_state);
#
"""
#
print(f"workflow_manager进度表---重置爬虫的搜索词抓取进度: {update_sql_workflow}")
#
conn.execute(update_sql_workflow)
def
run
(
self
,
num
=
0
):
while
num
<=
5
:
...
...
Pyspark_job/sqoop_export/export_dwt_asin_sync.py
View file @
e521edcf
...
...
@@ -10,74 +10,74 @@ sys.path.append(os.path.dirname(sys.path[0]))
from
utils.secure_db_client
import
get_remote_engine
def
update_workflow_manager
(
site_name
,
date_type
,
date_info
):
print
(
f
"当前执行的参数: {site_name, date_type, date_info}"
)
if
date_type
==
"month"
:
while
True
:
try
:
site_name_pri_dict
=
{
"us"
:
2
,
"uk"
:
4
,
"de"
:
6
,
}
engine_mysql
=
get_remote_engine
(
site_name
=
'us'
,
db_type
=
'mysql'
)
spider_script
=
f
'ansible dabing_all -f 10 -m shell -a "nohup /usr/local/bin/python3 /mnt/py_spider/threading_spider/Poll_site_spider_month.py {site_name} {date_type} {date_info} >/dev/null 2>&1 &";'
priority
=
site_name_pri_dict
[
site_name
]
update_sql_workflow_spider
=
f
"""
INSERT INTO selection.workflow_manager
(
workflow_name,
site_name,
date_type,
date_info,
priority,
spider_name,
spider_script,
spider_is_ready,
spider_state,
bg_name,
bg_dol_name,
bg_dol_state,
finished_count
)
VALUES
(
'月全流程',
'{site_name}',
'month',
'{date_info}',
{priority},
'{site_name}_spider_asin',
'{spider_script}',
'yes',
1,
'{site_name}_all_cal',
'ALL-月流程-ABA+反查+流量选品',
1,
0
)
ON DUPLICATE KEY UPDATE
spider_is_ready = VALUES(spider_is_ready),
spider_script = VALUES(spider_script),
spider_state = VALUES(spider_state),
bg_dol_name = VALUES(bg_dol_name);
"""
print
(
f
"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow_spider}"
)
engine_mysql
.
execute
(
update_sql_workflow_spider
)
update_sql_workflow_bg
=
f
"""update selection.workflow_manager
set bg_dol_state=3, finished_count=COALESCE(finished_count, 0) + 1
WHERE workflow_name='月全流程' and site_name='{site_name}' and date_type='{date_type}' and date_info='{date_info}' and priority={priority}"""
print
(
f
"workflow_manager进度表---更新asin导出进度和完成次数: {update_sql_workflow_bg}"
)
engine_mysql
.
execute
(
update_sql_workflow_bg
)
break
except
Exception
as
e
:
print
(
f
"失败:workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow_spider}, {e}, 报错信息: {traceback.format_exc()}"
)
time
.
sleep
(
300
)
continue
#
def update_workflow_manager(site_name, date_type, date_info):
#
print(f"当前执行的参数: {site_name, date_type, date_info}")
#
if date_type == "month":
#
while True:
#
try:
#
site_name_pri_dict = {
#
"us": 2,
#
"uk": 4,
#
"de": 6,
#
}
#
engine_mysql = get_remote_engine(
#
site_name='us',
#
db_type='mysql'
#
)
#
spider_script = f'ansible dabing_all -f 10 -m shell -a "nohup /usr/local/bin/python3 /mnt/py_spider/threading_spider/Poll_site_spider_month.py {site_name} {date_type} {date_info} >/dev/null 2>&1 &";'
#
priority = site_name_pri_dict[site_name]
#
update_sql_workflow_spider = f"""
#
INSERT INTO selection.workflow_manager
#
(
#
workflow_name,
#
site_name,
#
date_type,
#
date_info,
#
priority,
#
spider_name,
#
spider_script,
#
spider_is_ready,
#
spider_state,
#
bg_name,
# bg_dol_name,
#
bg_dol_state,
#
finished_count
#
)
#
VALUES
#
(
#
'月全流程',
#
'{site_name}',
#
'month',
#
'{date_info}',
# {priority},
#
'{site_name}_spider_asin',
#
'{spider_script}',
#
'yes',
#
1,
# '{site_name}_all_cal',
# 'ALL-月流程-ABA+反查+流量选品',
#
1,
#
0
#
)
#
ON DUPLICATE KEY UPDATE
#
spider_is_ready = VALUES(spider_is_ready),
#
spider_script = VALUES(spider_script),
#
spider_state = VALUES(spider_state),
#
bg_dol_name = VALUES(bg_dol_name);
#
"""
#
print(f"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow_spider}")
#
engine_mysql.execute(update_sql_workflow_spider)
#
# update_sql_workflow_bg = f"""update selection.workflow_manager
#
set bg_dol_state=3, finished_count=COALESCE(finished_count, 0) + 1
#
WHERE workflow_name='月全流程' and site_name='{site_name}' and date_type='{date_type}' and date_info='{date_info}' and priority={priority}"""
#
print(f"workflow_manager进度表---更新asin导出进度和完成次数: {update_sql_workflow_bg}")
#
engine_mysql.execute(update_sql_workflow_bg)
#
break
#
except Exception as e:
#
print(f"失败:workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow_spider}, {e}, 报错信息: {traceback.format_exc()}")
#
time.sleep(300)
#
continue
def
export_data
(
site_name
,
date_type
,
date_info
):
...
...
@@ -163,4 +163,4 @@ if __name__ == '__main__':
export_data
(
site_name
,
date_type
,
date_info
)
if
site_name
in
[
"us"
,
"uk"
,
"de"
]
and
date_type
==
"month"
:
get_minid_maxid
(
site_name
,
date_type
,
date_info
)
update_workflow_manager
(
site_name
,
date_type
,
date_info
)
#
update_workflow_manager(site_name, date_type, date_info)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment