Commit e521edcf by fangxingjun

no message

parent cc57a012
......@@ -10,6 +10,7 @@ sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
# from ..utils.templates import Templates
from utils.db_util import DbTypes, DBUtil
from urllib.parse import quote
from listen_program.wf_month_control import wf_month_control
class ImportStToPg14(object):
......@@ -46,6 +47,7 @@ class ImportStToPg14(object):
sql = f"select `year_month` from selection.date_20_to_30 WHERE year_week='{self.date_info}' and week_day=1;"
df = pd.read_sql(sql, con=self.engine_mysql)
self.date_info = list(df.year_month)[0]
self.date_type = "month"
def delete_dirty_data(self):
print(f"删除脏数据, 防止失败执行时报错")
......@@ -118,8 +120,6 @@ class ImportStToPg14(object):
return df_search_term
def save_data(self):
print(f"存储{self.site_name}_search_term_month: {self.df_save.shape}")
self.df_save.to_sql(f"{self.site_name}_search_term_month", con=self.engine_pg14, index=False,
if_exists="append")
......@@ -134,7 +134,9 @@ class ImportStToPg14(object):
print(f"当前没有新增的搜索词同步, 不需要更改进度表, 退出程序")
quit()
# 更改workflow_manager进度表
self.update_workflow_manager()
# self.update_workflow_manager() # 更改到单独的命令执行
wf_month_control(site_name=self.site_name, date_type=self.date_type, date_info=self.date_info,
spider_name=f'{self.site_name}_spider_st', wf_type="spider")
with self.engine_pg14.begin() as conn:
# sql_delete = f"delete from {self.site_name}_search_term_month_syn where (date_info='{self.date_info}' and state=1) or (date_info<'{self.date_info}');"
......@@ -170,49 +172,49 @@ class ImportStToPg14(object):
]
return [[search_term, url] for url in urls]
def update_workflow_manager(self):
with self.engine_mysql.begin() as conn:
priority = self.site_name_pri_dict[self.site_name]
spider_script = f'ansible dabing_all -f 10 -m shell -a "nohup /usr/local/bin/python3 /mnt/py_spider/threading_spider/Poll_site_search_term_month.py {self.site_name} {self.date_info} >/dev/null 2>&1 &";'
update_sql_workflow = f"""
INSERT INTO selection.workflow_manager
(
workflow_name,
site_name,
date_type,
date_info,
priority,
spider_name,
spider_script,
spider_is_ready,
spider_state,
bg_name,
bg_dol_name,
bg_dol_state
)
VALUES
(
'月全流程',
'{self.site_name}',
'month',
'{self.date_info}',
{priority},
'{self.site_name}_spider_st',
'{spider_script}',
'yes',
1,
'{self.site_name}_asin_export',
'ALL站点-asin同步-pg-api',
1
)
ON DUPLICATE KEY UPDATE
spider_is_ready = VALUES(spider_is_ready),
spider_script = VALUES(spider_script),
bg_dol_state = VALUES(bg_dol_state),
spider_state = VALUES(spider_state);
"""
print(f"workflow_manager进度表---重置爬虫的搜索词抓取进度: {update_sql_workflow}")
conn.execute(update_sql_workflow)
# def update_workflow_manager(self):
# with self.engine_mysql.begin() as conn:
# priority = self.site_name_pri_dict[self.site_name]
# spider_script = f'ansible dabing_all -f 10 -m shell -a "nohup /usr/local/bin/python3 /mnt/py_spider/threading_spider/Poll_site_search_term_month.py {self.site_name} {self.date_info} >/dev/null 2>&1 &";'
# update_sql_workflow = f"""
# INSERT INTO selection.workflow_manager
# (
# workflow_name,
# site_name,
# date_type,
# date_info,
# priority,
# spider_name,
# spider_script,
# spider_is_ready,
# spider_state,
# bg_name,
# bg_dol_name,
# bg_dol_state
# )
# VALUES
# (
# '月全流程',
# '{self.site_name}',
# 'month',
# '{self.date_info}',
# {priority},
# '{self.site_name}_spider_st',
# '{spider_script}',
# 'yes',
# 1,
# '{self.site_name}_asin_export',
# 'ALL站点-asin同步-pg-api',
# 1
# )
# ON DUPLICATE KEY UPDATE
# spider_is_ready = VALUES(spider_is_ready),
# spider_script = VALUES(spider_script),
# bg_dol_state = VALUES(bg_dol_state),
# spider_state = VALUES(spider_state);
# """
# print(f"workflow_manager进度表---重置爬虫的搜索词抓取进度: {update_sql_workflow}")
# conn.execute(update_sql_workflow)
def run(self, num=0):
while num <= 5:
......
......@@ -10,74 +10,74 @@ sys.path.append(os.path.dirname(sys.path[0]))
from utils.secure_db_client import get_remote_engine
def update_workflow_manager(site_name, date_type, date_info):
print(f"当前执行的参数: {site_name, date_type, date_info}")
if date_type == "month":
while True:
try:
site_name_pri_dict = {
"us": 2,
"uk": 4,
"de": 6,
}
engine_mysql = get_remote_engine(
site_name='us',
db_type='mysql'
)
spider_script = f'ansible dabing_all -f 10 -m shell -a "nohup /usr/local/bin/python3 /mnt/py_spider/threading_spider/Poll_site_spider_month.py {site_name} {date_type} {date_info} >/dev/null 2>&1 &";'
priority = site_name_pri_dict[site_name]
update_sql_workflow_spider = f"""
INSERT INTO selection.workflow_manager
(
workflow_name,
site_name,
date_type,
date_info,
priority,
spider_name,
spider_script,
spider_is_ready,
spider_state,
bg_name,
bg_dol_name,
bg_dol_state,
finished_count
)
VALUES
(
'月全流程',
'{site_name}',
'month',
'{date_info}',
{priority},
'{site_name}_spider_asin',
'{spider_script}',
'yes',
1,
'{site_name}_all_cal',
'ALL-月流程-ABA+反查+流量选品',
1,
0
)
ON DUPLICATE KEY UPDATE
spider_is_ready = VALUES(spider_is_ready),
spider_script = VALUES(spider_script),
spider_state = VALUES(spider_state),
bg_dol_name = VALUES(bg_dol_name);
"""
print(f"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow_spider}")
engine_mysql.execute(update_sql_workflow_spider)
update_sql_workflow_bg = f"""update selection.workflow_manager
set bg_dol_state=3, finished_count=COALESCE(finished_count, 0) + 1
WHERE workflow_name='月全流程' and site_name='{site_name}' and date_type='{date_type}' and date_info='{date_info}' and priority={priority}"""
print(f"workflow_manager进度表---更新asin导出进度和完成次数: {update_sql_workflow_bg}")
engine_mysql.execute(update_sql_workflow_bg)
break
except Exception as e:
print(f"失败:workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow_spider}, {e}, 报错信息: {traceback.format_exc()}")
time.sleep(300)
continue
# def update_workflow_manager(site_name, date_type, date_info):
# print(f"当前执行的参数: {site_name, date_type, date_info}")
# if date_type == "month":
# while True:
# try:
# site_name_pri_dict = {
# "us": 2,
# "uk": 4,
# "de": 6,
# }
# engine_mysql = get_remote_engine(
# site_name='us',
# db_type='mysql'
# )
# spider_script = f'ansible dabing_all -f 10 -m shell -a "nohup /usr/local/bin/python3 /mnt/py_spider/threading_spider/Poll_site_spider_month.py {site_name} {date_type} {date_info} >/dev/null 2>&1 &";'
# priority = site_name_pri_dict[site_name]
# update_sql_workflow_spider = f"""
# INSERT INTO selection.workflow_manager
# (
# workflow_name,
# site_name,
# date_type,
# date_info,
# priority,
# spider_name,
# spider_script,
# spider_is_ready,
# spider_state,
# bg_name,
# bg_dol_name,
# bg_dol_state,
# finished_count
# )
# VALUES
# (
# '月全流程',
# '{site_name}',
# 'month',
# '{date_info}',
# {priority},
# '{site_name}_spider_asin',
# '{spider_script}',
# 'yes',
# 1,
# '{site_name}_all_cal',
# 'ALL-月流程-ABA+反查+流量选品',
# 1,
# 0
# )
# ON DUPLICATE KEY UPDATE
# spider_is_ready = VALUES(spider_is_ready),
# spider_script = VALUES(spider_script),
# spider_state = VALUES(spider_state),
# bg_dol_name = VALUES(bg_dol_name);
# """
# print(f"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow_spider}")
# engine_mysql.execute(update_sql_workflow_spider)
#
# update_sql_workflow_bg = f"""update selection.workflow_manager
# set bg_dol_state=3, finished_count=COALESCE(finished_count, 0) + 1
# WHERE workflow_name='月全流程' and site_name='{site_name}' and date_type='{date_type}' and date_info='{date_info}' and priority={priority}"""
# print(f"workflow_manager进度表---更新asin导出进度和完成次数: {update_sql_workflow_bg}")
# engine_mysql.execute(update_sql_workflow_bg)
# break
# except Exception as e:
# print(f"失败:workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow_spider}, {e}, 报错信息: {traceback.format_exc()}")
# time.sleep(300)
# continue
def export_data(site_name, date_type, date_info):
......@@ -163,4 +163,4 @@ if __name__ == '__main__':
export_data(site_name, date_type, date_info)
if site_name in ["us", "uk", "de"] and date_type == "month":
get_minid_maxid(site_name, date_type, date_info)
update_workflow_manager(site_name, date_type, date_info)
# update_workflow_manager(site_name, date_type, date_info)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment