Commit 66d45910 by fangxingjun

no message

parent 27e0f356
import os
import sys
import time
sys.path.append(os.path.dirname(sys.path[0]))
from utils.secure_db_client import get_remote_engine
def update_workflow_manager(site_name, date_type, date_info):
if date_type == "month":
while True:
try:
site_name_pri_dict = {
"us": 2,
"uk": 4,
"de": 6,
}
engine_mysql = get_remote_engine(
site_name='us',
db_type='mysql'
)
with engine_mysql.begin() as conn:
priority = site_name_pri_dict[site_name]
update_sql_workflow = f"""
INSERT INTO workflow_manager
(
workflow_name,
site_name,
date_type,
date_info,
priority,
spider_name,
spider_is_ready,
spider_state,
bg_name,
bg_dol_state
)
VALUES
(
'月全流程',
'{site_name}',
'month',
'{date_info}',
{priority},
'us_spider_asin',
'yes',
1,
'us_all_cal',
1
)
ON DUPLICATE KEY UPDATE
spider_is_ready = VALUES(spider_is_ready),
spider_state = VALUES(spider_state);
"""
print(f"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow}")
conn.execute(update_sql_workflow)
except Exception as e:
time.sleep(300)
continue
def export_data(site_name, date_type, date_info):
engine = get_remote_engine(
site_name=site_name, # -> database "selection"
......@@ -41,3 +97,4 @@ if __name__ == '__main__':
date_type = sys.argv[2] # 参数2:类型:week/4_week/month/quarter/day
date_info = sys.argv[3] # 参数3:年-周/年-月/年-季/年-月-日, 比如: 2022-1
export_data(site_name, date_type, date_info)
update_workflow_manager(site_name, date_type, date_info)
......@@ -22,8 +22,9 @@ if __name__ == '__main__':
# import_table = f"{site_name}_asin_detail_month_{d1}_{d2}"
# if date_type == 'day':
# import_table = f"{site_name}_asin_detail_day_{date_info.replace('-', '_')}"
import_table = f"{site_name}_asin_detail_{date_type}_{date_info.replace('-', '_')}"
check_table = f"{site_name}_all_syn_st_{date_type}_{date_info.replace('-', '_')}"
import_table = f"{site_name}_asin_detail_{date_type.replace('_week', '')}_{date_info.replace('-', '_')}"
check_table = f"{site_name}_all_syn_st_{date_type.replace('_week', '')}_{date_info.replace('-', '_')}"
hive_table = "ods_asin_detail"
partition_dict = {
"site_name": site_name,
......@@ -54,14 +55,14 @@ if __name__ == '__main__':
sql_check_syn = f"select * from {check_table} where state in (1, 2) limit 100"
df = engine.read_sql(sql_check_syn)
if df.shape[0] > 0:
print(f"爬虫还未抓完, 等待5分钟继续")
print(f"asin详情--爬虫还未抓完, 等待5分钟继续")
time.sleep(300)
continue
else:
print("爬虫已经全部抓取完成, 可以同步数据")
print("asin详情--爬虫已经全部抓取完成, 可以同步数据")
break
except Exception as e:
print(f"检查asin是否全部抓取完成报错, 报错信息: {e}, {traceback.format_exc()}")
print(f"asin详情--检查asin是否全部抓取完成报错, 报错信息: {e}, {traceback.format_exc()}")
time.sleep(300)
engine = get_remote_engine(
site_name=site_name,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment