update_workflow_progress=f"update workflow_progress set status_val=3,status='抓取结束' where page='反查搜索词' and date_info='2026-{week}' and site_name='{site}' and date_type='week'"
sql_select_=f"select status_val from workflow_progress where date_info='2026-{week}' and date_type='week' and page='反查搜索词' and site_name='{site}'"
print(sql_select_)
engine_us_mysql=db_engine('us','mysql')
df=engine_us_mysql.read_sql(sql_select_)
# Bug3修复:df为空时不再抛 IndexError
ifdf.emptyorint(df.status_val[0])notin(1,2):
print('5555555555555555555555555555555555')
lock.release()
else:
print('5555555555555555555555555555555555')
returnFalse
exceptExceptionase:
print(f'查询 workflow_progress 失败: {e}')
lock.release()
returnFalse
update_workflow_progress=f"update workflow_progress set status_val=3,status='抓取结束' where page='反查搜索词' and date_info='2026-{week}' and site_name='{site}' and date_type='week'"
select_day_status_val=f"select status_val from workflow_progress where date_type='day' and page='ASIN详情' and site_name='us' and date_info='{next_date}' and kafka_flow_state=1 and spider_state=1"
sql_select_=f"select status_val from workflow_progress where date_type='day' and page='ASIN详情' and site_name='{site_name}' and date_info='{next_date}' and kafka_flow_state=1 and spider_state=1"
print('sql_select 2222222:',sql_select_)
df_status_dict=engine_mysql.read_sql(sql_select_)
ifnotdf_status_dict.empty:
print('查询kafka是否开启',df_status_dict.status_val[0])
ifdf_status_dict.status_val[0]in(1,2):
update_month_spider_state=f"update workflow_progress set spider_state=2,status_val=2 WHERE site_name='{site_name}' and date_type='day' and date_info='{next_date}' and page='ASIN详情'"
sql_read=f"select id from workflow_progress where date_type='day' and page='ASIN详情' and site_name='{site_name}' and date_info='{next_date}' and kafka_flow_state=3 and spider_state=2"
print('等待es启动::',sql_read)
df_report_date=engine_mysql.read_sql(sql_read)
ifnotdf_report_date.empty:
print('抓取 day asin')
break
else:
n+=1
time.sleep(120)
ifn>25:
break
except:
time.sleep(10)
ifnum_state==3:
print('抓取完成 抓取完成')
iflock_state.acquire(blocking=True):
sql_read=f"select id from workflow_progress where date_type='day' and page='ASIN详情' and site_name='{site_name}' and date_info='{next_date}' and kafka_flow_state=3 and spider_state=2"
print(sql_read,'抓取完成 修改状态 333333444444')
df_report_id=engine_mysql.read_sql(sql_read)
ifnotdf_report_id.empty:
update_month_spider_state=f"update workflow_progress set spider_state=3,status_val=3 WHERE site_name='{site_name}' and date_type='day' and date_info='{next_date}' and page='ASIN详情'"
select_state1_sql=f"select status_val from workflow_progress where site_name='{site}' and date_info='{year_week}' and date_type='week' and page='反查搜索词'"
select_sate_sql=f"select status_val from workflow_progress where site_name='{site}' and date_info='{year_week}' and date_type='week' and page='ASIN详情'"
update_workflow_progress=f"update workflow_progress set status_val=2,status='抓取中' where page='ASIN详情' and date_info='2025-{week}' and site_name='{site}' and date_type='week' and status_val in(1,0)"
spider_state_sql=f"select id from workflow_progress where date_info='{year_week}' and date_type='week' and site_name='{site}' and page='ASIN详情' and up_spider_state=1"
update_up_spider_state_list=f"update workflow_progress set up_spider_state=3 where date_info='{year_week}' and date_type='week' and site_name='{site}' and page='ASIN详情' and up_spider_state=1"
update_month_spider_state=f"update workflow_progress set spider_state=2,spider_int={spider_int} WHERE site_name='{site}' and date_type='month' and date_info='{year_month}' and page='ASIN详情'"
spider_state_sql=f"select id from workflow_progress where date_info='{year_week}' and date_type='week' and site_name='{site}' and page='ASIN详情' and up_spider_state=1"
update_up_spider_state_list=f"update workflow_progress set up_spider_state=3 where date_info='{year_week}' and date_type='week' and site_name='{site}' and page='ASIN详情' and up_spider_state=1"
update_month_spider_state=f"update workflow_progress set spider_state=2,spider_int={spider_int} WHERE site_name='{site}' and date_type='month' and date_info='{year_month}' and page='ASIN详情'"
sql_read=f"select id from workflow_progress where date_info='{year_month}' and date_type='month' and page='ASIN详情' and site_name='{site}' and kafka_flow_state=3 and spider_state=2"
up_state_sql=f"update workflow_progress set up_spider_state=1 where date_info='{year_week}' and date_type='week' and site_name='{site}' and page='ASIN详情'"
print('up_state_sql::',up_state_sql)
db_cursor_connect_update(up_state_sql,site)
returnFalse
ifdf_report_date.id[0]:
returnTrue
else:
n+=1
time.sleep(120)
except:
time.sleep(10)
sql_read=f"select id from workflow_progress where date_info='{year_month}' and date_type='month' and page='ASIN详情' and site_name='{site}' and kafka_flow_state=3 and spider_state=2"
up_state_sql=f"update workflow_progress set up_spider_state=1 where date_info='{year_week}' and date_type='week' and site_name='{site}' and page='ASIN详情'"
print('up_state_sql::',up_state_sql)
db_cursor_connect_update(up_state_sql,site)
returnFalse
ifdf_report_date.id[0]:
returnTrue
else:
n+=1
time.sleep(120)
except:
time.sleep(10)
ifnum==1:
ifsitein['fr','es','it']:
print(f' {site} _all_syn_st 完成')
else:
sql_select_=f"select status_val from workflow_progress where date_info='{year_week}' and date_type='week' and page='ASIN详情' and site_name='{site}'"
update_workflow_progress=f"update workflow_progress set status_val=3,status='ASIN爬取完成',up_spider_state=3 where page='ASIN详情' and date_info='{year_week}' and site_name='{site}' and date_type='week'"
# SELECT * from workflow_progress WHERE site_name='us' and page='asin详情' and date_type='month' and status_val=1 and status='月ASIN导出完成 and date_info='
update_month_asin_state=f"update workflow_progress set status_val=3,status='月ASIN抓取完成' WHERE site_name='{site}' and page='asin详情' and date_type='month' and status_val=1 and status='月ASIN导出完成' and date_info='{year_month}'"
update_month_spider_state=f"update workflow_progress set kafka_flow_state=1,spider_state=3,spider_int={spider_int} WHERE site_name='{site}' and date_type='month' and date_info='{year_month}' and page='ASIN详情'"
update_workflow_progress=f"update workflow_progress set status_val=3,status='ASIN爬取完成',up_spider_state=3 where page='ASIN详情' and date_info='{year_week}' and site_name='{site}' and date_type='week'"
update_month_asin_state=f"update workflow_progress set status_val=3,status='月ASIN抓取完成' WHERE site_name='{site}' and page='asin详情' and date_type='month' and status_val=1 and status='月ASIN导出完成' and date_info='{year_month}'"
update_month_spider_state=f"update workflow_progress set kafka_flow_state=1,spider_state=3,spider_int={spider_int} WHERE site_name='{site}' and date_type='month' and date_info='{year_month}' and page='ASIN详情'"