Commit 44057a7b by Peng

no message

parent 6360879f
...@@ -41,7 +41,6 @@ class dow_category_Product: ...@@ -41,7 +41,6 @@ class dow_category_Product:
self.engine_us_mysql = None self.engine_us_mysql = None
self.engine_pg = None self.engine_pg = None
self.num = 0
week = time.strftime("%W") week = time.strftime("%W")
yaer = time.strftime('%Y', time.localtime(time.time())) yaer = time.strftime('%Y', time.localtime(time.time()))
self.y_w = f"{yaer}-{week}" self.y_w = f"{yaer}-{week}"
...@@ -52,26 +51,14 @@ class dow_category_Product: ...@@ -52,26 +51,14 @@ class dow_category_Product:
self.engine_us_mysql = get_remote_engine(site_name='us', db_type='mysql') self.engine_us_mysql = get_remote_engine(site_name='us', db_type='mysql')
self.engine_pg = get_remote_engine(site_name=site, db_type='postgresql_15_outer') self.engine_pg = get_remote_engine(site_name=site, db_type='postgresql_15_outer')
self.num = 0
week = time.strftime("%W") week = time.strftime("%W")
yaer = time.strftime('%Y', time.localtime(time.time())) yaer = time.strftime('%Y', time.localtime(time.time()))
self.y_w = f"{yaer}-{week}" self.y_w = f"{yaer}-{week}"
# ---------------------- Driver 管理 & 稳定层 ---------------------- # ---------------------- Driver 管理 & 稳定层 ----------------------
def _kill_edge_process(self):
# 保持你原行为:强制关闭 msedge,减少 profile 被锁导致的奇怪崩溃
try:
os.system("taskkill /F /IM msedge.exe")
except Exception as e:
print("强制关闭msedge.exe失败:", e)
def _build_driver(self): def _build_driver(self):
print('初始化') print('初始化')
try: os.system("taskkill /F /IM msedge.exe")
pr_name = "msedge.exe"
os.system('%s%s' % ("taskkill /F /IM ", pr_name))
except Exception as e:
print("强制关闭chrome.exe失败:", e)
time.sleep(2) time.sleep(2)
edge_options = Options() edge_options = Options()
...@@ -82,7 +69,7 @@ class dow_category_Product: ...@@ -82,7 +69,7 @@ class dow_category_Product:
edge_options.add_argument(r'--user-data-dir=C:\Users\FLA主账号客服维权使用\AppData\Local\Microsoft\Edge\User Data') edge_options.add_argument(r'--user-data-dir=C:\Users\FLA主账号客服维权使用\AppData\Local\Microsoft\Edge\User Data')
edge_options.add_argument('--profile-directory=Default') edge_options.add_argument('--profile-directory=Default')
# 降低“首次运行/恢复弹窗/扩展”对启动的干扰(不影响登录态) # 降低"首次运行/恢复弹窗/扩展"对启动的干扰(不影响登录态)
edge_options.add_argument("--no-first-run") edge_options.add_argument("--no-first-run")
edge_options.add_argument("--no-default-browser-check") edge_options.add_argument("--no-default-browser-check")
edge_options.add_argument("--disable-extensions") edge_options.add_argument("--disable-extensions")
...@@ -128,6 +115,16 @@ class dow_category_Product: ...@@ -128,6 +115,16 @@ class dow_category_Product:
def _jitter(self, a=0.6, b=1.6): def _jitter(self, a=0.6, b=1.6):
time.sleep(random.uniform(a, b)) time.sleep(random.uniform(a, b))
def _check_login_redirect(self, driver):
"""检测是否被重定向到登录页,是则直接退出程序。"""
try:
url = driver.current_url.lower()
if 'signin' in url or 'sign-in' in url or 'ap/signin' in url:
print('[退出] 检测到登录页,session 已过期,请重新登录后再运行')
sys.exit(1)
except Exception:
pass
def _safe_action(self, action_name, fn, driver, site=None): def _safe_action(self, action_name, fn, driver, site=None):
""" """
统一的稳定执行器: 统一的稳定执行器:
...@@ -148,25 +145,28 @@ class dow_category_Product: ...@@ -148,25 +145,28 @@ class dow_category_Product:
msg = str(e).lower() msg = str(e).lower()
print(f"[{action_name}] 第{attempt}次失败:{e}") print(f"[{action_name}] 第{attempt}次失败:{e}")
# 常见“页面崩溃/断连/渲染挂掉”关键词 # 常见"页面崩溃/断连/渲染挂掉"关键词
crash_like = any(x in msg for x in [ crash_like = any(x in msg for x in [
"page crash", "crash", "renderer", "disconnected", "page crash", "crash", "renderer", "disconnected",
"not connected to devtools", "session deleted", "not connected to devtools", "session deleted",
"cannot determine loading status", "target window already closed" "cannot determine loading status", "target window already closed"
]) ])
# Timeout(页面未加载/JS超时)也值得先 refresh 一次,避免直接重启整个浏览器
should_retry = crash_like or isinstance(e, TimeoutException)
# 先尝试 refresh(刷新后能继续跑) # 先尝试 refresh(刷新后能继续跑)
if attempt < self.MAX_ACTION_RETRY and crash_like: if attempt < self.MAX_ACTION_RETRY and should_retry:
try: try:
print(f"[{action_name}] 尝试 refresh 恢复...") print(f"[{action_name}] 尝试 refresh 恢复...")
driver.refresh() driver.refresh()
self._jitter(3, 6) self._jitter(3, 6)
# 恢复后重新定位到 category-insights # 恢复后重新定位到 category-insights
if site: if site:
self._ensure_category_insights(driver, site) self._ensure_category_insights(driver, site)
continue continue
except Exception as e2: except Exception as e2:
print(f"[{action_name}] refresh 也失败:{e2}") print(f"[{action_name}] refresh 也失败,触发重启:{e2}")
raise NeedRestart(f"[{action_name}] refresh 失败:{e2}")
# 走到这里:说明需要重启 driver # 走到这里:说明需要重启 driver
break break
except Exception as e: except Exception as e:
...@@ -203,6 +203,7 @@ class dow_category_Product: ...@@ -203,6 +203,7 @@ class dow_category_Product:
return True return True
self._safe_action("open_category_insights", _open, driver, site=site) self._safe_action("open_category_insights", _open, driver, site=site)
self._check_login_redirect(driver) # 导航完成后检测是否被重定向到登录页
def _click_site(): def _click_site():
self._select_site_radio(driver, site) self._select_site_radio(driver, site)
...@@ -219,7 +220,6 @@ class dow_category_Product: ...@@ -219,7 +220,6 @@ class dow_category_Product:
time.sleep(1) time.sleep(1)
html = etree.HTML(driver.page_source) html = etree.HTML(driver.page_source)
self.save_category(html) self.save_category(html)
print(333333333333333333333333)
global syn_state global syn_state
syn_state = True syn_state = True
...@@ -239,7 +239,7 @@ class dow_category_Product: ...@@ -239,7 +239,7 @@ class dow_category_Product:
for Category in Category_list: for Category in Category_list:
try: try:
# 关键:每个大循环都做一次“崩溃检测+必要时恢复” # 关键:每个大循环都做一次"崩溃检测+必要时恢复"
self._safe_action("loop_healthcheck", lambda: True, driver, site=site) self._safe_action("loop_healthcheck", lambda: True, driver, site=site)
self.cilik_site(driver) self.cilik_site(driver)
...@@ -343,8 +343,10 @@ class dow_category_Product: ...@@ -343,8 +343,10 @@ class dow_category_Product:
'//div[@class="big-text-section-name"][1]/div[@class="big-text"]/text()' '//div[@class="big-text-section-name"][1]/div[@class="big-text"]/text()'
) )
if ratio_list: if ratio_list:
search_ratio = re.findall(r'(.*?)‰', ratio_list[0])[0] _sr = re.findall(r'(.*?)‰', ratio_list[0])
return_ratio = re.findall(r'(.*?)%', ratio_list[1])[0] search_ratio = _sr[0] if _sr else None
_rr = re.findall(r'(.*?)%', ratio_list[1]) if len(ratio_list) > 1 else []
return_ratio = _rr[0] if _rr else None
else: else:
search_ratio = None search_ratio = None
return_ratio = None return_ratio = None
...@@ -353,8 +355,10 @@ class dow_category_Product: ...@@ -353,8 +355,10 @@ class dow_category_Product:
'//div[@class="big-text-section-name"][1]/div[@class="sub-text"]/text()' '//div[@class="big-text-section-name"][1]/div[@class="sub-text"]/text()'
) )
if product_ratio_list: if product_ratio_list:
product_average = re.findall(r'(.*?)‰', product_ratio_list[0])[0] _pa = re.findall(r'(.*?)‰', product_ratio_list[0])
return_product_average = re.findall(r'(.*?)%', product_ratio_list[1])[0] product_average = _pa[0] if _pa else None
_rpa = re.findall(r'(.*?)%', product_ratio_list[1]) if len(product_ratio_list) > 1 else []
return_product_average = _rpa[0] if _rpa else None
else: else:
product_average = None product_average = None
return_product_average = None return_product_average = None
...@@ -403,11 +407,11 @@ class dow_category_Product: ...@@ -403,11 +407,11 @@ class dow_category_Product:
total = 0.0 total = 0.0
for num_str in sta_list: for num_str in sta_list:
total += float(num_str) total += float(num_str)
results = [float(num) / total if float(num) != 0 else 0 for num in sta_list] results = [float(num) / total if total > 0 and float(num) != 0 else 0 for num in sta_list]
five_star = round(results[0], 2) five_star = round(results[0], 2) if len(results) > 0 else 0
three_star = round(results[1], 2) three_star = round(results[1], 2) if len(results) > 1 else 0
two_star = round(results[2], 2) two_star = round(results[2], 2) if len(results) > 2 else 0
one_star = round(results[3], 2) one_star = round(results[3], 2) if len(results) > 3 else 0
else: else:
five_star = 0 five_star = 0
three_star = 0 three_star = 0
...@@ -425,7 +429,7 @@ class dow_category_Product: ...@@ -425,7 +429,7 @@ class dow_category_Product:
pattern = '' pattern = ''
matches_list = re.findall(pattern, big_text_Advertisement) matches_list = re.findall(pattern, big_text_Advertisement)
ad_spend = matches_list[0] if matches_list else None ad_spend = matches_list[0] if matches_list else None
majority_spend = matches_list[1] if matches_list else None majority_spend = matches_list[1] if len(matches_list) > 1 else None
else: else:
ad_spend = 0 ad_spend = 0
majority_spend = 0 majority_spend = 0
...@@ -487,7 +491,10 @@ class dow_category_Product: ...@@ -487,7 +491,10 @@ class dow_category_Product:
save_Category_list.append([ save_Category_list.append([
Category_name, Product_name[0], Keyword[0], Category_name, Product_name[0], Keyword[0],
float(search_ratio), float(product_average), float(return_ratio), float(return_product_average), float(search_ratio) if search_ratio is not None else None,
float(product_average) if product_average is not None else None,
float(return_ratio) if return_ratio is not None else None,
float(return_product_average) if return_product_average is not None else None,
self.y_w, big_text_sller, big_text_brand, big_text_asin, big_text_new_asin, self.y_w, big_text_sller, big_text_brand, big_text_asin, big_text_new_asin,
big_text_per_asin, big_text_Advertisement, big_text_star, big_brand_int, big_text_per_asin, big_text_Advertisement, big_text_star, big_brand_int,
big_asin_int, big_new_asin_int, big_per_asin_int, five_star, three_star, two_star, big_asin_int, big_new_asin_int, big_per_asin_int, five_star, three_star, two_star,
...@@ -533,6 +540,7 @@ class dow_category_Product: ...@@ -533,6 +540,7 @@ class dow_category_Product:
'news_adv_spend', 'news_majority_spend' 'news_adv_spend', 'news_majority_spend'
]) ])
df = df.where(pd.notnull(df), None)
self.engine_pg.to_sql(df, f'{site}_aba_profit_category_insights', if_exists="append") self.engine_pg.to_sql(df, f'{site}_aba_profit_category_insights', if_exists="append")
print('存储成功 pg') print('存储成功 pg')
break break
...@@ -571,7 +579,6 @@ class dow_category_Product: ...@@ -571,7 +579,6 @@ class dow_category_Product:
driver.close() driver.close()
driver.quit() driver.quit()
print('重新启动 浏览器,') print('重新启动 浏览器,')
self.run()
raise NeedRestart("num>1 trigger restart") raise NeedRestart("num>1 trigger restart")
except NeedRestart as e: except NeedRestart as e:
...@@ -606,7 +613,8 @@ class dow_category_Product: ...@@ -606,7 +613,8 @@ class dow_category_Product:
majority_spend = 0 majority_spend = 0
if input: if input:
products_aggregate_sales = input.get('products_aggregate_sales', [])[0] _agg = input.get('products_aggregate_sales', [])
products_aggregate_sales = _agg[0] if _agg else None
if products_aggregate_sales: if products_aggregate_sales:
split = products_aggregate_sales.split("|") split = products_aggregate_sales.split("|")
sales_amount_str = self.safe_get(split, 1, '').partition("$")[-1] sales_amount_str = self.safe_get(split, 1, '').partition("$")[-1]
...@@ -683,8 +691,10 @@ class dow_category_Product: ...@@ -683,8 +691,10 @@ class dow_category_Product:
most_popular_keyword_list = [] most_popular_keyword_list = []
if most_popular_list: if most_popular_list:
for most_popular in most_popular_list: for most_popular in most_popular_list:
most_popular_keyword = most_popular.xpath('.//div[2]/text()')[0] _kw_list = most_popular.xpath('.//div[2]/text()')
most_popular_b_nums = most_popular.xpath('.//div/b/text()')[0] most_popular_keyword = _kw_list[0] if _kw_list else None
_bn_list = most_popular.xpath('.//div/b/text()')
most_popular_b_nums = _bn_list[0] if _bn_list else None
most_popular_keyword_list.append({ most_popular_keyword_list.append({
"most_popular_keywords": most_popular_keyword, "most_popular_keywords": most_popular_keyword,
'most_popular_search_nums': most_popular_b_nums 'most_popular_search_nums': most_popular_b_nums
...@@ -692,8 +702,10 @@ class dow_category_Product: ...@@ -692,8 +702,10 @@ class dow_category_Product:
top_ratio_list = html_top.xpath('//div[@class="big-text-section-name"][1]/div[@class="big-text"]/text()') top_ratio_list = html_top.xpath('//div[@class="big-text-section-name"][1]/div[@class="big-text"]/text()')
if top_ratio_list: if top_ratio_list:
search_ratio = re.findall(r'(.*?)‰', top_ratio_list[0])[0] _sr = re.findall(r'(.*?)‰', top_ratio_list[0])
return_ratio = re.findall(r'(.*?)%', top_ratio_list[1])[0] search_ratio = _sr[0] if _sr else None
_rr = re.findall(r'(.*?)%', top_ratio_list[1]) if len(top_ratio_list) > 1 else []
return_ratio = _rr[0] if _rr else None
else: else:
search_ratio = None search_ratio = None
return_ratio = None return_ratio = None
...@@ -739,6 +751,8 @@ class dow_category_Product: ...@@ -739,6 +751,8 @@ class dow_category_Product:
time.sleep(1.5) time.sleep(1.5)
return self.analysis_top_Newly_html(driver) return self.analysis_top_Newly_html(driver)
return self._safe_action("click_newly_launched", _do, driver, site=self.site_name) return self._safe_action("click_newly_launched", _do, driver, site=self.site_name)
except NeedRestart:
raise
except Exception: except Exception:
return "{}" return "{}"
...@@ -747,19 +761,31 @@ class dow_category_Product: ...@@ -747,19 +761,31 @@ class dow_category_Product:
self.mysql_connect(site=self.site_name) self.mysql_connect(site=self.site_name)
select_sql = 'select category from seller_category_insights_syn where state =1' select_sql = 'select category from seller_category_insights_syn where state =1'
df = self.engine_pg.read_sql(select_sql) df = self.engine_pg.read_sql(select_sql)
if df.shape[0] > 0:
category_list = list(df.category) category_list = list(df.category)
print(category_list) print(category_list)
if category_list: if category_list:
return category_list return category_list
else: else:
return None
else:
for i in range(5):
try:
self.mysql_connect() self.mysql_connect()
workflow_everyday_list = [ workflow_everyday_list = [
[self.site_name, self.y_w, '类目分析抓取完成', 3, f'{self.site_name}_aba_profit_category_insights', 'week', '类目分析', '是'] [self.site_name, self.y_w, '类目分析抓取完成', 3, f'{self.site_name}_aba_profit_category_insights', 'week', '类目分析', '是']
] ]
print(workflow_everyday_list)
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list, columns=[ df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list, columns=[
'site_name', 'date_info', 'status', 'status_val', 'table_name', 'date_type', 'page', 'is_end' 'site_name', 'date_info', 'status', 'status_val', 'table_name', 'date_type', 'page', 'is_end'
]) ])
self.engine_us_mysql.to_sql(df_seller_asin_account, 'workflow_progress', if_exists='append') self.engine_us_mysql.to_sql(df_seller_asin_account, 'workflow_progress', if_exists='append')
break
except:
print('存储报错 类目分析抓取完成')
self.mysql_connect(site=self.site_name)
time.sleep(10)
return None
def save_category(self, html): def save_category(self, html):
Category_list = html.xpath('//h2[contains(text(),"Category")]/following-sibling::div/div') Category_list = html.xpath('//h2[contains(text(),"Category")]/following-sibling::div/div')
...@@ -769,6 +795,8 @@ class dow_category_Product: ...@@ -769,6 +795,8 @@ class dow_category_Product:
for Category in Category_list: for Category in Category_list:
Category_name = Category.xpath('./@id') Category_name = Category.xpath('./@id')
Category_label = Category.xpath('.//@label') Category_label = Category.xpath('.//@label')
if not Category_name or not Category_label:
continue
self.category_item[Category_label[0]] = Category_name[0] self.category_item[Category_label[0]] = Category_name[0]
Categorys_list.append(Category_name[0]) Categorys_list.append(Category_name[0])
Categorys_list_syn.append([Category_label[0]]) Categorys_list_syn.append([Category_label[0]])
...@@ -779,7 +807,7 @@ class dow_category_Product: ...@@ -779,7 +807,7 @@ class dow_category_Product:
if syn_state is False: if syn_state is False:
now = datetime.datetime.now() now = datetime.datetime.now()
is_monday = (now.weekday() == 0) is_monday = (now.weekday() == 0)
is_9_am = (now.hour == 11) # 保持你原判断 is_9_am = (now.hour == 11)
if is_monday and is_9_am: if is_monday and is_9_am:
TRUNCATE_SQL = 'TRUNCATE seller_category_insights_syn' TRUNCATE_SQL = 'TRUNCATE seller_category_insights_syn'
conn.execute(TRUNCATE_SQL) conn.execute(TRUNCATE_SQL)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment