no message

44057a7b · Peng · 6360879f · 44057a7b
Commit 44057a7b authored Apr 17, 2026 by Peng
Hide whitespace changes
Inline Side-by-side

Showing with 79 additions and 51 deletions

dow_category_edge.py py_spider/amazon_every_day_spider/dow_category_edge.py +79 -51

No files found.
--- a/py_spider/amazon_every_day_spider/dow_category_edge.py
+++ b/py_spider/amazon_every_day_spider/dow_category_edge.py
@@ -41,7 +41,6 @@ class dow_category_Product:
        self.engine_us_mysql = None
        self.engine_pg = None

-        self.num = 0
        week = time.strftime("%W")
        yaer = time.strftime('%Y', time.localtime(time.time()))
        self.y_w = f"{yaer}-{week}"
@@ -52,26 +51,14 @@ class dow_category_Product:
        self.engine_us_mysql = get_remote_engine(site_name='us', db_type='mysql')
        self.engine_pg = get_remote_engine(site_name=site, db_type='postgresql_15_outer')

-        self.num = 0
        week = time.strftime("%W")
        yaer = time.strftime('%Y', time.localtime(time.time()))
        self.y_w = f"{yaer}-{week}"

    # ---------------------- Driver 管理 & 稳定层 ----------------------
-    def _kill_edge_process(self):
-        # 保持你原行为：强制关闭 msedge，减少 profile 被锁导致的奇怪崩溃
-        try:
-            os.system("taskkill /F /IM msedge.exe")
-        except Exception as e:
-            print("强制关闭msedge.exe失败：", e)
-
    def _build_driver(self):
        print('初始化')
-        try:
-            pr_name = "msedge.exe"
-            os.system('%s%s' % ("taskkill /F /IM ", pr_name))
-        except Exception as e:
-            print("强制关闭chrome.exe失败：", e)
+        os.system("taskkill /F /IM msedge.exe")
        time.sleep(2)
        edge_options = Options()

@@ -82,7 +69,7 @@ class dow_category_Product:
        edge_options.add_argument(r'--user-data-dir=C:\Users\FLA主账号客服维权使用\AppData\Local\Microsoft\Edge\User Data')
        edge_options.add_argument('--profile-directory=Default')

-        # 降低“首次运行/恢复弹窗/扩展”对启动的干扰（不影响登录态）
+        # 降低"首次运行/恢复弹窗/扩展"对启动的干扰（不影响登录态）
        edge_options.add_argument("--no-first-run")
        edge_options.add_argument("--no-default-browser-check")
        edge_options.add_argument("--disable-extensions")
@@ -128,6 +115,16 @@ class dow_category_Product:
    def _jitter(self, a=0.6, b=1.6):
        time.sleep(random.uniform(a, b))

+    def _check_login_redirect(self, driver):
+        """检测是否被重定向到登录页，是则直接退出程序。"""
+        try:
+            url = driver.current_url.lower()
+            if 'signin' in url or 'sign-in' in url or 'ap/signin' in url:
+                print('[退出] 检测到登录页，session 已过期，请重新登录后再运行')
+                sys.exit(1)
+        except Exception:
+            pass
+
    def _safe_action(self, action_name, fn, driver, site=None):
        """
        统一的稳定执行器：
@@ -148,25 +145,28 @@ class dow_category_Product:
                msg = str(e).lower()
                print(f"[{action_name}] 第{attempt}次失败：{e}")

-                # 常见“页面崩溃/断连/渲染挂掉”关键词
+                # 常见"页面崩溃/断连/渲染挂掉"关键词
                crash_like = any(x in msg for x in [
                    "page crash", "crash", "renderer", "disconnected",
                    "not connected to devtools", "session deleted",
                    "cannot determine loading status", "target window already closed"
                ])
+                # Timeout（页面未加载/JS超时）也值得先 refresh 一次，避免直接重启整个浏览器
+                should_retry = crash_like or isinstance(e, TimeoutException)

                # 先尝试 refresh（刷新后能继续跑）
-                if attempt < self.MAX_ACTION_RETRY and crash_like:
+                if attempt < self.MAX_ACTION_RETRY and should_retry:
                    try:
                        print(f"[{action_name}] 尝试 refresh 恢复...")
                        driver.refresh()
                        self._jitter(3, 6)
-                        # 恢复后重新定位到 category-insights）
+                        # 恢复后重新定位到 category-insights
                        if site:
                            self._ensure_category_insights(driver, site)
                        continue
                    except Exception as e2:
-                        print(f"[{action_name}] refresh 也失败：{e2}")
+                        print(f"[{action_name}] refresh 也失败，触发重启：{e2}")
+                        raise NeedRestart(f"[{action_name}] refresh 失败：{e2}")
                # 走到这里：说明需要重启 driver
                break
            except Exception as e:
@@ -203,6 +203,7 @@ class dow_category_Product:
            return True

        self._safe_action("open_category_insights", _open, driver, site=site)
+        self._check_login_redirect(driver)  # 导航完成后检测是否被重定向到登录页

        def _click_site():
            self._select_site_radio(driver, site)
@@ -219,7 +220,6 @@ class dow_category_Product:
        time.sleep(1)
        html = etree.HTML(driver.page_source)
        self.save_category(html)
-        print(333333333333333333333333)

        global syn_state
        syn_state = True
@@ -239,7 +239,7 @@ class dow_category_Product:

        for Category in Category_list:
            try:
-                # 关键：每个大循环都做一次“崩溃检测+必要时恢复”
+                # 关键：每个大循环都做一次"崩溃检测+必要时恢复"
                self._safe_action("loop_healthcheck", lambda: True, driver, site=site)

                self.cilik_site(driver)
@@ -343,8 +343,10 @@ class dow_category_Product:
                                    '//div[@class="big-text-section-name"][1]/div[@class="big-text"]/text()'
                                )
                                if ratio_list:
-                                    search_ratio = re.findall(r'(.*?)‰', ratio_list[0])[0]
-                                    return_ratio = re.findall(r'(.*?)%', ratio_list[1])[0]
+                                    _sr = re.findall(r'(.*?)‰', ratio_list[0])
+                                    search_ratio = _sr[0] if _sr else None
+                                    _rr = re.findall(r'(.*?)%', ratio_list[1]) if len(ratio_list) > 1 else []
+                                    return_ratio = _rr[0] if _rr else None
                                else:
                                    search_ratio = None
                                    return_ratio = None
@@ -353,8 +355,10 @@ class dow_category_Product:
                                    '//div[@class="big-text-section-name"][1]/div[@class="sub-text"]/text()'
                                )
                                if product_ratio_list:
-                                    product_average = re.findall(r'(.*?)‰', product_ratio_list[0])[0]
-                                    return_product_average = re.findall(r'(.*?)%', product_ratio_list[1])[0]
+                                    _pa = re.findall(r'(.*?)‰', product_ratio_list[0])
+                                    product_average = _pa[0] if _pa else None
+                                    _rpa = re.findall(r'(.*?)%', product_ratio_list[1]) if len(product_ratio_list) > 1 else []
+                                    return_product_average = _rpa[0] if _rpa else None
                                else:
                                    product_average = None
                                    return_product_average = None
@@ -403,11 +407,11 @@ class dow_category_Product:
                                    total = 0.0
                                    for num_str in sta_list:
                                        total += float(num_str)
-                                    results = [float(num) / total if float(num) != 0 else 0 for num in sta_list]
-                                    five_star = round(results[0], 2)
-                                    three_star = round(results[1], 2)
-                                    two_star = round(results[2], 2)
-                                    one_star = round(results[3], 2)
+                                    results = [float(num) / total if total > 0 and float(num) != 0 else 0 for num in sta_list]
+                                    five_star  = round(results[0], 2) if len(results) > 0 else 0
+                                    three_star = round(results[1], 2) if len(results) > 1 else 0
+                                    two_star   = round(results[2], 2) if len(results) > 2 else 0
+                                    one_star   = round(results[3], 2) if len(results) > 3 else 0
                                else:
                                    five_star = 0
                                    three_star = 0
@@ -425,7 +429,7 @@ class dow_category_Product:
                                        pattern = ''
                                    matches_list = re.findall(pattern, big_text_Advertisement)
                                    ad_spend = matches_list[0] if matches_list else None
-                                    majority_spend = matches_list[1] if matches_list else None
+                                    majority_spend = matches_list[1] if len(matches_list) > 1 else None
                                else:
                                    ad_spend = 0
                                    majority_spend = 0
@@ -487,7 +491,10 @@ class dow_category_Product:

                                save_Category_list.append([
                                    Category_name, Product_name[0], Keyword[0],
-                                    float(search_ratio), float(product_average), float(return_ratio), float(return_product_average),
+                                    float(search_ratio) if search_ratio is not None else None,
+                                    float(product_average) if product_average is not None else None,
+                                    float(return_ratio) if return_ratio is not None else None,
+                                    float(return_product_average) if return_product_average is not None else None,
                                    self.y_w, big_text_sller, big_text_brand, big_text_asin, big_text_new_asin,
                                    big_text_per_asin, big_text_Advertisement, big_text_star, big_brand_int,
                                    big_asin_int, big_new_asin_int, big_per_asin_int, five_star, three_star, two_star,
@@ -533,6 +540,7 @@ class dow_category_Product:
                                        'news_adv_spend', 'news_majority_spend'
                                    ])

+                                    df = df.where(pd.notnull(df), None)
                                    self.engine_pg.to_sql(df, f'{site}_aba_profit_category_insights', if_exists="append")
                                    print('存储成功 pg')
                                break
@@ -571,7 +579,6 @@ class dow_category_Product:
                    driver.close()
                    driver.quit()
                    print('重新启动 浏览器，')
-                    self.run()
                    raise NeedRestart("num>1 trigger restart")

            except NeedRestart as e:
@@ -606,7 +613,8 @@ class dow_category_Product:
        majority_spend = 0

        if input:
-            products_aggregate_sales = input.get('products_aggregate_sales', [])[0]
+            _agg = input.get('products_aggregate_sales', [])
+            products_aggregate_sales = _agg[0] if _agg else None
            if products_aggregate_sales:
                split = products_aggregate_sales.split("|")
                sales_amount_str = self.safe_get(split, 1, '').partition("$")[-1]
@@ -683,8 +691,10 @@ class dow_category_Product:
        most_popular_keyword_list = []
        if most_popular_list:
            for most_popular in most_popular_list:
-                most_popular_keyword = most_popular.xpath('.//div[2]/text()')[0]
-                most_popular_b_nums = most_popular.xpath('.//div/b/text()')[0]
+                _kw_list = most_popular.xpath('.//div[2]/text()')
+                most_popular_keyword = _kw_list[0] if _kw_list else None
+                _bn_list = most_popular.xpath('.//div/b/text()')
+                most_popular_b_nums = _bn_list[0] if _bn_list else None
                most_popular_keyword_list.append({
                    "most_popular_keywords": most_popular_keyword,
                    'most_popular_search_nums': most_popular_b_nums
@@ -692,8 +702,10 @@ class dow_category_Product:

        top_ratio_list = html_top.xpath('//div[@class="big-text-section-name"][1]/div[@class="big-text"]/text()')
        if top_ratio_list:
-            search_ratio = re.findall(r'(.*?)‰', top_ratio_list[0])[0]
-            return_ratio = re.findall(r'(.*?)%', top_ratio_list[1])[0]
+            _sr = re.findall(r'(.*?)‰', top_ratio_list[0])
+            search_ratio = _sr[0] if _sr else None
+            _rr = re.findall(r'(.*?)%', top_ratio_list[1]) if len(top_ratio_list) > 1 else []
+            return_ratio = _rr[0] if _rr else None
        else:
            search_ratio = None
            return_ratio = None
@@ -739,6 +751,8 @@ class dow_category_Product:
                    time.sleep(1.5)
                    return self.analysis_top_Newly_html(driver)
                return self._safe_action("click_newly_launched", _do, driver, site=self.site_name)
+        except NeedRestart:
+            raise
        except Exception:
            return "{}"

@@ -747,19 +761,31 @@ class dow_category_Product:
        self.mysql_connect(site=self.site_name)
        select_sql = 'select category from seller_category_insights_syn where state =1'
        df = self.engine_pg.read_sql(select_sql)
-        category_list = list(df.category)
-        print(category_list)
-        if category_list:
-            return category_list
+        if df.shape[0] > 0:
+            category_list = list(df.category)
+            print(category_list)
+            if category_list:
+                return category_list
+            else:
+                return None
        else:
-            self.mysql_connect()
-            workflow_everyday_list = [
-                [self.site_name, self.y_w, '类目分析抓取完成', 3, f'{self.site_name}_aba_profit_category_insights', 'week', '类目分析', '是']
-            ]
-            df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list, columns=[
-                'site_name', 'date_info', 'status', 'status_val', 'table_name', 'date_type', 'page', 'is_end'
-            ])
-            self.engine_us_mysql.to_sql(df_seller_asin_account, 'workflow_progress', if_exists='append')
+            for i in range(5):
+                try:
+                    self.mysql_connect()
+                    workflow_everyday_list = [
+                        [self.site_name, self.y_w, '类目分析抓取完成', 3, f'{self.site_name}_aba_profit_category_insights', 'week', '类目分析', '是']
+                    ]
+                    print(workflow_everyday_list)
+                    df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list, columns=[
+                        'site_name', 'date_info', 'status', 'status_val', 'table_name', 'date_type', 'page', 'is_end'
+                    ])
+                    self.engine_us_mysql.to_sql(df_seller_asin_account, 'workflow_progress', if_exists='append')
+                    break
+                except:
+                    print('存储报错 类目分析抓取完成')
+                    self.mysql_connect(site=self.site_name)
+                    time.sleep(10)
+            return None

    def save_category(self, html):
        Category_list = html.xpath('//h2[contains(text(),"Category")]/following-sibling::div/div')
@@ -769,6 +795,8 @@ class dow_category_Product:
        for Category in Category_list:
            Category_name = Category.xpath('./@id')
            Category_label = Category.xpath('.//@label')
+            if not Category_name or not Category_label:
+                continue
            self.category_item[Category_label[0]] = Category_name[0]
            Categorys_list.append(Category_name[0])
            Categorys_list_syn.append([Category_label[0]])
@@ -779,7 +807,7 @@ class dow_category_Product:
                if syn_state is False:
                    now = datetime.datetime.now()
                    is_monday = (now.weekday() == 0)
-                    is_9_am = (now.hour == 11)  # 保持你原判断
+                    is_9_am = (now.hour == 11)
                    if is_monday and is_9_am:
                        TRUNCATE_SQL = 'TRUNCATE seller_category_insights_syn'
                        conn.execute(TRUNCATE_SQL)