本次主要解决页面加载完成没有显示详情进行判断不在往下走进行报错了。新增15s内页面源码没有该id进行重新请求页面。

d4dde086 · Peng · 2a634fbb · d4dde086
Commit d4dde086 authored Jan 21, 2026 by Peng
Hide whitespace changes
Inline Side-by-side

Showing with 88 additions and 77 deletions

H10_spider.py py_spider/amazon_spider/H10_spider.py +88 -77

No files found.
--- a/py_spider/amazon_spider/H10_spider.py
+++ b/py_spider/amazon_spider/H10_spider.py
@@ -66,6 +66,38 @@ class H10():
        s.connect(('baidu.com', 0))
        ip = s.getsockname()[0]
        # You are viewing a demo of Cerebro
+        """
+        H10测试账号
+        账号：yswg006@hotmail.com  # 124  126 共用
+        密码：Chianbugye@8346148
+        yswg304@outlook.com # 122 
+        Chinabuye@467138
+        YSWGHF422023@outlook.com # 244
+        soundasia422023@
+        CherryY2023@outlook.com # 245
+        20230322Yy@
+        H10961961@outlook.com # 246
+        soundasia961961@
+        X18756082657@outlook.com # 247
+        Zyx13075039897@
+        wretyu2023@outlook.com # 127
+        Sffgserter@1
+        a18756082657@outlook.com # 121
+        12345678Ll@
+        账号：yashengweige678@outlook.com # 120
+        密码：987654321yswg@
+        账号：yswg12345678@outlook.com # 126  信用卡有问题
+        密码：yswg654321@
+        """
        user_pw_dict = {
            '192.168.10.244': [r'C:\Users\win10-244\Downloads', 'YSWGHF422023@outlook.com', 'soundasia422023@'],
            '192.168.10.245': [r'C:\Users\win10-245\Downloads', 'CherryY2023@outlook.com', '20230322Yy@'],
@@ -75,7 +107,7 @@ class H10():
            '192.168.0.121': [r'C:\Users\1\Downloads', 'a18756082657@outlook.com', '12345678Ll@'],
            '192.168.0.126': [r'C:\Users\Administrator\Downloads', 'yswg12345678@outlook.com', 'yswg654321@'],
            '192.168.0.127': [r'C:\Users\1\Downloads', 'wretyu2023@outlook.com', 'Sffgserter@1'],
-            '192.168.0.122': [r'C:\Users\1\Downloads', 'yashengweige678@outlook.com', '987654321yswg@'],
+            '192.168.0.122': [r'C:\Users\1\Downloads', 'yswg304@outlook.com', 'Chinabuye@467138'],
            '192.168.0.124': [r'C:\Users\1\Downloads', 'yswg006@hotmail.com', 'Chianbugye@8346148'],
        }
        user_pw_list = user_pw_dict.get(ip)
@@ -395,9 +427,19 @@ class H10():
            except:
                pass
+    def wait_page(self, timeout=30):
+        start = time.time()
+        wait = WebDriverWait(self.driver, timeout)
+        try:
+            wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="re-container"]//input')))
+            print("wait_page ok, used:", time.time() - start)
+            return True
+        except TimeoutException:
+            print("wait_page timeout, used:", time.time() - start)
+            return False
    def webdrvier_html(self, asin, asinstype):
        # 点击选择站点
-        for i in range(4):
+        for i in range(5):
            try:
                _url = self.driver.current_url
                if "concurrent-sessions" in _url or 'signin' in _url:
@@ -405,7 +447,10 @@ class H10():
                if asin not in self.err_asin_list and self.useremail_state:
                    print('cerebro界面', self.site_name_url)
                    self.driver.get(f'https://members.helium10.com/cerebro?accountId={self.account_id}')
-                    time.sleep(10)
+                    if not self.wait_page(timeout=15):
+                        print('页面未加载出来')
+                        continue
+                    time.sleep(2)
                    if 'You are viewing a demo of Cerebro' in self.driver.page_source:
                        print(self.email_name, '账号过期')
                        self.driver.refresh()
@@ -448,11 +493,11 @@ class H10():
                    try:
                        self.driver.execute_script(
                            f"""document.querySelector("img[loading='lazy']").click()""")
-                        time.sleep(1.5)
+                        time.sleep(1)
                    except:
                        self.driver.execute_script(
                            f"""document.querySelector("img[alt='{alt}']").click()""")
-                        time.sleep(1.5)
+                        time.sleep(1)
                    self.verify()
                    # 切换站点
                    self.driver.execute_script(f"""document.querySelector("div[data-value='{host}']").click()""")
@@ -472,7 +517,7 @@ class H10():
                    # 点击 get keyword
                    time.sleep(1)
                    self.driver.execute_script('document.querySelector("#CerebroSearchButtons > button").click()')
-                    time.sleep(2)
+                    time.sleep(1)
                    html = self.driver.page_source
                    if 'You have reached the limit of the uses' in html:
                        self.useremail_state = False
@@ -493,7 +538,7 @@ class H10():
                    time.sleep(2)
                    try:
                        if 'searched this product before' in html or '先前已搜索过此产品' in html:
-                            print('33333333333')
+                            print('33333333333444444')
                            self.driver.execute_script(
                                """document.querySelector("button[data-testid='runnewsearch']").click()""")
                            sleep(randint(3, 8))
@@ -502,7 +547,7 @@ class H10():
                        print('点击 run 报错')
                    # 点击下载
                    self.driver.execute_script('window.scrollBy(0, 300);')
-                    time.sleep(2)
+                    time.sleep(1)
                    html = self.driver.page_source
                    if 'You have reached the limit of the uses' in html:
                        self.useremail_state = False
@@ -518,10 +563,9 @@ class H10():
                        break
                    elif 'errorCodes.undefined' in html:
                        continue
-                    sleep(randint(13, 28))
+                    sleep(randint(15, 30))
-                    time.sleep(5)
                    self.verify()
-                    time.sleep(2.5)
+                    time.sleep(2)
                    if 'Wrong entered data or no results' in html:
                        print('没有报告可下载2222', asin)
                        self.err_asin_list.append(asin)
@@ -532,17 +576,26 @@ class H10():
                        break
                    elif 'errorCodes.undefined' in html:
                        continue
-                    time.sleep(5)
+                    elif 'errors.common.502' in html:
-                    html = self.driver.page_source
+                        print('没有报告可下载333', asin)
-                    resp = etree.HTML(html)
+                        self.err_asin_list.append(asin)
-                    try:
+                        break
-                        div_class = resp.xpath(
-                            '''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
-                    except:
-                        print('报错22222222222222')
                    if asinstype:
-                        time.sleep(2)
+                        try:
+                            print('点击显示下拉框')
+                            button_js = 'document.querySelector("#CerebroFilter > div > div.sc-dzXNMW.dufncf > div.sc-hFCjLd.igMWUF > div > button").click()'
+                            self.driver.execute_script(button_js)
+                            time.sleep(2)
+                            html = self.driver.page_source
+                            resp = etree.HTML(html)
+                            print('Amazons Choice获取元素')
+                            time.sleep(2)
+                            div_class = resp.xpath(
+                                '''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
+                        except:
+                            print('报错22222222222222')
                        print('点击选择亚马逊精选 勾选')
+                        time.sleep(2)
                        try:
                            script = f"""
                                    const elements = document.querySelectorAll("div[class='{div_class[0]}']>div");
@@ -553,7 +606,7 @@ class H10():
                            if i == 2:
                                self.err_asins_adv_list.append(asin)
                        self.driver.execute_script(script)
-                        time.sleep(2)
+                        time.sleep(1)
                        html1 = self.driver.page_source
                        resp1 = etree.HTML(html1)
                        span_class = resp1.xpath(
@@ -561,15 +614,15 @@ class H10():
                        # 选择亚马逊精选参数1
                        self.driver.execute_script(
                            f"""document.querySelector("div[class='{span_class}']").click()""")
-                        time.sleep(2)
+                        time.sleep(1)
                        # 选择亚马逊精选参数2
                        self.driver.execute_script(
                            f"""document.querySelector("div[class='{span_class}']").click()""")
-                        time.sleep(2)
+                        time.sleep(1)
                        # 点击添加
                        self.driver.execute_script(
                            """document.querySelector("button[data-testid='applyfilters']").click()""")
-                    time.sleep(6.5)
+                    time.sleep(3)
                    # 下载报告
                    # 点击下载csv按钮
                    self.driver.execute_script(
@@ -742,57 +795,6 @@ class H10():
                    print('重新下载文件222：', asin, path)
                    self.webdrvier_html(asin, None)
                    self.if_csv_path(file_path)
-        # columns = pd.read_csv(file_path, nrows=0).columns.tolist()
-        #
-        # def contains_chinese(text):
-        #     return bool(re.search(r'[\u4e00-\u9fff]', text))
-        # is_chinese_header = any(contains_chinese(col) for col in columns)
-        # if is_chinese_header:
-        #     print("表头是中文")
-        #     columns_to_include_zh = ['关键词词组', 'Cerebro IQ 得分', '搜索量', '搜索量趋势',
-        #                              '广告推广ASIN 数',
-        #                              '竞品数', 'CPR', '标题密度', '亚马逊推荐', '自然',
-        #                              '亚马逊推荐排名', '广告排名', '自然排名']
-        #     df = pd.read_csv(file_path, usecols=columns_to_include_zh)
-        #     # 中文 -> 英文映射
-        #     df.rename(columns={
-        #         '关键词词组': 'keyword',
-        #         'Cerebro IQ 得分': 'cerebro_iq_score',
-        #         '搜索量': 'search_volume',
-        #         '搜索量趋势': 'search_volume_trend',
-        #         '广告推广ASIN 数': 'sponsored_asins',
-        #         '竞品数': 'competing_product',
-        #         'CPR': 'cpr',
-        #         '标题密度': 'title_desity',
-        #         '亚马逊推荐': 'amazon_recommended',
-        #         '自然': 'organic',
-        #         '亚马逊推荐排名': 'amazon_recommended_rank',
-        #         '广告排名': 'sponsored_rank',
-        #         '自然排名': 'organic_rank'
-        #     }, inplace=True)
-        # else:
-        #     print("表头是英文")
-        #     columns_to_include_en = ['Keyword Phrase', 'Cerebro IQ Score', 'Search Volume', 'Search Volume Trend',
-        #                              'Sponsored ASINs',
-        #                              'Competing Products', 'CPR', 'Title Density', 'Amazon Recommended', 'Organic',
-        #                              'Amazon Rec. Rank', 'Sponsored Rank', 'Organic Rank']
-        #     df = pd.read_csv(file_path, usecols=columns_to_include_en)
-        #     df.rename(columns={
-        #         'Keyword Phrase': 'keyword',
-        #         'Cerebro IQ Score': 'cerebro_iq_score',
-        #         'Search Volume': 'search_volume',
-        #         'Search Volume Trend': 'search_volume_trend',
-        #         'Sponsored ASINs': 'sponsored_asins',
-        #         'Competing Products': 'competing_product',
-        #         'CPR': 'cpr',
-        #         'Title Density': 'title_desity',
-        #         'Amazon Recommended': 'amazon_recommended',
-        #         'Organic': 'organic',
-        #         'Amazon Rec. Rank': 'amazon_recommended_rank',
-        #         'Sponsored Rank': 'sponsored_rank',
-        #         'Organic Rank': 'organic_rank'
-        #     }, inplace=True)
        header_config = {
            "chinese": {
                "columns": ['关键词词组', 'Cerebro IQ 得分', '搜索量', '搜索量趋势',
@@ -973,6 +975,15 @@ class H10():
                previous_date_str = previous_date.strftime("%Y-%m-%d")
                file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{previous_date_str}.csv'
                print('file_pathsave_competition2222', file_path)
+                state = self.if_csv_path(file_path)
+                if state == False:
+                    time.sleep(3)
+                    file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
+                    print('file_pathsave_competition3333', file_path)
+                    state = self.if_csv_path(file_path)
+                    if state == False:
+                        self.nex_page(self.asin_list, asinstype=1)
            # 创建一个字典来映射原始列名和新的列名
            columns = pd.read_csv(file_path, nrows=0).columns.tolist()
            def contains_chinese(text):
@@ -1079,8 +1090,8 @@ class H10():
        else:
            path = r'C:\Users\ASUS\Downloads'
            print('当前路径：', path)
-            self.email_name = 'yashengweige678@outlook.com'
+            self.email_name = 'yswg006@hotmail.com'
-            self.pw = '987654321yswg@'  # 'yashengweige678@outlook.com', '987654321yswg@'
+            self.pw = 'Chianbugye@8346148'  # 'yashengweige678@outlook.com', '987654321yswg@'
        self.web_drver()
        while True:
            self.data = {}