no message

1612910c · Peng · 179f825d · 1612910c · 1612910c · 1612910c
Commit 1612910c authored Feb 03, 2026 by Peng
Showing with 476 additions and 298 deletions

Get_Cookies.py py_spider/amazon_every_day_spider/Get_Cookies.py +6 -6

get_cookies2.py py_spider/amazon_every_day_spider/get_cookies2.py +300 -171

H10_spider.py py_spider/amazon_spider/H10_spider.py +170 -121

No files found.
--- a/py_spider/amazon_every_day_spider/Get_Cookies.py
+++ b/py_spider/amazon_every_day_spider/Get_Cookies.py
@@ -151,10 +151,10 @@ def get_cookie(site='us', zipCode='10010'):
 if __name__ == '__main__':
    while True:
        get_cookie(site='us', zipCode='10010')
-        get_cookie(site='de', zipCode='10115')
+        # get_cookie(site='de', zipCode='10115')
-        get_cookie(site='uk', zipCode='W1S 3PR')
+        # get_cookie(site='uk', zipCode='W1S 3PR')
-        get_cookie(site='it', zipCode='00185')
+        # get_cookie(site='it', zipCode='00185')
-        get_cookie(site='es', zipCode='28001')
+        # get_cookie(site='es', zipCode='28001')
-        get_cookie(site='fr', zipCode='75019')
+        # get_cookie(site='fr', zipCode='75019')
-        get_cookie(site='ca', zipCode='M5B 2H1')
+        # get_cookie(site='ca', zipCode='M5B 2H1')
        time.sleep(random.uniform(60.5, 180.5))
--- a/py_spider/amazon_every_day_spider/get_cookies2.py
+++ b/py_spider/amazon_every_day_spider/get_cookies2.py
+# import json
+# import re
+#
+# from curl_cffi import requests
+# from lxml import etree
+#
+# requ_see = requests.Session()
+# headers = {
+#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+#     'Accept-Encoding': 'gzip, deflate, br, zstd',
+#     'Accept-Language': 'zh-CN,zh;q=0.9',
+#     'Cache-Control': 'no-cache',
+#     'Pragma': 'no-cache',
+#     'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+#     'Sec-Ch-Ua-Mobile': '?0',
+#     'Sec-Ch-Ua-Platform': ' "Windows"',
+#     'Sec-Fetch-Dest': 'document',
+#     'Sec-Fetch-Mode': 'navigate',
+#     'Sec-Fetch-Site': ' none',
+#     'Sec-Fetch-User': '?1',
+#     'Upgrade-Insecure-Requests': '1',
+#     'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+# }
+# asin_resp = requ_see.get('https://www.amazon.co.uk/dp/B0714LLB2T?th=1', headers=headers)
+# print("第一步 请求首页")
+# html_xpath = etree.HTML(asin_resp.text)
+# ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
+# print("第一次发送请求，获取邮编：", ingress)
+# # url2 = 'https://www.amazon.co.uk/nav/ajax/hMenuDesktopFirstLayer?ajaxTemplate=hMenuDesktopFirstLayer&pageType=Detail&hmDataAjaxHint=1&isFreshRegion=false&isFreshCustomer=false&isPrimeMember=false&isPrimeDay=false&isBackup=false&firstName=false&navDeviceType=desktop&hashCustomerAndSessionId=8b35c8413eaf45f3509509691ec91ce8cc82c3f3&environmentVFI=AmazonNavigationCards%2Fdevelopment%40B6407668806-AL2_aarch64&languageCode=en_GB&customerCountryCode=US'
+# # requ_see.get(url2, headers=headers)
+#
+# data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
+# data_modal = json.loads(data_a_modal[0])
+# print(data_modal)
+# headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z']
+# clkci_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
+# headers['Referer'] = 'https://www.amazon.co.uk/dp/B0714LLB2T?th=1'
+# print(headers, 23333333)
+# clkci_resp = requ_see.get(clkci_url, headers=headers)
+# print(clkci_resp.text)
+# CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)[0]
+# print("CSRF_TOKEN:", CSRF_TOKEN)
+# address_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/address-change?actionSource=glow'
+#
+# headers_post = {
+#     'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+#     'Accept-Encoding': 'gzip', }
+# headers_post['Origin'] = 'https://www.amazon.co.uk'
+# headers_post['Anti-Csrftoken-A2z'] = CSRF_TOKEN
+# headers_post['Sec-Fetch-Site'] = 'same-origin'
+# headers_post['Sec-Fetch-Mode'] = 'cors'
+# headers_post['Sec-Fetch-Dest'] = 'empty'
+# headers_post['Accept'] = '*/*'
+# headers_post['Content-Type'] = 'application/json'
+# headers_post['X-Requested-With'] = 'XMLHttpRequest'
+#
+# address_json = {"locationType": "LOCATION_INPUT", "zipCode": "W1S 3PR", "deviceType": "web", "storeContext": "grocery",
+#                 "pageType": "Detail", "actionSource": "glow"}
+# address_resp = requ_see.post(address_url, headers=headers_post, json=address_json, verify=False, impersonate="chrome")
+#
+# submit_headers = {
+#     'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+#     'Accept-Encoding': 'gzip',
+#     'Accept-Language': 'zh-CN,zh;q=0.9',
+# }
+# submit_headers['Accept'] = '*/*'
+# submit_headers['Referer'] = 'https://www.amazon.co.uk/dp/B0714LLB2T?th=1'
+# submit_headers['X-Requested-With'] = 'XMLHttpRequest'
+# submit_headers['Sec-Fetch-Site'] = 'same-origin'
+# submit_headers['Sec-Fetch-Mode'] = 'cors'
+# submit_headers['Sec-Fetch-Dest'] = 'empty'
+# detail_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/get-location-label?storeContext=grocery&pageType=Detail&actionSource=desktop-modal'
+# submit_resp = requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome")
+# print(submit_resp.text)
+# print(submit_resp.cookies.get_dict(), '322222222')
+# print(requ_see.cookies.get_dict(), '433333333')
+# cookie_dict = requ_see.cookies.get_dict()
+# index_resp = requests.get('https://www.amazon.co.uk', headers=headers, cookies=cookie_dict, verify=False,
+#                           impersonate="chrome")
+# index_xpath = etree.HTML(index_resp.text)
+# ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
+# print("获取最新邮编：", ingress)
-'存储到pg'
-'获取小语言cookie'
-import sys
-import os
-import pandas as pd
-sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
-from curl_cffi import requests
 import json
-from lxml import etree
-import re
 import random
+import re
 import time
-from secure_db_client import get_remote_engine
 import traceback
-item = {}
+import pandas as pd
+from curl_cffi import requests
+from lxml import etree
+import sys
+print(sys.executable)
+from amazon_every_day_spider.secure_db_client import get_remote_engine
+"""
+打包命令：cd /d E:\Git_new\spider\py_spider
+pyinstaller -F amazon_every_day_spider\Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
+"""
-headers_num_int = 0
 def get_cookie(site='us', zipCode='10010'):
    try:
+        params_site = {
+            "us": "pet-supplies",
+            "de": "toys",
+            "uk": "grocery",
+            "it": "luggage",
+            "es": "apparel",
+            "fr": "kitchen",
+            "ca": "beauty",
+            "mx": "beauty",
+            "au": "fashion",
+            "ae": "generic",
+            "br": "generic",
+            "nl": "home-improvement",
+            "pl": "beauty",
+            "se": "beauty",
+            "tr": "home-improvement",
+            # grocery
+        }
        if site == "us":
-            url_ = 'https://www.amazon.com'
+            index_url = 'https://www.amazon.com'
+            url_asin = 'https://www.amazon.com/dp/B0009X29WK'
            host = 'www.amazon.com'
        elif site == 'uk':
-            url_ = 'https://www.amazon.co.uk'  # 站点url
+            index_url = 'https://www.amazon.co.uk'  # 站点url
+            url_asin = 'https://www.amazon.co.uk/dp/B0714LLB2T'  # 站点url
            host = 'www.amazon.co.uk'
        elif site == 'de':
-            url_ = 'https://www.amazon.de'
+            index_url = 'https://www.amazon.de'
+            url_asin = 'https://www.amazon.de/dp/B00006YYXM'
            host = 'www.amazon.de'
        elif site == 'fr':
-            url_ = 'https://www.amazon.fr'
+            index_url = 'https://www.amazon.fr'
+            url_asin = 'https://www.amazon.fr/dp/B0FK9JNPM5'
            host = 'www.amazon.fr'
        elif site == 'es':
-            url_ = 'https://www.amazon.es'
+            index_url = 'https://www.amazon.es'
+            url_asin = 'https://www.amazon.es/dp/B0FDFVY9J6'
            host = 'www.amazon.es'
        elif site == 'it':
-            url_ = 'https://www.amazon.it'
+            index_url = 'https://www.amazon.it'
+            url_asin = 'https://www.amazon.it/dp/B0F3C16GTF'
            host = 'www.amazon.it'
-        engine_us = get_remote_engine(site, 'mysql')
+        elif site == 'ca':
-        n = random.randint(110, 120)
+            index_url = 'https://www.amazon.ca'
-        ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36'
+            url_asin = 'https://www.amazon.ca/dp/B08H3JPH74'
-        print(ua)
+            host = 'www.amazon.ca'
+        elif site == 'mx':
+            index_url = "https://www.amazon.com.mx"
+            url_asin = 'https://www.amazon.com.mx/dp/B08H3JPH74'
+            host = 'www.amazon.com.mx'
+        elif site == 'ae':
+            index_url = "https://www.amazon.ae"
+            url_asin = 'https://www.amazon.ae/dp/B08H3JPH74'
+            host = 'www.amazon.ae'
+        elif site == 'au':
+            index_url = "https://www.amazon.com.au"
+            url_asin = 'https://www.amazon.com.au/dp/B0D1YFSYGQ'
+            host = 'www.amazon.com.au'
+        elif site == 'tr':
+            index_url = "https://www.amazon.com.tr"
+            url_asin = 'https://www.amazon.com.tr/dp/B08SPXK5WC'
+            host = 'www.amazon.com.tr'
+        elif site == 'be':
+            index_url = "https://www.amazon.com.be"
+            url_asin = 'https://www.amazon.com.be/dp/B01B7O6JH0'
+            host = 'www.amazon.com.be'
+        elif site == 'jp':
+            index_url = "https://www.amazon.co.jp"
+            url_asin = 'https://www.amazon.co.jp/dp/B08H3JPH74'
+            host = 'www.amazon.co.jp'
+        elif site == 'nl':
+            index_url = "https://www.amazon.nl"
+            url_asin = 'https://www.amazon.nl/dp/B01COWDLGG'
+            host = 'www.amazon.nl'
+        elif site == 'pl':
+            index_url = "https://www.amazon.pl"
+            url_asin = 'https://www.amazon.pl/dp/B08H3JPH74'
+            host = 'www.amazon.pl'
+        elif site == 'se':
+            index_url = "https://www.amazon.se"
+            url_asin = 'https://www.amazon.se/dp/B08H3JPH74'
+            host = 'www.amazon.se'
+        elif site == 'br':
+            index_url = "https://www.amazon.com.br"
+            url_asin = 'https://www.amazon.com.br/dp/B08SPXK5WC'
+            host = 'www.amazon.com.br'
+        if site not in ['us', 'uk', 'fr', 'es', 'it', 'de']:
+            engine_us = get_remote_engine('us', 'mysql')
+        else:
+            engine_us = get_remote_engine(site, 'mysql')
+        requ_see = requests.Session()
        headers = {
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-            "Accept-Encoding": "gzip, deflate, br, zstd",
+            'Accept-Encoding': 'gzip, deflate, br, zstd',
-            "Accept-Language": "zh-CN,zh;q=0.9",
+            'Accept-Language': 'zh-CN,zh;q=0.9',
-            "Cache-Control": "no-cache",
+            'Cache-Control': 'no-cache',
-            "Device-Memory": "8",
+            'Pragma': 'no-cache',
-            "Downlink": "1.25",
+            'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
-            "Dpr": "0.75",
+            'Sec-Ch-Ua-Mobile': '?0',
-            "Ect": "3g",
+            'Sec-Ch-Ua-Platform': ' "Windows"',
-            "Pragma": "no-cache",
+            'Sec-Fetch-Dest': 'document',
-            "Rtt": "300",
+            'Sec-Fetch-Mode': 'navigate',
-            "Sec-Ch-Device-Memory": "8",
+            'Sec-Fetch-Site': ' none',
-            "Sec-Ch-Dpr": "0.75",
+            'Sec-Fetch-User': '?1',
-            "Sec-Ch-Ua": f'"Not_A Brand";v="8", "Chromium";v="{n}", "Google Chrome";v="{n}"',
+            'Upgrade-Insecure-Requests': '1',
-            "Sec-Ch-Ua-Mobile": "?0",
+            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
-            "Sec-Ch-Ua-Platform": '"Windows"',
-            "Sec-Ch-Ua-Platform-Version": '"10.0.0"',
-            "Sec-Ch-Viewport-Width": "2560",
-            "Sec-Fetch-Dest": "document",
-            "Sec-Fetch-Mode": "navigate",
-            "Sec-Fetch-Site": "none",
-            "Sec-Fetch-User": "?1",
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": ua,
-            "Viewport-Width": "2560",
        }
+        if site in ['au', 'pl']:
+            del headers['Accept-Encoding']
+            headers['Sec-Ch-Ua'] = "\"Google Chrome\";v=\"143\", \"Chromium\";v=\"143\", \"Not A(Brand\";v=\"24\""
+            headers['priority'] = "u=0, i"
+            headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"
-        alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
+        asin_resp = requ_see.get(url_asin, headers=headers, impersonate="chrome")
-        k = ""
+        print("第一步 请求asin首页:", url_asin)
-        for i in (0, random.randint(0, 5)):
+        html_xpath = etree.HTML(asin_resp.text)
-            k += random.choice(alphabet)
-        # headers[k] = str(uuid.uuid4())
-        sess = requests.Session()
-        resp_ = sess.get(url_, headers=headers, timeout=15, verify=False)
-        cookie = resp_.headers.get('set-cookie')
-        print("第一步 请求首页", url_)
-        cookies_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
-        html_xpath = etree.HTML(resp_.text)
        ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
        print("第一次发送请求，获取邮编：", ingress)
        data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
        data_modal = json.loads(data_a_modal[0])
-        # if site != 'us':
+        print('获取参数anti-csrftoken-a2z：', data_modal)
-        #     csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0]
+        headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z']
-        #     url_post = url_ + '/privacyprefs/retail/v1/acceptall'
+        #                        /portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal
-        #     dada_post = {
+        clkci_url = f'{index_url}/portal-migration/hz/glow/get-rendered-address-selections'
-        #         "anti-csrftoken-a2z": csrftoken,
+        # clkci_url = f'{index_url}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
-        #         "accept": "all"
+        headers['Referer'] = url_asin
-        #     }
+        print('第二步点击')
-        #     resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post,
+        params = {
-        #                           verify=False)
+            "deviceType": "desktop",
-        #     cookie_post = resp_post.headers.get('set-cookie')
+            "pageType": "Detail",
-        #     cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")}
+            "storeContext": params_site.get(site),
-        #     cookies_dict_post.update(cookies_dict)
+            "actionSource": "desktop-modal"
-        # else:
-        cookies_dict_post = cookies_dict
-        # if site == 'us':
-        #     get_token_headers = {
-        #         'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
-        #         'referer': url_,
-        #         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
-        #     }
-        #     print(get_token_headers, '23232')
-        # else:
-        get_token_headers = {
-            'accept': 'text/html,*/*',
-            'accept-encoding': 'gzip, deflate, br',
-            'accept-language': 'zh-CN,zh;q=0.9',
-            'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
-            'cache-control': 'no-cache',
-            'referer': url_,
-            'sec-fetch-dest': 'empty',
-            'sec-fetch-mode': 'cors',
-            'sec-fetch-site': 'same-origin',
-            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
-            'viewport-width': '1920',
-            'x-requested-with': 'XMLHttpRequest',
        }
-        data_modal_url = url_ + data_modal['url']
+        clkci_resp = requ_see.get(clkci_url, headers=headers, params=params, impersonate="chrome")
-        print('第二步 拼接url 点击更改位置:', data_modal_url)
+        CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)[0]
-        data_modal_resp = sess.get(data_modal_url, headers=get_token_headers, cookies=cookies_dict_post,
-                                   timeout=15, verify=False)
-        data_modal_cookie = data_modal_resp.headers.get('set-cookie')
-        CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', data_modal_resp.text)[0]
        print("CSRF_TOKEN:", CSRF_TOKEN)
-        try:
+        # https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
-            data_modal_cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in data_modal_cookie.split("; ")}
+        # https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
-            data_modal_cookie_dict.update(cookies_dict)
+        # https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
-        except:
+        address_url = f'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
-            data_modal_cookie_dict = cookies_dict_post
+        # if site in ['tr', 'be', 'nl', 'pl', 'se']:
+        #     url = f"https://{host}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
-        url_2 = url_ + '/portal-migration/hz/glow/address-change?actionSource=glow'
-        print('url_2:', url_2)
+        headers_post = {
-        # {"locationType":"LOCATION_INPUT","zipCode":"10010","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"}
+            'Host': host,
-        data = {"locationType": "LOCATION_INPUT", "zipCode": zipCode, "storeContext": "generic", "deviceType": "web",
-                "pageType": "Gateway", "actionSource": "glow"}
-        print(data)
-        post_headers = {
-            'anti-csrftoken-a2z': CSRF_TOKEN,
            'accept': 'text/html,*/*',
-            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
-            'cache-control': 'no-cache',
+            'anti-csrftoken-a2z': CSRF_TOKEN,
-            'content-length': '138',
            'content-type': 'application/json',
-            'device-memory': '8',
+            'origin': index_url,
-            'downlink': '10',
+            'referer': f'https://{host}/dp/B0009X29WK?th=1',
-            'dpr': '1',
+            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'ect': '4g',
+            # 'x-amzn-flow-closure-id': '1768269613',
-            'origin': url_,
-            'pragma': 'no-cache',
-            'referer': url_,
-            'rtt': '250',
-            'sec-ch-device-memory': '8',
-            'sec-ch-dpr': '1',
-            'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
-            'sec-ch-ua-mobile': '?0',
-            'sec-ch-ua-platform': '"Windows"',
-            'sec-ch-ua-platform-version': '"10.0.0"',
-            'sec-ch-viewport-width': '1920',
-            'sec-fetch-dest': 'empty',
-            'sec-fetch-mode': 'cors',
-            'sec-fetch-site': 'same-origin',
-            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
-            'viewport-width': '1920',
-            'TE': 'trailers',
            'x-requested-with': 'XMLHttpRequest'
+        }
+        if site in ['uk', 'it', 'es', 'fr']:
+            del headers_post['referer']
+        address_json = {"locationType": "LOCATION_INPUT", "zipCode": f"{zipCode}", "deviceType": "web",
+                        "storeContext": params_site.get(site),
+                        "pageType": "Detail", "actionSource": "glow"}
+        if site == 'au':
+            address_json['locationType'] = 'POSTAL_CODE_WITH_CITY'
+            address_json['city'] = 'WARWICK FARM'
+        elif site == 'ae':
+            address_json['locationType'] = 'CITY'
+            address_json['city'] = 'Abu Dhabi'
+            address_json['pageType'] = 'Gateway'
+        elif site == 'nl':
+            del address_json['zipCode']
+            address_json['locationType'] = 'COUNTRY'
+            address_json['district'] = 'NL'
+            address_json['countryCode'] = 'NL'
+        print('第三步 输入 邮编')
+        print(address_url)
+        post_resp = requ_see.post(address_url, headers=headers_post, json=address_json, verify=False,
+                                      impersonate="chrome")
+        print(post_resp.text)
+        print(post_resp)
+        submit_headers = {
+            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept-Encoding': 'gzip',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
        }
-        print('第三步 发送post 请求 输入 邮编 点击确定')
+        submit_headers['Accept'] = '*/*'
-        resp_2 = sess.post(url_2, headers=post_headers, json=data, cookies=data_modal_cookie_dict,
+        submit_headers['Referer'] = url_asin
-                           timeout=15, verify=False)
+        submit_headers['X-Requested-With'] = 'XMLHttpRequest'
-        print(resp_2.text)
+        submit_headers['Sec-Fetch-Site'] = 'same-origin'
-        post_cookies = resp_2.headers.get('set-cookie')
+        submit_headers['Sec-Fetch-Mode'] = 'cors'
-        try:
+        submit_headers['Sec-Fetch-Dest'] = 'empty'
-            post_cookies_dict = {i.split("=")[0]: i.split("=")[-1] for i in post_cookies.split("; ")}
+        print('第四步。提交')
-            post_cookies_dict.update(data_modal_cookie_dict)
+        detail_url = f'{index_url}/portal-migration/hz/glow/get-location-label?storeContext=pet-supplies&pageType=Detail&actionSource=desktop-modal'
-        except:
-            post_cookies_dict = data_modal_cookie_dict
+        # detail_url = f'{index_url}/portal-migration/hz/glow/get-location-label?storeContext=grocery&pageType=Detail&actionSource=desktop-modal'
+        requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome")
-        done_url = url_ + "/portal-migration/hz/glow/get-location-label?storeContext=generic&pageType=Gateway&actionSource=desktop-modal"
+        print(requ_see.cookies.get_dict())
-        print('第四步，点击完成，')
+        cookie_dict = requ_see.cookies.get_dict()
-        done_resp = sess.get(done_url, headers=headers, cookies=post_cookies_dict, timeout=15, verify=False)
+        index_resp = requests.get(index_url, headers=headers, cookies=cookie_dict, verify=False,
-        print(done_resp.text, 'done_respdone_respdone_respdone_resp')
+                                  impersonate="chrome")
-        done_cookies_dict = sess.cookies.get_dict()
-        print('done_cookies_dict::', done_cookies_dict)
-        print("第五步，请求首页，获取邮编，是否修改成功")
-        index_resp = sess.get(url_, headers=headers, timeout=15, cookies=done_cookies_dict, verify=False)
-        index_resp_cookies = sess.cookies.get_dict()
-        print(sess.cookies.get_dict(), '2222222222222222')
        index_xpath = etree.HTML(index_resp.text)
        ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
        print("获取最新邮编：", ingress)
+        if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip() or 'M5B 2H' in ingress[0].strip():
-        if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip():
            print(f"***************    当前获取 {site} 站点 cookie   邮编  {zipCode}  ********************")
-            cookies = json.dumps(index_resp_cookies, ensure_ascii=False)
+            cookies = json.dumps(cookie_dict, ensure_ascii=False)
            item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies}
            print(item)
            # 构造 DataFrame
-            df = pd.DataFrame([{"cookies": cookies, "type": "DB"}])
+            if site not in ['us', 'uk', 'fr', 'es', 'it', 'de']:
-            # 存储到数据库
+                df = pd.DataFrame([{"cookies": cookies, 'site': site}])
-            engine_us.to_sql(df,  f"{site}_cookies",  if_exists="append")
+                # 存储到数据库
+                engine_us.to_sql(df, 'other_site_cookies', if_exists="append")
+                print(f"入库成功 {site} other_site_cookies")
+            else:
+                # 构造 DataFrame
+                df = pd.DataFrame([{"cookies": cookies}])
+                # 存储到数据库
+                engine_us.to_sql(df, f"{site}_comment_cookies", if_exists="append")
+                print(f"入库成功 {site} {site}_comment_cookies")
+        print('\n')
    except Exception as e:
-        print(f"获取 {site} 站点 cookie 报错，切换下一个站点",e)
+        print(f"获取 {site} 站点 cookie 报错，切换下一个站点", e)
        print("报错", f"\n{traceback.format_exc()}")
        time.sleep(random.uniform(2.5, 5.5))
@@ -219,10 +343,15 @@ if __name__ == '__main__':
        get_cookie(site='us', zipCode='10010')
        get_cookie(site='de', zipCode='10115')
        get_cookie(site='uk', zipCode='W1S 3PR')
-        # get_cookie(site='it', zipCode='85')
-        get_cookie(site='es', zipCode='28001')
-        get_cookie(site='fr', zipCode='75019')
        get_cookie(site='us', zipCode='10010')
-        get_cookie(site='de', zipCode='10115')
+        time.sleep(random.uniform(60.5, 180.5))
-        get_cookie(site='uk', zipCode='W1S 3PR')
+# ae
-        time.sleep(random.uniform(30.5, 70.5))
+# au
+# be
+# br
+# ca
+# mx
+# nl
+# pl
+# se
+# tr
--- a/py_spider/amazon_spider/H10_spider.py
+++ b/py_spider/amazon_spider/H10_spider.py
@@ -20,6 +20,7 @@ from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.common.exceptions import WebDriverException, TimeoutException
 from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support import expected_conditions as EC
@@ -122,73 +123,45 @@ class H10():
            self.site_name = 'us'
        self.engine = get_remote_engine(self.site_name, 'mysql')
-    def web_drver(self):
+    def web_drver(self, is_login=True):
-        chrome_options = Options()
+        opt = Options()
-        chrome_options.add_argument('-disable-gpu')
+        # 稳定性 & 资源
-        chrome_options.add_argument("--disable-notifications")
+        opt.add_argument("--no-sandbox")
-        chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
+        opt.add_argument("--disable-dev-shm-usage")
-        chrome_options.add_argument('--ignore-certificate-errors')
+        opt.add_argument("--disable-gpu")
-        chrome_options.add_argument('--ignore-ssl-errors')  # 忽略ssl错误
+        opt.add_argument("--window-size=1920,1080")
-        chrome_options.add_argument("disable-blink-features=AutomationControlled")
+        opt.add_argument("--disable-notifications")
-        chrome_options.add_argument('–no-sandbox')  # 沙盒模式运行
+        opt.add_argument("--disable-extensions")
-        # 忽略无关的日志
+        opt.add_argument("--disable-background-networking")
-        chrome_options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
+        opt.add_argument("--disable-background-timer-throttling")
-        # 禁止硬件加速，避免严重占用cpu
+        opt.add_argument("--disable-renderer-backgrounding")
-        chrome_options.add_argument('--disable-gpu')
+        opt.add_argument("--disable-features=Translate,BackForwardCache")
-        # 隐身模式（无痕模式）
-        # chrome_options.add_argument('--incognito')
-        chrome_options.add_argument("--start-maximized")
-        chrome_options.add_argument('--disable-gpu')
-        # 以最高权限运行
-        chrome_options.add_argument('--no-sandbox')
-        # 启用打印预览。
-        chrome_options.add_argument("--enable-print-preview")
-        # 在工具 栏增加一个书签按钮
-        chrome_options.add_argument("--bookmark-menu")
-        # 启用书签同步
-        chrome_options.add_argument("--enable-sync")
-        chrome_options.add_argument('–allow-running-insecure-content')  # 允许运行不安全的内容
-        chrome_options.add_argument('–disable-web-security')  # 关闭安全策略
-        chrome_options.add_argument('–disable-xss-auditor')  # 禁止xss防护
-        # 解决浏览器弹出下载多个文件 允许
-        chrome_options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 1})
-        chrome_options.add_argument(' window-size=1920,1080')
-        chrome_options.add_experimental_option("prefs",
-                                               {"profile.default_content_setting_values.automatic_downloads": 1})
-        # 创建一个带有配置文件的 Chrome 浏览器实例
-        self.driver = webdriver.Chrome(options=chrome_options)
-        # 设置headers
+        # ✅ 修正：必须带 --
-        self.driver.execute_cdp_cmd("Network.setExtraHTTPHeaders",
+        opt.add_argument("--disable-blink-features=AutomationControlled")
-                                    {"headers":
+        opt.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
-                                        {
-                                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
-                                        }
-                                    })
-        # 防止网站检测selenium的webdriver
+        # ✅ 保持登录：固定 profile（不会清 cookie）
-        self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        opt.add_argument(r"--user-data-dir=C:\selenium\chrome_profile")
-            "source": """
+        opt.add_argument(r"--profile-directory=Default")
-                    Object.defineProperty(navigator, 'webdriver', {
-                        get: () => False
-                    })
-                """})
-        self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        # ✅ 减负：禁用图片/字体（可选，通常不影响登录）
-            "source": """
+        prefs = {
-                                                                    Object.defineProperty(navigator, 'webdriver', {
+            "profile.managed_default_content_settings.images": 1,
-                                                                      get: () => undefined
+            "profile.managed_default_content_settings.fonts": 1,
-                                                                    })
+            "profile.default_content_setting_values.notifications": 1,
-                                                                  """
+        }
-        })
+        opt.add_experimental_option("prefs", prefs)
-        self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        # ✅ 更快：不要等所有资源加载完（可选）
-            "source": """const toBlob=HTMLCanvasElement.prototype.toBlob;const toDataURL=HTMLCanvasElement.prototype.toDataURL;const getImageData=CanvasRenderingContext2D.prototype.getImageData;function noisify(canvas,context){if(context){const shift={'r':Math.floor(Math.random()*10)-5,'g':Math.floor(Math.random()*10)-5,'b':Math.floor(Math.random()*10)-5,'a':Math.floor(Math.random()*10)-5};const width=canvas.width;const height=canvas.height;if(width&&height){const imageData=getImageData.apply(context,[0,0,width,height]);for(let i=0;i<height;i++){for(let j=0;j<width;j++){const n=((i*(width*4))+(j*4));imageData.data[n+0]=imageData.data[n+0]+shift.r;imageData.data[n+1]=imageData.data[n+1]+shift.g;imageData.data[n+2]=imageData.data[n+2]+shift.b;imageData.data[n+3]=imageData.data[n+3]+shift.a}}window.top.postMessage("canvas-fingerprint-defender-alert",'*');context.putImageData(imageData,0,0)}}}Object.defineProperty(HTMLCanvasElement.prototype,"toBlob",{"value":function(){noisify(this,this.getContext("2d"));return toBlob.apply(this,arguments)}});Object.defineProperty(HTMLCanvasElement.prototype,"toDataURL",{"value":function(){noisify(this,this.getContext("2d"));return toDataURL.apply(this,arguments)}});Object.defineProperty(CanvasRenderingContext2D.prototype,"getImageData",{"value":function(){noisify(this.canvas,this);return getImageData.apply(this,arguments)}});document.documentElement.dataset.cbscriptallow=true;if(document.documentElement.dataset.cbscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.CanvasRenderingContext2D){iframes[i].contentWindow.CanvasRenderingContext2D.prototype.getImageData=CanvasRenderingContext2D.prototype.getImageData}if(iframes[i].contentWindow.HTMLCanvasElement){iframes[i].contentWindow.HTMLCanvasElement.prototype.toBlob=HTMLCanvasElement.prototype.toBlob;iframes[i].contentWindow.HTMLCanvasElement.prototype.toDataURL=HTMLCanvasElement.prototype.toDataURL}}}}""", })
+        opt.page_load_strategy = "eager"
-        self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        try:
-            "source": """var config={"random":{"value":function(){return Math.random()},"item":function(e){var rand=e.length*config.random.value();return e[Math.floor(rand)]},"number":function(power){var tmp=[];for(var i=0;i<power.length;i++){tmp.push(Math.pow(2,power[i]))}return config.random.item(tmp)},"int":function(power){var tmp=[];for(var i=0;i<power.length;i++){var n=Math.pow(2,power[i]);tmp.push(new Int32Array([n,n]))}return config.random.item(tmp)},"float":function(power){var tmp=[];for(var i=0;i<power.length;i++){var n=Math.pow(2,power[i]);tmp.push(new Float32Array([1,n]))}return config.random.item(tmp)}},"spoof":{"webgl":{"buffer":function(target){var proto=target.prototype?target.prototype:target.__proto__;const bufferData=proto.bufferData;Object.defineProperty(proto,"bufferData",{"value":function(){var index=Math.floor(config.random.value()*arguments[1].length);var noise=arguments[1][index]!==undefined?0.1*config.random.value()*arguments[1][index]:0;arguments[1][index]=arguments[1][index]+noise;window.top.postMessage("webgl-fingerprint-defender-alert",'*');return bufferData.apply(this,arguments)}})},"parameter":function(target){var proto=target.prototype?target.prototype:target.__proto__;const getParameter=proto.getParameter;Object.defineProperty(proto,"getParameter",{"value":function(){window.top.postMessage("webgl-fingerprint-defender-alert",'*');if(arguments[0]===3415)return 0;else if(arguments[0]===3414)return 24;else if(arguments[0]===36348)return 30;else if(arguments[0]===7936)return"WebKit";else if(arguments[0]===37445)return"Google Inc.";else if(arguments[0]===7937)return"WebKit WebGL";else if(arguments[0]===3379)return config.random.number([14,15]);else if(arguments[0]===36347)return config.random.number([12,13]);else if(arguments[0]===34076)return config.random.number([14,15]);else if(arguments[0]===34024)return config.random.number([14,15]);else if(arguments[0]===3386)return config.random.int([13,14,15]);else if(arguments[0]===3413)return config.random.number([1,2,3,4]);else if(arguments[0]===3412)return config.random.number([1,2,3,4]);else if(arguments[0]===3411)return config.random.number([1,2,3,4]);else if(arguments[0]===3410)return config.random.number([1,2,3,4]);else if(arguments[0]===34047)return config.random.number([1,2,3,4]);else if(arguments[0]===34930)return config.random.number([1,2,3,4]);else if(arguments[0]===34921)return config.random.number([1,2,3,4]);else if(arguments[0]===35660)return config.random.number([1,2,3,4]);else if(arguments[0]===35661)return config.random.number([4,5,6,7,8]);else if(arguments[0]===36349)return config.random.number([10,11,12,13]);else if(arguments[0]===33902)return config.random.float([0,10,11,12,13]);else if(arguments[0]===33901)return config.random.float([0,10,11,12,13]);else if(arguments[0]===37446)return config.random.item(["Graphics","HD Graphics","Intel(R) HD Graphics"]);else if(arguments[0]===7938)return config.random.item(["WebGL 1.0","WebGL 1.0 (OpenGL)","WebGL 1.0 (OpenGL Chromium)"]);else if(arguments[0]===35724)return config.random.item(["WebGL","WebGL GLSL","WebGL GLSL ES","WebGL GLSL ES (OpenGL Chromium"]);return getParameter.apply(this,arguments)}})}}}};config.spoof.webgl.buffer(WebGLRenderingContext);config.spoof.webgl.buffer(WebGL2RenderingContext);config.spoof.webgl.parameter(WebGLRenderingContext);config.spoof.webgl.parameter(WebGL2RenderingContext);document.documentElement.dataset.wgscriptallow=true;if(document.documentElement.dataset.wgscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.WebGLRenderingContext){iframes[i].contentWindow.WebGLRenderingContext.prototype.bufferData=WebGLRenderingContext.prototype.bufferData;iframes[i].contentWindow.WebGLRenderingContext.prototype.getParameter=WebGLRenderingContext.prototype.getParameter}if(iframes[i].contentWindow.WebGL2RenderingContext){iframes[i].contentWindow.WebGL2RenderingContext.prototype.bufferData=WebGL2RenderingContext.prototype.bufferData;iframes[i].contentWindow.WebGL2RenderingContext.prototype.getParameter=WebGL2RenderingContext.prototype.getParameter}}}}"""})
+            self.driver = webdriver.Chrome(options=opt)
-        self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+        except:
-            "source": """var rand={"noise":function(){var SIGN=Math.random()<Math.random()?-1:1;return Math.floor(Math.random()+SIGN*Math.random())},"sign":function(){const tmp=[-1,-1,-1,-1,-1,-1,+1,-1,-1,-1];const index=Math.floor(Math.random()*tmp.length);return tmp[index]}};Object.defineProperty(HTMLElement.prototype,"offsetHeight",{get(){const height=Math.floor(this.getBoundingClientRect().height);const valid=height&&rand.sign()===1;const result=valid?height+rand.noise():height;return result}});Object.defineProperty(HTMLElement.prototype,"offsetWidth",{get(){const width=Math.floor(this.getBoundingClientRect().width);const valid=width&&rand.sign()===1;const result=valid?width+rand.noise():width;return result}});document.documentElement.dataset.fbscriptallow=true;if(document.documentElement.dataset.fbscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.HTMLElement){iframes[i].contentWindow.HTMLElement.prototype.offsetWidth=HTMLElement.prototype.offsetWidth;iframes[i].contentWindow.HTMLElement.prototype.offsetHeight=HTMLElement.prototype.offsetHeight}}}}"""})
+            service = Service(r"D:\EXE\webdrvier版本\120\chromedriver.exe")
+            self.driver = webdriver.Chrome(service=service, options=opt)
        self.driver.maximize_window()
-        self.longin()
+        if is_login:
+            self.longin()
    def activate_recaptcha(self, api):
        """
@@ -437,27 +410,60 @@ class H10():
        except TimeoutException:
            print("wait_page timeout, used:", time.time() - start)
            return False
+    def click_button(self):
+        try:
+            print('点击显示下拉框')
+            button_js = 'document.querySelector("#CerebroFilter > div > div.sc-bZEumQ.ilswiy > div.sc-DnZRP.etdxo > div > button").click()'
+            self.driver.execute_script(button_js)
+        except:
+            self.driver.find_element(By.XPATH, '//button[@data-testid="showMoreButton"]').click()
+        time.sleep(2)
+        html = self.driver.page_source
+        resp = etree.HTML(html)
+        print('Amazons Choice获取元素')
+        time.sleep(2)
+        div_class = resp.xpath(
+            '''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
+        print('点击选择亚马逊精选 勾选')
+        time.sleep(2)
+        return div_class
+    def click_Choice(self):
+        html = self.driver.page_source
+        resp = etree.HTML(html)
+        print('Amazons Choice获取元素')
+        time.sleep(2)
+        div_class = resp.xpath(
+            '''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
+        print('点击选择亚马逊精选 勾选')
+        time.sleep(2)
+        if div_class:
+            return div_class
+        else:
+            return None
    def webdrvier_html(self, asin, asinstype):
+        refresh_num = 0
        # 点击选择站点
        for i in range(6):
            try:
                _url = self.driver.current_url
+                self.id_url = f'https://members.helium10.com/cerebro?accountId={self.account_id}'
+                self.driver.get(self.id_url)
                if "concurrent-sessions" in _url or 'signin' in _url:
                    self.longin()
                if asin not in self.err_asin_list and self.useremail_state:
                    print('cerebro界面', self.site_name_url)
-                    self.driver.get(f'https://members.helium10.com/cerebro?accountId={self.account_id}')
+                    if not self.wait_page(timeout=35):
-                    if not self.wait_page(timeout=50):
-                        self.driver.refresh()
                        print('页面未加载出来')
                        continue
-                    time.sleep(2)
+                    sleep(randint(10, 15))
                    if 'You are viewing a demo of Cerebro' in self.driver.page_source:
                        print(self.email_name, '账号过期')
                        self.driver.refresh()
                        continue
-                        # self.useremail_state = False
-                        # self.send_ms('You are viewing a demo of Cerebro')
                    self.verify()
                    if self.site_name_url == 'Amazon.co.uk':
                        self.site_name_csv = 'GB'
@@ -494,25 +500,38 @@ class H10():
                    try:
                        self.driver.execute_script(
                            f"""document.querySelector("img[loading='lazy']").click()""")
-                        time.sleep(1)
+                        time.sleep(1.5)
                    except:
                        self.driver.execute_script(
                            f"""document.querySelector("img[alt='{alt}']").click()""")
-                        time.sleep(1)
+                        time.sleep(1.5)
                    self.verify()
                    # 切换站点
                    self.driver.execute_script(f"""document.querySelector("div[data-value='{host}']").click()""")
-                    time.sleep(1.5)
+                    time.sleep(2)
                    # 输入asin
                    print('输入asin', asin)
+                    wait = WebDriverWait(self.driver, 5)
+                    try:
+                        send_asins_xpath = '//*[@id="re-container"]//div[@id="findKeywordSearch"]//input'
+                        wait.until(EC.element_to_be_clickable((By.XPATH, send_asins_xpath)))
+                    except TimeoutException:
+                        try:
+                            send_asins_xpath = '//*[@id="re-container"]//input[contains(@class,"sc-blmEgr sc-cxgeGX")]'
+                            wait.until(EC.element_to_be_clickable((By.XPATH, send_asins_xpath)))
+                        except:
+                            send_asins_xpath = '//*[@id="re-container"]//div[@data-value="0"]//input'
                    if ',' in asin:
                        _asin_lsit = asin.split(',')
                        for _asin in _asin_lsit:
-                            self.driver.find_element(By.XPATH, '//*[@id="re-container"]//input').send_keys(f'{_asin},')
+                            self.driver.find_element(By.XPATH,send_asins_xpath).send_keys(f'{_asin},')
                            time.sleep(1.5)
                    else:
-                        self.driver.find_element(By.XPATH, '//*[@id="re-container"]//input').send_keys(f'{asin},')
+                        self.driver.find_element(By.XPATH,send_asins_xpath).send_keys(f'{asin},')
-                        time.sleep(1)
+                    time.sleep(2)
+                    if 'detected. Please check the ASINs and try again' in self.driver.page_source:
+                        self.err_asin_list.append(asin)
+                        break
                    # 勾选排除变体
                    self.driver.execute_script("""document.querySelector("input[name='excludeVariations']").click()""")
                    # 点击 get keyword
@@ -532,13 +551,17 @@ class H10():
                        print('中间框下载词 没有报告')
                        self.err_asins_adv_list.append(asin)
                        break
+                    elif '拒绝访问' in html:
+                        print('拒绝访问 没有次数')
+                        self.err_asins_adv_list.append(asin)
+                        break
                    elif 'errorCodes.undefined' in html:
                        continue
-                    html = self.driver.page_source
                    self.verify()
-                    time.sleep(2)
+                    time.sleep(3.5)
                    try:
-                        if 'searched this product before' in html or '先前已搜索过此产品' in html:
+                        html = self.driver.page_source
+                        if 'searched this product before' in html or '先前已搜索过此产品' in html or '运行新搜索' in html or '从历史数据加载' in html:
                            print('33333333333444444')
                            self.driver.execute_script(
                                """document.querySelector("button[data-testid='runnewsearch']").click()""")
@@ -562,9 +585,16 @@ class H10():
                        print('中间框下载词 没有报告')
                        self.err_asins_adv_list.append(asin)
                        break
+                    elif '拒绝访问' in html:
+                        print('拒绝访问 没有次数')
+                        self.err_asins_adv_list.append(asin)
+                        break
                    elif 'errorCodes.undefined' in html:
                        continue
-                    sleep(randint(15, 30))
+                    if asinstype:
+                        sleep(randint(20, 38))
+                    else:
+                        sleep(randint(8, 15))
                    self.verify()
                    time.sleep(2)
                    if 'Wrong entered data or no results' in html:
@@ -583,20 +613,11 @@ class H10():
                        break
                    if asinstype:
                        try:
-                            print('点击显示下拉框')
+                            div_class = self.click_Choice()
-                            button_js = 'document.querySelector("#CerebroFilter > div > div.sc-dzXNMW.dufncf > div.sc-hFCjLd.igMWUF > div > button").click()'
+                            if div_class is None:
-                            self.driver.execute_script(button_js)
+                                div_class = self.click_button()
-                            time.sleep(2)
-                            html = self.driver.page_source
-                            resp = etree.HTML(html)
-                            print('Amazons Choice获取元素')
-                            time.sleep(2)
-                            div_class = resp.xpath(
-                                '''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
                        except:
-                            print('报错22222222222222')
+                            div_class = self.click_button()
-                        print('点击选择亚马逊精选 勾选')
-                        time.sleep(2)
                        try:
                            script = f"""
                                    const elements = document.querySelectorAll("div[class='{div_class[0]}']>div");
@@ -604,14 +625,17 @@ class H10():
                                    secondElement.click();
                            """
                        except:
+                            print('报错：scrip t  script 2323232323232323')
                            if i == 2:
                                self.err_asins_adv_list.append(asin)
+                                continue
                        self.driver.execute_script(script)
                        time.sleep(1)
                        html1 = self.driver.page_source
                        resp1 = etree.HTML(html1)
                        span_class = resp1.xpath(
-                            '//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class|//span[contains(text(),"已分析的产品")]/parent::div/following-sibling::div/@class')[0]
+                            '//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class|//span[contains(text(),"已分析的产品")]/parent::div/following-sibling::div/@class')[
+                            0]
                        # 选择亚马逊精选参数1
                        self.driver.execute_script(
                            f"""document.querySelector("div[class='{span_class}']").click()""")
@@ -632,15 +656,20 @@ class H10():
                    time.sleep(1)
                    print('点击选择csv')
                    self.driver.execute_script("""document.querySelector("div[data-testid='csv']").click()""")
-                    time.sleep(15)
+                    time.sleep(8)
                    break
            except Exception as e:
-                print('详细报错')
+                print('详细报错',e)
-                print(traceback.format_exc(), e)
+                if i == 5:
-                self.driver.refresh()
-                time.sleep(1)
-                if i == 2:
                    self.err_asin_list.append(asin)
+                refresh_num += 1
+                if refresh_num > 4:
+                    print('超过4次。清除缓存')
+                    self.enable_no_cache()
+                    self.clear_http_cache()
+                    self.clear_cache_but_keep_cookies('https://members.helium10.com/')
+                    refresh_num = 0
+                    time.sleep(2)
                continue
    def nex_page(self, asin_list, asinstype=None):
@@ -761,7 +790,7 @@ class H10():
            with open(file_path, 'r', encoding='utf-8') as f:
                f.read()
            f.close()
-            print('找到文件：路径有效：',file_path)
+            print('找到文件：路径有效：', file_path)
            return True
        except:
            print('文件路径不存在')
@@ -795,7 +824,6 @@ class H10():
                if state == False:
                    print('重新下载文件222：', asin, path)
                    self.webdrvier_html(asin, None)
-                    self.if_csv_path(file_path)
        header_config = {
            "chinese": {
                "columns": ['关键词词组', 'Cerebro IQ 得分', '搜索量', '搜索量趋势',
@@ -968,7 +996,7 @@ class H10():
            file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
            print('file_pathsave_competition1111111', file_path)
            state = self.if_csv_path(file_path)
-            if state==False:
+            if state == False:
                current_date = datetime.date.today()
                # 计算前一天日期
                previous_date = current_date - datetime.timedelta(days=1)
@@ -983,10 +1011,17 @@ class H10():
                    print('file_pathsave_competition3333', file_path)
                    state = self.if_csv_path(file_path)
                    if state == False:
-                        self.nex_page(self.asin_list, asinstype=1)
+                        print('重新下载文件3333333333 ：', asin_list, path)
+                        # self.webdrvier_html(','.join(asin_list), 1)
+                        if self.is_nex_pag:
+                            self.nex_page(self.asin_list, asinstype=1)
+                            self.is_nex_pag = False
+                            file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
            # 创建一个字典来映射原始列名和新的列名
            columns = pd.read_csv(file_path, nrows=0).columns.tolist()
            def contains_chinese(text):
                return bool(re.search(r'[\u4e00-\u9fff]', text))
@@ -1075,6 +1110,25 @@ class H10():
            print(data)
            requests.post(url=url, data=data, timeout=15)
+    def enable_no_cache(self):
+        self.driver.execute_cdp_cmd("Network.enable", {})
+        self.driver.execute_cdp_cmd("Network.setCacheDisabled", {"cacheDisabled": True})
+    def clear_http_cache(self):
+        self.driver.execute_cdp_cmd("Network.enable", {})
+        self.driver.execute_cdp_cmd("Network.clearBrowserCache", {})
+    def clear_cache_but_keep_cookies(self, origin: str):
+        # 1) 清 http cache
+        self.driver.execute_cdp_cmd("Network.enable", {})
+        self.driver.execute_cdp_cmd("Network.clearBrowserCache", {})
+        # 2) 清更深层缓存：cache storage / service worker / appcache
+        self.driver.execute_cdp_cmd("Storage.clearDataForOrigin", {
+            "origin": origin,
+            "storageTypes": "appcache,cache_storage,service_workers"
+        })
    def run(self):
        user_pw_list = self.get_ip_address()
        if user_pw_list:
@@ -1091,33 +1145,20 @@ class H10():
        else:
            path = r'C:\Users\ASUS\Downloads'
            print('当前路径：', path)
-            self.email_name = 'yswg006@hotmail.com'
+            self.email_name = 'yswg304@outlook.com'
-            self.pw = 'Chianbugye@8346148'  # 'yashengweige678@outlook.com', '987654321yswg@'
+            # 'yswg304@outlook.com', 'Chinabuye@467138'
+            self.pw = 'Chinabuye@467138'  # 'yashengweige678@outlook.com', '987654321yswg@'
        self.web_drver()
+        loop = 0
        while True:
            self.data = {}
            self.sku_list = []
            self.err_asins_adv_list = []
-            try:
-                self.driver.refresh()
-            except:
-                continue
-            time.sleep(4)
-            self.driver.execute_script("localStorage.clear();")  # 清除本地存储
-            time.sleep(0.5)
-            self.driver.execute_script("sessionStorage.clear();")  # 清除会话存储
-            time.sleep(0.5)
-            self.driver.execute_script(
-                "caches.keys().then(function(names) { for (let name of names) { caches.delete(name); } });")
-            self.driver.execute_script("window.performance.clearResourceTimings();")
-            self.driver.execute_cdp_cmd("Network.clearBrowserCache", {})
-            # 2) （可选）清性能 timings 不影响登录
-            self.driver.execute_script("window.performance.clearResourceTimings();")
-            time.sleep(5)
            login_url = self.driver.current_url
            if "concurrent-sessions" in login_url or 'signin' in login_url:
                self.longin()
            for site in ['us', 'uk', 'de', 'fr', 'es', 'it', 'mx']:
+                self.is_nex_pag = True
                print(site)
                if site == 'uk':
                    self.site_url = 'Amazon.co.uk'
@@ -1139,6 +1180,7 @@ class H10():
                self.mysql_connect(site)
                # 获取未抓取的sku
                self.read_db_sku()
                for sku_token in self.sku_data_list:
                    sku_token_list = sku_token.split('|-|')
                    sku = sku_token_list[0]
@@ -1176,6 +1218,13 @@ class H10():
                self.mysql_connect(site)
            time.sleep(randint(20, 50))
+            loop += 1
+            # ✅ 每 30 次再清一次缓存（按你任务调整）
+            if loop % 30 == 0:
+                print('清除缓存')
+                self.enable_no_cache()
+                self.clear_http_cache()
+                self.clear_cache_but_keep_cookies('https://members.helium10.com/')
            for i in range(10):
                print(f"当前时间 {datetime.datetime.now().hour} 点，超出运行时段退出循环。")
                hour = datetime.datetime.now().hour