no message

3a4d4a7e · Peng · 16efe939 · 3a4d4a7e
Commit 3a4d4a7e authored Apr 17, 2026 by Peng
Show whitespace changes
Inline Side-by-side

Showing with 71 additions and 43 deletions

Get_Cookies.py py_spider/amazon_every_day_spider/Get_Cookies.py +71 -43

No files found.
--- a/py_spider/amazon_every_day_spider/Get_Cookies.py
+++ b/py_spider/amazon_every_day_spider/Get_Cookies.py
@@ -3,13 +3,16 @@ import random
 import re
 import time
 import traceback
 import pandas as pd
 from curl_cffi import requests
 from lxml import etree
+import os
-from amazon_every_day_spider.secure_db_client import get_remote_engine
+import sys
+if getattr(sys, 'frozen', False):
+    # exe 运行时，把 exe 所在目录加入 path，动态读取 secure_db_client.py
+    sys.path.insert(0, os.path.dirname(sys.executable))
+from secure_db_client import get_remote_engine
+import uuid
 """
 打包命令：cd /d E:\Git_new\spider\py_spider
 pyinstaller -F amazon_every_day_spider\Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
@@ -21,73 +24,96 @@ def get_cookie(site='us', zipCode='10010'):
    try:
        if site == "us":
            index_url = 'https://www.amazon.com'
-            url_asin = 'https://www.amazon.com/dp/B0009X29WK'
+            url_asin = 'https://www.amazon.com/dp/B0DB1GHRYL?th=1'
-            host = 'www.amazon.com'
        elif site == 'uk':
-            index_url = 'https://www.amazon.co.uk'  # 站点url
+            index_url = 'https://www.amazon.co.uk'
-            url_asin = 'https://www.amazon.co.uk/dp/B0714LLB2T'  # 站点url
+            url_asin = 'https://www.amazon.co.uk/dp/B0714LLB2T'
-            host = 'www.amazon.co.uk'
        elif site == 'de':
            index_url = 'https://www.amazon.de'
            url_asin = 'https://www.amazon.de/dp/B00006YYXM'
-            host = 'www.amazon.de'
        elif site == 'fr':
            index_url = 'https://www.amazon.fr'
            url_asin = 'https://www.amazon.fr/dp/B0FK9JNPM5'
-            host = 'www.amazon.fr'
        elif site == 'es':
            index_url = 'https://www.amazon.es'
            url_asin = 'https://www.amazon.es/dp/B0FDFVY9J6'
-            host = 'www.amazon.es'
        elif site == 'it':
            index_url = 'https://www.amazon.it'
            url_asin = 'https://www.amazon.it/dp/B0F3C16GTF'
-            host = 'www.amazon.it'
        elif site == 'ca':
            index_url = 'https://www.amazon.ca'
-            url_asin = 'https://www.amazon.ca//dp/B08H3JPH74'
+            url_asin = 'https://www.amazon.ca/dp/B08H3JPH74'
-            host = 'www.amazon.ca'
        if site == 'ca':
            engine_us = get_remote_engine('us', 'mysql')
        else:
            engine_us = get_remote_engine(site, 'mysql')
        requ_see = requests.Session()
+        n = random.randint(120, 142)
+        ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 6900)}.{random.randint(1, 181)} Safari/537.36'
        headers = {
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'connection': 'close',
-            'Accept-Encoding': 'gzip, deflate, br, zstd',
+            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'accept-language': 'zh-CN,zh;q=0.9',
-            'Cache-Control': 'no-cache',
+            'accept-encoding': 'gzip, deflate, br, zstd',
-            'Pragma': 'no-cache',
+            'cache-control': 'no-cache',
-            'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
-            'Sec-Ch-Ua-Mobile': '?0',
+            'sec-ch-ua-mobile': '?0',
-            'Sec-Ch-Ua-Platform': ' "Windows"',
+            'user-agent': ua,
-            'Sec-Fetch-Dest': 'document',
+            "pragma": "no-cache",
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': ' none',
-            'Sec-Fetch-User': '?1',
-            'Upgrade-Insecure-Requests': '1',
-            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
-        asin_resp = requ_see.get(url_asin, headers=headers)
+        alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
+                    's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+        k = ""
+        for i in (0, random.randint(0, 26)):
+            k += random.choice(alphabet)
+        headers[k] = str(uuid.uuid4())
+        asin_resp = requ_see.get(url_asin, headers=headers,verify=False,
+                                  impersonate="chrome")
        print("第一步 请求asin首页:", url_asin)
+        # 检测是否返回验证码拦截页，自动提交过关
+        if 'validateCaptcha' in asin_resp.text or 'opfcaptcha' in asin_resp.text:
+            print(f'[{site}] 检测到验证码拦截页，自动提交过关...')
+            cap_xpath = etree.HTML(asin_resp.text)
+            amzn = cap_xpath.xpath("//input[@name='amzn']/@value")
+            amzn_r = cap_xpath.xpath("//input[@name='amzn-r']/@value")
+            field_keywords = cap_xpath.xpath("//input[@name='field-keywords']/@value")
+            if amzn and amzn_r and field_keywords:
+                captcha_url = f"{index_url}/errors/validateCaptcha"
+                params = {'amzn': amzn[0], 'amzn-r': amzn_r[0], 'field-keywords': field_keywords[0]}
+                asin_resp = requ_see.get(captcha_url, params=params, headers=headers, verify=False, impersonate="chrome")
+                print(f'[{site}] 验证码提交完成，状态码: {asin_resp.status_code}')
+            else:
+                print(f'[{site}] 验证码参数提取失败，跳过')
+                return
        html_xpath = etree.HTML(asin_resp.text)
        ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
        print("第一次发送请求，获取邮编：", ingress)
        data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
+        if not data_a_modal:
+            print(f'[{site}] 未找到 modal 元素，跳过')
+            return
        data_modal = json.loads(data_a_modal[0])
        print('获取参数anti-csrftoken-a2z：', data_modal)
        headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z']
-        clkci_url = f'{index_url}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
+        clkci_url = index_url + data_modal['url']
-        headers['Referer'] = url_asin
        print('第二步点击')
-        clkci_resp = requ_see.get(clkci_url, headers=headers)
+        clkci_resp = requ_see.get(clkci_url, headers=headers,verify=False,
-        CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)[0]
+                                  impersonate="chrome")
+        csrf_list = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)
+        if not csrf_list:
+            print(f'[{site}] 未找到 CSRF_TOKEN，跳过')
+            return
+        CSRF_TOKEN = csrf_list[0]
        print("CSRF_TOKEN:", CSRF_TOKEN)
        address_url = f'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
        headers_post = {
-            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'User-Agent': ua,
            'Accept-Encoding': 'gzip', }
        headers_post['Origin'] = index_url
        headers_post['Anti-Csrftoken-A2z'] = CSRF_TOKEN
@@ -106,7 +132,7 @@ def get_cookie(site='us', zipCode='10010'):
                                  impersonate="chrome")
        print(post_resp.text)
        submit_headers = {
-            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'User-Agent': ua,
            'Accept-Encoding': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }
@@ -121,7 +147,7 @@ def get_cookie(site='us', zipCode='10010'):
        requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome")
        print(requ_see.cookies.get_dict())
        cookie_dict = requ_see.cookies.get_dict()
-        index_resp = requests.get(index_url, headers=headers, cookies=cookie_dict, verify=False,
+        index_resp = requ_see.get(index_url, headers=headers, verify=False,
                                  impersonate="chrome")
        index_xpath = etree.HTML(index_resp.text)
        ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
@@ -149,12 +175,14 @@ def get_cookie(site='us', zipCode='10010'):
 if __name__ == '__main__':
+    count = 0
    while True:
        get_cookie(site='us', zipCode='10010')
-        # get_cookie(site='de', zipCode='10115')
+        get_cookie(site='de', zipCode='10115')
-        # get_cookie(site='uk', zipCode='W1S 3PR')
+        get_cookie(site='uk', zipCode='W1S 3PR')
-        # get_cookie(site='it', zipCode='00185')
+        if count % 10 == 0:
-        # get_cookie(site='es', zipCode='28001')
+            get_cookie(site='it', zipCode='00185')
-        # get_cookie(site='fr', zipCode='75019')
+            get_cookie(site='es', zipCode='28001')
-        # get_cookie(site='ca', zipCode='M5B 2H1')
+            get_cookie(site='fr', zipCode='75019')
+        count += 1
        time.sleep(random.uniform(60.5, 180.5))