Commit 3a4d4a7e by Peng

no message

parent 16efe939
...@@ -3,13 +3,16 @@ import random ...@@ -3,13 +3,16 @@ import random
import re import re
import time import time
import traceback import traceback
import pandas as pd import pandas as pd
from curl_cffi import requests from curl_cffi import requests
from lxml import etree from lxml import etree
import os
from amazon_every_day_spider.secure_db_client import get_remote_engine import sys
if getattr(sys, 'frozen', False):
# exe 运行时,把 exe 所在目录加入 path,动态读取 secure_db_client.py
sys.path.insert(0, os.path.dirname(sys.executable))
from secure_db_client import get_remote_engine
import uuid
""" """
打包命令:cd /d E:\Git_new\spider\py_spider 打包命令:cd /d E:\Git_new\spider\py_spider
pyinstaller -F amazon_every_day_spider\Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider pyinstaller -F amazon_every_day_spider\Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
...@@ -21,73 +24,96 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -21,73 +24,96 @@ def get_cookie(site='us', zipCode='10010'):
try: try:
if site == "us": if site == "us":
index_url = 'https://www.amazon.com' index_url = 'https://www.amazon.com'
url_asin = 'https://www.amazon.com/dp/B0009X29WK' url_asin = 'https://www.amazon.com/dp/B0DB1GHRYL?th=1'
host = 'www.amazon.com'
elif site == 'uk': elif site == 'uk':
index_url = 'https://www.amazon.co.uk' # 站点url index_url = 'https://www.amazon.co.uk'
url_asin = 'https://www.amazon.co.uk/dp/B0714LLB2T' # 站点url url_asin = 'https://www.amazon.co.uk/dp/B0714LLB2T'
host = 'www.amazon.co.uk'
elif site == 'de': elif site == 'de':
index_url = 'https://www.amazon.de' index_url = 'https://www.amazon.de'
url_asin = 'https://www.amazon.de/dp/B00006YYXM' url_asin = 'https://www.amazon.de/dp/B00006YYXM'
host = 'www.amazon.de'
elif site == 'fr': elif site == 'fr':
index_url = 'https://www.amazon.fr' index_url = 'https://www.amazon.fr'
url_asin = 'https://www.amazon.fr/dp/B0FK9JNPM5' url_asin = 'https://www.amazon.fr/dp/B0FK9JNPM5'
host = 'www.amazon.fr'
elif site == 'es': elif site == 'es':
index_url = 'https://www.amazon.es' index_url = 'https://www.amazon.es'
url_asin = 'https://www.amazon.es/dp/B0FDFVY9J6' url_asin = 'https://www.amazon.es/dp/B0FDFVY9J6'
host = 'www.amazon.es'
elif site == 'it': elif site == 'it':
index_url = 'https://www.amazon.it' index_url = 'https://www.amazon.it'
url_asin = 'https://www.amazon.it/dp/B0F3C16GTF' url_asin = 'https://www.amazon.it/dp/B0F3C16GTF'
host = 'www.amazon.it'
elif site == 'ca': elif site == 'ca':
index_url = 'https://www.amazon.ca' index_url = 'https://www.amazon.ca'
url_asin = 'https://www.amazon.ca//dp/B08H3JPH74' url_asin = 'https://www.amazon.ca/dp/B08H3JPH74'
host = 'www.amazon.ca'
if site == 'ca': if site == 'ca':
engine_us = get_remote_engine('us', 'mysql') engine_us = get_remote_engine('us', 'mysql')
else: else:
engine_us = get_remote_engine(site, 'mysql') engine_us = get_remote_engine(site, 'mysql')
requ_see = requests.Session() requ_see = requests.Session()
n = random.randint(120, 142)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 6900)}.{random.randint(1, 181)} Safari/537.36'
headers = { headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'connection': 'close',
'Accept-Encoding': 'gzip, deflate, br, zstd', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9', 'accept-language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache', 'accept-encoding': 'gzip, deflate, br, zstd',
'Pragma': 'no-cache', 'cache-control': 'no-cache',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', 'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Sec-Ch-Ua-Mobile': '?0', 'sec-ch-ua-mobile': '?0',
'Sec-Ch-Ua-Platform': ' "Windows"', 'user-agent': ua,
'Sec-Fetch-Dest': 'document', "pragma": "no-cache",
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': ' none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
} }
asin_resp = requ_see.get(url_asin, headers=headers)
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
k = ""
for i in (0, random.randint(0, 26)):
k += random.choice(alphabet)
headers[k] = str(uuid.uuid4())
asin_resp = requ_see.get(url_asin, headers=headers,verify=False,
impersonate="chrome")
print("第一步 请求asin首页:", url_asin) print("第一步 请求asin首页:", url_asin)
# 检测是否返回验证码拦截页,自动提交过关
if 'validateCaptcha' in asin_resp.text or 'opfcaptcha' in asin_resp.text:
print(f'[{site}] 检测到验证码拦截页,自动提交过关...')
cap_xpath = etree.HTML(asin_resp.text)
amzn = cap_xpath.xpath("//input[@name='amzn']/@value")
amzn_r = cap_xpath.xpath("//input[@name='amzn-r']/@value")
field_keywords = cap_xpath.xpath("//input[@name='field-keywords']/@value")
if amzn and amzn_r and field_keywords:
captcha_url = f"{index_url}/errors/validateCaptcha"
params = {'amzn': amzn[0], 'amzn-r': amzn_r[0], 'field-keywords': field_keywords[0]}
asin_resp = requ_see.get(captcha_url, params=params, headers=headers, verify=False, impersonate="chrome")
print(f'[{site}] 验证码提交完成,状态码: {asin_resp.status_code}')
else:
print(f'[{site}] 验证码参数提取失败,跳过')
return
html_xpath = etree.HTML(asin_resp.text) html_xpath = etree.HTML(asin_resp.text)
ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()") ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
print("第一次发送请求,获取邮编:", ingress) print("第一次发送请求,获取邮编:", ingress)
data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal") data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
if not data_a_modal:
print(f'[{site}] 未找到 modal 元素,跳过')
return
data_modal = json.loads(data_a_modal[0]) data_modal = json.loads(data_a_modal[0])
print('获取参数anti-csrftoken-a2z:', data_modal) print('获取参数anti-csrftoken-a2z:', data_modal)
headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z'] headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z']
clkci_url = f'{index_url}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS' clkci_url = index_url + data_modal['url']
headers['Referer'] = url_asin
print('第二步点击') print('第二步点击')
clkci_resp = requ_see.get(clkci_url, headers=headers) clkci_resp = requ_see.get(clkci_url, headers=headers,verify=False,
CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)[0] impersonate="chrome")
csrf_list = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)
if not csrf_list:
print(f'[{site}] 未找到 CSRF_TOKEN,跳过')
return
CSRF_TOKEN = csrf_list[0]
print("CSRF_TOKEN:", CSRF_TOKEN) print("CSRF_TOKEN:", CSRF_TOKEN)
address_url = f'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow' address_url = f'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
headers_post = { headers_post = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'User-Agent': ua,
'Accept-Encoding': 'gzip', } 'Accept-Encoding': 'gzip', }
headers_post['Origin'] = index_url headers_post['Origin'] = index_url
headers_post['Anti-Csrftoken-A2z'] = CSRF_TOKEN headers_post['Anti-Csrftoken-A2z'] = CSRF_TOKEN
...@@ -106,7 +132,7 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -106,7 +132,7 @@ def get_cookie(site='us', zipCode='10010'):
impersonate="chrome") impersonate="chrome")
print(post_resp.text) print(post_resp.text)
submit_headers = { submit_headers = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'User-Agent': ua,
'Accept-Encoding': 'gzip', 'Accept-Encoding': 'gzip',
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9',
} }
...@@ -121,7 +147,7 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -121,7 +147,7 @@ def get_cookie(site='us', zipCode='10010'):
requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome") requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome")
print(requ_see.cookies.get_dict()) print(requ_see.cookies.get_dict())
cookie_dict = requ_see.cookies.get_dict() cookie_dict = requ_see.cookies.get_dict()
index_resp = requests.get(index_url, headers=headers, cookies=cookie_dict, verify=False, index_resp = requ_see.get(index_url, headers=headers, verify=False,
impersonate="chrome") impersonate="chrome")
index_xpath = etree.HTML(index_resp.text) index_xpath = etree.HTML(index_resp.text)
ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()") ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
...@@ -149,12 +175,14 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -149,12 +175,14 @@ def get_cookie(site='us', zipCode='10010'):
if __name__ == '__main__': if __name__ == '__main__':
count = 0
while True: while True:
get_cookie(site='us', zipCode='10010') get_cookie(site='us', zipCode='10010')
# get_cookie(site='de', zipCode='10115') get_cookie(site='de', zipCode='10115')
# get_cookie(site='uk', zipCode='W1S 3PR') get_cookie(site='uk', zipCode='W1S 3PR')
# get_cookie(site='it', zipCode='00185') if count % 10 == 0:
# get_cookie(site='es', zipCode='28001') get_cookie(site='it', zipCode='00185')
# get_cookie(site='fr', zipCode='75019') get_cookie(site='es', zipCode='28001')
# get_cookie(site='ca', zipCode='M5B 2H1') get_cookie(site='fr', zipCode='75019')
count += 1
time.sleep(random.uniform(60.5, 180.5)) time.sleep(random.uniform(60.5, 180.5))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment