Commit 6dd760f3 by Peng

no message

parent 3f158caf
'存储到pg'
'获取小语言cookie'
import sys
import os
import pandas as pd
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import requests
import json import json
from lxml import etree
import re
import random import random
import uuid import re
import time import time
import urllib3
from secure_db_client import get_remote_engine
import py_ja3
import traceback import traceback
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
item = {}
headers_num_int = 0 import pandas as pd
from curl_cffi import requests
from lxml import etree
from amazon_every_day_spider.secure_db_client import get_remote_engine
"""
打包命令:cd /d E:\Git_new\spider\py_spider
pyinstaller -F amazon_every_day_spider\Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
"""
def get_cookie(site='us', zipCode='10010'): def get_cookie(site='us', zipCode='10010'):
try: try:
if site == "us": if site == "us":
url_ = 'https://www.amazon.com' index_url = 'https://www.amazon.com'
url_asin = 'https://www.amazon.com/dp/B0009X29WK'
host = 'www.amazon.com' host = 'www.amazon.com'
elif site == 'uk': elif site == 'uk':
url_ = 'https://www.amazon.co.uk' # 站点url index_url = 'https://www.amazon.co.uk' # 站点url
url_asin = 'https://www.amazon.co.uk/dp/B0714LLB2T' # 站点url
host = 'www.amazon.co.uk' host = 'www.amazon.co.uk'
elif site == 'de': elif site == 'de':
url_ = 'https://www.amazon.de' index_url = 'https://www.amazon.de'
url_asin = 'https://www.amazon.de/dp/B00006YYXM'
host = 'www.amazon.de' host = 'www.amazon.de'
elif site == 'fr': elif site == 'fr':
url_ = 'https://www.amazon.fr' index_url = 'https://www.amazon.fr'
url_asin = 'https://www.amazon.fr/dp/B0FK9JNPM5'
host = 'www.amazon.fr' host = 'www.amazon.fr'
elif site == 'es': elif site == 'es':
url_ = 'https://www.amazon.es' index_url = 'https://www.amazon.es'
url_asin = 'https://www.amazon.es/dp/B0FDFVY9J6'
host = 'www.amazon.es' host = 'www.amazon.es'
elif site == 'it': elif site == 'it':
url_ = 'https://www.amazon.it' index_url = 'https://www.amazon.it'
url_asin = 'https://www.amazon.it/dp/B0F3C16GTF'
host = 'www.amazon.it' host = 'www.amazon.it'
elif site == 'ca':
index_url = 'https://www.amazon.ca'
url_asin = 'https://www.amazon.ca//dp/B08H3JPH74'
engine_us = get_remote_engine(site, 'mysql') host = 'www.amazon.ca'
n = random.randint(120, 130) if site == 'ca':
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36' engine_us = get_remote_engine('us', 'mysql')
print(ua) else:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.4929.149 Safari/537.36' engine_us = get_remote_engine(site, 'mysql')
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36' requ_see = requests.Session()
headers = { headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
"Accept-Encoding": "gzip, deflate, br, zstd", 'Accept-Encoding': 'gzip, deflate, br, zstd',
"Accept-Language": "zh-CN,zh;q=0.9", 'Accept-Language': 'zh-CN,zh;q=0.9',
"Cache-Control": "no-cache", 'Cache-Control': 'no-cache',
"Device-Memory": "8", 'Pragma': 'no-cache',
"Downlink": "1.25", 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
"Dpr": "0.75", 'Sec-Ch-Ua-Mobile': '?0',
"Ect": "3g", 'Sec-Ch-Ua-Platform': ' "Windows"',
"Pragma": "no-cache", 'Sec-Fetch-Dest': 'document',
"Rtt": "300", 'Sec-Fetch-Mode': 'navigate',
"Sec-Ch-Device-Memory": "8", 'Sec-Fetch-Site': ' none',
"Sec-Ch-Dpr": "0.75", 'Sec-Fetch-User': '?1',
"Sec-Ch-Ua": f'"Not_A Brand";v="8", "Chromium";v="{ua}", "Google Chrome";v="{ua}"', 'Upgrade-Insecure-Requests': '1',
"Sec-Ch-Ua-Mobile": "?0", 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
"Sec-Ch-Ua-Platform": '"Windows"', }
"Sec-Ch-Ua-Platform-Version": '"10.0.0"', asin_resp = requ_see.get(url_asin, headers=headers)
"Sec-Ch-Viewport-Width": "2560", print("第一步 请求asin首页:", url_asin)
"Sec-Fetch-Dest": "document", html_xpath = etree.HTML(asin_resp.text)
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": ua,
"Viewport-Width": "2560",
}
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
k = ""
for i in (0, random.randint(0, 5)):
k += random.choice(alphabet)
# headers[k] = str(uuid.uuid4())
sess = requests.Session()
sess.mount(url_, py_ja3.DESAdapter())
resp_ = sess.get(url_, headers=headers, timeout=15, verify=False)
cookie = resp_.headers.get('set-cookie')
print("第一步 请求首页", url_)
cookies_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
html_xpath = etree.HTML(resp_.text)
ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()") ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
print("第一次发送请求,获取邮编:", ingress) print("第一次发送请求,获取邮编:", ingress)
data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal") data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
data_modal = json.loads(data_a_modal[0]) data_modal = json.loads(data_a_modal[0])
# if site != 'us': print('获取参数anti-csrftoken-a2z:', data_modal)
# csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0] headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z']
# url_post = url_ + '/privacyprefs/retail/v1/acceptall' clkci_url = f'{index_url}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
# dada_post = { headers['Referer'] = url_asin
# "anti-csrftoken-a2z": csrftoken, print('第二步点击')
# "accept": "all" clkci_resp = requ_see.get(clkci_url, headers=headers)
# } CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)[0]
# resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post,
# verify=False)
# cookie_post = resp_post.headers.get('set-cookie')
# cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")}
# cookies_dict_post.update(cookies_dict)
# else:
cookies_dict_post = cookies_dict
# if site == 'us':
# get_token_headers = {
# 'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
# 'referer': url_,
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
# }
# print(get_token_headers, '23232')
# else:
get_token_headers = {
'accept': 'text/html,*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
'cache-control': 'no-cache',
'referer': url_,
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'viewport-width': '1920',
'x-requested-with': 'XMLHttpRequest',
}
data_modal_url = url_ + data_modal['url']
print('第二步 拼接url 点击更改位置:', data_modal_url)
data_modal_resp = sess.get(data_modal_url, headers=get_token_headers, cookies=cookies_dict_post,
timeout=15, verify=False)
data_modal_cookie = data_modal_resp.headers.get('set-cookie')
CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', data_modal_resp.text)[0]
print("CSRF_TOKEN:", CSRF_TOKEN) print("CSRF_TOKEN:", CSRF_TOKEN)
try: address_url = f'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
data_modal_cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in data_modal_cookie.split("; ")} headers_post = {
data_modal_cookie_dict.update(cookies_dict) 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
except: 'Accept-Encoding': 'gzip', }
data_modal_cookie_dict = cookies_dict_post headers_post['Origin'] = index_url
headers_post['Anti-Csrftoken-A2z'] = CSRF_TOKEN
url_2 = url_ + '/portal-migration/hz/glow/address-change?actionSource=glow' headers_post['Sec-Fetch-Site'] = 'same-origin'
print('url_2:', url_2) headers_post['Sec-Fetch-Mode'] = 'cors'
# {"locationType":"LOCATION_INPUT","zipCode":"10010","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"} headers_post['Sec-Fetch-Dest'] = 'empty'
data = {"locationType": "LOCATION_INPUT", "zipCode": zipCode, "storeContext": "generic", "deviceType": "web", headers_post['Accept'] = '*/*'
"pageType": "Gateway", "actionSource": "glow"} headers_post['Content-Type'] = 'application/json'
print(data) headers_post['X-Requested-With'] = 'XMLHttpRequest'
post_headers = {
'anti-csrftoken-a2z': CSRF_TOKEN, address_json = {"locationType": "LOCATION_INPUT", "zipCode": f"{zipCode}", "deviceType": "web",
'accept': 'text/html,*/*', "storeContext": "grocery",
'accept-encoding': 'gzip, deflate, br', "pageType": "Detail", "actionSource": "glow"}
'accept-language': 'zh-CN,zh;q=0.9', print('第三步 输入 邮编')
'cache-control': 'no-cache', post_resp = requ_see.post(address_url, headers=headers_post, json=address_json, verify=False,
'content-length': '138', impersonate="chrome")
'content-type': 'application/json', print(post_resp.text)
'device-memory': '8', submit_headers = {
'downlink': '10', 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'dpr': '1', 'Accept-Encoding': 'gzip',
'ect': '4g', 'Accept-Language': 'zh-CN,zh;q=0.9',
'origin': url_,
'pragma': 'no-cache',
'referer': url_,
'rtt': '250',
'sec-ch-device-memory': '8',
'sec-ch-dpr': '1',
'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-ch-ua-platform-version': '"10.0.0"',
'sec-ch-viewport-width': '1920',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'viewport-width': '1920',
'TE': 'trailers',
'x-requested-with': 'XMLHttpRequest'
} }
print('第三步 发送post 请求 输入 邮编 点击确定') submit_headers['Accept'] = '*/*'
resp_2 = sess.post(url_2, headers=post_headers, json=data, cookies=data_modal_cookie_dict, submit_headers['Referer'] = url_asin
timeout=15, verify=False) submit_headers['X-Requested-With'] = 'XMLHttpRequest'
print(resp_2.text) submit_headers['Sec-Fetch-Site'] = 'same-origin'
post_cookies = resp_2.headers.get('set-cookie') submit_headers['Sec-Fetch-Mode'] = 'cors'
try: submit_headers['Sec-Fetch-Dest'] = 'empty'
post_cookies_dict = {i.split("=")[0]: i.split("=")[-1] for i in post_cookies.split("; ")} print('第四步。提交')
post_cookies_dict.update(data_modal_cookie_dict) detail_url = f'{index_url}/portal-migration/hz/glow/get-location-label?storeContext=grocery&pageType=Detail&actionSource=desktop-modal'
except: requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome")
post_cookies_dict = data_modal_cookie_dict print(requ_see.cookies.get_dict())
cookie_dict = requ_see.cookies.get_dict()
done_url = url_ + "/portal-migration/hz/glow/get-location-label?storeContext=generic&pageType=Gateway&actionSource=desktop-modal" index_resp = requests.get(index_url, headers=headers, cookies=cookie_dict, verify=False,
print('第四步,点击完成,') impersonate="chrome")
done_resp = sess.get(done_url, headers=headers, cookies=post_cookies_dict, timeout=15, verify=False)
print(done_resp.text, 'done_respdone_respdone_respdone_resp')
done_cookies_dict = sess.cookies.get_dict()
print('done_cookies_dict::', done_cookies_dict)
print("第五步,请求首页,获取邮编,是否修改成功")
index_resp = sess.get(url_, headers=headers, timeout=15, cookies=done_cookies_dict, verify=False)
index_resp_cookies = sess.cookies.get_dict()
print(sess.cookies.get_dict(), '2222222222222222')
index_xpath = etree.HTML(index_resp.text) index_xpath = etree.HTML(index_resp.text)
ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()") ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
print("获取最新邮编:", ingress) print("获取最新邮编:", ingress)
if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip() or 'M5B 2H' in ingress[0].strip():
if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip():
print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************") print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************")
cookies = json.dumps(index_resp_cookies, ensure_ascii=False) cookies = json.dumps(cookie_dict, ensure_ascii=False)
item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies} item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies}
print(item) print(item)
# 构造 DataFrame # 构造 DataFrame
df = pd.DataFrame([{"cookies": cookies, "type": "DB"}]) if site == 'ca':
# 存储到数据库 df = pd.DataFrame([{"cookies": cookies, "type": "DB", 'site': 'ca'}])
engine_us.to_sql(df, f"{site}_cookies", if_exists="append") # 存储到数据库
engine_us.to_sql(df, 'other_site_cookies', if_exists="append")
else:
# 构造 DataFrame
df = pd.DataFrame([{"cookies": cookies, "type": "DB"}])
# 存储到数据库
engine_us.to_sql(df, f"{site}_cookies", if_exists="append")
print('\n')
except Exception as e: except Exception as e:
print(f"获取 {site} 站点 cookie 报错,切换下一个站点",e) print(f"获取 {site} 站点 cookie 报错,切换下一个站点", e)
print("报错", f"\n{traceback.format_exc()}") print("报错", f"\n{traceback.format_exc()}")
time.sleep(random.uniform(2.5, 5.5)) time.sleep(random.uniform(2.5, 5.5))
...@@ -230,10 +153,8 @@ if __name__ == '__main__': ...@@ -230,10 +153,8 @@ if __name__ == '__main__':
get_cookie(site='us', zipCode='10010') get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115') get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR') get_cookie(site='uk', zipCode='W1S 3PR')
# get_cookie(site='it', zipCode='85') get_cookie(site='it', zipCode='00185')
get_cookie(site='es', zipCode='28001') get_cookie(site='es', zipCode='28001')
get_cookie(site='fr', zipCode='75019') get_cookie(site='fr', zipCode='75019')
get_cookie(site='us', zipCode='10010') get_cookie(site='ca', zipCode='M5B 2H1')
get_cookie(site='de', zipCode='10115') time.sleep(random.uniform(60.5, 180.5))
get_cookie(site='uk', zipCode='W1S 3PR')
time.sleep(random.uniform(30.5, 70.5))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment