Commit 1612910c by Peng

no message

parent 179f825d
......@@ -151,10 +151,10 @@ def get_cookie(site='us', zipCode='10010'):
if __name__ == '__main__':
while True:
get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR')
get_cookie(site='it', zipCode='00185')
get_cookie(site='es', zipCode='28001')
get_cookie(site='fr', zipCode='75019')
get_cookie(site='ca', zipCode='M5B 2H1')
# get_cookie(site='de', zipCode='10115')
# get_cookie(site='uk', zipCode='W1S 3PR')
# get_cookie(site='it', zipCode='00185')
# get_cookie(site='es', zipCode='28001')
# get_cookie(site='fr', zipCode='75019')
# get_cookie(site='ca', zipCode='M5B 2H1')
time.sleep(random.uniform(60.5, 180.5))
# import json
# import re
#
# from curl_cffi import requests
# from lxml import etree
#
# requ_see = requests.Session()
# headers = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate, br, zstd',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'no-cache',
# 'Pragma': 'no-cache',
# 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
# 'Sec-Ch-Ua-Mobile': '?0',
# 'Sec-Ch-Ua-Platform': ' "Windows"',
# 'Sec-Fetch-Dest': 'document',
# 'Sec-Fetch-Mode': 'navigate',
# 'Sec-Fetch-Site': ' none',
# 'Sec-Fetch-User': '?1',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
# }
# asin_resp = requ_see.get('https://www.amazon.co.uk/dp/B0714LLB2T?th=1', headers=headers)
# print("第一步 请求首页")
# html_xpath = etree.HTML(asin_resp.text)
# ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
# print("第一次发送请求,获取邮编:", ingress)
# # url2 = 'https://www.amazon.co.uk/nav/ajax/hMenuDesktopFirstLayer?ajaxTemplate=hMenuDesktopFirstLayer&pageType=Detail&hmDataAjaxHint=1&isFreshRegion=false&isFreshCustomer=false&isPrimeMember=false&isPrimeDay=false&isBackup=false&firstName=false&navDeviceType=desktop&hashCustomerAndSessionId=8b35c8413eaf45f3509509691ec91ce8cc82c3f3&environmentVFI=AmazonNavigationCards%2Fdevelopment%40B6407668806-AL2_aarch64&languageCode=en_GB&customerCountryCode=US'
# # requ_see.get(url2, headers=headers)
#
# data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
# data_modal = json.loads(data_a_modal[0])
# print(data_modal)
# headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z']
# clkci_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
# headers['Referer'] = 'https://www.amazon.co.uk/dp/B0714LLB2T?th=1'
# print(headers, 23333333)
# clkci_resp = requ_see.get(clkci_url, headers=headers)
# print(clkci_resp.text)
# CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)[0]
# print("CSRF_TOKEN:", CSRF_TOKEN)
# address_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/address-change?actionSource=glow'
#
# headers_post = {
# 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# 'Accept-Encoding': 'gzip', }
# headers_post['Origin'] = 'https://www.amazon.co.uk'
# headers_post['Anti-Csrftoken-A2z'] = CSRF_TOKEN
# headers_post['Sec-Fetch-Site'] = 'same-origin'
# headers_post['Sec-Fetch-Mode'] = 'cors'
# headers_post['Sec-Fetch-Dest'] = 'empty'
# headers_post['Accept'] = '*/*'
# headers_post['Content-Type'] = 'application/json'
# headers_post['X-Requested-With'] = 'XMLHttpRequest'
#
# address_json = {"locationType": "LOCATION_INPUT", "zipCode": "W1S 3PR", "deviceType": "web", "storeContext": "grocery",
# "pageType": "Detail", "actionSource": "glow"}
# address_resp = requ_see.post(address_url, headers=headers_post, json=address_json, verify=False, impersonate="chrome")
#
# submit_headers = {
# 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# 'Accept-Encoding': 'gzip',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# }
# submit_headers['Accept'] = '*/*'
# submit_headers['Referer'] = 'https://www.amazon.co.uk/dp/B0714LLB2T?th=1'
# submit_headers['X-Requested-With'] = 'XMLHttpRequest'
# submit_headers['Sec-Fetch-Site'] = 'same-origin'
# submit_headers['Sec-Fetch-Mode'] = 'cors'
# submit_headers['Sec-Fetch-Dest'] = 'empty'
# detail_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/get-location-label?storeContext=grocery&pageType=Detail&actionSource=desktop-modal'
# submit_resp = requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome")
# print(submit_resp.text)
# print(submit_resp.cookies.get_dict(), '322222222')
# print(requ_see.cookies.get_dict(), '433333333')
# cookie_dict = requ_see.cookies.get_dict()
# index_resp = requests.get('https://www.amazon.co.uk', headers=headers, cookies=cookie_dict, verify=False,
# impersonate="chrome")
# index_xpath = etree.HTML(index_resp.text)
# ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
# print("获取最新邮编:", ingress)
'存储到pg'
'获取小语言cookie'
import sys
import os
import pandas as pd
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from curl_cffi import requests
import json
from lxml import etree
import re
import random
import re
import time
from secure_db_client import get_remote_engine
import traceback
item = {}
import pandas as pd
from curl_cffi import requests
from lxml import etree
import sys
print(sys.executable)
from amazon_every_day_spider.secure_db_client import get_remote_engine
"""
打包命令:cd /d E:\Git_new\spider\py_spider
pyinstaller -F amazon_every_day_spider\Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
"""
headers_num_int = 0
def get_cookie(site='us', zipCode='10010'):
try:
params_site = {
"us": "pet-supplies",
"de": "toys",
"uk": "grocery",
"it": "luggage",
"es": "apparel",
"fr": "kitchen",
"ca": "beauty",
"mx": "beauty",
"au": "fashion",
"ae": "generic",
"br": "generic",
"nl": "home-improvement",
"pl": "beauty",
"se": "beauty",
"tr": "home-improvement",
# grocery
}
if site == "us":
url_ = 'https://www.amazon.com'
index_url = 'https://www.amazon.com'
url_asin = 'https://www.amazon.com/dp/B0009X29WK'
host = 'www.amazon.com'
elif site == 'uk':
url_ = 'https://www.amazon.co.uk' # 站点url
index_url = 'https://www.amazon.co.uk' # 站点url
url_asin = 'https://www.amazon.co.uk/dp/B0714LLB2T' # 站点url
host = 'www.amazon.co.uk'
elif site == 'de':
url_ = 'https://www.amazon.de'
index_url = 'https://www.amazon.de'
url_asin = 'https://www.amazon.de/dp/B00006YYXM'
host = 'www.amazon.de'
elif site == 'fr':
url_ = 'https://www.amazon.fr'
index_url = 'https://www.amazon.fr'
url_asin = 'https://www.amazon.fr/dp/B0FK9JNPM5'
host = 'www.amazon.fr'
elif site == 'es':
url_ = 'https://www.amazon.es'
index_url = 'https://www.amazon.es'
url_asin = 'https://www.amazon.es/dp/B0FDFVY9J6'
host = 'www.amazon.es'
elif site == 'it':
url_ = 'https://www.amazon.it'
index_url = 'https://www.amazon.it'
url_asin = 'https://www.amazon.it/dp/B0F3C16GTF'
host = 'www.amazon.it'
elif site == 'ca':
index_url = 'https://www.amazon.ca'
url_asin = 'https://www.amazon.ca/dp/B08H3JPH74'
host = 'www.amazon.ca'
elif site == 'mx':
index_url = "https://www.amazon.com.mx"
url_asin = 'https://www.amazon.com.mx/dp/B08H3JPH74'
host = 'www.amazon.com.mx'
elif site == 'ae':
index_url = "https://www.amazon.ae"
url_asin = 'https://www.amazon.ae/dp/B08H3JPH74'
host = 'www.amazon.ae'
elif site == 'au':
index_url = "https://www.amazon.com.au"
url_asin = 'https://www.amazon.com.au/dp/B0D1YFSYGQ'
host = 'www.amazon.com.au'
elif site == 'tr':
index_url = "https://www.amazon.com.tr"
url_asin = 'https://www.amazon.com.tr/dp/B08SPXK5WC'
host = 'www.amazon.com.tr'
elif site == 'be':
index_url = "https://www.amazon.com.be"
url_asin = 'https://www.amazon.com.be/dp/B01B7O6JH0'
host = 'www.amazon.com.be'
elif site == 'jp':
index_url = "https://www.amazon.co.jp"
url_asin = 'https://www.amazon.co.jp/dp/B08H3JPH74'
host = 'www.amazon.co.jp'
elif site == 'nl':
index_url = "https://www.amazon.nl"
url_asin = 'https://www.amazon.nl/dp/B01COWDLGG'
host = 'www.amazon.nl'
elif site == 'pl':
index_url = "https://www.amazon.pl"
url_asin = 'https://www.amazon.pl/dp/B08H3JPH74'
host = 'www.amazon.pl'
elif site == 'se':
index_url = "https://www.amazon.se"
url_asin = 'https://www.amazon.se/dp/B08H3JPH74'
host = 'www.amazon.se'
elif site == 'br':
index_url = "https://www.amazon.com.br"
url_asin = 'https://www.amazon.com.br/dp/B08SPXK5WC'
host = 'www.amazon.com.br'
if site not in ['us', 'uk', 'fr', 'es', 'it', 'de']:
engine_us = get_remote_engine('us', 'mysql')
else:
engine_us = get_remote_engine(site, 'mysql')
n = random.randint(110, 120)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36'
print(ua)
requ_see = requests.Session()
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Device-Memory": "8",
"Downlink": "1.25",
"Dpr": "0.75",
"Ect": "3g",
"Pragma": "no-cache",
"Rtt": "300",
"Sec-Ch-Device-Memory": "8",
"Sec-Ch-Dpr": "0.75",
"Sec-Ch-Ua": f'"Not_A Brand";v="8", "Chromium";v="{n}", "Google Chrome";v="{n}"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Ch-Ua-Platform-Version": '"10.0.0"',
"Sec-Ch-Viewport-Width": "2560",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": ua,
"Viewport-Width": "2560",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': ' "Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': ' none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
if site in ['au', 'pl']:
del headers['Accept-Encoding']
headers['Sec-Ch-Ua'] = "\"Google Chrome\";v=\"143\", \"Chromium\";v=\"143\", \"Not A(Brand\";v=\"24\""
headers['priority'] = "u=0, i"
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
k = ""
for i in (0, random.randint(0, 5)):
k += random.choice(alphabet)
# headers[k] = str(uuid.uuid4())
sess = requests.Session()
resp_ = sess.get(url_, headers=headers, timeout=15, verify=False)
cookie = resp_.headers.get('set-cookie')
print("第一步 请求首页", url_)
cookies_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
html_xpath = etree.HTML(resp_.text)
asin_resp = requ_see.get(url_asin, headers=headers, impersonate="chrome")
print("第一步 请求asin首页:", url_asin)
html_xpath = etree.HTML(asin_resp.text)
ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
print("第一次发送请求,获取邮编:", ingress)
data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
data_modal = json.loads(data_a_modal[0])
# if site != 'us':
# csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0]
# url_post = url_ + '/privacyprefs/retail/v1/acceptall'
# dada_post = {
# "anti-csrftoken-a2z": csrftoken,
# "accept": "all"
# }
# resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post,
# verify=False)
# cookie_post = resp_post.headers.get('set-cookie')
# cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")}
# cookies_dict_post.update(cookies_dict)
# else:
cookies_dict_post = cookies_dict
# if site == 'us':
# get_token_headers = {
# 'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
# 'referer': url_,
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
# }
# print(get_token_headers, '23232')
# else:
get_token_headers = {
'accept': 'text/html,*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
'cache-control': 'no-cache',
'referer': url_,
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'viewport-width': '1920',
'x-requested-with': 'XMLHttpRequest',
print('获取参数anti-csrftoken-a2z:', data_modal)
headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z']
# /portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal
clkci_url = f'{index_url}/portal-migration/hz/glow/get-rendered-address-selections'
# clkci_url = f'{index_url}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
headers['Referer'] = url_asin
print('第二步点击')
params = {
"deviceType": "desktop",
"pageType": "Detail",
"storeContext": params_site.get(site),
"actionSource": "desktop-modal"
}
data_modal_url = url_ + data_modal['url']
print('第二步 拼接url 点击更改位置:', data_modal_url)
data_modal_resp = sess.get(data_modal_url, headers=get_token_headers, cookies=cookies_dict_post,
timeout=15, verify=False)
data_modal_cookie = data_modal_resp.headers.get('set-cookie')
CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', data_modal_resp.text)[0]
clkci_resp = requ_see.get(clkci_url, headers=headers, params=params, impersonate="chrome")
CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)[0]
print("CSRF_TOKEN:", CSRF_TOKEN)
try:
data_modal_cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in data_modal_cookie.split("; ")}
data_modal_cookie_dict.update(cookies_dict)
except:
data_modal_cookie_dict = cookies_dict_post
url_2 = url_ + '/portal-migration/hz/glow/address-change?actionSource=glow'
print('url_2:', url_2)
# {"locationType":"LOCATION_INPUT","zipCode":"10010","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"}
data = {"locationType": "LOCATION_INPUT", "zipCode": zipCode, "storeContext": "generic", "deviceType": "web",
"pageType": "Gateway", "actionSource": "glow"}
print(data)
post_headers = {
'anti-csrftoken-a2z': CSRF_TOKEN,
# https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
# https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
# https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
address_url = f'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
# if site in ['tr', 'be', 'nl', 'pl', 'se']:
# url = f"https://{host}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
headers_post = {
'Host': host,
'accept': 'text/html,*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-length': '138',
'anti-csrftoken-a2z': CSRF_TOKEN,
'content-type': 'application/json',
'device-memory': '8',
'downlink': '10',
'dpr': '1',
'ect': '4g',
'origin': url_,
'pragma': 'no-cache',
'referer': url_,
'rtt': '250',
'sec-ch-device-memory': '8',
'sec-ch-dpr': '1',
'sec-ch-ua': '"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-ch-ua-platform-version': '"10.0.0"',
'sec-ch-viewport-width': '1920',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'viewport-width': '1920',
'TE': 'trailers',
'origin': index_url,
'referer': f'https://{host}/dp/B0009X29WK?th=1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# 'x-amzn-flow-closure-id': '1768269613',
'x-requested-with': 'XMLHttpRequest'
}
if site in ['uk', 'it', 'es', 'fr']:
del headers_post['referer']
address_json = {"locationType": "LOCATION_INPUT", "zipCode": f"{zipCode}", "deviceType": "web",
"storeContext": params_site.get(site),
"pageType": "Detail", "actionSource": "glow"}
if site == 'au':
address_json['locationType'] = 'POSTAL_CODE_WITH_CITY'
address_json['city'] = 'WARWICK FARM'
elif site == 'ae':
address_json['locationType'] = 'CITY'
address_json['city'] = 'Abu Dhabi'
address_json['pageType'] = 'Gateway'
elif site == 'nl':
del address_json['zipCode']
address_json['locationType'] = 'COUNTRY'
address_json['district'] = 'NL'
address_json['countryCode'] = 'NL'
print('第三步 输入 邮编')
print(address_url)
post_resp = requ_see.post(address_url, headers=headers_post, json=address_json, verify=False,
impersonate="chrome")
print(post_resp.text)
print(post_resp)
submit_headers = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept-Encoding': 'gzip',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
print('第三步 发送post 请求 输入 邮编 点击确定')
resp_2 = sess.post(url_2, headers=post_headers, json=data, cookies=data_modal_cookie_dict,
timeout=15, verify=False)
print(resp_2.text)
post_cookies = resp_2.headers.get('set-cookie')
try:
post_cookies_dict = {i.split("=")[0]: i.split("=")[-1] for i in post_cookies.split("; ")}
post_cookies_dict.update(data_modal_cookie_dict)
except:
post_cookies_dict = data_modal_cookie_dict
done_url = url_ + "/portal-migration/hz/glow/get-location-label?storeContext=generic&pageType=Gateway&actionSource=desktop-modal"
print('第四步,点击完成,')
done_resp = sess.get(done_url, headers=headers, cookies=post_cookies_dict, timeout=15, verify=False)
print(done_resp.text, 'done_respdone_respdone_respdone_resp')
done_cookies_dict = sess.cookies.get_dict()
print('done_cookies_dict::', done_cookies_dict)
print("第五步,请求首页,获取邮编,是否修改成功")
index_resp = sess.get(url_, headers=headers, timeout=15, cookies=done_cookies_dict, verify=False)
index_resp_cookies = sess.cookies.get_dict()
print(sess.cookies.get_dict(), '2222222222222222')
submit_headers['Accept'] = '*/*'
submit_headers['Referer'] = url_asin
submit_headers['X-Requested-With'] = 'XMLHttpRequest'
submit_headers['Sec-Fetch-Site'] = 'same-origin'
submit_headers['Sec-Fetch-Mode'] = 'cors'
submit_headers['Sec-Fetch-Dest'] = 'empty'
print('第四步。提交')
detail_url = f'{index_url}/portal-migration/hz/glow/get-location-label?storeContext=pet-supplies&pageType=Detail&actionSource=desktop-modal'
# detail_url = f'{index_url}/portal-migration/hz/glow/get-location-label?storeContext=grocery&pageType=Detail&actionSource=desktop-modal'
requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome")
print(requ_see.cookies.get_dict())
cookie_dict = requ_see.cookies.get_dict()
index_resp = requests.get(index_url, headers=headers, cookies=cookie_dict, verify=False,
impersonate="chrome")
index_xpath = etree.HTML(index_resp.text)
ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
print("获取最新邮编:", ingress)
if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip():
if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip() or 'M5B 2H' in ingress[0].strip():
print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************")
cookies = json.dumps(index_resp_cookies, ensure_ascii=False)
cookies = json.dumps(cookie_dict, ensure_ascii=False)
item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies}
print(item)
# 构造 DataFrame
df = pd.DataFrame([{"cookies": cookies, "type": "DB"}])
if site not in ['us', 'uk', 'fr', 'es', 'it', 'de']:
df = pd.DataFrame([{"cookies": cookies, 'site': site}])
# 存储到数据库
engine_us.to_sql(df, f"{site}_cookies", if_exists="append")
engine_us.to_sql(df, 'other_site_cookies', if_exists="append")
print(f"入库成功 {site} other_site_cookies")
else:
# 构造 DataFrame
df = pd.DataFrame([{"cookies": cookies}])
# 存储到数据库
engine_us.to_sql(df, f"{site}_comment_cookies", if_exists="append")
print(f"入库成功 {site} {site}_comment_cookies")
print('\n')
except Exception as e:
print(f"获取 {site} 站点 cookie 报错,切换下一个站点",e)
print(f"获取 {site} 站点 cookie 报错,切换下一个站点", e)
print("报错", f"\n{traceback.format_exc()}")
time.sleep(random.uniform(2.5, 5.5))
......@@ -219,10 +343,15 @@ if __name__ == '__main__':
get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR')
# get_cookie(site='it', zipCode='85')
get_cookie(site='es', zipCode='28001')
get_cookie(site='fr', zipCode='75019')
get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR')
time.sleep(random.uniform(30.5, 70.5))
time.sleep(random.uniform(60.5, 180.5))
# ae
# au
# be
# br
# ca
# mx
# nl
# pl
# se
# tr
......@@ -20,6 +20,7 @@ from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
......@@ -122,72 +123,44 @@ class H10():
self.site_name = 'us'
self.engine = get_remote_engine(self.site_name, 'mysql')
def web_drver(self):
chrome_options = Options()
chrome_options.add_argument('-disable-gpu')
chrome_options.add_argument("--disable-notifications")
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--ignore-ssl-errors') # 忽略ssl错误
chrome_options.add_argument("disable-blink-features=AutomationControlled")
chrome_options.add_argument('–no-sandbox') # 沙盒模式运行
# 忽略无关的日志
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
# 禁止硬件加速,避免严重占用cpu
chrome_options.add_argument('--disable-gpu')
# 隐身模式(无痕模式)
# chrome_options.add_argument('--incognito')
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--disable-gpu')
# 以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 启用打印预览。
chrome_options.add_argument("--enable-print-preview")
# 在工具 栏增加一个书签按钮
chrome_options.add_argument("--bookmark-menu")
# 启用书签同步
chrome_options.add_argument("--enable-sync")
chrome_options.add_argument('–allow-running-insecure-content') # 允许运行不安全的内容
chrome_options.add_argument('–disable-web-security') # 关闭安全策略
chrome_options.add_argument('–disable-xss-auditor') # 禁止xss防护
# 解决浏览器弹出下载多个文件 允许
chrome_options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 1})
chrome_options.add_argument(' window-size=1920,1080')
chrome_options.add_experimental_option("prefs",
{"profile.default_content_setting_values.automatic_downloads": 1})
# 创建一个带有配置文件的 Chrome 浏览器实例
self.driver = webdriver.Chrome(options=chrome_options)
def web_drver(self, is_login=True):
opt = Options()
# 稳定性 & 资源
opt.add_argument("--no-sandbox")
opt.add_argument("--disable-dev-shm-usage")
opt.add_argument("--disable-gpu")
opt.add_argument("--window-size=1920,1080")
opt.add_argument("--disable-notifications")
opt.add_argument("--disable-extensions")
opt.add_argument("--disable-background-networking")
opt.add_argument("--disable-background-timer-throttling")
opt.add_argument("--disable-renderer-backgrounding")
opt.add_argument("--disable-features=Translate,BackForwardCache")
# 设置headers
self.driver.execute_cdp_cmd("Network.setExtraHTTPHeaders",
{"headers":
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
}
})
# ✅ 修正:必须带 --
opt.add_argument("--disable-blink-features=AutomationControlled")
opt.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
# 防止网站检测selenium的webdriver
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => False
})
"""})
# ✅ 保持登录:固定 profile(不会清 cookie)
opt.add_argument(r"--user-data-dir=C:\selenium\chrome_profile")
opt.add_argument(r"--profile-directory=Default")
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """const toBlob=HTMLCanvasElement.prototype.toBlob;const toDataURL=HTMLCanvasElement.prototype.toDataURL;const getImageData=CanvasRenderingContext2D.prototype.getImageData;function noisify(canvas,context){if(context){const shift={'r':Math.floor(Math.random()*10)-5,'g':Math.floor(Math.random()*10)-5,'b':Math.floor(Math.random()*10)-5,'a':Math.floor(Math.random()*10)-5};const width=canvas.width;const height=canvas.height;if(width&&height){const imageData=getImageData.apply(context,[0,0,width,height]);for(let i=0;i<height;i++){for(let j=0;j<width;j++){const n=((i*(width*4))+(j*4));imageData.data[n+0]=imageData.data[n+0]+shift.r;imageData.data[n+1]=imageData.data[n+1]+shift.g;imageData.data[n+2]=imageData.data[n+2]+shift.b;imageData.data[n+3]=imageData.data[n+3]+shift.a}}window.top.postMessage("canvas-fingerprint-defender-alert",'*');context.putImageData(imageData,0,0)}}}Object.defineProperty(HTMLCanvasElement.prototype,"toBlob",{"value":function(){noisify(this,this.getContext("2d"));return toBlob.apply(this,arguments)}});Object.defineProperty(HTMLCanvasElement.prototype,"toDataURL",{"value":function(){noisify(this,this.getContext("2d"));return toDataURL.apply(this,arguments)}});Object.defineProperty(CanvasRenderingContext2D.prototype,"getImageData",{"value":function(){noisify(this.canvas,this);return getImageData.apply(this,arguments)}});document.documentElement.dataset.cbscriptallow=true;if(document.documentElement.dataset.cbscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.CanvasRenderingContext2D){iframes[i].contentWindow.CanvasRenderingContext2D.prototype.getImageData=CanvasRenderingContext2D.prototype.getImageData}if(iframes[i].contentWindow.HTMLCanvasElement){iframes[i].contentWindow.HTMLCanvasElement.prototype.toBlob=HTMLCanvasElement.prototype.toBlob;iframes[i].contentWindow.HTMLCanvasElement.prototype.toDataURL=HTMLCanvasElement.prototype.toDataURL}}}}""", })
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """var config={"random":{"value":function(){return Math.random()},"item":function(e){var rand=e.length*config.random.value();return e[Math.floor(rand)]},"number":function(power){var tmp=[];for(var i=0;i<power.length;i++){tmp.push(Math.pow(2,power[i]))}return config.random.item(tmp)},"int":function(power){var tmp=[];for(var i=0;i<power.length;i++){var n=Math.pow(2,power[i]);tmp.push(new Int32Array([n,n]))}return config.random.item(tmp)},"float":function(power){var tmp=[];for(var i=0;i<power.length;i++){var n=Math.pow(2,power[i]);tmp.push(new Float32Array([1,n]))}return config.random.item(tmp)}},"spoof":{"webgl":{"buffer":function(target){var proto=target.prototype?target.prototype:target.__proto__;const bufferData=proto.bufferData;Object.defineProperty(proto,"bufferData",{"value":function(){var index=Math.floor(config.random.value()*arguments[1].length);var noise=arguments[1][index]!==undefined?0.1*config.random.value()*arguments[1][index]:0;arguments[1][index]=arguments[1][index]+noise;window.top.postMessage("webgl-fingerprint-defender-alert",'*');return bufferData.apply(this,arguments)}})},"parameter":function(target){var proto=target.prototype?target.prototype:target.__proto__;const getParameter=proto.getParameter;Object.defineProperty(proto,"getParameter",{"value":function(){window.top.postMessage("webgl-fingerprint-defender-alert",'*');if(arguments[0]===3415)return 0;else if(arguments[0]===3414)return 24;else if(arguments[0]===36348)return 30;else if(arguments[0]===7936)return"WebKit";else if(arguments[0]===37445)return"Google Inc.";else if(arguments[0]===7937)return"WebKit WebGL";else if(arguments[0]===3379)return config.random.number([14,15]);else if(arguments[0]===36347)return config.random.number([12,13]);else if(arguments[0]===34076)return config.random.number([14,15]);else if(arguments[0]===34024)return config.random.number([14,15]);else if(arguments[0]===3386)return config.random.int([13,14,15]);else if(arguments[0]===3413)return config.random.number([1,2,3,4]);else if(arguments[0]===3412)return config.random.number([1,2,3,4]);else if(arguments[0]===3411)return config.random.number([1,2,3,4]);else if(arguments[0]===3410)return config.random.number([1,2,3,4]);else if(arguments[0]===34047)return config.random.number([1,2,3,4]);else if(arguments[0]===34930)return config.random.number([1,2,3,4]);else if(arguments[0]===34921)return config.random.number([1,2,3,4]);else if(arguments[0]===35660)return config.random.number([1,2,3,4]);else if(arguments[0]===35661)return config.random.number([4,5,6,7,8]);else if(arguments[0]===36349)return config.random.number([10,11,12,13]);else if(arguments[0]===33902)return config.random.float([0,10,11,12,13]);else if(arguments[0]===33901)return config.random.float([0,10,11,12,13]);else if(arguments[0]===37446)return config.random.item(["Graphics","HD Graphics","Intel(R) HD Graphics"]);else if(arguments[0]===7938)return config.random.item(["WebGL 1.0","WebGL 1.0 (OpenGL)","WebGL 1.0 (OpenGL Chromium)"]);else if(arguments[0]===35724)return config.random.item(["WebGL","WebGL GLSL","WebGL GLSL ES","WebGL GLSL ES (OpenGL Chromium"]);return getParameter.apply(this,arguments)}})}}}};config.spoof.webgl.buffer(WebGLRenderingContext);config.spoof.webgl.buffer(WebGL2RenderingContext);config.spoof.webgl.parameter(WebGLRenderingContext);config.spoof.webgl.parameter(WebGL2RenderingContext);document.documentElement.dataset.wgscriptallow=true;if(document.documentElement.dataset.wgscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.WebGLRenderingContext){iframes[i].contentWindow.WebGLRenderingContext.prototype.bufferData=WebGLRenderingContext.prototype.bufferData;iframes[i].contentWindow.WebGLRenderingContext.prototype.getParameter=WebGLRenderingContext.prototype.getParameter}if(iframes[i].contentWindow.WebGL2RenderingContext){iframes[i].contentWindow.WebGL2RenderingContext.prototype.bufferData=WebGL2RenderingContext.prototype.bufferData;iframes[i].contentWindow.WebGL2RenderingContext.prototype.getParameter=WebGL2RenderingContext.prototype.getParameter}}}}"""})
self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """var rand={"noise":function(){var SIGN=Math.random()<Math.random()?-1:1;return Math.floor(Math.random()+SIGN*Math.random())},"sign":function(){const tmp=[-1,-1,-1,-1,-1,-1,+1,-1,-1,-1];const index=Math.floor(Math.random()*tmp.length);return tmp[index]}};Object.defineProperty(HTMLElement.prototype,"offsetHeight",{get(){const height=Math.floor(this.getBoundingClientRect().height);const valid=height&&rand.sign()===1;const result=valid?height+rand.noise():height;return result}});Object.defineProperty(HTMLElement.prototype,"offsetWidth",{get(){const width=Math.floor(this.getBoundingClientRect().width);const valid=width&&rand.sign()===1;const result=valid?width+rand.noise():width;return result}});document.documentElement.dataset.fbscriptallow=true;if(document.documentElement.dataset.fbscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.HTMLElement){iframes[i].contentWindow.HTMLElement.prototype.offsetWidth=HTMLElement.prototype.offsetWidth;iframes[i].contentWindow.HTMLElement.prototype.offsetHeight=HTMLElement.prototype.offsetHeight}}}}"""})
# ✅ 减负:禁用图片/字体(可选,通常不影响登录)
prefs = {
"profile.managed_default_content_settings.images": 1,
"profile.managed_default_content_settings.fonts": 1,
"profile.default_content_setting_values.notifications": 1,
}
opt.add_experimental_option("prefs", prefs)
# ✅ 更快:不要等所有资源加载完(可选)
opt.page_load_strategy = "eager"
try:
self.driver = webdriver.Chrome(options=opt)
except:
service = Service(r"D:\EXE\webdrvier版本\120\chromedriver.exe")
self.driver = webdriver.Chrome(service=service, options=opt)
self.driver.maximize_window()
if is_login:
self.longin()
def activate_recaptcha(self, api):
......@@ -437,27 +410,60 @@ class H10():
except TimeoutException:
print("wait_page timeout, used:", time.time() - start)
return False
def click_button(self):
try:
print('点击显示下拉框')
button_js = 'document.querySelector("#CerebroFilter > div > div.sc-bZEumQ.ilswiy > div.sc-DnZRP.etdxo > div > button").click()'
self.driver.execute_script(button_js)
except:
self.driver.find_element(By.XPATH, '//button[@data-testid="showMoreButton"]').click()
time.sleep(2)
html = self.driver.page_source
resp = etree.HTML(html)
print('Amazons Choice获取元素')
time.sleep(2)
div_class = resp.xpath(
'''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
print('点击选择亚马逊精选 勾选')
time.sleep(2)
return div_class
def click_Choice(self):
html = self.driver.page_source
resp = etree.HTML(html)
print('Amazons Choice获取元素')
time.sleep(2)
div_class = resp.xpath(
'''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
print('点击选择亚马逊精选 勾选')
time.sleep(2)
if div_class:
return div_class
else:
return None
def webdrvier_html(self, asin, asinstype):
refresh_num = 0
# 点击选择站点
for i in range(6):
try:
_url = self.driver.current_url
self.id_url = f'https://members.helium10.com/cerebro?accountId={self.account_id}'
self.driver.get(self.id_url)
if "concurrent-sessions" in _url or 'signin' in _url:
self.longin()
if asin not in self.err_asin_list and self.useremail_state:
print('cerebro界面', self.site_name_url)
self.driver.get(f'https://members.helium10.com/cerebro?accountId={self.account_id}')
if not self.wait_page(timeout=50):
self.driver.refresh()
if not self.wait_page(timeout=35):
print('页面未加载出来')
continue
time.sleep(2)
sleep(randint(10, 15))
if 'You are viewing a demo of Cerebro' in self.driver.page_source:
print(self.email_name, '账号过期')
self.driver.refresh()
continue
# self.useremail_state = False
# self.send_ms('You are viewing a demo of Cerebro')
self.verify()
if self.site_name_url == 'Amazon.co.uk':
self.site_name_csv = 'GB'
......@@ -494,25 +500,38 @@ class H10():
try:
self.driver.execute_script(
f"""document.querySelector("img[loading='lazy']").click()""")
time.sleep(1)
time.sleep(1.5)
except:
self.driver.execute_script(
f"""document.querySelector("img[alt='{alt}']").click()""")
time.sleep(1)
time.sleep(1.5)
self.verify()
# 切换站点
self.driver.execute_script(f"""document.querySelector("div[data-value='{host}']").click()""")
time.sleep(1.5)
time.sleep(2)
# 输入asin
print('输入asin', asin)
wait = WebDriverWait(self.driver, 5)
try:
send_asins_xpath = '//*[@id="re-container"]//div[@id="findKeywordSearch"]//input'
wait.until(EC.element_to_be_clickable((By.XPATH, send_asins_xpath)))
except TimeoutException:
try:
send_asins_xpath = '//*[@id="re-container"]//input[contains(@class,"sc-blmEgr sc-cxgeGX")]'
wait.until(EC.element_to_be_clickable((By.XPATH, send_asins_xpath)))
except:
send_asins_xpath = '//*[@id="re-container"]//div[@data-value="0"]//input'
if ',' in asin:
_asin_lsit = asin.split(',')
for _asin in _asin_lsit:
self.driver.find_element(By.XPATH, '//*[@id="re-container"]//input').send_keys(f'{_asin},')
self.driver.find_element(By.XPATH,send_asins_xpath).send_keys(f'{_asin},')
time.sleep(1.5)
else:
self.driver.find_element(By.XPATH, '//*[@id="re-container"]//input').send_keys(f'{asin},')
time.sleep(1)
self.driver.find_element(By.XPATH,send_asins_xpath).send_keys(f'{asin},')
time.sleep(2)
if 'detected. Please check the ASINs and try again' in self.driver.page_source:
self.err_asin_list.append(asin)
break
# 勾选排除变体
self.driver.execute_script("""document.querySelector("input[name='excludeVariations']").click()""")
# 点击 get keyword
......@@ -532,13 +551,17 @@ class H10():
print('中间框下载词 没有报告')
self.err_asins_adv_list.append(asin)
break
elif '拒绝访问' in html:
print('拒绝访问 没有次数')
self.err_asins_adv_list.append(asin)
break
elif 'errorCodes.undefined' in html:
continue
html = self.driver.page_source
self.verify()
time.sleep(2)
time.sleep(3.5)
try:
if 'searched this product before' in html or '先前已搜索过此产品' in html:
html = self.driver.page_source
if 'searched this product before' in html or '先前已搜索过此产品' in html or '运行新搜索' in html or '从历史数据加载' in html:
print('33333333333444444')
self.driver.execute_script(
"""document.querySelector("button[data-testid='runnewsearch']").click()""")
......@@ -562,9 +585,16 @@ class H10():
print('中间框下载词 没有报告')
self.err_asins_adv_list.append(asin)
break
elif '拒绝访问' in html:
print('拒绝访问 没有次数')
self.err_asins_adv_list.append(asin)
break
elif 'errorCodes.undefined' in html:
continue
sleep(randint(15, 30))
if asinstype:
sleep(randint(20, 38))
else:
sleep(randint(8, 15))
self.verify()
time.sleep(2)
if 'Wrong entered data or no results' in html:
......@@ -583,20 +613,11 @@ class H10():
break
if asinstype:
try:
print('点击显示下拉框')
button_js = 'document.querySelector("#CerebroFilter > div > div.sc-dzXNMW.dufncf > div.sc-hFCjLd.igMWUF > div > button").click()'
self.driver.execute_script(button_js)
time.sleep(2)
html = self.driver.page_source
resp = etree.HTML(html)
print('Amazons Choice获取元素')
time.sleep(2)
div_class = resp.xpath(
'''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
div_class = self.click_Choice()
if div_class is None:
div_class = self.click_button()
except:
print('报错22222222222222')
print('点击选择亚马逊精选 勾选')
time.sleep(2)
div_class = self.click_button()
try:
script = f"""
const elements = document.querySelectorAll("div[class='{div_class[0]}']>div");
......@@ -604,14 +625,17 @@ class H10():
secondElement.click();
"""
except:
print('报错:scrip t script 2323232323232323')
if i == 2:
self.err_asins_adv_list.append(asin)
continue
self.driver.execute_script(script)
time.sleep(1)
html1 = self.driver.page_source
resp1 = etree.HTML(html1)
span_class = resp1.xpath(
'//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class|//span[contains(text(),"已分析的产品")]/parent::div/following-sibling::div/@class')[0]
'//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class|//span[contains(text(),"已分析的产品")]/parent::div/following-sibling::div/@class')[
0]
# 选择亚马逊精选参数1
self.driver.execute_script(
f"""document.querySelector("div[class='{span_class}']").click()""")
......@@ -632,15 +656,20 @@ class H10():
time.sleep(1)
print('点击选择csv')
self.driver.execute_script("""document.querySelector("div[data-testid='csv']").click()""")
time.sleep(15)
time.sleep(8)
break
except Exception as e:
print('详细报错')
print(traceback.format_exc(), e)
self.driver.refresh()
time.sleep(1)
if i == 2:
print('详细报错',e)
if i == 5:
self.err_asin_list.append(asin)
refresh_num += 1
if refresh_num > 4:
print('超过4次。清除缓存')
self.enable_no_cache()
self.clear_http_cache()
self.clear_cache_but_keep_cookies('https://members.helium10.com/')
refresh_num = 0
time.sleep(2)
continue
def nex_page(self, asin_list, asinstype=None):
......@@ -761,7 +790,7 @@ class H10():
with open(file_path, 'r', encoding='utf-8') as f:
f.read()
f.close()
print('找到文件:路径有效:',file_path)
print('找到文件:路径有效:', file_path)
return True
except:
print('文件路径不存在')
......@@ -795,7 +824,6 @@ class H10():
if state == False:
print('重新下载文件222:', asin, path)
self.webdrvier_html(asin, None)
self.if_csv_path(file_path)
header_config = {
"chinese": {
"columns": ['关键词词组', 'Cerebro IQ 得分', '搜索量', '搜索量趋势',
......@@ -968,7 +996,7 @@ class H10():
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
print('file_pathsave_competition1111111', file_path)
state = self.if_csv_path(file_path)
if state==False:
if state == False:
current_date = datetime.date.today()
# 计算前一天日期
previous_date = current_date - datetime.timedelta(days=1)
......@@ -983,10 +1011,17 @@ class H10():
print('file_pathsave_competition3333', file_path)
state = self.if_csv_path(file_path)
if state == False:
print('重新下载文件3333333333 :', asin_list, path)
# self.webdrvier_html(','.join(asin_list), 1)
if self.is_nex_pag:
self.nex_page(self.asin_list, asinstype=1)
self.is_nex_pag = False
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
# 创建一个字典来映射原始列名和新的列名
columns = pd.read_csv(file_path, nrows=0).columns.tolist()
def contains_chinese(text):
return bool(re.search(r'[\u4e00-\u9fff]', text))
......@@ -1075,6 +1110,25 @@ class H10():
print(data)
requests.post(url=url, data=data, timeout=15)
def enable_no_cache(self):
self.driver.execute_cdp_cmd("Network.enable", {})
self.driver.execute_cdp_cmd("Network.setCacheDisabled", {"cacheDisabled": True})
def clear_http_cache(self):
self.driver.execute_cdp_cmd("Network.enable", {})
self.driver.execute_cdp_cmd("Network.clearBrowserCache", {})
def clear_cache_but_keep_cookies(self, origin: str):
# 1) 清 http cache
self.driver.execute_cdp_cmd("Network.enable", {})
self.driver.execute_cdp_cmd("Network.clearBrowserCache", {})
# 2) 清更深层缓存:cache storage / service worker / appcache
self.driver.execute_cdp_cmd("Storage.clearDataForOrigin", {
"origin": origin,
"storageTypes": "appcache,cache_storage,service_workers"
})
def run(self):
user_pw_list = self.get_ip_address()
if user_pw_list:
......@@ -1091,33 +1145,20 @@ class H10():
else:
path = r'C:\Users\ASUS\Downloads'
print('当前路径:', path)
self.email_name = 'yswg006@hotmail.com'
self.pw = 'Chianbugye@8346148' # 'yashengweige678@outlook.com', '987654321yswg@'
self.email_name = 'yswg304@outlook.com'
# 'yswg304@outlook.com', 'Chinabuye@467138'
self.pw = 'Chinabuye@467138' # 'yashengweige678@outlook.com', '987654321yswg@'
self.web_drver()
loop = 0
while True:
self.data = {}
self.sku_list = []
self.err_asins_adv_list = []
try:
self.driver.refresh()
except:
continue
time.sleep(4)
self.driver.execute_script("localStorage.clear();") # 清除本地存储
time.sleep(0.5)
self.driver.execute_script("sessionStorage.clear();") # 清除会话存储
time.sleep(0.5)
self.driver.execute_script(
"caches.keys().then(function(names) { for (let name of names) { caches.delete(name); } });")
self.driver.execute_script("window.performance.clearResourceTimings();")
self.driver.execute_cdp_cmd("Network.clearBrowserCache", {})
# 2) (可选)清性能 timings 不影响登录
self.driver.execute_script("window.performance.clearResourceTimings();")
time.sleep(5)
login_url = self.driver.current_url
if "concurrent-sessions" in login_url or 'signin' in login_url:
self.longin()
for site in ['us', 'uk', 'de', 'fr', 'es', 'it', 'mx']:
self.is_nex_pag = True
print(site)
if site == 'uk':
self.site_url = 'Amazon.co.uk'
......@@ -1139,6 +1180,7 @@ class H10():
self.mysql_connect(site)
# 获取未抓取的sku
self.read_db_sku()
for sku_token in self.sku_data_list:
sku_token_list = sku_token.split('|-|')
sku = sku_token_list[0]
......@@ -1176,6 +1218,13 @@ class H10():
self.mysql_connect(site)
time.sleep(randint(20, 50))
loop += 1
# ✅ 每 30 次再清一次缓存(按你任务调整)
if loop % 30 == 0:
print('清除缓存')
self.enable_no_cache()
self.clear_http_cache()
self.clear_cache_but_keep_cookies('https://members.helium10.com/')
for i in range(10):
print(f"当前时间 {datetime.datetime.now().hour} 点,超出运行时段退出循环。")
hour = datetime.datetime.now().hour
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment