Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
1612910c
Commit
1612910c
authored
Feb 03, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
179f825d
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
476 additions
and
298 deletions
+476
-298
Get_Cookies.py
py_spider/amazon_every_day_spider/Get_Cookies.py
+6
-6
get_cookies2.py
py_spider/amazon_every_day_spider/get_cookies2.py
+300
-171
H10_spider.py
py_spider/amazon_spider/H10_spider.py
+170
-121
No files found.
py_spider/amazon_every_day_spider/Get_Cookies.py
View file @
1612910c
...
@@ -151,10 +151,10 @@ def get_cookie(site='us', zipCode='10010'):
...
@@ -151,10 +151,10 @@ def get_cookie(site='us', zipCode='10010'):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
while
True
:
while
True
:
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'de'
,
zipCode
=
'10115'
)
#
get_cookie(site='de', zipCode='10115')
get_cookie
(
site
=
'uk'
,
zipCode
=
'W1S 3PR'
)
#
get_cookie(site='uk', zipCode='W1S 3PR')
get_cookie
(
site
=
'it'
,
zipCode
=
'00185'
)
#
get_cookie(site='it', zipCode='00185')
get_cookie
(
site
=
'es'
,
zipCode
=
'28001'
)
#
get_cookie(site='es', zipCode='28001')
get_cookie
(
site
=
'fr'
,
zipCode
=
'75019'
)
#
get_cookie(site='fr', zipCode='75019')
get_cookie
(
site
=
'ca'
,
zipCode
=
'M5B 2H1'
)
#
get_cookie(site='ca', zipCode='M5B 2H1')
time
.
sleep
(
random
.
uniform
(
60.5
,
180.5
))
time
.
sleep
(
random
.
uniform
(
60.5
,
180.5
))
py_spider/amazon_every_day_spider/get_cookies2.py
View file @
1612910c
# import json
# import re
#
# from curl_cffi import requests
# from lxml import etree
#
# requ_see = requests.Session()
# headers = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate, br, zstd',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'no-cache',
# 'Pragma': 'no-cache',
# 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
# 'Sec-Ch-Ua-Mobile': '?0',
# 'Sec-Ch-Ua-Platform': ' "Windows"',
# 'Sec-Fetch-Dest': 'document',
# 'Sec-Fetch-Mode': 'navigate',
# 'Sec-Fetch-Site': ' none',
# 'Sec-Fetch-User': '?1',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
# }
# asin_resp = requ_see.get('https://www.amazon.co.uk/dp/B0714LLB2T?th=1', headers=headers)
# print("第一步 请求首页")
# html_xpath = etree.HTML(asin_resp.text)
# ingress = html_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
# print("第一次发送请求,获取邮编:", ingress)
# # url2 = 'https://www.amazon.co.uk/nav/ajax/hMenuDesktopFirstLayer?ajaxTemplate=hMenuDesktopFirstLayer&pageType=Detail&hmDataAjaxHint=1&isFreshRegion=false&isFreshCustomer=false&isPrimeMember=false&isPrimeDay=false&isBackup=false&firstName=false&navDeviceType=desktop&hashCustomerAndSessionId=8b35c8413eaf45f3509509691ec91ce8cc82c3f3&environmentVFI=AmazonNavigationCards%2Fdevelopment%40B6407668806-AL2_aarch64&languageCode=en_GB&customerCountryCode=US'
# # requ_see.get(url2, headers=headers)
#
# data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
# data_modal = json.loads(data_a_modal[0])
# print(data_modal)
# headers['Anti-Csrftoken-A2z'] = data_modal['ajaxHeaders']['anti-csrftoken-a2z']
# clkci_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
# headers['Referer'] = 'https://www.amazon.co.uk/dp/B0714LLB2T?th=1'
# print(headers, 23333333)
# clkci_resp = requ_see.get(clkci_url, headers=headers)
# print(clkci_resp.text)
# CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', clkci_resp.text)[0]
# print("CSRF_TOKEN:", CSRF_TOKEN)
# address_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/address-change?actionSource=glow'
#
# headers_post = {
# 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# 'Accept-Encoding': 'gzip', }
# headers_post['Origin'] = 'https://www.amazon.co.uk'
# headers_post['Anti-Csrftoken-A2z'] = CSRF_TOKEN
# headers_post['Sec-Fetch-Site'] = 'same-origin'
# headers_post['Sec-Fetch-Mode'] = 'cors'
# headers_post['Sec-Fetch-Dest'] = 'empty'
# headers_post['Accept'] = '*/*'
# headers_post['Content-Type'] = 'application/json'
# headers_post['X-Requested-With'] = 'XMLHttpRequest'
#
# address_json = {"locationType": "LOCATION_INPUT", "zipCode": "W1S 3PR", "deviceType": "web", "storeContext": "grocery",
# "pageType": "Detail", "actionSource": "glow"}
# address_resp = requ_see.post(address_url, headers=headers_post, json=address_json, verify=False, impersonate="chrome")
#
# submit_headers = {
# 'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# 'Accept-Encoding': 'gzip',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# }
# submit_headers['Accept'] = '*/*'
# submit_headers['Referer'] = 'https://www.amazon.co.uk/dp/B0714LLB2T?th=1'
# submit_headers['X-Requested-With'] = 'XMLHttpRequest'
# submit_headers['Sec-Fetch-Site'] = 'same-origin'
# submit_headers['Sec-Fetch-Mode'] = 'cors'
# submit_headers['Sec-Fetch-Dest'] = 'empty'
# detail_url = 'https://www.amazon.co.uk/portal-migration/hz/glow/get-location-label?storeContext=grocery&pageType=Detail&actionSource=desktop-modal'
# submit_resp = requ_see.get(detail_url, headers=submit_headers, verify=False, impersonate="chrome")
# print(submit_resp.text)
# print(submit_resp.cookies.get_dict(), '322222222')
# print(requ_see.cookies.get_dict(), '433333333')
# cookie_dict = requ_see.cookies.get_dict()
# index_resp = requests.get('https://www.amazon.co.uk', headers=headers, cookies=cookie_dict, verify=False,
# impersonate="chrome")
# index_xpath = etree.HTML(index_resp.text)
# ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
# print("获取最新邮编:", ingress)
'存储到pg'
'获取小语言cookie'
import
sys
import
os
import
pandas
as
pd
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
# 上级目录
from
curl_cffi
import
requests
import
json
import
json
from
lxml
import
etree
import
re
import
random
import
random
import
re
import
time
import
time
from
secure_db_client
import
get_remote_engine
import
traceback
import
traceback
item
=
{}
import
pandas
as
pd
from
curl_cffi
import
requests
from
lxml
import
etree
import
sys
print
(
sys
.
executable
)
from
amazon_every_day_spider.secure_db_client
import
get_remote_engine
"""
打包命令:cd /d E:
\
Git_new
\
spider
\
py_spider
pyinstaller -F amazon_every_day_spider
\
Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
"""
headers_num_int
=
0
def
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
):
def
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
):
try
:
try
:
params_site
=
{
"us"
:
"pet-supplies"
,
"de"
:
"toys"
,
"uk"
:
"grocery"
,
"it"
:
"luggage"
,
"es"
:
"apparel"
,
"fr"
:
"kitchen"
,
"ca"
:
"beauty"
,
"mx"
:
"beauty"
,
"au"
:
"fashion"
,
"ae"
:
"generic"
,
"br"
:
"generic"
,
"nl"
:
"home-improvement"
,
"pl"
:
"beauty"
,
"se"
:
"beauty"
,
"tr"
:
"home-improvement"
,
# grocery
}
if
site
==
"us"
:
if
site
==
"us"
:
url_
=
'https://www.amazon.com'
index_url
=
'https://www.amazon.com'
url_asin
=
'https://www.amazon.com/dp/B0009X29WK'
host
=
'www.amazon.com'
host
=
'www.amazon.com'
elif
site
==
'uk'
:
elif
site
==
'uk'
:
url_
=
'https://www.amazon.co.uk'
# 站点url
index_url
=
'https://www.amazon.co.uk'
# 站点url
url_asin
=
'https://www.amazon.co.uk/dp/B0714LLB2T'
# 站点url
host
=
'www.amazon.co.uk'
host
=
'www.amazon.co.uk'
elif
site
==
'de'
:
elif
site
==
'de'
:
url_
=
'https://www.amazon.de'
index_url
=
'https://www.amazon.de'
url_asin
=
'https://www.amazon.de/dp/B00006YYXM'
host
=
'www.amazon.de'
host
=
'www.amazon.de'
elif
site
==
'fr'
:
elif
site
==
'fr'
:
url_
=
'https://www.amazon.fr'
index_url
=
'https://www.amazon.fr'
url_asin
=
'https://www.amazon.fr/dp/B0FK9JNPM5'
host
=
'www.amazon.fr'
host
=
'www.amazon.fr'
elif
site
==
'es'
:
elif
site
==
'es'
:
url_
=
'https://www.amazon.es'
index_url
=
'https://www.amazon.es'
url_asin
=
'https://www.amazon.es/dp/B0FDFVY9J6'
host
=
'www.amazon.es'
host
=
'www.amazon.es'
elif
site
==
'it'
:
elif
site
==
'it'
:
url_
=
'https://www.amazon.it'
index_url
=
'https://www.amazon.it'
url_asin
=
'https://www.amazon.it/dp/B0F3C16GTF'
host
=
'www.amazon.it'
host
=
'www.amazon.it'
engine_us
=
get_remote_engine
(
site
,
'mysql'
)
elif
site
==
'ca'
:
n
=
random
.
randint
(
110
,
120
)
index_url
=
'https://www.amazon.ca'
ua
=
f
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36'
url_asin
=
'https://www.amazon.ca/dp/B08H3JPH74'
print
(
ua
)
host
=
'www.amazon.ca'
elif
site
==
'mx'
:
index_url
=
"https://www.amazon.com.mx"
url_asin
=
'https://www.amazon.com.mx/dp/B08H3JPH74'
host
=
'www.amazon.com.mx'
elif
site
==
'ae'
:
index_url
=
"https://www.amazon.ae"
url_asin
=
'https://www.amazon.ae/dp/B08H3JPH74'
host
=
'www.amazon.ae'
elif
site
==
'au'
:
index_url
=
"https://www.amazon.com.au"
url_asin
=
'https://www.amazon.com.au/dp/B0D1YFSYGQ'
host
=
'www.amazon.com.au'
elif
site
==
'tr'
:
index_url
=
"https://www.amazon.com.tr"
url_asin
=
'https://www.amazon.com.tr/dp/B08SPXK5WC'
host
=
'www.amazon.com.tr'
elif
site
==
'be'
:
index_url
=
"https://www.amazon.com.be"
url_asin
=
'https://www.amazon.com.be/dp/B01B7O6JH0'
host
=
'www.amazon.com.be'
elif
site
==
'jp'
:
index_url
=
"https://www.amazon.co.jp"
url_asin
=
'https://www.amazon.co.jp/dp/B08H3JPH74'
host
=
'www.amazon.co.jp'
elif
site
==
'nl'
:
index_url
=
"https://www.amazon.nl"
url_asin
=
'https://www.amazon.nl/dp/B01COWDLGG'
host
=
'www.amazon.nl'
elif
site
==
'pl'
:
index_url
=
"https://www.amazon.pl"
url_asin
=
'https://www.amazon.pl/dp/B08H3JPH74'
host
=
'www.amazon.pl'
elif
site
==
'se'
:
index_url
=
"https://www.amazon.se"
url_asin
=
'https://www.amazon.se/dp/B08H3JPH74'
host
=
'www.amazon.se'
elif
site
==
'br'
:
index_url
=
"https://www.amazon.com.br"
url_asin
=
'https://www.amazon.com.br/dp/B08SPXK5WC'
host
=
'www.amazon.com.br'
if
site
not
in
[
'us'
,
'uk'
,
'fr'
,
'es'
,
'it'
,
'de'
]:
engine_us
=
get_remote_engine
(
'us'
,
'mysql'
)
else
:
engine_us
=
get_remote_engine
(
site
,
'mysql'
)
requ_see
=
requests
.
Session
()
headers
=
{
headers
=
{
"Accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
"Accept-Encoding"
:
"gzip, deflate, br, zstd"
,
'Accept-Encoding'
:
'gzip, deflate, br, zstd'
,
"Accept-Language"
:
"zh-CN,zh;q=0.9"
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
"Cache-Control"
:
"no-cache"
,
'Cache-Control'
:
'no-cache'
,
"Device-Memory"
:
"8"
,
'Pragma'
:
'no-cache'
,
"Downlink"
:
"1.25"
,
'Sec-Ch-Ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
,
"Dpr"
:
"0.75"
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
"Ect"
:
"3g"
,
'Sec-Ch-Ua-Platform'
:
' "Windows"'
,
"Pragma"
:
"no-cache"
,
'Sec-Fetch-Dest'
:
'document'
,
"Rtt"
:
"300"
,
'Sec-Fetch-Mode'
:
'navigate'
,
"Sec-Ch-Device-Memory"
:
"8"
,
'Sec-Fetch-Site'
:
' none'
,
"Sec-Ch-Dpr"
:
"0.75"
,
'Sec-Fetch-User'
:
'?1'
,
"Sec-Ch-Ua"
:
f
'"Not_A Brand";v="8", "Chromium";v="{n}", "Google Chrome";v="{n}"'
,
'Upgrade-Insecure-Requests'
:
'1'
,
"Sec-Ch-Ua-Mobile"
:
"?0"
,
'User-Agent'
:
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
"Sec-Ch-Ua-Platform"
:
'"Windows"'
,
"Sec-Ch-Ua-Platform-Version"
:
'"10.0.0"'
,
"Sec-Ch-Viewport-Width"
:
"2560"
,
"Sec-Fetch-Dest"
:
"document"
,
"Sec-Fetch-Mode"
:
"navigate"
,
"Sec-Fetch-Site"
:
"none"
,
"Sec-Fetch-User"
:
"?1"
,
"Upgrade-Insecure-Requests"
:
"1"
,
"User-Agent"
:
ua
,
"Viewport-Width"
:
"2560"
,
}
}
if
site
in
[
'au'
,
'pl'
]:
del
headers
[
'Accept-Encoding'
]
headers
[
'Sec-Ch-Ua'
]
=
"
\"
Google Chrome
\"
;v=
\"
143
\"
,
\"
Chromium
\"
;v=
\"
143
\"
,
\"
Not A(Brand
\"
;v=
\"
24
\"
"
headers
[
'priority'
]
=
"u=0, i"
headers
[
'User-Agent'
]
=
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"
alphabet
=
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'n'
]
asin_resp
=
requ_see
.
get
(
url_asin
,
headers
=
headers
,
impersonate
=
"chrome"
)
k
=
""
print
(
"第一步 请求asin首页:"
,
url_asin
)
for
i
in
(
0
,
random
.
randint
(
0
,
5
)):
html_xpath
=
etree
.
HTML
(
asin_resp
.
text
)
k
+=
random
.
choice
(
alphabet
)
# headers[k] = str(uuid.uuid4())
sess
=
requests
.
Session
()
resp_
=
sess
.
get
(
url_
,
headers
=
headers
,
timeout
=
15
,
verify
=
False
)
cookie
=
resp_
.
headers
.
get
(
'set-cookie'
)
print
(
"第一步 请求首页"
,
url_
)
cookies_dict
=
{
i
.
split
(
"="
)[
0
]:
i
.
split
(
"="
)[
-
1
]
for
i
in
cookie
.
split
(
"; "
)}
html_xpath
=
etree
.
HTML
(
resp_
.
text
)
ingress
=
html_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
ingress
=
html_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
print
(
"第一次发送请求,获取邮编:"
,
ingress
)
print
(
"第一次发送请求,获取邮编:"
,
ingress
)
data_a_modal
=
html_xpath
.
xpath
(
"//span[@id='nav-global-location-data-modal-action']/@data-a-modal"
)
data_a_modal
=
html_xpath
.
xpath
(
"//span[@id='nav-global-location-data-modal-action']/@data-a-modal"
)
data_modal
=
json
.
loads
(
data_a_modal
[
0
])
data_modal
=
json
.
loads
(
data_a_modal
[
0
])
# if site != 'us':
print
(
'获取参数anti-csrftoken-a2z:'
,
data_modal
)
# csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0]
headers
[
'Anti-Csrftoken-A2z'
]
=
data_modal
[
'ajaxHeaders'
][
'anti-csrftoken-a2z'
]
# url_post = url_ + '/privacyprefs/retail/v1/acceptall'
# /portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal
# dada_post = {
clkci_url
=
f
'{index_url}/portal-migration/hz/glow/get-rendered-address-selections'
# "anti-csrftoken-a2z": csrftoken,
# clkci_url = f'{index_url}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
# "accept": "all"
headers
[
'Referer'
]
=
url_asin
# }
print
(
'第二步点击'
)
# resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post,
params
=
{
# verify=False)
"deviceType"
:
"desktop"
,
# cookie_post = resp_post.headers.get('set-cookie')
"pageType"
:
"Detail"
,
# cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")}
"storeContext"
:
params_site
.
get
(
site
),
# cookies_dict_post.update(cookies_dict)
"actionSource"
:
"desktop-modal"
# else:
cookies_dict_post
=
cookies_dict
# if site == 'us':
# get_token_headers = {
# 'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
# 'referer': url_,
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
# }
# print(get_token_headers, '23232')
# else:
get_token_headers
=
{
'accept'
:
'text/html,*/*'
,
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-language'
:
'zh-CN,zh;q=0.9'
,
'anti-csrftoken-a2z'
:
data_modal
[
'ajaxHeaders'
][
'anti-csrftoken-a2z'
],
'cache-control'
:
'no-cache'
,
'referer'
:
url_
,
'sec-fetch-dest'
:
'empty'
,
'sec-fetch-mode'
:
'cors'
,
'sec-fetch-site'
:
'same-origin'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
,
'viewport-width'
:
'1920'
,
'x-requested-with'
:
'XMLHttpRequest'
,
}
}
data_modal_url
=
url_
+
data_modal
[
'url'
]
clkci_resp
=
requ_see
.
get
(
clkci_url
,
headers
=
headers
,
params
=
params
,
impersonate
=
"chrome"
)
print
(
'第二步 拼接url 点击更改位置:'
,
data_modal_url
)
CSRF_TOKEN
=
re
.
findall
(
'CSRF_TOKEN : "(.*?)",'
,
clkci_resp
.
text
)[
0
]
data_modal_resp
=
sess
.
get
(
data_modal_url
,
headers
=
get_token_headers
,
cookies
=
cookies_dict_post
,
timeout
=
15
,
verify
=
False
)
data_modal_cookie
=
data_modal_resp
.
headers
.
get
(
'set-cookie'
)
CSRF_TOKEN
=
re
.
findall
(
'CSRF_TOKEN : "(.*?)",'
,
data_modal_resp
.
text
)[
0
]
print
(
"CSRF_TOKEN:"
,
CSRF_TOKEN
)
print
(
"CSRF_TOKEN:"
,
CSRF_TOKEN
)
try
:
# https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
data_modal_cookie_dict
=
{
i
.
split
(
"="
)[
0
]:
i
.
split
(
"="
)[
-
1
]
for
i
in
data_modal_cookie
.
split
(
"; "
)}
# https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
data_modal_cookie_dict
.
update
(
cookies_dict
)
# https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow
except
:
address_url
=
f
'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
data_modal_cookie_dict
=
cookies_dict_post
# if site in ['tr', 'be', 'nl', 'pl', 'se']:
# url = f"https://{host}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
url_2
=
url_
+
'/portal-migration/hz/glow/address-change?actionSource=glow'
print
(
'url_2:'
,
url_2
)
headers_post
=
{
# {"locationType":"LOCATION_INPUT","zipCode":"10010","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"}
'Host'
:
host
,
data
=
{
"locationType"
:
"LOCATION_INPUT"
,
"zipCode"
:
zipCode
,
"storeContext"
:
"generic"
,
"deviceType"
:
"web"
,
"pageType"
:
"Gateway"
,
"actionSource"
:
"glow"
}
print
(
data
)
post_headers
=
{
'anti-csrftoken-a2z'
:
CSRF_TOKEN
,
'accept'
:
'text/html,*/*'
,
'accept'
:
'text/html,*/*'
,
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-language'
:
'zh-CN,zh;q=0.9'
,
'accept-language'
:
'zh-CN,zh;q=0.9'
,
'cache-control'
:
'no-cache'
,
'anti-csrftoken-a2z'
:
CSRF_TOKEN
,
'content-length'
:
'138'
,
'content-type'
:
'application/json'
,
'content-type'
:
'application/json'
,
'device-memory'
:
'8'
,
'origin'
:
index_url
,
'downlink'
:
'10'
,
'referer'
:
f
'https://{host}/dp/B0009X29WK?th=1'
,
'dpr'
:
'1'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'ect'
:
'4g'
,
# 'x-amzn-flow-closure-id': '1768269613',
'origin'
:
url_
,
'pragma'
:
'no-cache'
,
'referer'
:
url_
,
'rtt'
:
'250'
,
'sec-ch-device-memory'
:
'8'
,
'sec-ch-dpr'
:
'1'
,
'sec-ch-ua'
:
'"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'sec-ch-ua-platform-version'
:
'"10.0.0"'
,
'sec-ch-viewport-width'
:
'1920'
,
'sec-fetch-dest'
:
'empty'
,
'sec-fetch-mode'
:
'cors'
,
'sec-fetch-site'
:
'same-origin'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
,
'viewport-width'
:
'1920'
,
'TE'
:
'trailers'
,
'x-requested-with'
:
'XMLHttpRequest'
'x-requested-with'
:
'XMLHttpRequest'
}
if
site
in
[
'uk'
,
'it'
,
'es'
,
'fr'
]:
del
headers_post
[
'referer'
]
address_json
=
{
"locationType"
:
"LOCATION_INPUT"
,
"zipCode"
:
f
"{zipCode}"
,
"deviceType"
:
"web"
,
"storeContext"
:
params_site
.
get
(
site
),
"pageType"
:
"Detail"
,
"actionSource"
:
"glow"
}
if
site
==
'au'
:
address_json
[
'locationType'
]
=
'POSTAL_CODE_WITH_CITY'
address_json
[
'city'
]
=
'WARWICK FARM'
elif
site
==
'ae'
:
address_json
[
'locationType'
]
=
'CITY'
address_json
[
'city'
]
=
'Abu Dhabi'
address_json
[
'pageType'
]
=
'Gateway'
elif
site
==
'nl'
:
del
address_json
[
'zipCode'
]
address_json
[
'locationType'
]
=
'COUNTRY'
address_json
[
'district'
]
=
'NL'
address_json
[
'countryCode'
]
=
'NL'
print
(
'第三步 输入 邮编'
)
print
(
address_url
)
post_resp
=
requ_see
.
post
(
address_url
,
headers
=
headers_post
,
json
=
address_json
,
verify
=
False
,
impersonate
=
"chrome"
)
print
(
post_resp
.
text
)
print
(
post_resp
)
submit_headers
=
{
'User-Agent'
:
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'Accept-Encoding'
:
'gzip'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
}
}
print
(
'第三步 发送post 请求 输入 邮编 点击确定'
)
submit_headers
[
'Accept'
]
=
'*/*'
resp_2
=
sess
.
post
(
url_2
,
headers
=
post_headers
,
json
=
data
,
cookies
=
data_modal_cookie_dict
,
submit_headers
[
'Referer'
]
=
url_asin
timeout
=
15
,
verify
=
False
)
submit_headers
[
'X-Requested-With'
]
=
'XMLHttpRequest'
print
(
resp_2
.
text
)
submit_headers
[
'Sec-Fetch-Site'
]
=
'same-origin'
post_cookies
=
resp_2
.
headers
.
get
(
'set-cookie'
)
submit_headers
[
'Sec-Fetch-Mode'
]
=
'cors'
try
:
submit_headers
[
'Sec-Fetch-Dest'
]
=
'empty'
post_cookies_dict
=
{
i
.
split
(
"="
)[
0
]:
i
.
split
(
"="
)[
-
1
]
for
i
in
post_cookies
.
split
(
"; "
)}
print
(
'第四步。提交'
)
post_cookies_dict
.
update
(
data_modal_cookie_dict
)
detail_url
=
f
'{index_url}/portal-migration/hz/glow/get-location-label?storeContext=pet-supplies&pageType=Detail&actionSource=desktop-modal'
except
:
post_cookies_dict
=
data_modal_cookie_dict
# detail_url = f'{index_url}/portal-migration/hz/glow/get-location-label?storeContext=grocery&pageType=Detail&actionSource=desktop-modal'
requ_see
.
get
(
detail_url
,
headers
=
submit_headers
,
verify
=
False
,
impersonate
=
"chrome"
)
done_url
=
url_
+
"/portal-migration/hz/glow/get-location-label?storeContext=generic&pageType=Gateway&actionSource=desktop-modal"
print
(
requ_see
.
cookies
.
get_dict
())
print
(
'第四步,点击完成,'
)
cookie_dict
=
requ_see
.
cookies
.
get_dict
()
done_resp
=
sess
.
get
(
done_url
,
headers
=
headers
,
cookies
=
post_cookies_dict
,
timeout
=
15
,
verify
=
False
)
index_resp
=
requests
.
get
(
index_url
,
headers
=
headers
,
cookies
=
cookie_dict
,
verify
=
False
,
print
(
done_resp
.
text
,
'done_respdone_respdone_respdone_resp'
)
impersonate
=
"chrome"
)
done_cookies_dict
=
sess
.
cookies
.
get_dict
()
print
(
'done_cookies_dict::'
,
done_cookies_dict
)
print
(
"第五步,请求首页,获取邮编,是否修改成功"
)
index_resp
=
sess
.
get
(
url_
,
headers
=
headers
,
timeout
=
15
,
cookies
=
done_cookies_dict
,
verify
=
False
)
index_resp_cookies
=
sess
.
cookies
.
get_dict
()
print
(
sess
.
cookies
.
get_dict
(),
'2222222222222222'
)
index_xpath
=
etree
.
HTML
(
index_resp
.
text
)
index_xpath
=
etree
.
HTML
(
index_resp
.
text
)
ingress
=
index_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
ingress
=
index_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
print
(
"获取最新邮编:"
,
ingress
)
print
(
"获取最新邮编:"
,
ingress
)
if
zipCode
in
ingress
[
0
]
.
strip
()
or
"W1S 3"
in
ingress
[
0
]
.
strip
()
or
'M5B 2H'
in
ingress
[
0
]
.
strip
():
if
zipCode
in
ingress
[
0
]
.
strip
()
or
"W1S 3"
in
ingress
[
0
]
.
strip
():
print
(
f
"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************"
)
print
(
f
"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************"
)
cookies
=
json
.
dumps
(
index_resp_cookies
,
ensure_ascii
=
False
)
cookies
=
json
.
dumps
(
cookie_dict
,
ensure_ascii
=
False
)
item
=
{
"site"
:
site
,
'zipCode'
:
ingress
[
0
]
.
strip
(),
'cookie'
:
cookies
}
item
=
{
"site"
:
site
,
'zipCode'
:
ingress
[
0
]
.
strip
(),
'cookie'
:
cookies
}
print
(
item
)
print
(
item
)
# 构造 DataFrame
# 构造 DataFrame
df
=
pd
.
DataFrame
([{
"cookies"
:
cookies
,
"type"
:
"DB"
}])
if
site
not
in
[
'us'
,
'uk'
,
'fr'
,
'es'
,
'it'
,
'de'
]:
# 存储到数据库
df
=
pd
.
DataFrame
([{
"cookies"
:
cookies
,
'site'
:
site
}])
engine_us
.
to_sql
(
df
,
f
"{site}_cookies"
,
if_exists
=
"append"
)
# 存储到数据库
engine_us
.
to_sql
(
df
,
'other_site_cookies'
,
if_exists
=
"append"
)
print
(
f
"入库成功 {site} other_site_cookies"
)
else
:
# 构造 DataFrame
df
=
pd
.
DataFrame
([{
"cookies"
:
cookies
}])
# 存储到数据库
engine_us
.
to_sql
(
df
,
f
"{site}_comment_cookies"
,
if_exists
=
"append"
)
print
(
f
"入库成功 {site} {site}_comment_cookies"
)
print
(
'
\n
'
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"获取 {site} 站点 cookie 报错,切换下一个站点"
,
e
)
print
(
f
"获取 {site} 站点 cookie 报错,切换下一个站点"
,
e
)
print
(
"报错"
,
f
"
\n
{traceback.format_exc()}"
)
print
(
"报错"
,
f
"
\n
{traceback.format_exc()}"
)
time
.
sleep
(
random
.
uniform
(
2.5
,
5.5
))
time
.
sleep
(
random
.
uniform
(
2.5
,
5.5
))
...
@@ -219,10 +343,15 @@ if __name__ == '__main__':
...
@@ -219,10 +343,15 @@ if __name__ == '__main__':
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'de'
,
zipCode
=
'10115'
)
get_cookie
(
site
=
'de'
,
zipCode
=
'10115'
)
get_cookie
(
site
=
'uk'
,
zipCode
=
'W1S 3PR'
)
get_cookie
(
site
=
'uk'
,
zipCode
=
'W1S 3PR'
)
# get_cookie(site='it', zipCode='85')
get_cookie
(
site
=
'es'
,
zipCode
=
'28001'
)
get_cookie
(
site
=
'fr'
,
zipCode
=
'75019'
)
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'de'
,
zipCode
=
'10115'
)
time
.
sleep
(
random
.
uniform
(
60.5
,
180.5
))
get_cookie
(
site
=
'uk'
,
zipCode
=
'W1S 3PR'
)
# ae
time
.
sleep
(
random
.
uniform
(
30.5
,
70.5
))
# au
# be
# br
# ca
# mx
# nl
# pl
# se
# tr
py_spider/amazon_spider/H10_spider.py
View file @
1612910c
...
@@ -20,6 +20,7 @@ from selenium import webdriver
...
@@ -20,6 +20,7 @@ from selenium import webdriver
from
selenium.common.exceptions
import
NoSuchElementException
from
selenium.common.exceptions
import
NoSuchElementException
from
selenium.common.exceptions
import
WebDriverException
,
TimeoutException
from
selenium.common.exceptions
import
WebDriverException
,
TimeoutException
from
selenium.webdriver.chrome.options
import
Options
from
selenium.webdriver.chrome.options
import
Options
from
selenium.webdriver.chrome.service
import
Service
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.common.keys
import
Keys
from
selenium.webdriver.common.keys
import
Keys
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.webdriver.support
import
expected_conditions
as
EC
...
@@ -122,73 +123,45 @@ class H10():
...
@@ -122,73 +123,45 @@ class H10():
self
.
site_name
=
'us'
self
.
site_name
=
'us'
self
.
engine
=
get_remote_engine
(
self
.
site_name
,
'mysql'
)
self
.
engine
=
get_remote_engine
(
self
.
site_name
,
'mysql'
)
def
web_drver
(
self
):
def
web_drver
(
self
,
is_login
=
True
):
chrome_options
=
Options
()
opt
=
Options
()
chrome_options
.
add_argument
(
'-disable-gpu'
)
# 稳定性 & 资源
chrome_options
.
add_argument
(
"--disable-notifications"
)
opt
.
add_argument
(
"--no-sandbox"
)
chrome_options
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-logging"
])
opt
.
add_argument
(
"--disable-dev-shm-usage"
)
chrome_options
.
add_argument
(
'--ignore-certificate-errors'
)
opt
.
add_argument
(
"--disable-gpu"
)
chrome_options
.
add_argument
(
'--ignore-ssl-errors'
)
# 忽略ssl错误
opt
.
add_argument
(
"--window-size=1920,1080"
)
chrome_options
.
add_argument
(
"disable-blink-features=AutomationControlled"
)
opt
.
add_argument
(
"--disable-notifications"
)
chrome_options
.
add_argument
(
'–no-sandbox'
)
# 沙盒模式运行
opt
.
add_argument
(
"--disable-extensions"
)
# 忽略无关的日志
opt
.
add_argument
(
"--disable-background-networking"
)
chrome_options
.
add_experimental_option
(
"excludeSwitches"
,
[
'enable-automation'
,
'enable-logging'
])
opt
.
add_argument
(
"--disable-background-timer-throttling"
)
# 禁止硬件加速,避免严重占用cpu
opt
.
add_argument
(
"--disable-renderer-backgrounding"
)
chrome_options
.
add_argument
(
'--disable-gpu'
)
opt
.
add_argument
(
"--disable-features=Translate,BackForwardCache"
)
# 隐身模式(无痕模式)
# chrome_options.add_argument('--incognito')
chrome_options
.
add_argument
(
"--start-maximized"
)
chrome_options
.
add_argument
(
'--disable-gpu'
)
# 以最高权限运行
chrome_options
.
add_argument
(
'--no-sandbox'
)
# 启用打印预览。
chrome_options
.
add_argument
(
"--enable-print-preview"
)
# 在工具 栏增加一个书签按钮
chrome_options
.
add_argument
(
"--bookmark-menu"
)
# 启用书签同步
chrome_options
.
add_argument
(
"--enable-sync"
)
chrome_options
.
add_argument
(
'–allow-running-insecure-content'
)
# 允许运行不安全的内容
chrome_options
.
add_argument
(
'–disable-web-security'
)
# 关闭安全策略
chrome_options
.
add_argument
(
'–disable-xss-auditor'
)
# 禁止xss防护
# 解决浏览器弹出下载多个文件 允许
chrome_options
.
add_experimental_option
(
"prefs"
,
{
"profile.default_content_setting_values.notifications"
:
1
})
chrome_options
.
add_argument
(
' window-size=1920,1080'
)
chrome_options
.
add_experimental_option
(
"prefs"
,
{
"profile.default_content_setting_values.automatic_downloads"
:
1
})
# 创建一个带有配置文件的 Chrome 浏览器实例
self
.
driver
=
webdriver
.
Chrome
(
options
=
chrome_options
)
# 设置headers
# ✅ 修正:必须带 --
self
.
driver
.
execute_cdp_cmd
(
"Network.setExtraHTTPHeaders"
,
opt
.
add_argument
(
"--disable-blink-features=AutomationControlled"
)
{
"headers"
:
opt
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-automation"
,
"enable-logging"
])
{
"User-Agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
,
}
})
# 防止网站检测selenium的webdriver
# ✅ 保持登录:固定 profile(不会清 cookie)
self
.
driver
.
execute_cdp_cmd
(
"Page.addScriptToEvaluateOnNewDocument"
,
{
opt
.
add_argument
(
r"--user-data-dir=C:\selenium\chrome_profile"
)
"source"
:
"""
opt
.
add_argument
(
r"--profile-directory=Default"
)
Object.defineProperty(navigator, 'webdriver', {
get: () => False
})
"""
})
self
.
driver
.
execute_cdp_cmd
(
"Page.addScriptToEvaluateOnNewDocument"
,
{
# ✅ 减负:禁用图片/字体(可选,通常不影响登录)
"source"
:
"""
prefs
=
{
Object.defineProperty(navigator, 'webdriver', {
"profile.managed_default_content_settings.images"
:
1
,
get: () => undefined
"profile.managed_default_content_settings.fonts"
:
1
,
})
"profile.default_content_setting_values.notifications"
:
1
,
"""
}
})
opt
.
add_experimental_option
(
"prefs"
,
prefs
)
self
.
driver
.
execute_cdp_cmd
(
"Page.addScriptToEvaluateOnNewDocument"
,
{
# ✅ 更快:不要等所有资源加载完(可选)
"source"
:
"""const toBlob=HTMLCanvasElement.prototype.toBlob;const toDataURL=HTMLCanvasElement.prototype.toDataURL;const getImageData=CanvasRenderingContext2D.prototype.getImageData;function noisify(canvas,context){if(context){const shift={'r':Math.floor(Math.random()*10)-5,'g':Math.floor(Math.random()*10)-5,'b':Math.floor(Math.random()*10)-5,'a':Math.floor(Math.random()*10)-5};const width=canvas.width;const height=canvas.height;if(width&&height){const imageData=getImageData.apply(context,[0,0,width,height]);for(let i=0;i<height;i++){for(let j=0;j<width;j++){const n=((i*(width*4))+(j*4));imageData.data[n+0]=imageData.data[n+0]+shift.r;imageData.data[n+1]=imageData.data[n+1]+shift.g;imageData.data[n+2]=imageData.data[n+2]+shift.b;imageData.data[n+3]=imageData.data[n+3]+shift.a}}window.top.postMessage("canvas-fingerprint-defender-alert",'*');context.putImageData(imageData,0,0)}}}Object.defineProperty(HTMLCanvasElement.prototype,"toBlob",{"value":function(){noisify(this,this.getContext("2d"));return toBlob.apply(this,arguments)}});Object.defineProperty(HTMLCanvasElement.prototype,"toDataURL",{"value":function(){noisify(this,this.getContext("2d"));return toDataURL.apply(this,arguments)}});Object.defineProperty(CanvasRenderingContext2D.prototype,"getImageData",{"value":function(){noisify(this.canvas,this);return getImageData.apply(this,arguments)}});document.documentElement.dataset.cbscriptallow=true;if(document.documentElement.dataset.cbscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.CanvasRenderingContext2D){iframes[i].contentWindow.CanvasRenderingContext2D.prototype.getImageData=CanvasRenderingContext2D.prototype.getImageData}if(iframes[i].contentWindow.HTMLCanvasElement){iframes[i].contentWindow.HTMLCanvasElement.prototype.toBlob=HTMLCanvasElement.prototype.toBlob;iframes[i].contentWindow.HTMLCanvasElement.prototype.toDataURL=HTMLCanvasElement.prototype.toDataURL}}}}"""
,
})
opt
.
page_load_strategy
=
"eager"
self
.
driver
.
execute_cdp_cmd
(
"Page.addScriptToEvaluateOnNewDocument"
,
{
try
:
"source"
:
"""var config={"random":{"value":function(){return Math.random()},"item":function(e){var rand=e.length*config.random.value();return e[Math.floor(rand)]},"number":function(power){var tmp=[];for(var i=0;i<power.length;i++){tmp.push(Math.pow(2,power[i]))}return config.random.item(tmp)},"int":function(power){var tmp=[];for(var i=0;i<power.length;i++){var n=Math.pow(2,power[i]);tmp.push(new Int32Array([n,n]))}return config.random.item(tmp)},"float":function(power){var tmp=[];for(var i=0;i<power.length;i++){var n=Math.pow(2,power[i]);tmp.push(new Float32Array([1,n]))}return config.random.item(tmp)}},"spoof":{"webgl":{"buffer":function(target){var proto=target.prototype?target.prototype:target.__proto__;const bufferData=proto.bufferData;Object.defineProperty(proto,"bufferData",{"value":function(){var index=Math.floor(config.random.value()*arguments[1].length);var noise=arguments[1][index]!==undefined?0.1*config.random.value()*arguments[1][index]:0;arguments[1][index]=arguments[1][index]+noise;window.top.postMessage("webgl-fingerprint-defender-alert",'*');return bufferData.apply(this,arguments)}})},"parameter":function(target){var proto=target.prototype?target.prototype:target.__proto__;const getParameter=proto.getParameter;Object.defineProperty(proto,"getParameter",{"value":function(){window.top.postMessage("webgl-fingerprint-defender-alert",'*');if(arguments[0]===3415)return 0;else if(arguments[0]===3414)return 24;else if(arguments[0]===36348)return 30;else if(arguments[0]===7936)return"WebKit";else if(arguments[0]===37445)return"Google Inc.";else if(arguments[0]===7937)return"WebKit WebGL";else if(arguments[0]===3379)return config.random.number([14,15]);else if(arguments[0]===36347)return config.random.number([12,13]);else if(arguments[0]===34076)return config.random.number([14,15]);else if(arguments[0]===34024)return config.random.number([14,15]);else if(arguments[0]===3386)return config.random.int([13,14,15]);else if(arguments[0]===3413)return config.random.number([1,2,3,4]);else if(arguments[0]===3412)return config.random.number([1,2,3,4]);else if(arguments[0]===3411)return config.random.number([1,2,3,4]);else if(arguments[0]===3410)return config.random.number([1,2,3,4]);else if(arguments[0]===34047)return config.random.number([1,2,3,4]);else if(arguments[0]===34930)return config.random.number([1,2,3,4]);else if(arguments[0]===34921)return config.random.number([1,2,3,4]);else if(arguments[0]===35660)return config.random.number([1,2,3,4]);else if(arguments[0]===35661)return config.random.number([4,5,6,7,8]);else if(arguments[0]===36349)return config.random.number([10,11,12,13]);else if(arguments[0]===33902)return config.random.float([0,10,11,12,13]);else if(arguments[0]===33901)return config.random.float([0,10,11,12,13]);else if(arguments[0]===37446)return config.random.item(["Graphics","HD Graphics","Intel(R) HD Graphics"]);else if(arguments[0]===7938)return config.random.item(["WebGL 1.0","WebGL 1.0 (OpenGL)","WebGL 1.0 (OpenGL Chromium)"]);else if(arguments[0]===35724)return config.random.item(["WebGL","WebGL GLSL","WebGL GLSL ES","WebGL GLSL ES (OpenGL Chromium"]);return getParameter.apply(this,arguments)}})}}}};config.spoof.webgl.buffer(WebGLRenderingContext);config.spoof.webgl.buffer(WebGL2RenderingContext);config.spoof.webgl.parameter(WebGLRenderingContext);config.spoof.webgl.parameter(WebGL2RenderingContext);document.documentElement.dataset.wgscriptallow=true;if(document.documentElement.dataset.wgscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.WebGLRenderingContext){iframes[i].contentWindow.WebGLRenderingContext.prototype.bufferData=WebGLRenderingContext.prototype.bufferData;iframes[i].contentWindow.WebGLRenderingContext.prototype.getParameter=WebGLRenderingContext.prototype.getParameter}if(iframes[i].contentWindow.WebGL2RenderingContext){iframes[i].contentWindow.WebGL2RenderingContext.prototype.bufferData=WebGL2RenderingContext.prototype.bufferData;iframes[i].contentWindow.WebGL2RenderingContext.prototype.getParameter=WebGL2RenderingContext.prototype.getParameter}}}}"""
})
self
.
driver
=
webdriver
.
Chrome
(
options
=
opt
)
self
.
driver
.
execute_cdp_cmd
(
"Page.addScriptToEvaluateOnNewDocument"
,
{
except
:
"source"
:
"""var rand={"noise":function(){var SIGN=Math.random()<Math.random()?-1:1;return Math.floor(Math.random()+SIGN*Math.random())},"sign":function(){const tmp=[-1,-1,-1,-1,-1,-1,+1,-1,-1,-1];const index=Math.floor(Math.random()*tmp.length);return tmp[index]}};Object.defineProperty(HTMLElement.prototype,"offsetHeight",{get(){const height=Math.floor(this.getBoundingClientRect().height);const valid=height&&rand.sign()===1;const result=valid?height+rand.noise():height;return result}});Object.defineProperty(HTMLElement.prototype,"offsetWidth",{get(){const width=Math.floor(this.getBoundingClientRect().width);const valid=width&&rand.sign()===1;const result=valid?width+rand.noise():width;return result}});document.documentElement.dataset.fbscriptallow=true;if(document.documentElement.dataset.fbscriptallow!=="true"){const iframes=[...window.top.document.querySelectorAll("iframe[sandbox]")];for(var i=0;i<iframes.length;i++){if(iframes[i].contentWindow){if(iframes[i].contentWindow.HTMLElement){iframes[i].contentWindow.HTMLElement.prototype.offsetWidth=HTMLElement.prototype.offsetWidth;iframes[i].contentWindow.HTMLElement.prototype.offsetHeight=HTMLElement.prototype.offsetHeight}}}}"""
})
service
=
Service
(
r"D:\EXE\webdrvier版本\120\chromedriver.exe"
)
self
.
driver
=
webdriver
.
Chrome
(
service
=
service
,
options
=
opt
)
self
.
driver
.
maximize_window
()
self
.
driver
.
maximize_window
()
self
.
longin
()
if
is_login
:
self
.
longin
()
def
activate_recaptcha
(
self
,
api
):
def
activate_recaptcha
(
self
,
api
):
"""
"""
...
@@ -437,27 +410,60 @@ class H10():
...
@@ -437,27 +410,60 @@ class H10():
except
TimeoutException
:
except
TimeoutException
:
print
(
"wait_page timeout, used:"
,
time
.
time
()
-
start
)
print
(
"wait_page timeout, used:"
,
time
.
time
()
-
start
)
return
False
return
False
def
click_button
(
self
):
try
:
print
(
'点击显示下拉框'
)
button_js
=
'document.querySelector("#CerebroFilter > div > div.sc-bZEumQ.ilswiy > div.sc-DnZRP.etdxo > div > button").click()'
self
.
driver
.
execute_script
(
button_js
)
except
:
self
.
driver
.
find_element
(
By
.
XPATH
,
'//button[@data-testid="showMoreButton"]'
)
.
click
()
time
.
sleep
(
2
)
html
=
self
.
driver
.
page_source
resp
=
etree
.
HTML
(
html
)
print
(
'Amazons Choice获取元素'
)
time
.
sleep
(
2
)
div_class
=
resp
.
xpath
(
'''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class'''
)
print
(
'点击选择亚马逊精选 勾选'
)
time
.
sleep
(
2
)
return
div_class
def
click_Choice
(
self
):
html
=
self
.
driver
.
page_source
resp
=
etree
.
HTML
(
html
)
print
(
'Amazons Choice获取元素'
)
time
.
sleep
(
2
)
div_class
=
resp
.
xpath
(
'''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class'''
)
print
(
'点击选择亚马逊精选 勾选'
)
time
.
sleep
(
2
)
if
div_class
:
return
div_class
else
:
return
None
def
webdrvier_html
(
self
,
asin
,
asinstype
):
def
webdrvier_html
(
self
,
asin
,
asinstype
):
refresh_num
=
0
# 点击选择站点
# 点击选择站点
for
i
in
range
(
6
):
for
i
in
range
(
6
):
try
:
try
:
_url
=
self
.
driver
.
current_url
_url
=
self
.
driver
.
current_url
self
.
id_url
=
f
'https://members.helium10.com/cerebro?accountId={self.account_id}'
self
.
driver
.
get
(
self
.
id_url
)
if
"concurrent-sessions"
in
_url
or
'signin'
in
_url
:
if
"concurrent-sessions"
in
_url
or
'signin'
in
_url
:
self
.
longin
()
self
.
longin
()
if
asin
not
in
self
.
err_asin_list
and
self
.
useremail_state
:
if
asin
not
in
self
.
err_asin_list
and
self
.
useremail_state
:
print
(
'cerebro界面'
,
self
.
site_name_url
)
print
(
'cerebro界面'
,
self
.
site_name_url
)
self
.
driver
.
get
(
f
'https://members.helium10.com/cerebro?accountId={self.account_id}'
)
if
not
self
.
wait_page
(
timeout
=
35
):
if
not
self
.
wait_page
(
timeout
=
50
):
self
.
driver
.
refresh
()
print
(
'页面未加载出来'
)
print
(
'页面未加载出来'
)
continue
continue
time
.
sleep
(
2
)
sleep
(
randint
(
10
,
15
)
)
if
'You are viewing a demo of Cerebro'
in
self
.
driver
.
page_source
:
if
'You are viewing a demo of Cerebro'
in
self
.
driver
.
page_source
:
print
(
self
.
email_name
,
'账号过期'
)
print
(
self
.
email_name
,
'账号过期'
)
self
.
driver
.
refresh
()
self
.
driver
.
refresh
()
continue
continue
# self.useremail_state = False
# self.send_ms('You are viewing a demo of Cerebro')
self
.
verify
()
self
.
verify
()
if
self
.
site_name_url
==
'Amazon.co.uk'
:
if
self
.
site_name_url
==
'Amazon.co.uk'
:
self
.
site_name_csv
=
'GB'
self
.
site_name_csv
=
'GB'
...
@@ -494,25 +500,38 @@ class H10():
...
@@ -494,25 +500,38 @@ class H10():
try
:
try
:
self
.
driver
.
execute_script
(
self
.
driver
.
execute_script
(
f
"""document.querySelector("img[loading='lazy']").click()"""
)
f
"""document.querySelector("img[loading='lazy']").click()"""
)
time
.
sleep
(
1
)
time
.
sleep
(
1
.5
)
except
:
except
:
self
.
driver
.
execute_script
(
self
.
driver
.
execute_script
(
f
"""document.querySelector("img[alt='{alt}']").click()"""
)
f
"""document.querySelector("img[alt='{alt}']").click()"""
)
time
.
sleep
(
1
)
time
.
sleep
(
1
.5
)
self
.
verify
()
self
.
verify
()
# 切换站点
# 切换站点
self
.
driver
.
execute_script
(
f
"""document.querySelector("div[data-value='{host}']").click()"""
)
self
.
driver
.
execute_script
(
f
"""document.querySelector("div[data-value='{host}']").click()"""
)
time
.
sleep
(
1.5
)
time
.
sleep
(
2
)
# 输入asin
# 输入asin
print
(
'输入asin'
,
asin
)
print
(
'输入asin'
,
asin
)
wait
=
WebDriverWait
(
self
.
driver
,
5
)
try
:
send_asins_xpath
=
'//*[@id="re-container"]//div[@id="findKeywordSearch"]//input'
wait
.
until
(
EC
.
element_to_be_clickable
((
By
.
XPATH
,
send_asins_xpath
)))
except
TimeoutException
:
try
:
send_asins_xpath
=
'//*[@id="re-container"]//input[contains(@class,"sc-blmEgr sc-cxgeGX")]'
wait
.
until
(
EC
.
element_to_be_clickable
((
By
.
XPATH
,
send_asins_xpath
)))
except
:
send_asins_xpath
=
'//*[@id="re-container"]//div[@data-value="0"]//input'
if
','
in
asin
:
if
','
in
asin
:
_asin_lsit
=
asin
.
split
(
','
)
_asin_lsit
=
asin
.
split
(
','
)
for
_asin
in
_asin_lsit
:
for
_asin
in
_asin_lsit
:
self
.
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="re-container"]//input'
)
.
send_keys
(
f
'{_asin},'
)
self
.
driver
.
find_element
(
By
.
XPATH
,
send_asins_xpath
)
.
send_keys
(
f
'{_asin},'
)
time
.
sleep
(
1.5
)
time
.
sleep
(
1.5
)
else
:
else
:
self
.
driver
.
find_element
(
By
.
XPATH
,
'//*[@id="re-container"]//input'
)
.
send_keys
(
f
'{asin},'
)
self
.
driver
.
find_element
(
By
.
XPATH
,
send_asins_xpath
)
.
send_keys
(
f
'{asin},'
)
time
.
sleep
(
1
)
time
.
sleep
(
2
)
if
'detected. Please check the ASINs and try again'
in
self
.
driver
.
page_source
:
self
.
err_asin_list
.
append
(
asin
)
break
# 勾选排除变体
# 勾选排除变体
self
.
driver
.
execute_script
(
"""document.querySelector("input[name='excludeVariations']").click()"""
)
self
.
driver
.
execute_script
(
"""document.querySelector("input[name='excludeVariations']").click()"""
)
# 点击 get keyword
# 点击 get keyword
...
@@ -532,13 +551,17 @@ class H10():
...
@@ -532,13 +551,17 @@ class H10():
print
(
'中间框下载词 没有报告'
)
print
(
'中间框下载词 没有报告'
)
self
.
err_asins_adv_list
.
append
(
asin
)
self
.
err_asins_adv_list
.
append
(
asin
)
break
break
elif
'拒绝访问'
in
html
:
print
(
'拒绝访问 没有次数'
)
self
.
err_asins_adv_list
.
append
(
asin
)
break
elif
'errorCodes.undefined'
in
html
:
elif
'errorCodes.undefined'
in
html
:
continue
continue
html
=
self
.
driver
.
page_source
self
.
verify
()
self
.
verify
()
time
.
sleep
(
2
)
time
.
sleep
(
3.5
)
try
:
try
:
if
'searched this product before'
in
html
or
'先前已搜索过此产品'
in
html
:
html
=
self
.
driver
.
page_source
if
'searched this product before'
in
html
or
'先前已搜索过此产品'
in
html
or
'运行新搜索'
in
html
or
'从历史数据加载'
in
html
:
print
(
'33333333333444444'
)
print
(
'33333333333444444'
)
self
.
driver
.
execute_script
(
self
.
driver
.
execute_script
(
"""document.querySelector("button[data-testid='runnewsearch']").click()"""
)
"""document.querySelector("button[data-testid='runnewsearch']").click()"""
)
...
@@ -562,9 +585,16 @@ class H10():
...
@@ -562,9 +585,16 @@ class H10():
print
(
'中间框下载词 没有报告'
)
print
(
'中间框下载词 没有报告'
)
self
.
err_asins_adv_list
.
append
(
asin
)
self
.
err_asins_adv_list
.
append
(
asin
)
break
break
elif
'拒绝访问'
in
html
:
print
(
'拒绝访问 没有次数'
)
self
.
err_asins_adv_list
.
append
(
asin
)
break
elif
'errorCodes.undefined'
in
html
:
elif
'errorCodes.undefined'
in
html
:
continue
continue
sleep
(
randint
(
15
,
30
))
if
asinstype
:
sleep
(
randint
(
20
,
38
))
else
:
sleep
(
randint
(
8
,
15
))
self
.
verify
()
self
.
verify
()
time
.
sleep
(
2
)
time
.
sleep
(
2
)
if
'Wrong entered data or no results'
in
html
:
if
'Wrong entered data or no results'
in
html
:
...
@@ -583,20 +613,11 @@ class H10():
...
@@ -583,20 +613,11 @@ class H10():
break
break
if
asinstype
:
if
asinstype
:
try
:
try
:
print
(
'点击显示下拉框'
)
div_class
=
self
.
click_Choice
()
button_js
=
'document.querySelector("#CerebroFilter > div > div.sc-dzXNMW.dufncf > div.sc-hFCjLd.igMWUF > div > button").click()'
if
div_class
is
None
:
self
.
driver
.
execute_script
(
button_js
)
div_class
=
self
.
click_button
()
time
.
sleep
(
2
)
html
=
self
.
driver
.
page_source
resp
=
etree
.
HTML
(
html
)
print
(
'Amazons Choice获取元素'
)
time
.
sleep
(
2
)
div_class
=
resp
.
xpath
(
'''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class'''
)
except
:
except
:
print
(
'报错22222222222222'
)
div_class
=
self
.
click_button
()
print
(
'点击选择亚马逊精选 勾选'
)
time
.
sleep
(
2
)
try
:
try
:
script
=
f
"""
script
=
f
"""
const elements = document.querySelectorAll("div[class='{div_class[0]}']>div");
const elements = document.querySelectorAll("div[class='{div_class[0]}']>div");
...
@@ -604,14 +625,17 @@ class H10():
...
@@ -604,14 +625,17 @@ class H10():
secondElement.click();
secondElement.click();
"""
"""
except
:
except
:
print
(
'报错:scrip t script 2323232323232323'
)
if
i
==
2
:
if
i
==
2
:
self
.
err_asins_adv_list
.
append
(
asin
)
self
.
err_asins_adv_list
.
append
(
asin
)
continue
self
.
driver
.
execute_script
(
script
)
self
.
driver
.
execute_script
(
script
)
time
.
sleep
(
1
)
time
.
sleep
(
1
)
html1
=
self
.
driver
.
page_source
html1
=
self
.
driver
.
page_source
resp1
=
etree
.
HTML
(
html1
)
resp1
=
etree
.
HTML
(
html1
)
span_class
=
resp1
.
xpath
(
span_class
=
resp1
.
xpath
(
'//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class|//span[contains(text(),"已分析的产品")]/parent::div/following-sibling::div/@class'
)[
0
]
'//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class|//span[contains(text(),"已分析的产品")]/parent::div/following-sibling::div/@class'
)[
0
]
# 选择亚马逊精选参数1
# 选择亚马逊精选参数1
self
.
driver
.
execute_script
(
self
.
driver
.
execute_script
(
f
"""document.querySelector("div[class='{span_class}']").click()"""
)
f
"""document.querySelector("div[class='{span_class}']").click()"""
)
...
@@ -632,15 +656,20 @@ class H10():
...
@@ -632,15 +656,20 @@ class H10():
time
.
sleep
(
1
)
time
.
sleep
(
1
)
print
(
'点击选择csv'
)
print
(
'点击选择csv'
)
self
.
driver
.
execute_script
(
"""document.querySelector("div[data-testid='csv']").click()"""
)
self
.
driver
.
execute_script
(
"""document.querySelector("div[data-testid='csv']").click()"""
)
time
.
sleep
(
15
)
time
.
sleep
(
8
)
break
break
except
Exception
as
e
:
except
Exception
as
e
:
print
(
'详细报错'
)
print
(
'详细报错'
,
e
)
print
(
traceback
.
format_exc
(),
e
)
if
i
==
5
:
self
.
driver
.
refresh
()
time
.
sleep
(
1
)
if
i
==
2
:
self
.
err_asin_list
.
append
(
asin
)
self
.
err_asin_list
.
append
(
asin
)
refresh_num
+=
1
if
refresh_num
>
4
:
print
(
'超过4次。清除缓存'
)
self
.
enable_no_cache
()
self
.
clear_http_cache
()
self
.
clear_cache_but_keep_cookies
(
'https://members.helium10.com/'
)
refresh_num
=
0
time
.
sleep
(
2
)
continue
continue
def
nex_page
(
self
,
asin_list
,
asinstype
=
None
):
def
nex_page
(
self
,
asin_list
,
asinstype
=
None
):
...
@@ -761,7 +790,7 @@ class H10():
...
@@ -761,7 +790,7 @@ class H10():
with
open
(
file_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
file_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
f
.
read
()
f
.
read
()
f
.
close
()
f
.
close
()
print
(
'找到文件:路径有效:'
,
file_path
)
print
(
'找到文件:路径有效:'
,
file_path
)
return
True
return
True
except
:
except
:
print
(
'文件路径不存在'
)
print
(
'文件路径不存在'
)
...
@@ -795,7 +824,6 @@ class H10():
...
@@ -795,7 +824,6 @@ class H10():
if
state
==
False
:
if
state
==
False
:
print
(
'重新下载文件222:'
,
asin
,
path
)
print
(
'重新下载文件222:'
,
asin
,
path
)
self
.
webdrvier_html
(
asin
,
None
)
self
.
webdrvier_html
(
asin
,
None
)
self
.
if_csv_path
(
file_path
)
header_config
=
{
header_config
=
{
"chinese"
:
{
"chinese"
:
{
"columns"
:
[
'关键词词组'
,
'Cerebro IQ 得分'
,
'搜索量'
,
'搜索量趋势'
,
"columns"
:
[
'关键词词组'
,
'Cerebro IQ 得分'
,
'搜索量'
,
'搜索量趋势'
,
...
@@ -968,7 +996,7 @@ class H10():
...
@@ -968,7 +996,7 @@ class H10():
file_path
=
fr
'{path}
\
{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
file_path
=
fr
'{path}
\
{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
print
(
'file_pathsave_competition1111111'
,
file_path
)
print
(
'file_pathsave_competition1111111'
,
file_path
)
state
=
self
.
if_csv_path
(
file_path
)
state
=
self
.
if_csv_path
(
file_path
)
if
state
==
False
:
if
state
==
False
:
current_date
=
datetime
.
date
.
today
()
current_date
=
datetime
.
date
.
today
()
# 计算前一天日期
# 计算前一天日期
previous_date
=
current_date
-
datetime
.
timedelta
(
days
=
1
)
previous_date
=
current_date
-
datetime
.
timedelta
(
days
=
1
)
...
@@ -983,10 +1011,17 @@ class H10():
...
@@ -983,10 +1011,17 @@ class H10():
print
(
'file_pathsave_competition3333'
,
file_path
)
print
(
'file_pathsave_competition3333'
,
file_path
)
state
=
self
.
if_csv_path
(
file_path
)
state
=
self
.
if_csv_path
(
file_path
)
if
state
==
False
:
if
state
==
False
:
self
.
nex_page
(
self
.
asin_list
,
asinstype
=
1
)
print
(
'重新下载文件3333333333 :'
,
asin_list
,
path
)
# self.webdrvier_html(','.join(asin_list), 1)
if
self
.
is_nex_pag
:
self
.
nex_page
(
self
.
asin_list
,
asinstype
=
1
)
self
.
is_nex_pag
=
False
file_path
=
fr
'{path}
\
{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
# 创建一个字典来映射原始列名和新的列名
# 创建一个字典来映射原始列名和新的列名
columns
=
pd
.
read_csv
(
file_path
,
nrows
=
0
)
.
columns
.
tolist
()
columns
=
pd
.
read_csv
(
file_path
,
nrows
=
0
)
.
columns
.
tolist
()
def
contains_chinese
(
text
):
def
contains_chinese
(
text
):
return
bool
(
re
.
search
(
r'[\u4e00-\u9fff]'
,
text
))
return
bool
(
re
.
search
(
r'[\u4e00-\u9fff]'
,
text
))
...
@@ -1075,6 +1110,25 @@ class H10():
...
@@ -1075,6 +1110,25 @@ class H10():
print
(
data
)
print
(
data
)
requests
.
post
(
url
=
url
,
data
=
data
,
timeout
=
15
)
requests
.
post
(
url
=
url
,
data
=
data
,
timeout
=
15
)
def
enable_no_cache
(
self
):
self
.
driver
.
execute_cdp_cmd
(
"Network.enable"
,
{})
self
.
driver
.
execute_cdp_cmd
(
"Network.setCacheDisabled"
,
{
"cacheDisabled"
:
True
})
def
clear_http_cache
(
self
):
self
.
driver
.
execute_cdp_cmd
(
"Network.enable"
,
{})
self
.
driver
.
execute_cdp_cmd
(
"Network.clearBrowserCache"
,
{})
def
clear_cache_but_keep_cookies
(
self
,
origin
:
str
):
# 1) 清 http cache
self
.
driver
.
execute_cdp_cmd
(
"Network.enable"
,
{})
self
.
driver
.
execute_cdp_cmd
(
"Network.clearBrowserCache"
,
{})
# 2) 清更深层缓存:cache storage / service worker / appcache
self
.
driver
.
execute_cdp_cmd
(
"Storage.clearDataForOrigin"
,
{
"origin"
:
origin
,
"storageTypes"
:
"appcache,cache_storage,service_workers"
})
def
run
(
self
):
def
run
(
self
):
user_pw_list
=
self
.
get_ip_address
()
user_pw_list
=
self
.
get_ip_address
()
if
user_pw_list
:
if
user_pw_list
:
...
@@ -1091,33 +1145,20 @@ class H10():
...
@@ -1091,33 +1145,20 @@ class H10():
else
:
else
:
path
=
r'C:\Users\ASUS\Downloads'
path
=
r'C:\Users\ASUS\Downloads'
print
(
'当前路径:'
,
path
)
print
(
'当前路径:'
,
path
)
self
.
email_name
=
'yswg006@hotmail.com'
self
.
email_name
=
'yswg304@outlook.com'
self
.
pw
=
'Chianbugye@8346148'
# 'yashengweige678@outlook.com', '987654321yswg@'
# 'yswg304@outlook.com', 'Chinabuye@467138'
self
.
pw
=
'Chinabuye@467138'
# 'yashengweige678@outlook.com', '987654321yswg@'
self
.
web_drver
()
self
.
web_drver
()
loop
=
0
while
True
:
while
True
:
self
.
data
=
{}
self
.
data
=
{}
self
.
sku_list
=
[]
self
.
sku_list
=
[]
self
.
err_asins_adv_list
=
[]
self
.
err_asins_adv_list
=
[]
try
:
self
.
driver
.
refresh
()
except
:
continue
time
.
sleep
(
4
)
self
.
driver
.
execute_script
(
"localStorage.clear();"
)
# 清除本地存储
time
.
sleep
(
0.5
)
self
.
driver
.
execute_script
(
"sessionStorage.clear();"
)
# 清除会话存储
time
.
sleep
(
0.5
)
self
.
driver
.
execute_script
(
"caches.keys().then(function(names) { for (let name of names) { caches.delete(name); } });"
)
self
.
driver
.
execute_script
(
"window.performance.clearResourceTimings();"
)
self
.
driver
.
execute_cdp_cmd
(
"Network.clearBrowserCache"
,
{})
# 2) (可选)清性能 timings 不影响登录
self
.
driver
.
execute_script
(
"window.performance.clearResourceTimings();"
)
time
.
sleep
(
5
)
login_url
=
self
.
driver
.
current_url
login_url
=
self
.
driver
.
current_url
if
"concurrent-sessions"
in
login_url
or
'signin'
in
login_url
:
if
"concurrent-sessions"
in
login_url
or
'signin'
in
login_url
:
self
.
longin
()
self
.
longin
()
for
site
in
[
'us'
,
'uk'
,
'de'
,
'fr'
,
'es'
,
'it'
,
'mx'
]:
for
site
in
[
'us'
,
'uk'
,
'de'
,
'fr'
,
'es'
,
'it'
,
'mx'
]:
self
.
is_nex_pag
=
True
print
(
site
)
print
(
site
)
if
site
==
'uk'
:
if
site
==
'uk'
:
self
.
site_url
=
'Amazon.co.uk'
self
.
site_url
=
'Amazon.co.uk'
...
@@ -1139,6 +1180,7 @@ class H10():
...
@@ -1139,6 +1180,7 @@ class H10():
self
.
mysql_connect
(
site
)
self
.
mysql_connect
(
site
)
# 获取未抓取的sku
# 获取未抓取的sku
self
.
read_db_sku
()
self
.
read_db_sku
()
for
sku_token
in
self
.
sku_data_list
:
for
sku_token
in
self
.
sku_data_list
:
sku_token_list
=
sku_token
.
split
(
'|-|'
)
sku_token_list
=
sku_token
.
split
(
'|-|'
)
sku
=
sku_token_list
[
0
]
sku
=
sku_token_list
[
0
]
...
@@ -1176,6 +1218,13 @@ class H10():
...
@@ -1176,6 +1218,13 @@ class H10():
self
.
mysql_connect
(
site
)
self
.
mysql_connect
(
site
)
time
.
sleep
(
randint
(
20
,
50
))
time
.
sleep
(
randint
(
20
,
50
))
loop
+=
1
# ✅ 每 30 次再清一次缓存(按你任务调整)
if
loop
%
30
==
0
:
print
(
'清除缓存'
)
self
.
enable_no_cache
()
self
.
clear_http_cache
()
self
.
clear_cache_but_keep_cookies
(
'https://members.helium10.com/'
)
for
i
in
range
(
10
):
for
i
in
range
(
10
):
print
(
f
"当前时间 {datetime.datetime.now().hour} 点,超出运行时段退出循环。"
)
print
(
f
"当前时间 {datetime.datetime.now().hour} 点,超出运行时段退出循环。"
)
hour
=
datetime
.
datetime
.
now
()
.
hour
hour
=
datetime
.
datetime
.
now
()
.
hour
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment