Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
3a4d4a7e
Commit
3a4d4a7e
authored
Apr 17, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
16efe939
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
71 additions
and
43 deletions
+71
-43
Get_Cookies.py
py_spider/amazon_every_day_spider/Get_Cookies.py
+71
-43
No files found.
py_spider/amazon_every_day_spider/Get_Cookies.py
View file @
3a4d4a7e
...
@@ -3,13 +3,16 @@ import random
...
@@ -3,13 +3,16 @@ import random
import
re
import
re
import
time
import
time
import
traceback
import
traceback
import
pandas
as
pd
import
pandas
as
pd
from
curl_cffi
import
requests
from
curl_cffi
import
requests
from
lxml
import
etree
from
lxml
import
etree
import
os
from
amazon_every_day_spider.secure_db_client
import
get_remote_engine
import
sys
if
getattr
(
sys
,
'frozen'
,
False
):
# exe 运行时,把 exe 所在目录加入 path,动态读取 secure_db_client.py
sys
.
path
.
insert
(
0
,
os
.
path
.
dirname
(
sys
.
executable
))
from
secure_db_client
import
get_remote_engine
import
uuid
"""
"""
打包命令:cd /d E:
\
Git_new
\
spider
\
py_spider
打包命令:cd /d E:
\
Git_new
\
spider
\
py_spider
pyinstaller -F amazon_every_day_spider
\
Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
pyinstaller -F amazon_every_day_spider
\
Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
...
@@ -21,73 +24,96 @@ def get_cookie(site='us', zipCode='10010'):
...
@@ -21,73 +24,96 @@ def get_cookie(site='us', zipCode='10010'):
try
:
try
:
if
site
==
"us"
:
if
site
==
"us"
:
index_url
=
'https://www.amazon.com'
index_url
=
'https://www.amazon.com'
url_asin
=
'https://www.amazon.com/dp/B0009X29WK'
url_asin
=
'https://www.amazon.com/dp/B0DB1GHRYL?th=1'
host
=
'www.amazon.com'
elif
site
==
'uk'
:
elif
site
==
'uk'
:
index_url
=
'https://www.amazon.co.uk'
# 站点url
index_url
=
'https://www.amazon.co.uk'
url_asin
=
'https://www.amazon.co.uk/dp/B0714LLB2T'
# 站点url
url_asin
=
'https://www.amazon.co.uk/dp/B0714LLB2T'
host
=
'www.amazon.co.uk'
elif
site
==
'de'
:
elif
site
==
'de'
:
index_url
=
'https://www.amazon.de'
index_url
=
'https://www.amazon.de'
url_asin
=
'https://www.amazon.de/dp/B00006YYXM'
url_asin
=
'https://www.amazon.de/dp/B00006YYXM'
host
=
'www.amazon.de'
elif
site
==
'fr'
:
elif
site
==
'fr'
:
index_url
=
'https://www.amazon.fr'
index_url
=
'https://www.amazon.fr'
url_asin
=
'https://www.amazon.fr/dp/B0FK9JNPM5'
url_asin
=
'https://www.amazon.fr/dp/B0FK9JNPM5'
host
=
'www.amazon.fr'
elif
site
==
'es'
:
elif
site
==
'es'
:
index_url
=
'https://www.amazon.es'
index_url
=
'https://www.amazon.es'
url_asin
=
'https://www.amazon.es/dp/B0FDFVY9J6'
url_asin
=
'https://www.amazon.es/dp/B0FDFVY9J6'
host
=
'www.amazon.es'
elif
site
==
'it'
:
elif
site
==
'it'
:
index_url
=
'https://www.amazon.it'
index_url
=
'https://www.amazon.it'
url_asin
=
'https://www.amazon.it/dp/B0F3C16GTF'
url_asin
=
'https://www.amazon.it/dp/B0F3C16GTF'
host
=
'www.amazon.it'
elif
site
==
'ca'
:
elif
site
==
'ca'
:
index_url
=
'https://www.amazon.ca'
index_url
=
'https://www.amazon.ca'
url_asin
=
'https://www.amazon.ca//dp/B08H3JPH74'
url_asin
=
'https://www.amazon.ca/dp/B08H3JPH74'
host
=
'www.amazon.ca'
if
site
==
'ca'
:
if
site
==
'ca'
:
engine_us
=
get_remote_engine
(
'us'
,
'mysql'
)
engine_us
=
get_remote_engine
(
'us'
,
'mysql'
)
else
:
else
:
engine_us
=
get_remote_engine
(
site
,
'mysql'
)
engine_us
=
get_remote_engine
(
site
,
'mysql'
)
requ_see
=
requests
.
Session
()
requ_see
=
requests
.
Session
()
n
=
random
.
randint
(
120
,
142
)
ua
=
f
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 6900)}.{random.randint(1, 181)} Safari/537.36'
headers
=
{
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'connection'
:
'close'
,
'Accept-Encoding'
:
'gzip, deflate, br, zstd'
,
'accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'accept-language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'accept-encoding'
:
'gzip, deflate, br, zstd'
,
'Pragma'
:
'no-cache'
,
'cache-control'
:
'no-cache'
,
'Sec-Ch-Ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
,
'content-type'
:
'application/x-www-form-urlencoded;charset=UTF-8'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'sec-ch-ua-mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
' "Windows"'
,
'user-agent'
:
ua
,
'Sec-Fetch-Dest'
:
'document'
,
"pragma"
:
"no-cache"
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
' none'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
}
asin_resp
=
requ_see
.
get
(
url_asin
,
headers
=
headers
)
alphabet
=
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'n'
,
'o'
,
'p'
,
'q'
,
'r'
,
's'
,
't'
,
'u'
,
'v'
,
'w'
,
'x'
,
'y'
,
'z'
]
k
=
""
for
i
in
(
0
,
random
.
randint
(
0
,
26
)):
k
+=
random
.
choice
(
alphabet
)
headers
[
k
]
=
str
(
uuid
.
uuid4
())
asin_resp
=
requ_see
.
get
(
url_asin
,
headers
=
headers
,
verify
=
False
,
impersonate
=
"chrome"
)
print
(
"第一步 请求asin首页:"
,
url_asin
)
print
(
"第一步 请求asin首页:"
,
url_asin
)
# 检测是否返回验证码拦截页,自动提交过关
if
'validateCaptcha'
in
asin_resp
.
text
or
'opfcaptcha'
in
asin_resp
.
text
:
print
(
f
'[{site}] 检测到验证码拦截页,自动提交过关...'
)
cap_xpath
=
etree
.
HTML
(
asin_resp
.
text
)
amzn
=
cap_xpath
.
xpath
(
"//input[@name='amzn']/@value"
)
amzn_r
=
cap_xpath
.
xpath
(
"//input[@name='amzn-r']/@value"
)
field_keywords
=
cap_xpath
.
xpath
(
"//input[@name='field-keywords']/@value"
)
if
amzn
and
amzn_r
and
field_keywords
:
captcha_url
=
f
"{index_url}/errors/validateCaptcha"
params
=
{
'amzn'
:
amzn
[
0
],
'amzn-r'
:
amzn_r
[
0
],
'field-keywords'
:
field_keywords
[
0
]}
asin_resp
=
requ_see
.
get
(
captcha_url
,
params
=
params
,
headers
=
headers
,
verify
=
False
,
impersonate
=
"chrome"
)
print
(
f
'[{site}] 验证码提交完成,状态码: {asin_resp.status_code}'
)
else
:
print
(
f
'[{site}] 验证码参数提取失败,跳过'
)
return
html_xpath
=
etree
.
HTML
(
asin_resp
.
text
)
html_xpath
=
etree
.
HTML
(
asin_resp
.
text
)
ingress
=
html_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
ingress
=
html_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
print
(
"第一次发送请求,获取邮编:"
,
ingress
)
print
(
"第一次发送请求,获取邮编:"
,
ingress
)
data_a_modal
=
html_xpath
.
xpath
(
"//span[@id='nav-global-location-data-modal-action']/@data-a-modal"
)
data_a_modal
=
html_xpath
.
xpath
(
"//span[@id='nav-global-location-data-modal-action']/@data-a-modal"
)
if
not
data_a_modal
:
print
(
f
'[{site}] 未找到 modal 元素,跳过'
)
return
data_modal
=
json
.
loads
(
data_a_modal
[
0
])
data_modal
=
json
.
loads
(
data_a_modal
[
0
])
print
(
'获取参数anti-csrftoken-a2z:'
,
data_modal
)
print
(
'获取参数anti-csrftoken-a2z:'
,
data_modal
)
headers
[
'Anti-Csrftoken-A2z'
]
=
data_modal
[
'ajaxHeaders'
][
'anti-csrftoken-a2z'
]
headers
[
'Anti-Csrftoken-A2z'
]
=
data_modal
[
'ajaxHeaders'
][
'anti-csrftoken-a2z'
]
clkci_url
=
f
'{index_url}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
clkci_url
=
index_url
+
data_modal
[
'url'
]
headers
[
'Referer'
]
=
url_asin
print
(
'第二步点击'
)
print
(
'第二步点击'
)
clkci_resp
=
requ_see
.
get
(
clkci_url
,
headers
=
headers
)
clkci_resp
=
requ_see
.
get
(
clkci_url
,
headers
=
headers
,
verify
=
False
,
CSRF_TOKEN
=
re
.
findall
(
'CSRF_TOKEN : "(.*?)",'
,
clkci_resp
.
text
)[
0
]
impersonate
=
"chrome"
)
csrf_list
=
re
.
findall
(
'CSRF_TOKEN : "(.*?)",'
,
clkci_resp
.
text
)
if
not
csrf_list
:
print
(
f
'[{site}] 未找到 CSRF_TOKEN,跳过'
)
return
CSRF_TOKEN
=
csrf_list
[
0
]
print
(
"CSRF_TOKEN:"
,
CSRF_TOKEN
)
print
(
"CSRF_TOKEN:"
,
CSRF_TOKEN
)
address_url
=
f
'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
address_url
=
f
'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
headers_post
=
{
headers_post
=
{
'User-Agent'
:
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'User-Agent'
:
ua
,
'Accept-Encoding'
:
'gzip'
,
}
'Accept-Encoding'
:
'gzip'
,
}
headers_post
[
'Origin'
]
=
index_url
headers_post
[
'Origin'
]
=
index_url
headers_post
[
'Anti-Csrftoken-A2z'
]
=
CSRF_TOKEN
headers_post
[
'Anti-Csrftoken-A2z'
]
=
CSRF_TOKEN
...
@@ -106,7 +132,7 @@ def get_cookie(site='us', zipCode='10010'):
...
@@ -106,7 +132,7 @@ def get_cookie(site='us', zipCode='10010'):
impersonate
=
"chrome"
)
impersonate
=
"chrome"
)
print
(
post_resp
.
text
)
print
(
post_resp
.
text
)
submit_headers
=
{
submit_headers
=
{
'User-Agent'
:
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'User-Agent'
:
ua
,
'Accept-Encoding'
:
'gzip'
,
'Accept-Encoding'
:
'gzip'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
}
}
...
@@ -121,7 +147,7 @@ def get_cookie(site='us', zipCode='10010'):
...
@@ -121,7 +147,7 @@ def get_cookie(site='us', zipCode='10010'):
requ_see
.
get
(
detail_url
,
headers
=
submit_headers
,
verify
=
False
,
impersonate
=
"chrome"
)
requ_see
.
get
(
detail_url
,
headers
=
submit_headers
,
verify
=
False
,
impersonate
=
"chrome"
)
print
(
requ_see
.
cookies
.
get_dict
())
print
(
requ_see
.
cookies
.
get_dict
())
cookie_dict
=
requ_see
.
cookies
.
get_dict
()
cookie_dict
=
requ_see
.
cookies
.
get_dict
()
index_resp
=
requ
ests
.
get
(
index_url
,
headers
=
headers
,
cookies
=
cookie_dict
,
verify
=
False
,
index_resp
=
requ
_see
.
get
(
index_url
,
headers
=
headers
,
verify
=
False
,
impersonate
=
"chrome"
)
impersonate
=
"chrome"
)
index_xpath
=
etree
.
HTML
(
index_resp
.
text
)
index_xpath
=
etree
.
HTML
(
index_resp
.
text
)
ingress
=
index_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
ingress
=
index_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
...
@@ -149,12 +175,14 @@ def get_cookie(site='us', zipCode='10010'):
...
@@ -149,12 +175,14 @@ def get_cookie(site='us', zipCode='10010'):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
count
=
0
while
True
:
while
True
:
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
# get_cookie(site='de', zipCode='10115')
get_cookie
(
site
=
'de'
,
zipCode
=
'10115'
)
# get_cookie(site='uk', zipCode='W1S 3PR')
get_cookie
(
site
=
'uk'
,
zipCode
=
'W1S 3PR'
)
# get_cookie(site='it', zipCode='00185')
if
count
%
10
==
0
:
# get_cookie(site='es', zipCode='28001')
get_cookie
(
site
=
'it'
,
zipCode
=
'00185'
)
# get_cookie(site='fr', zipCode='75019')
get_cookie
(
site
=
'es'
,
zipCode
=
'28001'
)
# get_cookie(site='ca', zipCode='M5B 2H1')
get_cookie
(
site
=
'fr'
,
zipCode
=
'75019'
)
count
+=
1
time
.
sleep
(
random
.
uniform
(
60.5
,
180.5
))
time
.
sleep
(
random
.
uniform
(
60.5
,
180.5
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment