Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
6dd760f3
Commit
6dd760f3
authored
Jan 12, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
3f158caf
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
114 additions
and
193 deletions
+114
-193
Get_Cookies.py
py_spider/amazon_every_day_spider/Get_Cookies.py
+114
-193
No files found.
py_spider/amazon_every_day_spider/Get_Cookies.py
View file @
6dd760f3
'存储到pg'
'获取小语言cookie'
import
sys
import
os
import
pandas
as
pd
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
# 上级目录
import
requests
import
json
from
lxml
import
etree
import
re
import
random
import
uuid
import
re
import
time
import
urllib3
from
secure_db_client
import
get_remote_engine
import
py_ja3
import
traceback
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
item
=
{}
headers_num_int
=
0
import
pandas
as
pd
from
curl_cffi
import
requests
from
lxml
import
etree
from
amazon_every_day_spider.secure_db_client
import
get_remote_engine
"""
打包命令:cd /d E:
\
Git_new
\
spider
\
py_spider
pyinstaller -F amazon_every_day_spider
\
Get_Cookies.py --clean --paths . --collect-submodules amazon_every_day_spider
"""
def
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
):
try
:
if
site
==
"us"
:
url_
=
'https://www.amazon.com'
index_url
=
'https://www.amazon.com'
url_asin
=
'https://www.amazon.com/dp/B0009X29WK'
host
=
'www.amazon.com'
elif
site
==
'uk'
:
url_
=
'https://www.amazon.co.uk'
# 站点url
index_url
=
'https://www.amazon.co.uk'
# 站点url
url_asin
=
'https://www.amazon.co.uk/dp/B0714LLB2T'
# 站点url
host
=
'www.amazon.co.uk'
elif
site
==
'de'
:
url_
=
'https://www.amazon.de'
index_url
=
'https://www.amazon.de'
url_asin
=
'https://www.amazon.de/dp/B00006YYXM'
host
=
'www.amazon.de'
elif
site
==
'fr'
:
url_
=
'https://www.amazon.fr'
index_url
=
'https://www.amazon.fr'
url_asin
=
'https://www.amazon.fr/dp/B0FK9JNPM5'
host
=
'www.amazon.fr'
elif
site
==
'es'
:
url_
=
'https://www.amazon.es'
index_url
=
'https://www.amazon.es'
url_asin
=
'https://www.amazon.es/dp/B0FDFVY9J6'
host
=
'www.amazon.es'
elif
site
==
'it'
:
url_
=
'https://www.amazon.it'
index_url
=
'https://www.amazon.it'
url_asin
=
'https://www.amazon.it/dp/B0F3C16GTF'
host
=
'www.amazon.it'
engine_us
=
get_remote_engine
(
site
,
'mysql'
)
n
=
random
.
randint
(
120
,
130
)
ua
=
f
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36'
print
(
ua
)
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.4929.149 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
elif
site
==
'ca'
:
index_url
=
'https://www.amazon.ca'
url_asin
=
'https://www.amazon.ca//dp/B08H3JPH74'
host
=
'www.amazon.ca'
if
site
==
'ca'
:
engine_us
=
get_remote_engine
(
'us'
,
'mysql'
)
else
:
engine_us
=
get_remote_engine
(
site
,
'mysql'
)
requ_see
=
requests
.
Session
()
headers
=
{
"Accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
,
"Accept-Encoding"
:
"gzip, deflate, br, zstd"
,
"Accept-Language"
:
"zh-CN,zh;q=0.9"
,
"Cache-Control"
:
"no-cache"
,
"Device-Memory"
:
"8"
,
"Downlink"
:
"1.25"
,
"Dpr"
:
"0.75"
,
"Ect"
:
"3g"
,
"Pragma"
:
"no-cache"
,
"Rtt"
:
"300"
,
"Sec-Ch-Device-Memory"
:
"8"
,
"Sec-Ch-Dpr"
:
"0.75"
,
"Sec-Ch-Ua"
:
f
'"Not_A Brand";v="8", "Chromium";v="{ua}", "Google Chrome";v="{ua}"'
,
"Sec-Ch-Ua-Mobile"
:
"?0"
,
"Sec-Ch-Ua-Platform"
:
'"Windows"'
,
"Sec-Ch-Ua-Platform-Version"
:
'"10.0.0"'
,
"Sec-Ch-Viewport-Width"
:
"2560"
,
"Sec-Fetch-Dest"
:
"document"
,
"Sec-Fetch-Mode"
:
"navigate"
,
"Sec-Fetch-Site"
:
"none"
,
"Sec-Fetch-User"
:
"?1"
,
"Upgrade-Insecure-Requests"
:
"1"
,
"User-Agent"
:
ua
,
"Viewport-Width"
:
"2560"
,
}
alphabet
=
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'n'
]
k
=
""
for
i
in
(
0
,
random
.
randint
(
0
,
5
)):
k
+=
random
.
choice
(
alphabet
)
# headers[k] = str(uuid.uuid4())
sess
=
requests
.
Session
()
sess
.
mount
(
url_
,
py_ja3
.
DESAdapter
())
resp_
=
sess
.
get
(
url_
,
headers
=
headers
,
timeout
=
15
,
verify
=
False
)
cookie
=
resp_
.
headers
.
get
(
'set-cookie'
)
print
(
"第一步 请求首页"
,
url_
)
cookies_dict
=
{
i
.
split
(
"="
)[
0
]:
i
.
split
(
"="
)[
-
1
]
for
i
in
cookie
.
split
(
"; "
)}
html_xpath
=
etree
.
HTML
(
resp_
.
text
)
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate, br, zstd'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'no-cache'
,
'Pragma'
:
'no-cache'
,
'Sec-Ch-Ua'
:
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"'
,
'Sec-Ch-Ua-Mobile'
:
'?0'
,
'Sec-Ch-Ua-Platform'
:
' "Windows"'
,
'Sec-Fetch-Dest'
:
'document'
,
'Sec-Fetch-Mode'
:
'navigate'
,
'Sec-Fetch-Site'
:
' none'
,
'Sec-Fetch-User'
:
'?1'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
asin_resp
=
requ_see
.
get
(
url_asin
,
headers
=
headers
)
print
(
"第一步 请求asin首页:"
,
url_asin
)
html_xpath
=
etree
.
HTML
(
asin_resp
.
text
)
ingress
=
html_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
print
(
"第一次发送请求,获取邮编:"
,
ingress
)
data_a_modal
=
html_xpath
.
xpath
(
"//span[@id='nav-global-location-data-modal-action']/@data-a-modal"
)
data_modal
=
json
.
loads
(
data_a_modal
[
0
])
# if site != 'us':
# csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0]
# url_post = url_ + '/privacyprefs/retail/v1/acceptall'
# dada_post = {
# "anti-csrftoken-a2z": csrftoken,
# "accept": "all"
# }
# resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post,
# verify=False)
# cookie_post = resp_post.headers.get('set-cookie')
# cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")}
# cookies_dict_post.update(cookies_dict)
# else:
cookies_dict_post
=
cookies_dict
# if site == 'us':
# get_token_headers = {
# 'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
# 'referer': url_,
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
# }
# print(get_token_headers, '23232')
# else:
get_token_headers
=
{
'accept'
:
'text/html,*/*'
,
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-language'
:
'zh-CN,zh;q=0.9'
,
'anti-csrftoken-a2z'
:
data_modal
[
'ajaxHeaders'
][
'anti-csrftoken-a2z'
],
'cache-control'
:
'no-cache'
,
'referer'
:
url_
,
'sec-fetch-dest'
:
'empty'
,
'sec-fetch-mode'
:
'cors'
,
'sec-fetch-site'
:
'same-origin'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
,
'viewport-width'
:
'1920'
,
'x-requested-with'
:
'XMLHttpRequest'
,
}
data_modal_url
=
url_
+
data_modal
[
'url'
]
print
(
'第二步 拼接url 点击更改位置:'
,
data_modal_url
)
data_modal_resp
=
sess
.
get
(
data_modal_url
,
headers
=
get_token_headers
,
cookies
=
cookies_dict_post
,
timeout
=
15
,
verify
=
False
)
data_modal_cookie
=
data_modal_resp
.
headers
.
get
(
'set-cookie'
)
CSRF_TOKEN
=
re
.
findall
(
'CSRF_TOKEN : "(.*?)",'
,
data_modal_resp
.
text
)[
0
]
print
(
'获取参数anti-csrftoken-a2z:'
,
data_modal
)
headers
[
'Anti-Csrftoken-A2z'
]
=
data_modal
[
'ajaxHeaders'
][
'anti-csrftoken-a2z'
]
clkci_url
=
f
'{index_url}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Detail&storeContext=grocery&actionSource=desktop-modal&toasterType=AIS_INGRESS'
headers
[
'Referer'
]
=
url_asin
print
(
'第二步点击'
)
clkci_resp
=
requ_see
.
get
(
clkci_url
,
headers
=
headers
)
CSRF_TOKEN
=
re
.
findall
(
'CSRF_TOKEN : "(.*?)",'
,
clkci_resp
.
text
)[
0
]
print
(
"CSRF_TOKEN:"
,
CSRF_TOKEN
)
try
:
data_modal_cookie_dict
=
{
i
.
split
(
"="
)[
0
]:
i
.
split
(
"="
)[
-
1
]
for
i
in
data_modal_cookie
.
split
(
"; "
)}
data_modal_cookie_dict
.
update
(
cookies_dict
)
except
:
data_modal_cookie_dict
=
cookies_dict_post
url_2
=
url_
+
'/portal-migration/hz/glow/address-change?actionSource=glow'
print
(
'url_2:'
,
url_2
)
# {"locationType":"LOCATION_INPUT","zipCode":"10010","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"}
data
=
{
"locationType"
:
"LOCATION_INPUT"
,
"zipCode"
:
zipCode
,
"storeContext"
:
"generic"
,
"deviceType"
:
"web"
,
"pageType"
:
"Gateway"
,
"actionSource"
:
"glow"
}
print
(
data
)
post_headers
=
{
'anti-csrftoken-a2z'
:
CSRF_TOKEN
,
'accept'
:
'text/html,*/*'
,
'accept-encoding'
:
'gzip, deflate, br'
,
'accept-language'
:
'zh-CN,zh;q=0.9'
,
'cache-control'
:
'no-cache'
,
'content-length'
:
'138'
,
'content-type'
:
'application/json'
,
'device-memory'
:
'8'
,
'downlink'
:
'10'
,
'dpr'
:
'1'
,
'ect'
:
'4g'
,
'origin'
:
url_
,
'pragma'
:
'no-cache'
,
'referer'
:
url_
,
'rtt'
:
'250'
,
'sec-ch-device-memory'
:
'8'
,
'sec-ch-dpr'
:
'1'
,
'sec-ch-ua'
:
'"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"'
,
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
,
'sec-ch-ua-platform-version'
:
'"10.0.0"'
,
'sec-ch-viewport-width'
:
'1920'
,
'sec-fetch-dest'
:
'empty'
,
'sec-fetch-mode'
:
'cors'
,
'sec-fetch-site'
:
'same-origin'
,
'user-agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
,
'viewport-width'
:
'1920'
,
'TE'
:
'trailers'
,
'x-requested-with'
:
'XMLHttpRequest'
address_url
=
f
'{index_url}/portal-migration/hz/glow/address-change?actionSource=glow'
headers_post
=
{
'User-Agent'
:
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'Accept-Encoding'
:
'gzip'
,
}
headers_post
[
'Origin'
]
=
index_url
headers_post
[
'Anti-Csrftoken-A2z'
]
=
CSRF_TOKEN
headers_post
[
'Sec-Fetch-Site'
]
=
'same-origin'
headers_post
[
'Sec-Fetch-Mode'
]
=
'cors'
headers_post
[
'Sec-Fetch-Dest'
]
=
'empty'
headers_post
[
'Accept'
]
=
'*/*'
headers_post
[
'Content-Type'
]
=
'application/json'
headers_post
[
'X-Requested-With'
]
=
'XMLHttpRequest'
address_json
=
{
"locationType"
:
"LOCATION_INPUT"
,
"zipCode"
:
f
"{zipCode}"
,
"deviceType"
:
"web"
,
"storeContext"
:
"grocery"
,
"pageType"
:
"Detail"
,
"actionSource"
:
"glow"
}
print
(
'第三步 输入 邮编'
)
post_resp
=
requ_see
.
post
(
address_url
,
headers
=
headers_post
,
json
=
address_json
,
verify
=
False
,
impersonate
=
"chrome"
)
print
(
post_resp
.
text
)
submit_headers
=
{
'User-Agent'
:
' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
,
'Accept-Encoding'
:
'gzip'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
}
print
(
'第三步 发送post 请求 输入 邮编 点击确定'
)
resp_2
=
sess
.
post
(
url_2
,
headers
=
post_headers
,
json
=
data
,
cookies
=
data_modal_cookie_dict
,
timeout
=
15
,
verify
=
False
)
print
(
resp_2
.
text
)
post_cookies
=
resp_2
.
headers
.
get
(
'set-cookie'
)
try
:
post_cookies_dict
=
{
i
.
split
(
"="
)[
0
]:
i
.
split
(
"="
)[
-
1
]
for
i
in
post_cookies
.
split
(
"; "
)}
post_cookies_dict
.
update
(
data_modal_cookie_dict
)
except
:
post_cookies_dict
=
data_modal_cookie_dict
done_url
=
url_
+
"/portal-migration/hz/glow/get-location-label?storeContext=generic&pageType=Gateway&actionSource=desktop-modal"
print
(
'第四步,点击完成,'
)
done_resp
=
sess
.
get
(
done_url
,
headers
=
headers
,
cookies
=
post_cookies_dict
,
timeout
=
15
,
verify
=
False
)
print
(
done_resp
.
text
,
'done_respdone_respdone_respdone_resp'
)
done_cookies_dict
=
sess
.
cookies
.
get_dict
()
print
(
'done_cookies_dict::'
,
done_cookies_dict
)
print
(
"第五步,请求首页,获取邮编,是否修改成功"
)
index_resp
=
sess
.
get
(
url_
,
headers
=
headers
,
timeout
=
15
,
cookies
=
done_cookies_dict
,
verify
=
False
)
index_resp_cookies
=
sess
.
cookies
.
get_dict
()
print
(
sess
.
cookies
.
get_dict
(),
'2222222222222222'
)
submit_headers
[
'Accept'
]
=
'*/*'
submit_headers
[
'Referer'
]
=
url_asin
submit_headers
[
'X-Requested-With'
]
=
'XMLHttpRequest'
submit_headers
[
'Sec-Fetch-Site'
]
=
'same-origin'
submit_headers
[
'Sec-Fetch-Mode'
]
=
'cors'
submit_headers
[
'Sec-Fetch-Dest'
]
=
'empty'
print
(
'第四步。提交'
)
detail_url
=
f
'{index_url}/portal-migration/hz/glow/get-location-label?storeContext=grocery&pageType=Detail&actionSource=desktop-modal'
requ_see
.
get
(
detail_url
,
headers
=
submit_headers
,
verify
=
False
,
impersonate
=
"chrome"
)
print
(
requ_see
.
cookies
.
get_dict
())
cookie_dict
=
requ_see
.
cookies
.
get_dict
()
index_resp
=
requests
.
get
(
index_url
,
headers
=
headers
,
cookies
=
cookie_dict
,
verify
=
False
,
impersonate
=
"chrome"
)
index_xpath
=
etree
.
HTML
(
index_resp
.
text
)
ingress
=
index_xpath
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
print
(
"获取最新邮编:"
,
ingress
)
if
zipCode
in
ingress
[
0
]
.
strip
()
or
"W1S 3"
in
ingress
[
0
]
.
strip
():
if
zipCode
in
ingress
[
0
]
.
strip
()
or
"W1S 3"
in
ingress
[
0
]
.
strip
()
or
'M5B 2H'
in
ingress
[
0
]
.
strip
():
print
(
f
"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************"
)
cookies
=
json
.
dumps
(
index_resp_cookies
,
ensure_ascii
=
False
)
cookies
=
json
.
dumps
(
cookie_dict
,
ensure_ascii
=
False
)
item
=
{
"site"
:
site
,
'zipCode'
:
ingress
[
0
]
.
strip
(),
'cookie'
:
cookies
}
print
(
item
)
# 构造 DataFrame
df
=
pd
.
DataFrame
([{
"cookies"
:
cookies
,
"type"
:
"DB"
}])
# 存储到数据库
engine_us
.
to_sql
(
df
,
f
"{site}_cookies"
,
if_exists
=
"append"
)
if
site
==
'ca'
:
df
=
pd
.
DataFrame
([{
"cookies"
:
cookies
,
"type"
:
"DB"
,
'site'
:
'ca'
}])
# 存储到数据库
engine_us
.
to_sql
(
df
,
'other_site_cookies'
,
if_exists
=
"append"
)
else
:
# 构造 DataFrame
df
=
pd
.
DataFrame
([{
"cookies"
:
cookies
,
"type"
:
"DB"
}])
# 存储到数据库
engine_us
.
to_sql
(
df
,
f
"{site}_cookies"
,
if_exists
=
"append"
)
print
(
'
\n
'
)
except
Exception
as
e
:
print
(
f
"获取 {site} 站点 cookie 报错,切换下一个站点"
,
e
)
print
(
f
"获取 {site} 站点 cookie 报错,切换下一个站点"
,
e
)
print
(
"报错"
,
f
"
\n
{traceback.format_exc()}"
)
time
.
sleep
(
random
.
uniform
(
2.5
,
5.5
))
...
...
@@ -230,10 +153,8 @@ if __name__ == '__main__':
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'de'
,
zipCode
=
'10115'
)
get_cookie
(
site
=
'uk'
,
zipCode
=
'W1S 3PR'
)
# get_cookie(site='it', zipCode='
85')
get_cookie
(
site
=
'it'
,
zipCode
=
'001
85'
)
get_cookie
(
site
=
'es'
,
zipCode
=
'28001'
)
get_cookie
(
site
=
'fr'
,
zipCode
=
'75019'
)
get_cookie
(
site
=
'us'
,
zipCode
=
'10010'
)
get_cookie
(
site
=
'de'
,
zipCode
=
'10115'
)
get_cookie
(
site
=
'uk'
,
zipCode
=
'W1S 3PR'
)
time
.
sleep
(
random
.
uniform
(
30.5
,
70.5
))
get_cookie
(
site
=
'ca'
,
zipCode
=
'M5B 2H1'
)
time
.
sleep
(
random
.
uniform
(
60.5
,
180.5
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment