Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
efc3b2e1
Commit
efc3b2e1
authored
Feb 14, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
6078b1ae
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
84 additions
and
40 deletions
+84
-40
requests_param.py
py_spider/utils/requests_param.py
+84
-40
No files found.
py_spider/utils/requests_param.py
View file @
efc3b2e1
import
time
import
re
import
pandas
as
pd
import
sys
import
hashlib
# import requests
import
json
import
os
import
urllib3
import
random
import
re
import
sys
import
time
import
uuid
from
urllib.parse
import
urlparse
from
threading
import
Lock
import
urllib3
from
lxml
import
etree
# import requests
import
json
import
hashlib
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
# 上级目录
from
amazon_params.params
import
DB_REQUESTS_ASIN_PARAMS
...
...
@@ -19,6 +20,8 @@ from amazon_spider.VPS_IP import is_internet_available
from
datetime
import
datetime
,
timedelta
import
traceback
from
curl_cffi
import
requests
from
kafka.errors
import
KafkaTimeoutError
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
()
...
...
@@ -31,6 +34,9 @@ class Requests_param_val(BaseUtils):
self
.
proxy_name
=
'Kdl_h10'
print
(
"站点名称:"
,
self
.
site_name
,
'抓取项目'
,
"代理ip:"
,
self
.
proxy_name
)
self
.
cookies_queue
=
Queue
()
# cookie队列
self
.
kafuka_producer_str
=
self
.
kafuka_connect
()
self
.
next_page_lock
=
Lock
()
self
.
headers_num_int_s
=
0
def
init_db_names
(
self
):
self
.
engine_pg
=
self
.
pg_connect
()
...
...
@@ -38,7 +44,7 @@ class Requests_param_val(BaseUtils):
self
.
db_cookies
=
self
.
site_name
+
DB_REQUESTS_ASIN_PARAMS
[
'db_cookies'
][
2
:]
self
.
db_ip_16yun
=
DB_REQUESTS_ASIN_PARAMS
[
'db_ip_16yun'
]
def
get_site_url
(
self
,
site_name
):
def
get_site_url
(
self
,
site_name
=
'us'
):
if
site_name
==
"us"
:
self
.
site_url
=
'https://www.amazon.com/'
self
.
host
=
'www.amazon.com'
...
...
@@ -97,7 +103,7 @@ class Requests_param_val(BaseUtils):
except
:
break
def
get_cookie
(
self
):
def
get_cookie
(
self
,
num
=
None
):
print
(
"获取cookie,并返回"
)
while
True
:
if
is_internet_available
():
...
...
@@ -110,10 +116,11 @@ class Requests_param_val(BaseUtils):
self
.
engine
=
self
.
mysql_connect
()
self
.
engine_pg
=
self
.
pg_connect
()
with
self
.
engine
.
begin
()
as
conn
:
sql_read
=
f
'SELECT cookies,id FROM {self.db_cookies} limit 350;'
if
num
:
sql_read
=
f
'SELECT cookies,id FROM {self.db_cookies} limit {num};'
else
:
sql_read
=
f
'SELECT cookies,id FROM {self.db_cookies} limit 300;'
print
(
"获取cookie:"
,
sql_read
)
# a = conn.execute(sql_read)
# df_read = pd.DataFrame(a, columns=['cookies', 'id'])
df_read
=
self
.
engine
.
read_sql
(
sql_read
)
clientPriceList
=
list
(
df_read
.
cookies
+
"|-|"
+
df_read
.
id
.
astype
(
"U"
))
for
ck
in
clientPriceList
:
...
...
@@ -130,9 +137,9 @@ class Requests_param_val(BaseUtils):
def
db_column
(
self
,
site
):
if
site
in
(
'us'
,
'de'
,
'uk'
):
asin_detail_table
=
f
'select * from {site}_asin_detail_month_202
5
limit 1'
asin_detail_table
=
f
'select * from {site}_asin_detail_month_202
6
limit 1'
else
:
asin_detail_table
=
f
'select * from {site}_asin_detail_202
5
limit 1'
asin_detail_table
=
f
'select * from {site}_asin_detail_202
6
limit 1'
print
(
asin_detail_table
)
# df = pd.read_sql(asin_detail_table, con=self.engine_pg)
df
=
self
.
engine_pg
.
read_sql
(
asin_detail_table
)
...
...
@@ -141,10 +148,8 @@ class Requests_param_val(BaseUtils):
columns_list
.
remove
(
'id'
)
columns_list
.
remove
(
'updated_time'
)
columns_list
.
remove
(
'category_state'
)
if
site
in
(
'fr'
,
'es'
,
'it'
):
if
site
in
(
'fr'
,
'es'
,
'it'
):
columns_list
.
append
(
'week'
)
print
(
len
(
columns_list
))
print
(
columns_list
)
return
columns_list
# 检查是返回源码是否正确
...
...
@@ -167,7 +172,8 @@ class Requests_param_val(BaseUtils):
def
check_amazon_allow_redirects
(
self
,
response_url
,
asin
):
if
(
"keywords"
in
response_url
)
or
(
"dp/"
not
in
response_url
)
or
(
"ref="
in
response_url
and
"encoding="
in
response_url
)
or
(
asin
not
in
response_url
)
or
(
"ASIN="
in
response_url
and
"ref_=lx_bd"
in
response_url
)
or
(
'ref=rd_fr_'
in
response_url
and
f
'ref=rd_fr_{asin}'
in
response_url
)
\
"ASIN="
in
response_url
and
"ref_=lx_bd"
in
response_url
)
or
(
'ref=rd_fr_'
in
response_url
and
f
'ref=rd_fr_{asin}'
in
response_url
)
\
or
(
'&ASIN='
in
response_url
):
return
True
...
...
@@ -175,7 +181,8 @@ class Requests_param_val(BaseUtils):
# 检查邮编是否正确。
def
check_amazon_ingress
(
self
,
ingress
):
if
(
"中国大陆"
in
ingress
)
or
(
"China"
in
ingress
)
or
(
"Hong"
in
ingress
)
or
(
"Chine"
in
ingress
)
or
(
"Cina"
in
ingress
)
or
(
"Update location"
in
ingress
)
or
(
'香'
in
ingress
)
or
(
"location"
in
ingress
):
"Cina"
in
ingress
)
or
(
"Update location"
in
ingress
)
or
(
'香'
in
ingress
)
or
(
"location"
in
ingress
)
\
or
(
'Vereinigte'
in
ingress
):
return
True
# 检查请求是否出现验证码:
...
...
@@ -190,30 +197,31 @@ class Requests_param_val(BaseUtils):
# 组装请求头,
def
requests_amazon_headers
(
self
,
host
=
None
,
site_url
=
None
,
asin
=
None
,
scraper_url
=
None
):
n
=
random
.
randint
(
118
,
124
)
n
=
random
.
randint
(
120
,
142
)
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36
ua
=
f
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
# ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
headers
=
{
'connection'
:
'close'
,
'authority'
:
host
,
'accept'
:
'text/html,
*/*
'
,
'authority'
:
urlparse
(
self
.
site_url
)
.
hostname
,
'accept'
:
'text/html,
application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
'
,
'accept-language'
:
'zh-CN,zh;q=0.9'
,
'accept-Encodin'
:
'gzip, deflate, br, zstd'
,
'cache-control'
:
'no-cache'
,
'content-type'
:
'application/x-www-form-urlencoded;charset=UTF-8'
,
'sec-ch-ua-mobile'
:
'?0'
,
'user-agent'
:
ua
,
"Host"
:
self
.
host
,
"Pragma"
:
"no-cache"
,
"Sec-Fetch-Mode"
:
"navigate"
,
"Sec-Fetch-Site"
:
"none"
,
"Sec-Fetch-User"
:
"?1"
,
"Upgrade-Insecure-Requests"
:
"1"
,
"pragma"
:
"no-cache"
,
}
#
if asin:
#
headers['origin'] = f'{site_url}dp/{asin}'
# headers['referer'] = f'{site_url}dp/{asin}
'
if
asin
:
headers
[
'origin'
]
=
f
'{site_url}dp/{asin}'
headers
[
'referer'
]
=
f
'{site_url}?th=1
'
if
scraper_url
:
headers
[
'origin'
]
=
scraper_url
headers
[
'referer'
]
=
scraper_url
alphabet
=
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'n'
,
'o'
,
'p'
,
'q'
,
'r'
,
's'
,
't'
,
'u'
,
'v'
,
'w'
,
'x'
,
'y'
,
'z'
]
k
=
""
...
...
@@ -245,7 +253,8 @@ class Requests_param_val(BaseUtils):
if
ingress
:
if
(
"Page Not Found"
in
resp
.
text
)
or
(
"We are sorry! This Gift Card is not available"
in
resp
.
text
)
or
(
"500 - An error occurred"
in
resp
.
text
)
or
(
"Sorry! Something went wrong!"
in
resp
.
text
):
"500 - An error occurred"
in
resp
.
text
)
or
(
"Sorry! Something went wrong!"
in
resp
.
text
):
return
None
else
:
return
resp
.
text
...
...
@@ -276,7 +285,6 @@ class Requests_param_val(BaseUtils):
def
get_cookie_str
(
self
,
cookies_queue
):
while
1
:
cookie_str
=
cookies_queue
.
get
()
if
len
(
cookie_str
)
>
50
:
try
:
cookie_lsit
=
json
.
loads
(
cookie_str
)
except
:
...
...
@@ -297,8 +305,13 @@ class Requests_param_val(BaseUtils):
for
k
,
v
in
cookie_lsit
.
items
():
cookie_str
=
cookie_str
+
str
(
k
)
+
'='
+
str
(
v
)
+
';'
break
else
:
break
if
self
.
site_name
==
'uk'
:
cookie_str
=
cookie_str
.
replace
(
'i18n-prefs=HKD;'
,
'i18n-prefs=GBP;'
)
.
replace
(
'i18n-prefs=USD;'
,
'i18n-prefs=GBP;'
)
elif
self
.
site_name
==
'de'
:
cookie_str
=
cookie_str
.
replace
(
'i18n-prefs=HKD;'
,
'i18n-prefs=EUR;'
)
.
replace
(
'i18n-prefs=USD;'
,
'i18n-prefs=EUR;'
)
elif
self
.
site_name
==
'us'
:
cookie_str
=
cookie_str
.
replace
(
'i18n-prefs=HKD;'
,
'i18n-prefs=USD;'
)
return
cookie_str
# 获取自增id区间。根据传的站点获取对应的月 周 syn表的id
...
...
@@ -312,10 +325,10 @@ class Requests_param_val(BaseUtils):
with
self
.
engine
.
begin
()
as
conn
:
if
self
.
site_name
in
(
'us'
,
'de'
,
'uk'
):
if
state
==
3
and
minid_maxid
:
sql_update
=
f
"""UPDATE {self.site_name}_syn_asin_all_minid_maxid set state=3 where minid_maxid='{minid_maxid}' and yaer_month = '202
5
_{month}'"""
sql_update
=
f
"""UPDATE {self.site_name}_syn_asin_all_minid_maxid set state=3 where minid_maxid='{minid_maxid}' and yaer_month = '202
6
_{month}'"""
print
(
sql_update
)
conn
.
execute
(
sql_update
)
sql_read
=
f
"""SELECT id, minid_maxid FROM {self.site_name}_syn_asin_all_minid_maxid WHERE STATE = 1 and yaer_month = '202
5
_{month}' LIMIT 1"""
sql_read
=
f
"""SELECT id, minid_maxid FROM {self.site_name}_syn_asin_all_minid_maxid WHERE STATE = 1 and yaer_month = '202
6
_{month}' LIMIT 1"""
print
(
'sql_read:::'
,
sql_read
)
else
:
if
state
==
2
and
minid_maxid
:
...
...
@@ -348,5 +361,35 @@ class Requests_param_val(BaseUtils):
# 获取哈希的十六进制表示
md5_hex_digest
=
md5_hash
.
hexdigest
()
return
md5_hex_digest
if
__name__
==
'__main__'
:
Requests_param_val
(
site_name
=
'uk'
)
.
get_minid_maxid
(
month
=
'07'
,
state
=
1
)
def
on_send_success
(
self
,
record_metadata
):
print
(
f
"消息发送成功: {record_metadata.topic}-{record_metadata.partition}-{record_metadata.offset}"
)
def
on_send_error
(
self
,
excp
):
print
(
"消息发送失败"
,
excp
)
def
send_kafka
(
self
,
items
=
None
,
html_data
=
None
,
topic
=
None
,
num
=
3
):
print
(
'向Kafka发送数据'
)
for
i
in
range
(
5
):
try
:
if
items
:
del
items
[
'div_id_list'
]
future
=
self
.
kafuka_producer_str
.
send
(
topic
,
json
.
dumps
(
items
))
future
.
add_callback
(
self
.
on_send_success
)
.
add_errback
(
self
.
on_send_error
)
future
.
get
(
30
)
if
html_data
:
future
=
self
.
kafuka_producer_str
.
send
(
topic
,
html_data
)
future
.
add_callback
(
self
.
on_send_success
)
.
add_errback
(
self
.
on_send_error
)
future
.
get
(
30
)
print
(
'向Kafka发送数据 发送成功'
)
with
self
.
next_page_lock
:
self
.
headers_num_int_s
+=
1
if
self
.
headers_num_int_s
%
10
==
0
:
self
.
kafuka_producer_str
.
flush
()
break
except
Exception
as
e
:
print
(
f
"kafka发送失败(第{i + 1}/5次)"
,
e
)
time
.
sleep
(
2
)
if
i
>=
1
and
i
%
2
==
1
:
self
.
kafuka_producer_str
=
self
.
kafuka_connect
(
acks
=
True
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment