Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
4825673f
Commit
4825673f
authored
Mar 17, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
解决多个任务操作队列读取操作,避免某个线程被挂起。通过ai进行优化整体代码,多线程相互争夺资源的控制
parent
0d70b338
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
350 additions
and
336 deletions
+350
-336
asin_detail_pg.py
py_spider/amazon_spider/asin_detail_pg.py
+350
-336
No files found.
py_spider/amazon_spider/asin_detail_pg.py
View file @
4825673f
...
...
@@ -2,14 +2,14 @@ import sys
import
os
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
# 上级目录
from
amazon_params
import
py_ja3
#
from amazon_params import py_ja3
from
amazon_save_db.save_asin_detail_pg
import
Save_asin_detail
from
utils.asin_parse
import
ParseAsinUs
from
queue
import
Queue
from
amazon_params
import
py_ja3
from
queue
import
Queue
,
Empty
import
time
import
re
from
lxml
import
etree
import
requests
import
urllib3
import
threading
from
func_timeout.exceptions
import
FunctionTimedOut
...
...
@@ -17,7 +17,8 @@ import traceback
from
datetime
import
datetime
import
gzip
import
json
# from curl_cffi import requests as curl
from
curl_cffi
import
requests
# import requests as requests2
from
kafka.errors
import
KafkaTimeoutError
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
urllib3
.
disable_warnings
()
...
...
@@ -42,6 +43,7 @@ class async_asin_pg():
self
.
asin_not_div_id_dp_list
=
[]
# 返回html没有包含div @id=dp,状态13
self
.
asin_list_update
=
[]
# 3
self
.
cookies_queue
=
Queue
()
# cookie队列
self
.
cookie_refill_lock
=
threading
.
Lock
()
# cookie重填锁
self
.
item_queue
=
Queue
()
# 存储 item 详情数据队列
self
.
queries_asin_queue
=
Queue
()
# 需要爬取的asin队列
self
.
buyBox_list
=
[]
# 卖家名称 url 列表
...
...
@@ -70,356 +72,368 @@ class async_asin_pg():
self
.
topic_detail_month
=
f
'{self.site_name}_asin_detail_month_2026_{self.month_}'
self
.
topic_asin_html
=
f
'asin_html_2026_{self.month_}'
self
.
asin_video_list
=
[]
# 修复:sess 改为类成员变量,只 mount 一次
self
.
sess
=
requests
.
Session
()
self
.
sess
.
mount
(
self
.
site_url
,
py_ja3
.
DESAdapter
())
def
get_asin
(
self
):
while
True
:
if
not
self
.
queries_asin_queue
.
empty
():
querys
=
self
.
queries_asin_queue
.
get
()
try
:
querys
=
self
.
queries_asin_queue
.
get_nowait
()
except
Empty
:
print
(
f
"当前线程-已完成-爬取-跳出循环"
)
break
with
self
.
cookie_refill_lock
:
if
self
.
cookies_queue
.
empty
():
cookies_dict
=
self
.
reuests_para_val
.
get_cookie
()
self
.
cookie_dict_delete_id
=
cookies_dict
for
ck
in
cookies_dict
.
values
():
self
.
cookies_queue
.
put
(
ck
)
# 获取组装cookie
cookie_str
=
self
.
reuests_para_val
.
get_cookie_str
(
self
.
cookies_queue
)
query
=
str
(
querys
)
.
split
(
'|'
)
is_variat
=
query
[
2
]
asin
=
query
[
0
]
date_info
=
query
[
1
]
data_type_asin
=
int
(
query
[
3
])
volume_str
=
query
[
4
]
weight_str
=
query
[
5
]
headers
=
self
.
reuests_para_val
.
requests_amazon_headers
(
host
=
self
.
host
,
site_url
=
self
.
site_url
,
asin
=
asin
,
scraper_url
=
None
)
headers
[
"cookie"
]
=
cookie_str
self
.
month_
=
date_info
.
split
(
'-'
)[
1
]
if
self
.
headers_num_int
>
20
:
# 亚马逊出现超过20次ip已经被封锁。退出抓取切换ip。
break
if
is_variat
==
'1'
:
scraper_url
=
self
.
site_url
+
'dp/'
+
query
[
0
]
+
"?th=1&psc=1"
else
:
scraper_url
=
self
.
site_url
+
'dp/'
+
query
[
0
]
+
'?th=1'
self
.
request_total_count_list
.
append
(
4
)
print
(
'scraper_url::'
,
scraper_url
)
try
:
resp
=
self
.
sess
.
get
(
scraper_url
,
headers
=
headers
,
timeout
=
10
,
verify
=
False
)
# with open(rf'D:\新建文件夹\html_selenium_files\{self.site_name}_211123333_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if
self
.
reuests_para_val
.
check_amazon_yzm
(
resp
):
self
.
yzm_err_total_list
.
append
(
1
)
self
.
headers_num_int
+=
1
self
.
requests_error_asin_list
.
append
(
query
[
0
])
continue
except
Exception
as
e
:
self
.
asin_request_errp_total_list
.
append
(
2
)
print
(
"请求错误错误: "
,
e
)
if
'Received response with content-encoding: gzip'
in
str
(
e
):
self
.
asin_not_found_list
.
append
(
asin
)
else
:
self
.
requests_error_asin_list
.
append
(
query
[
0
])
# 获取组装cookie
cookie_str
=
self
.
reuests_para_val
.
get_cookie_str
(
self
.
cookies_queue
)
query
=
str
(
querys
)
.
split
(
'|'
)
is_variat
=
query
[
2
]
asin
=
query
[
0
]
date_info
=
query
[
1
]
data_type_asin
=
int
(
query
[
3
])
volume_str
=
query
[
4
]
weight_str
=
query
[
5
]
headers
=
self
.
reuests_para_val
.
requests_amazon_headers
(
host
=
self
.
host
,
site_url
=
self
.
site_url
,
asin
=
asin
,
scraper_url
=
None
)
headers
[
"cookie"
]
=
cookie_str
self
.
month_
=
date_info
.
split
(
'-'
)[
1
]
if
self
.
headers_num_int
>
20
:
# 亚马逊出现超过20次ip已经被封锁。退出抓取切换ip。
break
if
is_variat
==
'1'
:
scraper_url
=
self
.
site_url
+
'dp/'
+
query
[
0
]
+
"?th=1&psc=1"
else
:
scraper_url
=
self
.
site_url
+
'dp/'
+
query
[
0
]
+
'?th=1'
self
.
request_total_count_list
.
append
(
4
)
print
(
'scraper_url::'
,
scraper_url
)
try
:
# sess = requests2.Session()
# sess.mount(self.site_url, py_ja3.DESAdapter())
# resp = requests.get(scraper_url, headers=headers,
# timeout=20)
resp
=
requests
.
get
(
scraper_url
,
headers
=
headers
,
timeout
=
30
,
verify
=
False
,
impersonate
=
"chrome"
)
# with open(rf'D:\新建文件夹\html_selenium_files\{self.site_name}_211123333_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if
self
.
reuests_para_val
.
check_amazon_yzm
(
resp
):
print
(
'出现验证码::'
,
'#'
*
80
)
self
.
yzm_err_total_list
.
append
(
1
)
self
.
headers_num_int
+=
1
self
.
requests_error_asin_list
.
append
(
query
[
0
])
continue
response_url
=
resp
.
url
response
=
resp
.
text
response_s
=
etree
.
HTML
(
response
)
self
.
success_asin_total_list
.
append
(
3
)
if
self
.
reuests_para_val
.
check_amazon_not_page
(
response
):
except
Exception
as
e
:
self
.
asin_request_errp_total_list
.
append
(
2
)
print
(
"请求错误错误: "
,
e
)
if
'Received response with content-encoding: gzip'
in
str
(
e
):
self
.
asin_not_found_list
.
append
(
asin
)
continue
if
self
.
reuests_para_val
.
check_amazon_page
(
response
,
response_url
):
self
.
asin_not_redirect_list
.
append
(
asin
)
continue
if
self
.
reuests_para_val
.
check_amazon_allow_redirects
(
response_url
,
asin
):
self
.
asin_not_redirect_list
.
append
(
asin
)
continue
# 获取邮编
try
:
ingress
=
response_s
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
except
Exception
as
e
:
self
.
asin_not_response_list
.
append
(
asin
)
continue
try
:
ingress
=
ingress
[
0
]
.
strip
()
except
:
ingress
=
None
print
(
ingress
,
' 打印 邮编 '
,
resp
.
url
)
if
ingress
:
if
self
.
reuests_para_val
.
check_amazon_ingress
(
ingress
):
try
:
cookie_ubid_main_id
=
re
.
findall
(
r'ubid-main=(.*?);'
,
cookie_str
)[
0
]
except
:
cookie_ubid_main_id
=
re
.
findall
(
r'session-id=(.*?);'
,
cookie_str
)[
0
]
for
cookie_key_value
in
self
.
cookie_dict_delete_id
.
items
():
if
cookie_ubid_main_id
in
cookie_key_value
[
1
]:
self
.
delete_cookies_list
.
append
(
cookie_key_value
[
0
])
self
.
requests_error_asin_list
.
append
(
asin
)
continue
div_dp
=
response_s
.
xpath
(
'//div[@id="dp"]'
)
if
div_dp
:
# 解析resp=_response_text, asin=asin
items
=
ParseAsinUs
(
resp
=
response
,
asin
=
asin
,
month
=
self
.
month_
,
date_info
=
date_info
,
site_name
=
self
.
site_name
)
.
xpath_html
()
new_date
=
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
item
=
{
'asin'
:
items
[
"asin"
],
'week'
:
items
[
"week"
],
'month'
:
items
[
"month"
],
'title'
:
items
[
"title"
],
'img_url'
:
items
[
"img_url"
],
'rating'
:
items
[
"rating"
],
'total_comments'
:
items
[
"total_comments"
],
'price'
:
items
[
"price"
],
"rank"
:
items
[
"rank"
],
'category'
:
items
[
"category"
],
'launch_time'
:
items
[
"launch_time"
],
'volume'
:
items
[
"volume"
],
'weight'
:
items
[
"weight"
],
"page_inventory"
:
items
[
"page_inventory"
],
"buy_box_seller_type"
:
items
[
"buy_box_seller_type"
],
"asin_vartion_list"
:
items
[
"asin_vartion_list"
],
'title_len'
:
items
[
"title_len"
],
'img_num'
:
items
[
"img_num"
],
'img_type'
:
items
[
"img_type"
],
'activity_type'
:
items
[
"activity_type"
],
'one_two_val'
:
items
[
"one_two_val"
],
'three_four_val'
:
items
[
"three_four_val"
],
'eight_val'
:
items
[
"eight_val"
],
'qa_num'
:
items
[
"qa_num"
],
'five_star'
:
items
[
"five_star"
],
'four_star'
:
items
[
"four_star"
],
'three_star'
:
items
[
"three_star"
],
'two_star'
:
items
[
"two_star"
],
'one_star'
:
items
[
"one_star"
],
'low_star'
:
items
[
"low_star"
],
'together_asin'
:
items
[
"together_asin"
],
'brand'
:
items
[
"brand"
],
'ac_name'
:
items
[
"ac_name"
],
'material'
:
items
[
"material"
],
'node_id'
:
items
[
"node_id"
],
'data_type'
:
data_type_asin
,
'sp_num'
:
items
[
"sp_num"
],
'describe'
:
items
[
"describe"
],
'date_info'
:
date_info
,
'weight_str'
:
items
[
"weight_str"
],
'package_quantity'
:
items
[
'package_quantity'
],
'pattern_name'
:
items
[
'pattern_name'
],
'seller_id'
:
items
[
"seller_id"
],
'variat_num'
:
items
[
'variat_num'
],
'site_name'
:
self
.
site_name
,
'best_sellers_rank'
:
items
[
"best_sellers_rank"
],
'best_sellers_herf'
:
items
[
"best_sellers_herf"
],
'account_url'
:
items
[
"account_url"
],
'account_name'
:
items
[
"account_name"
],
'parentAsin'
:
items
[
"parentAsin"
],
'asinUpdateTime'
:
new_date
,
'follow_sellers'
:
items
[
'sellers_num'
],
'spider_int'
:
self
.
spider_int
,
'all_best_sellers_herf'
:
items
[
'all_best_sellers_herf'
],
'product_description'
:
items
[
'product_description'
],
'buy_sales'
:
items
[
'buySales'
],
'image_view'
:
items
[
'image_view'
],
'product_json'
:
items
[
'product_json'
],
'product_detail_json'
:
items
[
'productdetail_json'
],
'review_ai_text'
:
items
[
'review_ai_text'
],
'review_label_json'
:
items
[
'review_label_json'
],
'lob_asin_json'
:
items
[
'lob_asin_json'
],
'sp_initial_seen_asins_json'
:
items
[
'sp_initial_seen_asins_json'
],
'sp_4stars_initial_seen_asins_json'
:
items
[
'sp_4stars_initial_seen_asins_json'
],
'sp_delivery_initial_seen_asins_json'
:
items
[
'sp_delivery_initial_seen_asins_json'
],
'compare_similar_asin_json'
:
items
[
'compare_similar_asin_json'
],
'customer_reviews_json'
:
items
[
'customer_reviews_json'
],
'together_asin_json'
:
items
[
'together_asin_json'
],
'min_match_asin_json'
:
items
[
'min_match_asin_json'
],
'seller_json'
:
items
[
'seller_json'
],
'created_time'
:
new_date
,
'current_asin'
:
items
[
'current_asin'
],
'parent_asin'
:
items
[
"parentAsin"
],
'div_id_list'
:
items
[
'div_id_list'
],
'bundles_this_asins_json'
:
items
[
'bundles_this_asins_data_json'
],
'video_m3u8_url'
:
items
[
"video_m3u8"
],
'result_list_json'
:
items
[
'result_list_json'
],
'bundle_asin_component_json'
:
items
[
'bundle_asin_component_json'
],
'review_json_list'
:
items
[
'review_json_list'
],
'fbm_delivery_price'
:
items
[
'fbm_delivery_price'
]
}
if
self
.
site_name
in
[
'uk'
,
'de'
,
'fr'
,
'es'
,
'it'
]:
item
[
'five_six_val'
]
=
items
[
'five_six_val'
]
else
:
print
(
'2233请求错误错误::'
,
'#'
*
80
)
self
.
requests_error_asin_list
.
append
(
query
[
0
])
continue
response_url
=
resp
.
url
response
=
resp
.
text
response_s
=
etree
.
HTML
(
response
)
self
.
success_asin_total_list
.
append
(
3
)
if
self
.
reuests_para_val
.
check_amazon_not_page
(
response
):
self
.
asin_not_found_list
.
append
(
asin
)
continue
if
self
.
reuests_para_val
.
check_amazon_page
(
response
,
response_url
):
self
.
asin_not_redirect_list
.
append
(
asin
)
continue
if
self
.
reuests_para_val
.
check_amazon_allow_redirects
(
response_url
,
asin
):
self
.
asin_not_redirect_list
.
append
(
asin
)
continue
# 获取邮编
try
:
ingress
=
response_s
.
xpath
(
"//span[@id='glow-ingress-line2']/text()"
)
except
Exception
as
e
:
self
.
asin_not_response_list
.
append
(
asin
)
continue
try
:
ingress
=
ingress
[
0
]
.
strip
()
except
:
ingress
=
None
print
(
ingress
,
' 打印 邮编 '
,
resp
.
url
)
if
ingress
:
if
self
.
reuests_para_val
.
check_amazon_ingress
(
ingress
):
ubid_list
=
re
.
findall
(
r'ubid-main=(.*?);'
,
cookie_str
)
if
ubid_list
:
cookie_ubid_main_id
=
ubid_list
[
0
]
else
:
item
[
'five_six_val'
]
=
None
# 第二次请求
_response_text
=
None
if
item
[
'variat_num'
]
>
0
and
is_variat
==
'0'
:
self
.
request_total_count_list
.
append
(
4
)
_url
=
self
.
site_url
+
'dp/'
+
asin
+
"?th=1&psc=1"
print
(
'第二次请求:'
,
_url
)
try
:
_response_text
=
None
_response_text
=
self
.
reuests_para_val
.
requests_amazon
(
headers
=
headers
,
scraper_url
=
_url
)
if
_response_text
:
_items
=
ParseAsinUs
(
resp
=
_response_text
,
asin
=
asin
,
month
=
self
.
month_
,
date_info
=
date_info
,
site_name
=
self
.
site_name
)
.
xpath_html
()
if
_items
[
"volume"
]
and
item
[
'volume'
]
is
None
:
item
[
'volume'
]
=
_items
[
"volume"
]
if
_items
[
'result_list_json'
]
and
item
[
'result_list_json'
]
is
None
:
item
[
'result_list_json'
]
=
_items
[
"result_list_json"
]
if
_items
[
"weight_str"
]
and
item
[
'weight_str'
]
is
None
:
item
[
'weight_str'
]
=
_items
[
"weight_str"
]
if
_items
[
"weight"
]
and
item
[
'weight'
]
is
None
:
item
[
'weight'
]
=
_items
[
"weight"
]
if
item
[
"rank"
]
is
None
:
item
[
"rank"
]
=
_items
[
"rank"
]
if
item
[
"launch_time"
]
is
None
:
item
[
"launch_time"
]
=
_items
[
"launch_time"
]
if
item
[
'product_description'
]
is
None
:
item
[
'product_description'
]
=
_items
[
"product_description"
]
if
item
[
"price"
]
is
None
:
item
[
"price"
]
=
_items
[
"price"
]
elif
item
[
"price"
]
<
1
:
item
[
"price"
]
=
_items
[
"price"
]
if
item
[
"buy_sales"
]
is
None
:
item
[
"buy_sales"
]
=
_items
[
"buySales"
]
if
item
[
'buy_box_seller_type'
]
is
None
or
item
[
'buy_box_seller_type'
]
==
4
:
item
[
"buy_box_seller_type"
]
=
_items
[
"buy_box_seller_type"
]
if
item
[
'page_inventory'
]
==
0
or
item
[
'page_inventory'
]
==
3
:
item
[
"page_inventory"
]
=
_items
[
"page_inventory"
]
if
item
[
'account_name'
]
is
None
:
item
[
"account_name"
]
=
_items
[
"account_name"
]
if
item
[
'seller_id'
]
is
None
:
item
[
"seller_id"
]
=
_items
[
"seller_id"
]
if
item
[
'seller_json'
]
is
None
:
item
[
"seller_json"
]
=
_items
[
"seller_json"
]
if
item
[
'five_star'
]
is
None
:
item
[
'five_star'
]
=
_items
[
"five_star"
]
if
item
[
'four_star'
]
is
None
:
item
[
'four_star'
]
=
_items
[
"four_star"
]
# 修复:删除重复的 four_star 检查
if
item
[
'two_star'
]
is
None
:
item
[
'two_star'
]
=
_items
[
"two_star"
]
if
item
[
'one_star'
]
is
None
:
item
[
'one_star'
]
=
_items
[
"one_star"
]
if
item
[
'low_star'
]
is
None
:
item
[
'low_star'
]
=
_items
[
"low_star"
]
if
item
[
'category'
]
is
None
:
item
[
'category'
]
=
_items
[
"category"
]
if
item
[
'node_id'
]
is
None
:
item
[
'node_id'
]
=
_items
[
"node_id"
]
if
item
[
'review_json_list'
]
is
None
:
item
[
'review_json_list'
]
=
_items
[
"review_json_list"
]
if
item
[
'fbm_delivery_price'
]
is
None
:
item
[
'fbm_delivery_price'
]
=
_items
[
"fbm_delivery_price"
]
if
item
[
'review_ai_text'
]
is
None
:
item
[
'review_ai_text'
]
=
_items
[
"review_ai_text"
]
except
:
pass
_response_text_var
=
None
if
item
[
"buy_box_seller_type"
]
==
4
and
item
[
'page_inventory'
]
==
3
and
item
[
'variat_num'
]
>
0
and
\
items
[
"asin_variation_list"
]:
self
.
request_total_count_list
.
append
(
4
)
try
:
if
asin
!=
items
[
"asin_variation_list"
][
0
][
0
]:
_to_asin
=
items
[
"asin_variation_list"
][
0
][
0
]
elif
len
(
items
[
"asin_variation_list"
])
>
1
:
_to_asin
=
items
[
"asin_variation_list"
][
1
][
0
]
else
:
_to_asin
=
item
[
'parentAsin'
]
_url
=
self
.
site_url
+
'dp/'
+
_to_asin
+
"?th=1&psc=1"
print
(
'请求asin 出现缺货,拿变体asin进行请求:'
,
_url
)
_response_text_var
=
self
.
reuests_para_val
.
requests_amazon
(
headers
=
headers
,
scraper_url
=
_url
)
_to_items
=
ParseAsinUs
(
resp
=
_response_text_var
,
asin
=
asin
,
month
=
self
.
month_
,
date_info
=
date_info
,
site_name
=
self
.
site_name
)
.
xpath_html
()
session_list
=
re
.
findall
(
r'session-id=(.*?);'
,
cookie_str
)
cookie_ubid_main_id
=
session_list
[
0
]
if
session_list
else
None
for
cookie_key_value
in
self
.
cookie_dict_delete_id
.
items
():
if
cookie_ubid_main_id
in
cookie_key_value
[
1
]:
self
.
delete_cookies_list
.
append
(
cookie_key_value
[
0
])
print
(
ingress
,
'邮编 错误 ::'
,
'#'
*
80
)
self
.
requests_error_asin_list
.
append
(
asin
)
continue
else
:
self
.
requests_error_asin_list
.
append
(
asin
)
continue
div_dp
=
response_s
.
xpath
(
'//div[@id="dp"]'
)
if
div_dp
:
items
=
ParseAsinUs
(
resp
=
response
,
asin
=
asin
,
month
=
self
.
month_
,
date_info
=
date_info
,
site_name
=
self
.
site_name
)
.
xpath_html
()
new_date
=
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
item
=
{
'asin'
:
items
[
"asin"
],
'week'
:
items
[
"week"
],
'month'
:
items
[
"month"
],
'title'
:
items
[
"title"
],
'img_url'
:
items
[
"img_url"
],
'rating'
:
items
[
"rating"
],
'total_comments'
:
items
[
"total_comments"
],
'price'
:
items
[
"price"
],
"rank"
:
items
[
"rank"
],
'category'
:
items
[
"category"
],
'launch_time'
:
items
[
"launch_time"
],
'volume'
:
items
[
"volume"
],
'weight'
:
items
[
"weight"
],
"page_inventory"
:
items
[
"page_inventory"
],
"buy_box_seller_type"
:
items
[
"buy_box_seller_type"
],
"asin_vartion_list"
:
items
[
"asin_vartion_list"
],
'title_len'
:
items
[
"title_len"
],
'img_num'
:
items
[
"img_num"
],
'img_type'
:
items
[
"img_type"
],
'activity_type'
:
items
[
"activity_type"
],
'one_two_val'
:
items
[
"one_two_val"
],
'three_four_val'
:
items
[
"three_four_val"
],
'eight_val'
:
items
[
"eight_val"
],
'qa_num'
:
items
[
"qa_num"
],
'five_star'
:
items
[
"five_star"
],
'four_star'
:
items
[
"four_star"
],
'three_star'
:
items
[
"three_star"
],
'two_star'
:
items
[
"two_star"
],
'one_star'
:
items
[
"one_star"
],
'low_star'
:
items
[
"low_star"
],
'together_asin'
:
items
[
"together_asin"
],
'brand'
:
items
[
"brand"
],
'ac_name'
:
items
[
"ac_name"
],
'material'
:
items
[
"material"
],
'node_id'
:
items
[
"node_id"
],
'data_type'
:
data_type_asin
,
'sp_num'
:
items
[
"sp_num"
],
'describe'
:
items
[
"describe"
],
'date_info'
:
date_info
,
'weight_str'
:
items
[
"weight_str"
],
'package_quantity'
:
items
[
'package_quantity'
],
'pattern_name'
:
items
[
'pattern_name'
],
'seller_id'
:
items
[
"seller_id"
],
'variat_num'
:
items
[
'variat_num'
],
'site_name'
:
self
.
site_name
,
'best_sellers_rank'
:
items
[
"best_sellers_rank"
],
'best_sellers_herf'
:
items
[
"best_sellers_herf"
],
'account_url'
:
items
[
"account_url"
],
'account_name'
:
items
[
"account_name"
],
'parentAsin'
:
items
[
"parentAsin"
],
'asinUpdateTime'
:
new_date
,
'follow_sellers'
:
items
[
'sellers_num'
],
'spider_int'
:
self
.
spider_int
,
'all_best_sellers_herf'
:
items
[
'all_best_sellers_herf'
],
'product_description'
:
items
[
'product_description'
],
'buy_sales'
:
items
[
'buySales'
],
'image_view'
:
items
[
'image_view'
],
'product_json'
:
items
[
'product_json'
],
'product_detail_json'
:
items
[
'productdetail_json'
],
'review_ai_text'
:
items
[
'review_ai_text'
],
'review_label_json'
:
items
[
'review_label_json'
],
'lob_asin_json'
:
items
[
'lob_asin_json'
],
'sp_initial_seen_asins_json'
:
items
[
'sp_initial_seen_asins_json'
],
'sp_4stars_initial_seen_asins_json'
:
items
[
'sp_4stars_initial_seen_asins_json'
],
'sp_delivery_initial_seen_asins_json'
:
items
[
'sp_delivery_initial_seen_asins_json'
],
'compare_similar_asin_json'
:
items
[
'compare_similar_asin_json'
],
'customer_reviews_json'
:
items
[
'customer_reviews_json'
],
'together_asin_json'
:
items
[
'together_asin_json'
],
'min_match_asin_json'
:
items
[
'min_match_asin_json'
],
'seller_json'
:
items
[
'seller_json'
],
'created_time'
:
new_date
,
'current_asin'
:
items
[
'current_asin'
],
'parent_asin'
:
items
[
"parentAsin"
],
'div_id_list'
:
items
[
'div_id_list'
],
'bundles_this_asins_json'
:
items
[
'bundles_this_asins_data_json'
],
'video_m3u8_url'
:
items
[
"video_m3u8"
],
'result_list_json'
:
items
[
'result_list_json'
],
'bundle_asin_component_json'
:
items
[
'bundle_asin_component_json'
],
'review_json_list'
:
items
[
'review_json_list'
],
'fbm_delivery_price'
:
items
[
'fbm_delivery_price'
]
}
if
self
.
site_name
in
[
'uk'
,
'de'
,
'fr'
,
'es'
,
'it'
]:
item
[
'five_six_val'
]
=
items
[
'five_six_val'
]
else
:
item
[
'five_six_val'
]
=
None
# 第二次请求
_response_text
=
None
if
item
[
'variat_num'
]
>
0
and
is_variat
==
'0'
:
self
.
request_total_count_list
.
append
(
4
)
_url
=
self
.
site_url
+
'dp/'
+
asin
+
"?th=1&psc=1"
print
(
'第二次请求:'
,
_url
)
try
:
_response_text
=
None
_response_text
=
self
.
reuests_para_val
.
requests_amazon
(
headers
=
headers
,
scraper_url
=
_url
,
sess
=
None
)
if
_response_text
:
_items
=
ParseAsinUs
(
resp
=
_response_text
,
asin
=
asin
,
month
=
self
.
month_
,
date_info
=
date_info
,
site_name
=
self
.
site_name
)
.
xpath_html
()
if
_items
[
"volume"
]
and
item
[
'volume'
]
is
None
:
item
[
'volume'
]
=
_items
[
"volume"
]
if
_items
[
'result_list_json'
]
and
item
[
'result_list_json'
]
is
None
:
item
[
'result_list_json'
]
=
_items
[
"result_list_json"
]
if
_items
[
"weight_str"
]
and
item
[
'weight_str'
]
is
None
:
item
[
'weight_str'
]
=
_items
[
"weight_str"
]
if
_items
[
"weight"
]
and
item
[
'weight'
]
is
None
:
item
[
'weight'
]
=
_items
[
"weight"
]
if
item
[
"rank"
]
is
None
:
item
[
"rank"
]
=
_items
[
"rank"
]
if
item
[
"launch_time"
]
is
None
:
item
[
"launch_time"
]
=
_items
[
"launch_time"
]
if
item
[
'product_description'
]
is
None
:
item
[
'product_description'
]
=
_items
[
"product_description"
]
if
item
[
"price"
]
is
None
:
item
[
"price"
]
=
_items
[
"price"
]
elif
item
[
"price"
]
<
1
:
item
[
"price"
]
=
_items
[
"price"
]
if
item
[
"buy_sales"
]
is
None
:
item
[
"buy_sales"
]
=
_items
[
"buySales"
]
if
item
[
'buy_box_seller_type'
]
is
None
or
item
[
'buy_box_seller_type'
]
==
4
:
item
[
"buy_box_seller_type"
]
=
_to_items
[
"buy_box_seller_type"
]
item
[
"buy_box_seller_type"
]
=
_items
[
"buy_box_seller_type"
]
if
item
[
'page_inventory'
]
==
0
or
item
[
'page_inventory'
]
==
3
:
item
[
"page_inventory"
]
=
_items
[
"page_inventory"
]
if
item
[
'account_name'
]
is
None
:
item
[
"account_name"
]
=
_
to_
items
[
"account_name"
]
item
[
"account_name"
]
=
_items
[
"account_name"
]
if
item
[
'seller_id'
]
is
None
:
item
[
"seller_id"
]
=
_
to_
items
[
"seller_id"
]
item
[
"seller_id"
]
=
_items
[
"seller_id"
]
if
item
[
'seller_json'
]
is
None
:
item
[
"seller_json"
]
=
_to_items
[
"seller_json"
]
except
Exception
as
e
:
print
(
'请求asin 第二次请求 报错:'
,
f
"
\n
{traceback.format_exc()}"
)
if
item
[
'volume'
]
is
None
and
volume_str
!=
'null'
:
item
[
'volume'
]
=
volume_str
if
item
[
'weight_str'
]
is
None
and
weight_str
!=
'null'
:
item
[
'weight_str'
]
=
weight_str
new_date_hour
=
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d:
%
H"
)
num
=
self
.
reuests_para_val
.
get_hour
(
new_date_hour
)
self
.
hour_total_count_list
.
append
(
num
)
print
(
"+++++++++++item['buy_sales']::"
,
item
[
'buy_sales'
])
if
not
item
[
'title'
]
or
not
item
[
'img_url'
]:
self
.
asin_not_sure_list
.
append
(
asin
)
continue
# 修复:检查 img_url 是否包含无效值
img_url_invalid
=
False
if
item
[
'img_url'
]
and
len
(
item
[
'img_url'
]
.
strip
())
>
2
:
for
key
in
[
'None'
,
'null'
,
'none'
]:
if
key
in
item
[
'img_url'
]:
img_url_invalid
=
True
break
if
img_url_invalid
:
self
.
asin_not_sure_list
.
append
(
asin
)
continue
# 修复:检查 title 是否包含无效值
title_invalid
=
False
if
item
[
'title'
]
and
len
(
item
[
'title'
]
.
strip
())
>
2
:
for
key
in
[
'None'
,
'null'
,
'none'
]:
if
key
in
item
[
'title'
]:
title_invalid
=
True
break
if
title_invalid
:
self
.
asin_not_sure_list
.
append
(
asin
)
continue
print
(
'itemitem:::'
,
item
)
# 上架时间 排名 重量 底部信息 如果都为None 重新抓取
if
item
[
"launch_time"
]
is
None
and
item
[
"rank"
]
is
None
and
item
[
'weight'
]
is
None
and
item
[
'product_detail_json'
]
is
None
and
len
(
items
[
'div_id_list'
])
<
1
:
print
(
'上架时间 排名 重量 底部信息 如果都为None 重新抓取:::'
,
asin
)
self
.
requests_error_asin_list
.
append
(
asin
)
continue
if
(
self
.
reuests_para_val
.
check_contain_chinese
(
item
[
'title'
]))
or
(
self
.
reuests_para_val
.
check_contain_chinese
(
item
[
'category'
])):
self
.
asin_not_sure_list
.
append
(
asin
)
continue
if
items
[
"buyBox_list"
]:
self
.
buyBox_list
.
extend
(
items
[
"buyBox_list"
])
if
items
[
"buyBoxname_asin_list"
]:
self
.
buyBoxname_asin_list
.
extend
(
items
[
"buyBoxname_asin_list"
])
if
items
[
"bs_category_asin_list_pg"
]:
self
.
bs_category_asin_list_pg
.
extend
(
items
[
"bs_category_asin_list_pg"
])
if
items
[
"asin_variation_list"
]:
item
[
'variat_list'
]
=
json
.
dumps
(
items
[
"asin_variation_list"
])
# 变体
else
:
item
[
'variat_list'
]
=
None
item
[
'asin_vartion_list'
]
=
items
[
"asin_variation_list"
]
if
items
[
"all_img_video_list"
]:
item
[
'img_list'
]
=
json
.
dumps
(
items
[
"all_img_video_list"
])
else
:
item
[
'img_list'
]
=
None
if
item
[
'img_list'
]
is
None
:
item
[
'img_list'
]
=
[]
self
.
item_queue
.
put
(
item
)
# 获取字段值为None的字段名称写入redis进行统计
none_keys
=
[
key
for
key
,
value
in
item
.
items
()
if
(
value
is
None
)
or
(
value
==
-
1
and
key
==
'price'
)
or
(
value
==
0
and
key
in
[
'weight'
,
'total_comments'
,
'rating'
])]
for
key
in
[
'parentAsin'
,
'week'
,
'all_best_sellers_herf'
,
'best_sellers_rank'
,
'seller_id'
,
'account_url'
,
'product_json'
,
'product_detail_json'
,
'review_ai_text'
,
'lob_asin_json'
,
'sp_initial_seen_asins_json'
,
'sp_4stars_initial_seen_asins_json'
,
'sp_delivery_initial_seen_asins_json'
,
'compare_similar_asin_json'
,
'customer_reviews_json'
,
'together_asin_json'
,
'min_match_asin_json'
,
'product_description'
,
'variat_num'
,
'qa_num'
,
'asin_vartion_list'
,
'review_label_json'
,
'seller_json'
,
'current_asin'
,
'five_six_val'
,
'best_sellers_herf'
,
'bundles_this_asins_json'
]:
if
key
in
none_keys
:
none_keys
.
remove
(
key
)
log_time
=
time
.
strftime
(
'
%
Y-
%
m-
%
d'
,
time
.
localtime
(
time
.
time
()))
try
:
self
.
redis14
.
rpush
(
f
'{self.site_name}_{log_time}_asin_detail_is_none'
,
*
none_keys
)
item
[
"seller_json"
]
=
_items
[
"seller_json"
]
if
item
[
'five_star'
]
is
None
:
item
[
'five_star'
]
=
_items
[
"five_star"
]
if
item
[
'four_star'
]
is
None
:
item
[
'four_star'
]
=
_items
[
"four_star"
]
if
item
[
'three_star'
]
is
None
:
item
[
'three_star'
]
=
_items
[
"three_star"
]
if
item
[
'two_star'
]
is
None
:
item
[
'two_star'
]
=
_items
[
"two_star"
]
if
item
[
'one_star'
]
is
None
:
item
[
'one_star'
]
=
_items
[
"one_star"
]
if
item
[
'low_star'
]
is
None
:
item
[
'low_star'
]
=
_items
[
"low_star"
]
if
item
[
'category'
]
is
None
:
item
[
'category'
]
=
_items
[
"category"
]
if
item
[
'node_id'
]
is
None
:
item
[
'node_id'
]
=
_items
[
"node_id"
]
if
item
[
'review_json_list'
]
is
None
:
item
[
'review_json_list'
]
=
_items
[
"review_json_list"
]
if
item
[
'fbm_delivery_price'
]
is
None
:
item
[
'fbm_delivery_price'
]
=
_items
[
"fbm_delivery_price"
]
if
item
[
'review_ai_text'
]
is
None
:
item
[
'review_ai_text'
]
=
_items
[
"review_ai_text"
]
except
:
pass
self
.
reuests_para_val
.
send_kafka
(
items
=
item
,
topic
=
self
.
topic_detail_month
)
print
(
asin
,
'rank 排名:'
,
item
[
'rank'
])
if
item
[
'rank'
]
is
not
None
and
item
[
'rank'
]
<
9000
:
# requests_num 代表不同类型url请求返回的源码。
if
_response_text_var
:
# 请求asin 出现缺货,拿变体asin进行请求
requests_num
=
2
response_gzip
=
self
.
compress_string
(
_response_text_var
)
elif
_response_text
:
# 发现有变体。导入asin没有标记。重新请求第二次请求
requests_num
=
1
response_gzip
=
self
.
compress_string
(
_response_text
)
_response_text_var
=
None
if
item
[
"buy_box_seller_type"
]
==
4
and
item
[
'page_inventory'
]
==
3
and
item
[
'variat_num'
]
>
0
and
\
items
[
"asin_variation_list"
]:
self
.
request_total_count_list
.
append
(
4
)
try
:
if
asin
!=
items
[
"asin_variation_list"
][
0
][
0
]:
_to_asin
=
items
[
"asin_variation_list"
][
0
][
0
]
elif
len
(
items
[
"asin_variation_list"
])
>
1
:
_to_asin
=
items
[
"asin_variation_list"
][
1
][
0
]
else
:
requests_num
=
0
# 第一次请求返回源码
response_gzip
=
self
.
compress_string
(
response
)
html_data
=
f
'{self.site_name}|-||=|-|=||-|{asin}|-||=|-|=||-|{response_gzip}|-||=|-|=||-|{new_date}|-||=|-|=||-|{requests_num}'
self
.
reuests_para_val
.
send_kafka
(
html_data
=
html_data
,
topic
=
self
.
topic_asin_html
)
_to_asin
=
item
[
'parentAsin'
]
_url
=
self
.
site_url
+
'dp/'
+
_to_asin
+
"?th=1&psc=1"
print
(
'请求asin 出现缺货,拿变体asin进行请求:'
,
_url
)
_response_text_var
=
self
.
reuests_para_val
.
requests_amazon
(
headers
=
headers
,
scraper_url
=
_url
,
sess
=
None
)
_to_items
=
ParseAsinUs
(
resp
=
_response_text_var
,
asin
=
asin
,
month
=
self
.
month_
,
date_info
=
date_info
,
site_name
=
self
.
site_name
)
.
xpath_html
()
if
item
[
'buy_box_seller_type'
]
is
None
or
item
[
'buy_box_seller_type'
]
==
4
:
item
[
"buy_box_seller_type"
]
=
_to_items
[
"buy_box_seller_type"
]
if
item
[
'account_name'
]
is
None
:
item
[
"account_name"
]
=
_to_items
[
"account_name"
]
if
item
[
'seller_id'
]
is
None
:
item
[
"seller_id"
]
=
_to_items
[
"seller_id"
]
if
item
[
'seller_json'
]
is
None
:
item
[
"seller_json"
]
=
_to_items
[
"seller_json"
]
except
Exception
as
e
:
print
(
'请求asin 第二次请求 报错:'
,
f
"
\n
{traceback.format_exc()}"
)
if
item
[
'volume'
]
is
None
and
volume_str
!=
'null'
:
item
[
'volume'
]
=
volume_str
if
item
[
'weight_str'
]
is
None
and
weight_str
!=
'null'
:
item
[
'weight_str'
]
=
weight_str
new_date_hour
=
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d:
%
H"
)
num
=
self
.
reuests_para_val
.
get_hour
(
new_date_hour
)
self
.
hour_total_count_list
.
append
(
num
)
print
(
"+++++++++++item['buy_sales']::"
,
item
[
'buy_sales'
])
if
not
item
[
'title'
]
or
not
item
[
'img_url'
]:
self
.
asin_not_sure_list
.
append
(
asin
)
continue
# 修复:检查 img_url 是否包含无效值
img_url_invalid
=
False
if
item
[
'img_url'
]
and
len
(
item
[
'img_url'
]
.
strip
())
>
2
:
for
key
in
[
'None'
,
'null'
,
'none'
]:
if
key
in
item
[
'img_url'
]:
img_url_invalid
=
True
break
if
img_url_invalid
:
self
.
asin_not_sure_list
.
append
(
asin
)
continue
# 修复:检查 title 是否包含无效值
title_invalid
=
False
if
item
[
'title'
]
and
len
(
item
[
'title'
]
.
strip
())
>
2
:
for
key
in
[
'None'
,
'null'
,
'none'
]:
if
key
in
item
[
'title'
]:
title_invalid
=
True
break
if
title_invalid
:
self
.
asin_not_sure_list
.
append
(
asin
)
continue
print
(
'itemitem:::'
,
item
)
# 上架时间 排名 重量 底部信息 如果都为None 重新抓取
if
item
[
"launch_time"
]
is
None
and
item
[
"rank"
]
is
None
and
item
[
'weight'
]
is
None
and
item
[
'product_detail_json'
]
is
None
and
len
(
items
[
'div_id_list'
])
<
1
:
print
(
'上架时间 排名 重量 底部信息 如果都为None 重新抓取:::'
,
asin
)
print
(
ingress
,
'上架时间 排名 重量 底部信息 如果都为None ::'
,
'#'
*
80
)
self
.
requests_error_asin_list
.
append
(
asin
)
continue
if
(
self
.
reuests_para_val
.
check_contain_chinese
(
item
[
'title'
]))
or
(
self
.
reuests_para_val
.
check_contain_chinese
(
item
[
'category'
])):
self
.
asin_not_sure_list
.
append
(
asin
)
continue
if
items
[
"buyBox_list"
]:
self
.
buyBox_list
.
extend
(
items
[
"buyBox_list"
])
if
items
[
"buyBoxname_asin_list"
]:
self
.
buyBoxname_asin_list
.
extend
(
items
[
"buyBoxname_asin_list"
])
if
items
[
"bs_category_asin_list_pg"
]:
self
.
bs_category_asin_list_pg
.
extend
(
items
[
"bs_category_asin_list_pg"
])
if
items
[
"asin_variation_list"
]:
item
[
'variat_list'
]
=
json
.
dumps
(
items
[
"asin_variation_list"
])
# 变体
else
:
item
[
'variat_list'
]
=
None
item
[
'asin_vartion_list'
]
=
items
[
"asin_variation_list"
]
if
items
[
"all_img_video_list"
]:
item
[
'img_list'
]
=
json
.
dumps
(
items
[
"all_img_video_list"
])
else
:
if
'Click the button below to continue shopping'
in
response
:
self
.
requests_error_asin_list
.
append
(
query
[
0
])
item
[
'img_list'
]
=
None
if
item
[
'img_list'
]
is
None
:
item
[
'img_list'
]
=
json
.
dumps
([])
self
.
item_queue
.
put
(
item
)
# 获取字段值为None的字段名称写入redis进行统计
none_keys
=
[
key
for
key
,
value
in
item
.
items
()
if
(
value
is
None
)
or
(
value
==
-
1
and
key
==
'price'
)
or
(
value
==
0
and
key
in
[
'weight'
,
'total_comments'
,
'rating'
])]
for
key
in
[
'parentAsin'
,
'week'
,
'all_best_sellers_herf'
,
'best_sellers_rank'
,
'seller_id'
,
'account_url'
,
'product_json'
,
'product_detail_json'
,
'review_ai_text'
,
'lob_asin_json'
,
'sp_initial_seen_asins_json'
,
'sp_4stars_initial_seen_asins_json'
,
'sp_delivery_initial_seen_asins_json'
,
'compare_similar_asin_json'
,
'customer_reviews_json'
,
'together_asin_json'
,
'min_match_asin_json'
,
'product_description'
,
'variat_num'
,
'qa_num'
,
'asin_vartion_list'
,
'review_label_json'
,
'seller_json'
,
'current_asin'
,
'five_six_val'
,
'best_sellers_herf'
,
'bundles_this_asins_json'
]:
if
key
in
none_keys
:
none_keys
.
remove
(
key
)
log_time
=
time
.
strftime
(
'
%
Y-
%
m-
%
d'
,
time
.
localtime
(
time
.
time
()))
try
:
self
.
redis14
.
rpush
(
f
'{self.site_name}_{log_time}_asin_detail_is_none'
,
*
none_keys
)
except
:
pass
self
.
reuests_para_val
.
send_kafka
(
items
=
item
,
topic
=
self
.
topic_detail_month
)
print
(
asin
,
'rank 排名:'
,
item
[
'rank'
])
if
item
[
'rank'
]
is
not
None
and
item
[
'rank'
]
<
9000
:
# requests_num 代表不同类型url请求返回的源码。
if
_response_text_var
:
# 请求asin 出现缺货,拿变体asin进行请求
requests_num
=
2
response_gzip
=
self
.
compress_string
(
_response_text_var
)
elif
_response_text
:
# 发现有变体。导入asin没有标记。重新请求第二次请求
requests_num
=
1
response_gzip
=
self
.
compress_string
(
_response_text
)
else
:
print
(
'状态13'
,
asin
)
self
.
asin_not_div_id_dp_list
.
append
(
asin
)
continue
requests_num
=
0
# 第一次请求返回源码
response_gzip
=
self
.
compress_string
(
response
)
html_data
=
f
'{self.site_name}|-||=|-|=||-|{asin}|-||=|-|=||-|{response_gzip}|-||=|-|=||-|{new_date}|-||=|-|=||-|{requests_num}'
self
.
reuests_para_val
.
send_kafka
(
html_data
=
html_data
,
topic
=
self
.
topic_asin_html
)
else
:
print
(
f
"当前线程-已完成-爬取-跳出循环"
)
break
if
'Click the button below to continue shopping'
in
response
:
print
(
ingress
,
'Click the button below to continue shopping ::'
,
'#'
*
80
)
self
.
requests_error_asin_list
.
append
(
query
[
0
])
else
:
print
(
'状态13'
,
asin
)
self
.
asin_not_div_id_dp_list
.
append
(
asin
)
continue
# 压缩字符串
def
compress_string
(
self
,
input_string
):
...
...
@@ -474,7 +488,7 @@ class async_asin_pg():
# site_name=self.site_name).xpath_html()
# print(items)
asin_list
=
self
.
save_asin_detail
.
read_db_data
()
# asin_list = ['B0F
V8W9T52
|2025-01|1|1|null|null']
# asin_list = ['B0F
M433BGV
|2025-01|1|1|null|null']
if
asin_list
:
for
asin
in
asin_list
:
self
.
queries_asin_queue
.
put
(
asin
)
...
...
@@ -484,7 +498,7 @@ class async_asin_pg():
for
ck
in
cookies_dict
.
values
():
self
.
cookies_queue
.
put
(
ck
)
html_thread
=
[]
for
i
in
range
(
2
5
):
for
i
in
range
(
2
0
):
thread2
=
threading
.
Thread
(
target
=
self
.
get_asin
)
thread2
.
start
()
html_thread
.
append
(
thread2
)
...
...
@@ -557,4 +571,4 @@ class async_asin_pg():
pass
# if __name__ == '__main__':
# async_asin_pg(month=12
, spider_int=1, week=14,site_name='us').run()
# async_asin_pg(month='02'
, spider_int=1, week=14,site_name='us').run()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment