Commit 4825673f by Peng

解决多个任务操作队列读取操作,避免某个线程被挂起。通过ai进行优化整体代码,多线程相互争夺资源的控制

parent 0d70b338
...@@ -2,14 +2,14 @@ import sys ...@@ -2,14 +2,14 @@ import sys
import os import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from amazon_params import py_ja3 # from amazon_params import py_ja3
from amazon_save_db.save_asin_detail_pg import Save_asin_detail from amazon_save_db.save_asin_detail_pg import Save_asin_detail
from utils.asin_parse import ParseAsinUs from utils.asin_parse import ParseAsinUs
from queue import Queue from amazon_params import py_ja3
from queue import Queue, Empty
import time import time
import re import re
from lxml import etree from lxml import etree
import requests
import urllib3 import urllib3
import threading import threading
from func_timeout.exceptions import FunctionTimedOut from func_timeout.exceptions import FunctionTimedOut
...@@ -17,7 +17,8 @@ import traceback ...@@ -17,7 +17,8 @@ import traceback
from datetime import datetime from datetime import datetime
import gzip import gzip
import json import json
# from curl_cffi import requests as curl from curl_cffi import requests
# import requests as requests2
from kafka.errors import KafkaTimeoutError from kafka.errors import KafkaTimeoutError
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings() urllib3.disable_warnings()
...@@ -42,6 +43,7 @@ class async_asin_pg(): ...@@ -42,6 +43,7 @@ class async_asin_pg():
self.asin_not_div_id_dp_list = [] # 返回html没有包含div @id=dp,状态13 self.asin_not_div_id_dp_list = [] # 返回html没有包含div @id=dp,状态13
self.asin_list_update = [] # 3 self.asin_list_update = [] # 3
self.cookies_queue = Queue() # cookie队列 self.cookies_queue = Queue() # cookie队列
self.cookie_refill_lock = threading.Lock() # cookie重填锁
self.item_queue = Queue() # 存储 item 详情数据队列 self.item_queue = Queue() # 存储 item 详情数据队列
self.queries_asin_queue = Queue() # 需要爬取的asin队列 self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.buyBox_list = [] # 卖家名称 url 列表 self.buyBox_list = [] # 卖家名称 url 列表
...@@ -70,356 +72,368 @@ class async_asin_pg(): ...@@ -70,356 +72,368 @@ class async_asin_pg():
self.topic_detail_month = f'{self.site_name}_asin_detail_month_2026_{self.month_}' self.topic_detail_month = f'{self.site_name}_asin_detail_month_2026_{self.month_}'
self.topic_asin_html = f'asin_html_2026_{self.month_}' self.topic_asin_html = f'asin_html_2026_{self.month_}'
self.asin_video_list = [] self.asin_video_list = []
# 修复:sess 改为类成员变量,只 mount 一次
self.sess = requests.Session()
self.sess.mount(self.site_url, py_ja3.DESAdapter())
def get_asin(self): def get_asin(self):
while True: while True:
if not self.queries_asin_queue.empty(): try:
querys = self.queries_asin_queue.get() querys = self.queries_asin_queue.get_nowait()
except Empty:
print(f"当前线程-已完成-爬取-跳出循环")
break
with self.cookie_refill_lock:
if self.cookies_queue.empty(): if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie() cookies_dict = self.reuests_para_val.get_cookie()
self.cookie_dict_delete_id = cookies_dict self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values(): for ck in cookies_dict.values():
self.cookies_queue.put(ck) self.cookies_queue.put(ck)
# 获取组装cookie # 获取组装cookie
cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue) cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
query = str(querys).split('|') query = str(querys).split('|')
is_variat = query[2] is_variat = query[2]
asin = query[0] asin = query[0]
date_info = query[1] date_info = query[1]
data_type_asin = int(query[3]) data_type_asin = int(query[3])
volume_str = query[4] volume_str = query[4]
weight_str = query[5] weight_str = query[5]
headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url, headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
asin=asin, scraper_url=None) asin=asin, scraper_url=None)
headers["cookie"] = cookie_str headers["cookie"] = cookie_str
self.month_ = date_info.split('-')[1] self.month_ = date_info.split('-')[1]
if self.headers_num_int > 20: # 亚马逊出现超过20次ip已经被封锁。退出抓取切换ip。 if self.headers_num_int > 20: # 亚马逊出现超过20次ip已经被封锁。退出抓取切换ip。
break break
if is_variat == '1': if is_variat == '1':
scraper_url = self.site_url + 'dp/' + query[0] + "?th=1&psc=1" scraper_url = self.site_url + 'dp/' + query[0] + "?th=1&psc=1"
else: else:
scraper_url = self.site_url + 'dp/' + query[0] + '?th=1' scraper_url = self.site_url + 'dp/' + query[0] + '?th=1'
self.request_total_count_list.append(4) self.request_total_count_list.append(4)
print('scraper_url::', scraper_url) print('scraper_url::', scraper_url)
try: try:
resp = self.sess.get(scraper_url, headers=headers, # sess = requests2.Session()
timeout=10, verify=False) # sess.mount(self.site_url, py_ja3.DESAdapter())
# with open(rf'D:\新建文件夹\html_selenium_files\{self.site_name}_211123333_{asin}.html', 'w', encoding='utf-8')as f: # resp = requests.get(scraper_url, headers=headers,
# f.write(resp.text) # timeout=20)
if self.reuests_para_val.check_amazon_yzm(resp): resp = requests.get(scraper_url, headers=headers,
self.yzm_err_total_list.append(1) timeout=30, verify=False, impersonate="chrome")
self.headers_num_int += 1 # with open(rf'D:\新建文件夹\html_selenium_files\{self.site_name}_211123333_{asin}.html', 'w', encoding='utf-8')as f:
self.requests_error_asin_list.append(query[0]) # f.write(resp.text)
continue if self.reuests_para_val.check_amazon_yzm(resp):
except Exception as e: print('出现验证码::','#' * 80)
self.asin_request_errp_total_list.append(2) self.yzm_err_total_list.append(1)
print("请求错误错误: ", e) self.headers_num_int += 1
if 'Received response with content-encoding: gzip' in str(e): self.requests_error_asin_list.append(query[0])
self.asin_not_found_list.append(asin)
else:
self.requests_error_asin_list.append(query[0])
continue continue
response_url = resp.url except Exception as e:
response = resp.text self.asin_request_errp_total_list.append(2)
response_s = etree.HTML(response) print("请求错误错误: ", e)
self.success_asin_total_list.append(3) if 'Received response with content-encoding: gzip' in str(e):
if self.reuests_para_val.check_amazon_not_page(response):
self.asin_not_found_list.append(asin) self.asin_not_found_list.append(asin)
continue else:
if self.reuests_para_val.check_amazon_page(response, response_url): print('2233请求错误错误::', '#' * 80)
self.asin_not_redirect_list.append(asin) self.requests_error_asin_list.append(query[0])
continue continue
if self.reuests_para_val.check_amazon_allow_redirects(response_url, asin): response_url = resp.url
self.asin_not_redirect_list.append(asin) response = resp.text
continue response_s = etree.HTML(response)
# 获取邮编 self.success_asin_total_list.append(3)
try: if self.reuests_para_val.check_amazon_not_page(response):
ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()") self.asin_not_found_list.append(asin)
except Exception as e: continue
self.asin_not_response_list.append(asin) if self.reuests_para_val.check_amazon_page(response, response_url):
continue self.asin_not_redirect_list.append(asin)
try: continue
ingress = ingress[0].strip() if self.reuests_para_val.check_amazon_allow_redirects(response_url, asin):
except: self.asin_not_redirect_list.append(asin)
ingress = None continue
print(ingress, ' 打印 邮编 ', resp.url) # 获取邮编
if ingress: try:
if self.reuests_para_val.check_amazon_ingress(ingress): ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()")
try: except Exception as e:
cookie_ubid_main_id = re.findall(r'ubid-main=(.*?);', cookie_str)[0] self.asin_not_response_list.append(asin)
except: continue
cookie_ubid_main_id = re.findall(r'session-id=(.*?);', cookie_str)[0] try:
for cookie_key_value in self.cookie_dict_delete_id.items(): ingress = ingress[0].strip()
if cookie_ubid_main_id in cookie_key_value[1]: except:
self.delete_cookies_list.append(cookie_key_value[0]) ingress = None
self.requests_error_asin_list.append(asin) print(ingress, ' 打印 邮编 ', resp.url)
continue if ingress:
if self.reuests_para_val.check_amazon_ingress(ingress):
div_dp = response_s.xpath('//div[@id="dp"]') ubid_list = re.findall(r'ubid-main=(.*?);', cookie_str)
if div_dp: if ubid_list:
# 解析resp=_response_text, asin=asin cookie_ubid_main_id = ubid_list[0]
items = ParseAsinUs(resp=response, asin=asin, month=self.month_, date_info=date_info,
site_name=self.site_name).xpath_html()
new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item = {'asin': items["asin"], 'week': items["week"], 'month': items["month"],
'title': items["title"],
'img_url': items["img_url"],
'rating': items["rating"],
'total_comments': items["total_comments"],
'price': items["price"], "rank": items["rank"], 'category': items["category"],
'launch_time': items["launch_time"],
'volume': items["volume"],
'weight': items["weight"], "page_inventory": items["page_inventory"],
"buy_box_seller_type": items["buy_box_seller_type"],
"asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
'img_num': items["img_num"], 'img_type': items["img_type"],
'activity_type': items["activity_type"],
'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
'eight_val': items["eight_val"],
'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
'three_star': items["three_star"],
'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
'together_asin': items["together_asin"],
'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
'node_id': items["node_id"], 'data_type': data_type_asin,
'sp_num': items["sp_num"], 'describe': items["describe"], 'date_info': date_info,
'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
'variat_num': items['variat_num'],
'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
'spider_int': self.spider_int, 'all_best_sellers_herf': items['all_best_sellers_herf'],
'product_description': items['product_description'], 'buy_sales': items['buySales'],
'image_view': items['image_view'], 'product_json': items['product_json'],
'product_detail_json': items['productdetail_json'],
'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
'lob_asin_json': items['lob_asin_json'],
'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
'compare_similar_asin_json': items['compare_similar_asin_json'],
'customer_reviews_json': items['customer_reviews_json'],
'together_asin_json': items['together_asin_json'],
'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"], 'div_id_list': items['div_id_list'],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json':items['bundle_asin_component_json'],
'review_json_list':items['review_json_list'],'fbm_delivery_price':items['fbm_delivery_price']
}
if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
item['five_six_val'] = items['five_six_val']
else: else:
item['five_six_val'] = None session_list = re.findall(r'session-id=(.*?);', cookie_str)
# 第二次请求 cookie_ubid_main_id = session_list[0] if session_list else None
_response_text = None for cookie_key_value in self.cookie_dict_delete_id.items():
if item['variat_num'] > 0 and is_variat == '0': if cookie_ubid_main_id in cookie_key_value[1]:
self.request_total_count_list.append(4) self.delete_cookies_list.append(cookie_key_value[0])
_url = self.site_url + 'dp/' + asin + "?th=1&psc=1" print(ingress,'邮编 错误 ::', '#' * 80)
print('第二次请求:', _url) self.requests_error_asin_list.append(asin)
try: continue
_response_text = None else:
_response_text = self.reuests_para_val.requests_amazon(headers=headers, scraper_url=_url) self.requests_error_asin_list.append(asin)
if _response_text: continue
_items = ParseAsinUs(resp=_response_text, asin=asin, month=self.month_, div_dp = response_s.xpath('//div[@id="dp"]')
date_info=date_info, if div_dp:
site_name=self.site_name).xpath_html() items = ParseAsinUs(resp=response, asin=asin, month=self.month_, date_info=date_info,
if _items["volume"] and item['volume'] is None: site_name=self.site_name).xpath_html()
item['volume'] = _items["volume"] new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if _items['result_list_json'] and item['result_list_json'] is None: item = {'asin': items["asin"], 'week': items["week"], 'month': items["month"],
item['result_list_json'] = _items["result_list_json"] 'title': items["title"],
if _items["weight_str"] and item['weight_str'] is None: 'img_url': items["img_url"],
item['weight_str'] = _items["weight_str"] 'rating': items["rating"],
if _items["weight"] and item['weight'] is None: 'total_comments': items["total_comments"],
item['weight'] = _items["weight"] 'price': items["price"], "rank": items["rank"], 'category': items["category"],
if item["rank"] is None: 'launch_time': items["launch_time"],
item["rank"] = _items["rank"] 'volume': items["volume"],
if item["launch_time"] is None: 'weight': items["weight"], "page_inventory": items["page_inventory"],
item["launch_time"] = _items["launch_time"] "buy_box_seller_type": items["buy_box_seller_type"],
if item['product_description'] is None: "asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
item['product_description'] = _items["product_description"] 'img_num': items["img_num"], 'img_type': items["img_type"],
if item["price"] is None: 'activity_type': items["activity_type"],
item["price"] = _items["price"] 'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
elif item["price"] < 1: 'eight_val': items["eight_val"],
item["price"] = _items["price"] 'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
if item["buy_sales"] is None: 'three_star': items["three_star"],
item["buy_sales"] = _items["buySales"] 'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
if item['buy_box_seller_type'] is None or item['buy_box_seller_type'] == 4: 'together_asin': items["together_asin"],
item["buy_box_seller_type"] = _items["buy_box_seller_type"] 'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
if item['page_inventory'] == 0 or item['page_inventory'] == 3: 'node_id': items["node_id"], 'data_type': data_type_asin,
item["page_inventory"] = _items["page_inventory"] 'sp_num': items["sp_num"], 'describe': items["describe"], 'date_info': date_info,
if item['account_name'] is None: 'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
item["account_name"] = _items["account_name"] 'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
if item['seller_id'] is None: 'variat_num': items['variat_num'],
item["seller_id"] = _items["seller_id"] 'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
if item['seller_json'] is None: 'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
item["seller_json"] = _items["seller_json"] 'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
if item['five_star'] is None: 'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
item['five_star'] = _items["five_star"] 'spider_int': self.spider_int, 'all_best_sellers_herf': items['all_best_sellers_herf'],
if item['four_star'] is None: 'product_description': items['product_description'], 'buy_sales': items['buySales'],
item['four_star'] = _items["four_star"] 'image_view': items['image_view'], 'product_json': items['product_json'],
# 修复:删除重复的 four_star 检查 'product_detail_json': items['productdetail_json'],
if item['two_star'] is None: 'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
item['two_star'] = _items["two_star"] 'lob_asin_json': items['lob_asin_json'],
if item['one_star'] is None: 'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
item['one_star'] = _items["one_star"] 'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
if item['low_star'] is None: 'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
item['low_star'] = _items["low_star"] 'compare_similar_asin_json': items['compare_similar_asin_json'],
if item['category'] is None: 'customer_reviews_json': items['customer_reviews_json'],
item['category'] = _items["category"] 'together_asin_json': items['together_asin_json'],
if item['node_id'] is None: 'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
item['node_id'] = _items["node_id"] 'created_time': new_date, 'current_asin': items['current_asin'],
if item['review_json_list'] is None: 'parent_asin': items["parentAsin"], 'div_id_list': items['div_id_list'],
item['review_json_list'] = _items["review_json_list"] 'bundles_this_asins_json': items['bundles_this_asins_data_json'],
if item['fbm_delivery_price'] is None: 'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
item['fbm_delivery_price'] = _items["fbm_delivery_price"] 'bundle_asin_component_json':items['bundle_asin_component_json'],
if item['review_ai_text'] is None: 'review_json_list':items['review_json_list'],'fbm_delivery_price':items['fbm_delivery_price']
item['review_ai_text'] = _items["review_ai_text"] }
except: if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
pass item['five_six_val'] = items['five_six_val']
_response_text_var = None else:
if item["buy_box_seller_type"] == 4 and item['page_inventory'] == 3 and item['variat_num'] > 0 and \ item['five_six_val'] = None
items["asin_variation_list"]: # 第二次请求
self.request_total_count_list.append(4) _response_text = None
try: if item['variat_num'] > 0 and is_variat == '0':
if asin != items["asin_variation_list"][0][0]: self.request_total_count_list.append(4)
_to_asin = items["asin_variation_list"][0][0] _url = self.site_url + 'dp/' + asin + "?th=1&psc=1"
elif len(items["asin_variation_list"]) > 1: print('第二次请求:', _url)
_to_asin = items["asin_variation_list"][1][0] try:
else: _response_text = None
_to_asin = item['parentAsin'] _response_text = self.reuests_para_val.requests_amazon(headers=headers, scraper_url=_url, sess=None)
_url = self.site_url + 'dp/' + _to_asin + "?th=1&psc=1" if _response_text:
print('请求asin 出现缺货,拿变体asin进行请求:', _url) _items = ParseAsinUs(resp=_response_text, asin=asin, month=self.month_,
_response_text_var = self.reuests_para_val.requests_amazon(headers=headers, date_info=date_info,
scraper_url=_url) site_name=self.site_name).xpath_html()
_to_items = ParseAsinUs(resp=_response_text_var, asin=asin, month=self.month_, if _items["volume"] and item['volume'] is None:
date_info=date_info, item['volume'] = _items["volume"]
site_name=self.site_name).xpath_html() if _items['result_list_json'] and item['result_list_json'] is None:
item['result_list_json'] = _items["result_list_json"]
if _items["weight_str"] and item['weight_str'] is None:
item['weight_str'] = _items["weight_str"]
if _items["weight"] and item['weight'] is None:
item['weight'] = _items["weight"]
if item["rank"] is None:
item["rank"] = _items["rank"]
if item["launch_time"] is None:
item["launch_time"] = _items["launch_time"]
if item['product_description'] is None:
item['product_description'] = _items["product_description"]
if item["price"] is None:
item["price"] = _items["price"]
elif item["price"] < 1:
item["price"] = _items["price"]
if item["buy_sales"] is None:
item["buy_sales"] = _items["buySales"]
if item['buy_box_seller_type'] is None or item['buy_box_seller_type'] == 4: if item['buy_box_seller_type'] is None or item['buy_box_seller_type'] == 4:
item["buy_box_seller_type"] = _to_items["buy_box_seller_type"] item["buy_box_seller_type"] = _items["buy_box_seller_type"]
if item['page_inventory'] == 0 or item['page_inventory'] == 3:
item["page_inventory"] = _items["page_inventory"]
if item['account_name'] is None: if item['account_name'] is None:
item["account_name"] = _to_items["account_name"] item["account_name"] = _items["account_name"]
if item['seller_id'] is None: if item['seller_id'] is None:
item["seller_id"] = _to_items["seller_id"] item["seller_id"] = _items["seller_id"]
if item['seller_json'] is None: if item['seller_json'] is None:
item["seller_json"] = _to_items["seller_json"] item["seller_json"] = _items["seller_json"]
except Exception as e: if item['five_star'] is None:
print('请求asin 第二次请求 报错:', f"\n{traceback.format_exc()}") item['five_star'] = _items["five_star"]
if item['four_star'] is None:
if item['volume'] is None and volume_str != 'null': item['four_star'] = _items["four_star"]
item['volume'] = volume_str if item['three_star'] is None:
if item['weight_str'] is None and weight_str != 'null': item['three_star'] = _items["three_star"]
item['weight_str'] = weight_str if item['two_star'] is None:
new_date_hour = datetime.now().strftime("%Y-%m-%d:%H") item['two_star'] = _items["two_star"]
num = self.reuests_para_val.get_hour(new_date_hour) if item['one_star'] is None:
self.hour_total_count_list.append(num) item['one_star'] = _items["one_star"]
print("+++++++++++item['buy_sales']::", item['buy_sales']) if item['low_star'] is None:
if not item['title'] or not item['img_url']: item['low_star'] = _items["low_star"]
self.asin_not_sure_list.append(asin) if item['category'] is None:
continue item['category'] = _items["category"]
# 修复:检查 img_url 是否包含无效值 if item['node_id'] is None:
img_url_invalid = False item['node_id'] = _items["node_id"]
if item['img_url'] and len(item['img_url'].strip()) > 2: if item['review_json_list'] is None:
for key in ['None', 'null', 'none']: item['review_json_list'] = _items["review_json_list"]
if key in item['img_url']: if item['fbm_delivery_price'] is None:
img_url_invalid = True item['fbm_delivery_price'] = _items["fbm_delivery_price"]
break if item['review_ai_text'] is None:
if img_url_invalid: item['review_ai_text'] = _items["review_ai_text"]
self.asin_not_sure_list.append(asin)
continue
# 修复:检查 title 是否包含无效值
title_invalid = False
if item['title'] and len(item['title'].strip()) > 2:
for key in ['None', 'null', 'none']:
if key in item['title']:
title_invalid = True
break
if title_invalid:
self.asin_not_sure_list.append(asin)
continue
print('itemitem:::', item)
# 上架时间 排名 重量 底部信息 如果都为None 重新抓取
if item["launch_time"] is None and item["rank"] is None and item['weight'] is None and item[
'product_detail_json'] is None and len(items['div_id_list']) < 1:
print('上架时间 排名 重量 底部信息 如果都为None 重新抓取:::', asin)
self.requests_error_asin_list.append(asin)
continue
if (self.reuests_para_val.check_contain_chinese(item['title'])) or (
self.reuests_para_val.check_contain_chinese(item['category'])):
self.asin_not_sure_list.append(asin)
continue
if items["buyBox_list"]:
self.buyBox_list.extend(items["buyBox_list"])
if items["buyBoxname_asin_list"]:
self.buyBoxname_asin_list.extend(items["buyBoxname_asin_list"])
if items["bs_category_asin_list_pg"]:
self.bs_category_asin_list_pg.extend(items["bs_category_asin_list_pg"])
if items["asin_variation_list"]:
item['variat_list'] = json.dumps(items["asin_variation_list"]) # 变体
else:
item['variat_list'] = None
item['asin_vartion_list'] = items["asin_variation_list"]
if items["all_img_video_list"]:
item['img_list'] = json.dumps(items["all_img_video_list"])
else:
item['img_list'] = None
if item['img_list'] is None:
item['img_list'] = []
self.item_queue.put(item)
# 获取字段值为None的字段名称写入redis进行统计
none_keys = [key for key, value in item.items() if
(value is None) or (value == -1 and key == 'price') or (
value == 0 and key in ['weight', 'total_comments', 'rating'])]
for key in ['parentAsin', 'week', 'all_best_sellers_herf', 'best_sellers_rank', 'seller_id',
'account_url', 'product_json', 'product_detail_json', 'review_ai_text', 'lob_asin_json',
'sp_initial_seen_asins_json', 'sp_4stars_initial_seen_asins_json',
'sp_delivery_initial_seen_asins_json', 'compare_similar_asin_json',
'customer_reviews_json', 'together_asin_json', 'min_match_asin_json',
'product_description', 'variat_num', 'qa_num', 'asin_vartion_list', 'review_label_json',
'seller_json', 'current_asin', 'five_six_val', 'best_sellers_herf',
'bundles_this_asins_json'
]:
if key in none_keys:
none_keys.remove(key)
log_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
try:
self.redis14.rpush(f'{self.site_name}_{log_time}_asin_detail_is_none', *none_keys)
except: except:
pass pass
self.reuests_para_val.send_kafka(items=item, topic=self.topic_detail_month) _response_text_var = None
print(asin, 'rank 排名:', item['rank']) if item["buy_box_seller_type"] == 4 and item['page_inventory'] == 3 and item['variat_num'] > 0 and \
if item['rank'] is not None and item['rank'] < 9000: items["asin_variation_list"]:
# requests_num 代表不同类型url请求返回的源码。 self.request_total_count_list.append(4)
if _response_text_var: # 请求asin 出现缺货,拿变体asin进行请求 try:
requests_num = 2 if asin != items["asin_variation_list"][0][0]:
response_gzip = self.compress_string(_response_text_var) _to_asin = items["asin_variation_list"][0][0]
elif _response_text: # 发现有变体。导入asin没有标记。重新请求第二次请求 elif len(items["asin_variation_list"]) > 1:
requests_num = 1 _to_asin = items["asin_variation_list"][1][0]
response_gzip = self.compress_string(_response_text)
else: else:
requests_num = 0 # 第一次请求返回源码 _to_asin = item['parentAsin']
response_gzip = self.compress_string(response) _url = self.site_url + 'dp/' + _to_asin + "?th=1&psc=1"
html_data = f'{self.site_name}|-||=|-|=||-|{asin}|-||=|-|=||-|{response_gzip}|-||=|-|=||-|{new_date}|-||=|-|=||-|{requests_num}' print('请求asin 出现缺货,拿变体asin进行请求:', _url)
self.reuests_para_val.send_kafka(html_data=html_data, topic=self.topic_asin_html) _response_text_var = self.reuests_para_val.requests_amazon(headers=headers,
scraper_url=_url, sess=None)
_to_items = ParseAsinUs(resp=_response_text_var, asin=asin, month=self.month_,
date_info=date_info,
site_name=self.site_name).xpath_html()
if item['buy_box_seller_type'] is None or item['buy_box_seller_type'] == 4:
item["buy_box_seller_type"] = _to_items["buy_box_seller_type"]
if item['account_name'] is None:
item["account_name"] = _to_items["account_name"]
if item['seller_id'] is None:
item["seller_id"] = _to_items["seller_id"]
if item['seller_json'] is None:
item["seller_json"] = _to_items["seller_json"]
except Exception as e:
print('请求asin 第二次请求 报错:', f"\n{traceback.format_exc()}")
if item['volume'] is None and volume_str != 'null':
item['volume'] = volume_str
if item['weight_str'] is None and weight_str != 'null':
item['weight_str'] = weight_str
new_date_hour = datetime.now().strftime("%Y-%m-%d:%H")
num = self.reuests_para_val.get_hour(new_date_hour)
self.hour_total_count_list.append(num)
print("+++++++++++item['buy_sales']::", item['buy_sales'])
if not item['title'] or not item['img_url']:
self.asin_not_sure_list.append(asin)
continue
# 修复:检查 img_url 是否包含无效值
img_url_invalid = False
if item['img_url'] and len(item['img_url'].strip()) > 2:
for key in ['None', 'null', 'none']:
if key in item['img_url']:
img_url_invalid = True
break
if img_url_invalid:
self.asin_not_sure_list.append(asin)
continue
# 修复:检查 title 是否包含无效值
title_invalid = False
if item['title'] and len(item['title'].strip()) > 2:
for key in ['None', 'null', 'none']:
if key in item['title']:
title_invalid = True
break
if title_invalid:
self.asin_not_sure_list.append(asin)
continue
print('itemitem:::', item)
# 上架时间 排名 重量 底部信息 如果都为None 重新抓取
if item["launch_time"] is None and item["rank"] is None and item['weight'] is None and item[
'product_detail_json'] is None and len(items['div_id_list']) < 1:
print('上架时间 排名 重量 底部信息 如果都为None 重新抓取:::', asin)
print(ingress, '上架时间 排名 重量 底部信息 如果都为None ::', '#' * 80)
self.requests_error_asin_list.append(asin)
continue
if (self.reuests_para_val.check_contain_chinese(item['title'])) or (
self.reuests_para_val.check_contain_chinese(item['category'])):
self.asin_not_sure_list.append(asin)
continue
if items["buyBox_list"]:
self.buyBox_list.extend(items["buyBox_list"])
if items["buyBoxname_asin_list"]:
self.buyBoxname_asin_list.extend(items["buyBoxname_asin_list"])
if items["bs_category_asin_list_pg"]:
self.bs_category_asin_list_pg.extend(items["bs_category_asin_list_pg"])
if items["asin_variation_list"]:
item['variat_list'] = json.dumps(items["asin_variation_list"]) # 变体
else:
item['variat_list'] = None
item['asin_vartion_list'] = items["asin_variation_list"]
if items["all_img_video_list"]:
item['img_list'] = json.dumps(items["all_img_video_list"])
else: else:
if 'Click the button below to continue shopping' in response: item['img_list'] = None
self.requests_error_asin_list.append(query[0])
if item['img_list'] is None:
item['img_list'] = json.dumps([])
self.item_queue.put(item)
# 获取字段值为None的字段名称写入redis进行统计
none_keys = [key for key, value in item.items() if
(value is None) or (value == -1 and key == 'price') or (
value == 0 and key in ['weight', 'total_comments', 'rating'])]
for key in ['parentAsin', 'week', 'all_best_sellers_herf', 'best_sellers_rank', 'seller_id',
'account_url', 'product_json', 'product_detail_json', 'review_ai_text', 'lob_asin_json',
'sp_initial_seen_asins_json', 'sp_4stars_initial_seen_asins_json',
'sp_delivery_initial_seen_asins_json', 'compare_similar_asin_json',
'customer_reviews_json', 'together_asin_json', 'min_match_asin_json',
'product_description', 'variat_num', 'qa_num', 'asin_vartion_list', 'review_label_json',
'seller_json', 'current_asin', 'five_six_val', 'best_sellers_herf',
'bundles_this_asins_json'
]:
if key in none_keys:
none_keys.remove(key)
log_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
try:
self.redis14.rpush(f'{self.site_name}_{log_time}_asin_detail_is_none', *none_keys)
except:
pass
self.reuests_para_val.send_kafka(items=item, topic=self.topic_detail_month)
print(asin, 'rank 排名:', item['rank'])
if item['rank'] is not None and item['rank'] < 9000:
# requests_num 代表不同类型url请求返回的源码。
if _response_text_var: # 请求asin 出现缺货,拿变体asin进行请求
requests_num = 2
response_gzip = self.compress_string(_response_text_var)
elif _response_text: # 发现有变体。导入asin没有标记。重新请求第二次请求
requests_num = 1
response_gzip = self.compress_string(_response_text)
else: else:
print('状态13', asin) requests_num = 0 # 第一次请求返回源码
self.asin_not_div_id_dp_list.append(asin) response_gzip = self.compress_string(response)
continue html_data = f'{self.site_name}|-||=|-|=||-|{asin}|-||=|-|=||-|{response_gzip}|-||=|-|=||-|{new_date}|-||=|-|=||-|{requests_num}'
self.reuests_para_val.send_kafka(html_data=html_data, topic=self.topic_asin_html)
else: else:
print(f"当前线程-已完成-爬取-跳出循环") if 'Click the button below to continue shopping' in response:
break print(ingress, 'Click the button below to continue shopping ::', '#' * 80)
self.requests_error_asin_list.append(query[0])
else:
print('状态13', asin)
self.asin_not_div_id_dp_list.append(asin)
continue
# 压缩字符串 # 压缩字符串
def compress_string(self, input_string): def compress_string(self, input_string):
...@@ -474,7 +488,7 @@ class async_asin_pg(): ...@@ -474,7 +488,7 @@ class async_asin_pg():
# site_name=self.site_name).xpath_html() # site_name=self.site_name).xpath_html()
# print(items) # print(items)
asin_list = self.save_asin_detail.read_db_data() asin_list = self.save_asin_detail.read_db_data()
# asin_list = ['B0FV8W9T52|2025-01|1|1|null|null'] # asin_list = ['B0FM433BGV|2025-01|1|1|null|null']
if asin_list: if asin_list:
for asin in asin_list: for asin in asin_list:
self.queries_asin_queue.put(asin) self.queries_asin_queue.put(asin)
...@@ -484,7 +498,7 @@ class async_asin_pg(): ...@@ -484,7 +498,7 @@ class async_asin_pg():
for ck in cookies_dict.values(): for ck in cookies_dict.values():
self.cookies_queue.put(ck) self.cookies_queue.put(ck)
html_thread = [] html_thread = []
for i in range(25): for i in range(20):
thread2 = threading.Thread(target=self.get_asin) thread2 = threading.Thread(target=self.get_asin)
thread2.start() thread2.start()
html_thread.append(thread2) html_thread.append(thread2)
...@@ -557,4 +571,4 @@ class async_asin_pg(): ...@@ -557,4 +571,4 @@ class async_asin_pg():
pass pass
# if __name__ == '__main__': # if __name__ == '__main__':
# async_asin_pg(month=12, spider_int=1, week=14,site_name='us').run() # async_asin_pg(month='02', spider_int=1, week=14,site_name='us').run()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment