Commit 4825673f by Peng

解决多个任务操作队列读取操作,避免某个线程被挂起。通过ai进行优化整体代码,多线程相互争夺资源的控制

parent 0d70b338
......@@ -2,14 +2,14 @@ import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from amazon_params import py_ja3
# from amazon_params import py_ja3
from amazon_save_db.save_asin_detail_pg import Save_asin_detail
from utils.asin_parse import ParseAsinUs
from queue import Queue
from amazon_params import py_ja3
from queue import Queue, Empty
import time
import re
from lxml import etree
import requests
import urllib3
import threading
from func_timeout.exceptions import FunctionTimedOut
......@@ -17,7 +17,8 @@ import traceback
from datetime import datetime
import gzip
import json
# from curl_cffi import requests as curl
from curl_cffi import requests
# import requests as requests2
from kafka.errors import KafkaTimeoutError
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings()
......@@ -42,6 +43,7 @@ class async_asin_pg():
self.asin_not_div_id_dp_list = [] # 返回html没有包含div @id=dp,状态13
self.asin_list_update = [] # 3
self.cookies_queue = Queue() # cookie队列
self.cookie_refill_lock = threading.Lock() # cookie重填锁
self.item_queue = Queue() # 存储 item 详情数据队列
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.buyBox_list = [] # 卖家名称 url 列表
......@@ -70,356 +72,368 @@ class async_asin_pg():
self.topic_detail_month = f'{self.site_name}_asin_detail_month_2026_{self.month_}'
self.topic_asin_html = f'asin_html_2026_{self.month_}'
self.asin_video_list = []
# 修复:sess 改为类成员变量,只 mount 一次
self.sess = requests.Session()
self.sess.mount(self.site_url, py_ja3.DESAdapter())
def get_asin(self):
while True:
if not self.queries_asin_queue.empty():
querys = self.queries_asin_queue.get()
try:
querys = self.queries_asin_queue.get_nowait()
except Empty:
print(f"当前线程-已完成-爬取-跳出循环")
break
with self.cookie_refill_lock:
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie()
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
# 获取组装cookie
cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
query = str(querys).split('|')
is_variat = query[2]
asin = query[0]
date_info = query[1]
data_type_asin = int(query[3])
volume_str = query[4]
weight_str = query[5]
headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
asin=asin, scraper_url=None)
headers["cookie"] = cookie_str
self.month_ = date_info.split('-')[1]
if self.headers_num_int > 20: # 亚马逊出现超过20次ip已经被封锁。退出抓取切换ip。
break
if is_variat == '1':
scraper_url = self.site_url + 'dp/' + query[0] + "?th=1&psc=1"
else:
scraper_url = self.site_url + 'dp/' + query[0] + '?th=1'
self.request_total_count_list.append(4)
print('scraper_url::', scraper_url)
try:
resp = self.sess.get(scraper_url, headers=headers,
timeout=10, verify=False)
# with open(rf'D:\新建文件夹\html_selenium_files\{self.site_name}_211123333_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp):
self.yzm_err_total_list.append(1)
self.headers_num_int += 1
self.requests_error_asin_list.append(query[0])
continue
except Exception as e:
self.asin_request_errp_total_list.append(2)
print("请求错误错误: ", e)
if 'Received response with content-encoding: gzip' in str(e):
self.asin_not_found_list.append(asin)
else:
self.requests_error_asin_list.append(query[0])
# 获取组装cookie
cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
query = str(querys).split('|')
is_variat = query[2]
asin = query[0]
date_info = query[1]
data_type_asin = int(query[3])
volume_str = query[4]
weight_str = query[5]
headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
asin=asin, scraper_url=None)
headers["cookie"] = cookie_str
self.month_ = date_info.split('-')[1]
if self.headers_num_int > 20: # 亚马逊出现超过20次ip已经被封锁。退出抓取切换ip。
break
if is_variat == '1':
scraper_url = self.site_url + 'dp/' + query[0] + "?th=1&psc=1"
else:
scraper_url = self.site_url + 'dp/' + query[0] + '?th=1'
self.request_total_count_list.append(4)
print('scraper_url::', scraper_url)
try:
# sess = requests2.Session()
# sess.mount(self.site_url, py_ja3.DESAdapter())
# resp = requests.get(scraper_url, headers=headers,
# timeout=20)
resp = requests.get(scraper_url, headers=headers,
timeout=30, verify=False, impersonate="chrome")
# with open(rf'D:\新建文件夹\html_selenium_files\{self.site_name}_211123333_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp):
print('出现验证码::','#' * 80)
self.yzm_err_total_list.append(1)
self.headers_num_int += 1
self.requests_error_asin_list.append(query[0])
continue
response_url = resp.url
response = resp.text
response_s = etree.HTML(response)
self.success_asin_total_list.append(3)
if self.reuests_para_val.check_amazon_not_page(response):
except Exception as e:
self.asin_request_errp_total_list.append(2)
print("请求错误错误: ", e)
if 'Received response with content-encoding: gzip' in str(e):
self.asin_not_found_list.append(asin)
continue
if self.reuests_para_val.check_amazon_page(response, response_url):
self.asin_not_redirect_list.append(asin)
continue
if self.reuests_para_val.check_amazon_allow_redirects(response_url, asin):
self.asin_not_redirect_list.append(asin)
continue
# 获取邮编
try:
ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()")
except Exception as e:
self.asin_not_response_list.append(asin)
continue
try:
ingress = ingress[0].strip()
except:
ingress = None
print(ingress, ' 打印 邮编 ', resp.url)
if ingress:
if self.reuests_para_val.check_amazon_ingress(ingress):
try:
cookie_ubid_main_id = re.findall(r'ubid-main=(.*?);', cookie_str)[0]
except:
cookie_ubid_main_id = re.findall(r'session-id=(.*?);', cookie_str)[0]
for cookie_key_value in self.cookie_dict_delete_id.items():
if cookie_ubid_main_id in cookie_key_value[1]:
self.delete_cookies_list.append(cookie_key_value[0])
self.requests_error_asin_list.append(asin)
continue
div_dp = response_s.xpath('//div[@id="dp"]')
if div_dp:
# 解析resp=_response_text, asin=asin
items = ParseAsinUs(resp=response, asin=asin, month=self.month_, date_info=date_info,
site_name=self.site_name).xpath_html()
new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item = {'asin': items["asin"], 'week': items["week"], 'month': items["month"],
'title': items["title"],
'img_url': items["img_url"],
'rating': items["rating"],
'total_comments': items["total_comments"],
'price': items["price"], "rank": items["rank"], 'category': items["category"],
'launch_time': items["launch_time"],
'volume': items["volume"],
'weight': items["weight"], "page_inventory": items["page_inventory"],
"buy_box_seller_type": items["buy_box_seller_type"],
"asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
'img_num': items["img_num"], 'img_type': items["img_type"],
'activity_type': items["activity_type"],
'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
'eight_val': items["eight_val"],
'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
'three_star': items["three_star"],
'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
'together_asin': items["together_asin"],
'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
'node_id': items["node_id"], 'data_type': data_type_asin,
'sp_num': items["sp_num"], 'describe': items["describe"], 'date_info': date_info,
'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
'variat_num': items['variat_num'],
'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
'spider_int': self.spider_int, 'all_best_sellers_herf': items['all_best_sellers_herf'],
'product_description': items['product_description'], 'buy_sales': items['buySales'],
'image_view': items['image_view'], 'product_json': items['product_json'],
'product_detail_json': items['productdetail_json'],
'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
'lob_asin_json': items['lob_asin_json'],
'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
'compare_similar_asin_json': items['compare_similar_asin_json'],
'customer_reviews_json': items['customer_reviews_json'],
'together_asin_json': items['together_asin_json'],
'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"], 'div_id_list': items['div_id_list'],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json':items['bundle_asin_component_json'],
'review_json_list':items['review_json_list'],'fbm_delivery_price':items['fbm_delivery_price']
}
if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
item['five_six_val'] = items['five_six_val']
else:
print('2233请求错误错误::', '#' * 80)
self.requests_error_asin_list.append(query[0])
continue
response_url = resp.url
response = resp.text
response_s = etree.HTML(response)
self.success_asin_total_list.append(3)
if self.reuests_para_val.check_amazon_not_page(response):
self.asin_not_found_list.append(asin)
continue
if self.reuests_para_val.check_amazon_page(response, response_url):
self.asin_not_redirect_list.append(asin)
continue
if self.reuests_para_val.check_amazon_allow_redirects(response_url, asin):
self.asin_not_redirect_list.append(asin)
continue
# 获取邮编
try:
ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()")
except Exception as e:
self.asin_not_response_list.append(asin)
continue
try:
ingress = ingress[0].strip()
except:
ingress = None
print(ingress, ' 打印 邮编 ', resp.url)
if ingress:
if self.reuests_para_val.check_amazon_ingress(ingress):
ubid_list = re.findall(r'ubid-main=(.*?);', cookie_str)
if ubid_list:
cookie_ubid_main_id = ubid_list[0]
else:
item['five_six_val'] = None
# 第二次请求
_response_text = None
if item['variat_num'] > 0 and is_variat == '0':
self.request_total_count_list.append(4)
_url = self.site_url + 'dp/' + asin + "?th=1&psc=1"
print('第二次请求:', _url)
try:
_response_text = None
_response_text = self.reuests_para_val.requests_amazon(headers=headers, scraper_url=_url)
if _response_text:
_items = ParseAsinUs(resp=_response_text, asin=asin, month=self.month_,
date_info=date_info,
site_name=self.site_name).xpath_html()
if _items["volume"] and item['volume'] is None:
item['volume'] = _items["volume"]
if _items['result_list_json'] and item['result_list_json'] is None:
item['result_list_json'] = _items["result_list_json"]
if _items["weight_str"] and item['weight_str'] is None:
item['weight_str'] = _items["weight_str"]
if _items["weight"] and item['weight'] is None:
item['weight'] = _items["weight"]
if item["rank"] is None:
item["rank"] = _items["rank"]
if item["launch_time"] is None:
item["launch_time"] = _items["launch_time"]
if item['product_description'] is None:
item['product_description'] = _items["product_description"]
if item["price"] is None:
item["price"] = _items["price"]
elif item["price"] < 1:
item["price"] = _items["price"]
if item["buy_sales"] is None:
item["buy_sales"] = _items["buySales"]
if item['buy_box_seller_type'] is None or item['buy_box_seller_type'] == 4:
item["buy_box_seller_type"] = _items["buy_box_seller_type"]
if item['page_inventory'] == 0 or item['page_inventory'] == 3:
item["page_inventory"] = _items["page_inventory"]
if item['account_name'] is None:
item["account_name"] = _items["account_name"]
if item['seller_id'] is None:
item["seller_id"] = _items["seller_id"]
if item['seller_json'] is None:
item["seller_json"] = _items["seller_json"]
if item['five_star'] is None:
item['five_star'] = _items["five_star"]
if item['four_star'] is None:
item['four_star'] = _items["four_star"]
# 修复:删除重复的 four_star 检查
if item['two_star'] is None:
item['two_star'] = _items["two_star"]
if item['one_star'] is None:
item['one_star'] = _items["one_star"]
if item['low_star'] is None:
item['low_star'] = _items["low_star"]
if item['category'] is None:
item['category'] = _items["category"]
if item['node_id'] is None:
item['node_id'] = _items["node_id"]
if item['review_json_list'] is None:
item['review_json_list'] = _items["review_json_list"]
if item['fbm_delivery_price'] is None:
item['fbm_delivery_price'] = _items["fbm_delivery_price"]
if item['review_ai_text'] is None:
item['review_ai_text'] = _items["review_ai_text"]
except:
pass
_response_text_var = None
if item["buy_box_seller_type"] == 4 and item['page_inventory'] == 3 and item['variat_num'] > 0 and \
items["asin_variation_list"]:
self.request_total_count_list.append(4)
try:
if asin != items["asin_variation_list"][0][0]:
_to_asin = items["asin_variation_list"][0][0]
elif len(items["asin_variation_list"]) > 1:
_to_asin = items["asin_variation_list"][1][0]
else:
_to_asin = item['parentAsin']
_url = self.site_url + 'dp/' + _to_asin + "?th=1&psc=1"
print('请求asin 出现缺货,拿变体asin进行请求:', _url)
_response_text_var = self.reuests_para_val.requests_amazon(headers=headers,
scraper_url=_url)
_to_items = ParseAsinUs(resp=_response_text_var, asin=asin, month=self.month_,
date_info=date_info,
site_name=self.site_name).xpath_html()
session_list = re.findall(r'session-id=(.*?);', cookie_str)
cookie_ubid_main_id = session_list[0] if session_list else None
for cookie_key_value in self.cookie_dict_delete_id.items():
if cookie_ubid_main_id in cookie_key_value[1]:
self.delete_cookies_list.append(cookie_key_value[0])
print(ingress,'邮编 错误 ::', '#' * 80)
self.requests_error_asin_list.append(asin)
continue
else:
self.requests_error_asin_list.append(asin)
continue
div_dp = response_s.xpath('//div[@id="dp"]')
if div_dp:
items = ParseAsinUs(resp=response, asin=asin, month=self.month_, date_info=date_info,
site_name=self.site_name).xpath_html()
new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item = {'asin': items["asin"], 'week': items["week"], 'month': items["month"],
'title': items["title"],
'img_url': items["img_url"],
'rating': items["rating"],
'total_comments': items["total_comments"],
'price': items["price"], "rank": items["rank"], 'category': items["category"],
'launch_time': items["launch_time"],
'volume': items["volume"],
'weight': items["weight"], "page_inventory": items["page_inventory"],
"buy_box_seller_type": items["buy_box_seller_type"],
"asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
'img_num': items["img_num"], 'img_type': items["img_type"],
'activity_type': items["activity_type"],
'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
'eight_val': items["eight_val"],
'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
'three_star': items["three_star"],
'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
'together_asin': items["together_asin"],
'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
'node_id': items["node_id"], 'data_type': data_type_asin,
'sp_num': items["sp_num"], 'describe': items["describe"], 'date_info': date_info,
'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
'variat_num': items['variat_num'],
'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
'spider_int': self.spider_int, 'all_best_sellers_herf': items['all_best_sellers_herf'],
'product_description': items['product_description'], 'buy_sales': items['buySales'],
'image_view': items['image_view'], 'product_json': items['product_json'],
'product_detail_json': items['productdetail_json'],
'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
'lob_asin_json': items['lob_asin_json'],
'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
'compare_similar_asin_json': items['compare_similar_asin_json'],
'customer_reviews_json': items['customer_reviews_json'],
'together_asin_json': items['together_asin_json'],
'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"], 'div_id_list': items['div_id_list'],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json':items['bundle_asin_component_json'],
'review_json_list':items['review_json_list'],'fbm_delivery_price':items['fbm_delivery_price']
}
if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
item['five_six_val'] = items['five_six_val']
else:
item['five_six_val'] = None
# 第二次请求
_response_text = None
if item['variat_num'] > 0 and is_variat == '0':
self.request_total_count_list.append(4)
_url = self.site_url + 'dp/' + asin + "?th=1&psc=1"
print('第二次请求:', _url)
try:
_response_text = None
_response_text = self.reuests_para_val.requests_amazon(headers=headers, scraper_url=_url, sess=None)
if _response_text:
_items = ParseAsinUs(resp=_response_text, asin=asin, month=self.month_,
date_info=date_info,
site_name=self.site_name).xpath_html()
if _items["volume"] and item['volume'] is None:
item['volume'] = _items["volume"]
if _items['result_list_json'] and item['result_list_json'] is None:
item['result_list_json'] = _items["result_list_json"]
if _items["weight_str"] and item['weight_str'] is None:
item['weight_str'] = _items["weight_str"]
if _items["weight"] and item['weight'] is None:
item['weight'] = _items["weight"]
if item["rank"] is None:
item["rank"] = _items["rank"]
if item["launch_time"] is None:
item["launch_time"] = _items["launch_time"]
if item['product_description'] is None:
item['product_description'] = _items["product_description"]
if item["price"] is None:
item["price"] = _items["price"]
elif item["price"] < 1:
item["price"] = _items["price"]
if item["buy_sales"] is None:
item["buy_sales"] = _items["buySales"]
if item['buy_box_seller_type'] is None or item['buy_box_seller_type'] == 4:
item["buy_box_seller_type"] = _to_items["buy_box_seller_type"]
item["buy_box_seller_type"] = _items["buy_box_seller_type"]
if item['page_inventory'] == 0 or item['page_inventory'] == 3:
item["page_inventory"] = _items["page_inventory"]
if item['account_name'] is None:
item["account_name"] = _to_items["account_name"]
item["account_name"] = _items["account_name"]
if item['seller_id'] is None:
item["seller_id"] = _to_items["seller_id"]
item["seller_id"] = _items["seller_id"]
if item['seller_json'] is None:
item["seller_json"] = _to_items["seller_json"]
except Exception as e:
print('请求asin 第二次请求 报错:', f"\n{traceback.format_exc()}")
if item['volume'] is None and volume_str != 'null':
item['volume'] = volume_str
if item['weight_str'] is None and weight_str != 'null':
item['weight_str'] = weight_str
new_date_hour = datetime.now().strftime("%Y-%m-%d:%H")
num = self.reuests_para_val.get_hour(new_date_hour)
self.hour_total_count_list.append(num)
print("+++++++++++item['buy_sales']::", item['buy_sales'])
if not item['title'] or not item['img_url']:
self.asin_not_sure_list.append(asin)
continue
# 修复:检查 img_url 是否包含无效值
img_url_invalid = False
if item['img_url'] and len(item['img_url'].strip()) > 2:
for key in ['None', 'null', 'none']:
if key in item['img_url']:
img_url_invalid = True
break
if img_url_invalid:
self.asin_not_sure_list.append(asin)
continue
# 修复:检查 title 是否包含无效值
title_invalid = False
if item['title'] and len(item['title'].strip()) > 2:
for key in ['None', 'null', 'none']:
if key in item['title']:
title_invalid = True
break
if title_invalid:
self.asin_not_sure_list.append(asin)
continue
print('itemitem:::', item)
# 上架时间 排名 重量 底部信息 如果都为None 重新抓取
if item["launch_time"] is None and item["rank"] is None and item['weight'] is None and item[
'product_detail_json'] is None and len(items['div_id_list']) < 1:
print('上架时间 排名 重量 底部信息 如果都为None 重新抓取:::', asin)
self.requests_error_asin_list.append(asin)
continue
if (self.reuests_para_val.check_contain_chinese(item['title'])) or (
self.reuests_para_val.check_contain_chinese(item['category'])):
self.asin_not_sure_list.append(asin)
continue
if items["buyBox_list"]:
self.buyBox_list.extend(items["buyBox_list"])
if items["buyBoxname_asin_list"]:
self.buyBoxname_asin_list.extend(items["buyBoxname_asin_list"])
if items["bs_category_asin_list_pg"]:
self.bs_category_asin_list_pg.extend(items["bs_category_asin_list_pg"])
if items["asin_variation_list"]:
item['variat_list'] = json.dumps(items["asin_variation_list"]) # 变体
else:
item['variat_list'] = None
item['asin_vartion_list'] = items["asin_variation_list"]
if items["all_img_video_list"]:
item['img_list'] = json.dumps(items["all_img_video_list"])
else:
item['img_list'] = None
if item['img_list'] is None:
item['img_list'] = []
self.item_queue.put(item)
# 获取字段值为None的字段名称写入redis进行统计
none_keys = [key for key, value in item.items() if
(value is None) or (value == -1 and key == 'price') or (
value == 0 and key in ['weight', 'total_comments', 'rating'])]
for key in ['parentAsin', 'week', 'all_best_sellers_herf', 'best_sellers_rank', 'seller_id',
'account_url', 'product_json', 'product_detail_json', 'review_ai_text', 'lob_asin_json',
'sp_initial_seen_asins_json', 'sp_4stars_initial_seen_asins_json',
'sp_delivery_initial_seen_asins_json', 'compare_similar_asin_json',
'customer_reviews_json', 'together_asin_json', 'min_match_asin_json',
'product_description', 'variat_num', 'qa_num', 'asin_vartion_list', 'review_label_json',
'seller_json', 'current_asin', 'five_six_val', 'best_sellers_herf',
'bundles_this_asins_json'
]:
if key in none_keys:
none_keys.remove(key)
log_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
try:
self.redis14.rpush(f'{self.site_name}_{log_time}_asin_detail_is_none', *none_keys)
item["seller_json"] = _items["seller_json"]
if item['five_star'] is None:
item['five_star'] = _items["five_star"]
if item['four_star'] is None:
item['four_star'] = _items["four_star"]
if item['three_star'] is None:
item['three_star'] = _items["three_star"]
if item['two_star'] is None:
item['two_star'] = _items["two_star"]
if item['one_star'] is None:
item['one_star'] = _items["one_star"]
if item['low_star'] is None:
item['low_star'] = _items["low_star"]
if item['category'] is None:
item['category'] = _items["category"]
if item['node_id'] is None:
item['node_id'] = _items["node_id"]
if item['review_json_list'] is None:
item['review_json_list'] = _items["review_json_list"]
if item['fbm_delivery_price'] is None:
item['fbm_delivery_price'] = _items["fbm_delivery_price"]
if item['review_ai_text'] is None:
item['review_ai_text'] = _items["review_ai_text"]
except:
pass
self.reuests_para_val.send_kafka(items=item, topic=self.topic_detail_month)
print(asin, 'rank 排名:', item['rank'])
if item['rank'] is not None and item['rank'] < 9000:
# requests_num 代表不同类型url请求返回的源码。
if _response_text_var: # 请求asin 出现缺货,拿变体asin进行请求
requests_num = 2
response_gzip = self.compress_string(_response_text_var)
elif _response_text: # 发现有变体。导入asin没有标记。重新请求第二次请求
requests_num = 1
response_gzip = self.compress_string(_response_text)
_response_text_var = None
if item["buy_box_seller_type"] == 4 and item['page_inventory'] == 3 and item['variat_num'] > 0 and \
items["asin_variation_list"]:
self.request_total_count_list.append(4)
try:
if asin != items["asin_variation_list"][0][0]:
_to_asin = items["asin_variation_list"][0][0]
elif len(items["asin_variation_list"]) > 1:
_to_asin = items["asin_variation_list"][1][0]
else:
requests_num = 0 # 第一次请求返回源码
response_gzip = self.compress_string(response)
html_data = f'{self.site_name}|-||=|-|=||-|{asin}|-||=|-|=||-|{response_gzip}|-||=|-|=||-|{new_date}|-||=|-|=||-|{requests_num}'
self.reuests_para_val.send_kafka(html_data=html_data, topic=self.topic_asin_html)
_to_asin = item['parentAsin']
_url = self.site_url + 'dp/' + _to_asin + "?th=1&psc=1"
print('请求asin 出现缺货,拿变体asin进行请求:', _url)
_response_text_var = self.reuests_para_val.requests_amazon(headers=headers,
scraper_url=_url, sess=None)
_to_items = ParseAsinUs(resp=_response_text_var, asin=asin, month=self.month_,
date_info=date_info,
site_name=self.site_name).xpath_html()
if item['buy_box_seller_type'] is None or item['buy_box_seller_type'] == 4:
item["buy_box_seller_type"] = _to_items["buy_box_seller_type"]
if item['account_name'] is None:
item["account_name"] = _to_items["account_name"]
if item['seller_id'] is None:
item["seller_id"] = _to_items["seller_id"]
if item['seller_json'] is None:
item["seller_json"] = _to_items["seller_json"]
except Exception as e:
print('请求asin 第二次请求 报错:', f"\n{traceback.format_exc()}")
if item['volume'] is None and volume_str != 'null':
item['volume'] = volume_str
if item['weight_str'] is None and weight_str != 'null':
item['weight_str'] = weight_str
new_date_hour = datetime.now().strftime("%Y-%m-%d:%H")
num = self.reuests_para_val.get_hour(new_date_hour)
self.hour_total_count_list.append(num)
print("+++++++++++item['buy_sales']::", item['buy_sales'])
if not item['title'] or not item['img_url']:
self.asin_not_sure_list.append(asin)
continue
# 修复:检查 img_url 是否包含无效值
img_url_invalid = False
if item['img_url'] and len(item['img_url'].strip()) > 2:
for key in ['None', 'null', 'none']:
if key in item['img_url']:
img_url_invalid = True
break
if img_url_invalid:
self.asin_not_sure_list.append(asin)
continue
# 修复:检查 title 是否包含无效值
title_invalid = False
if item['title'] and len(item['title'].strip()) > 2:
for key in ['None', 'null', 'none']:
if key in item['title']:
title_invalid = True
break
if title_invalid:
self.asin_not_sure_list.append(asin)
continue
print('itemitem:::', item)
# 上架时间 排名 重量 底部信息 如果都为None 重新抓取
if item["launch_time"] is None and item["rank"] is None and item['weight'] is None and item[
'product_detail_json'] is None and len(items['div_id_list']) < 1:
print('上架时间 排名 重量 底部信息 如果都为None 重新抓取:::', asin)
print(ingress, '上架时间 排名 重量 底部信息 如果都为None ::', '#' * 80)
self.requests_error_asin_list.append(asin)
continue
if (self.reuests_para_val.check_contain_chinese(item['title'])) or (
self.reuests_para_val.check_contain_chinese(item['category'])):
self.asin_not_sure_list.append(asin)
continue
if items["buyBox_list"]:
self.buyBox_list.extend(items["buyBox_list"])
if items["buyBoxname_asin_list"]:
self.buyBoxname_asin_list.extend(items["buyBoxname_asin_list"])
if items["bs_category_asin_list_pg"]:
self.bs_category_asin_list_pg.extend(items["bs_category_asin_list_pg"])
if items["asin_variation_list"]:
item['variat_list'] = json.dumps(items["asin_variation_list"]) # 变体
else:
item['variat_list'] = None
item['asin_vartion_list'] = items["asin_variation_list"]
if items["all_img_video_list"]:
item['img_list'] = json.dumps(items["all_img_video_list"])
else:
if 'Click the button below to continue shopping' in response:
self.requests_error_asin_list.append(query[0])
item['img_list'] = None
if item['img_list'] is None:
item['img_list'] = json.dumps([])
self.item_queue.put(item)
# 获取字段值为None的字段名称写入redis进行统计
none_keys = [key for key, value in item.items() if
(value is None) or (value == -1 and key == 'price') or (
value == 0 and key in ['weight', 'total_comments', 'rating'])]
for key in ['parentAsin', 'week', 'all_best_sellers_herf', 'best_sellers_rank', 'seller_id',
'account_url', 'product_json', 'product_detail_json', 'review_ai_text', 'lob_asin_json',
'sp_initial_seen_asins_json', 'sp_4stars_initial_seen_asins_json',
'sp_delivery_initial_seen_asins_json', 'compare_similar_asin_json',
'customer_reviews_json', 'together_asin_json', 'min_match_asin_json',
'product_description', 'variat_num', 'qa_num', 'asin_vartion_list', 'review_label_json',
'seller_json', 'current_asin', 'five_six_val', 'best_sellers_herf',
'bundles_this_asins_json'
]:
if key in none_keys:
none_keys.remove(key)
log_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
try:
self.redis14.rpush(f'{self.site_name}_{log_time}_asin_detail_is_none', *none_keys)
except:
pass
self.reuests_para_val.send_kafka(items=item, topic=self.topic_detail_month)
print(asin, 'rank 排名:', item['rank'])
if item['rank'] is not None and item['rank'] < 9000:
# requests_num 代表不同类型url请求返回的源码。
if _response_text_var: # 请求asin 出现缺货,拿变体asin进行请求
requests_num = 2
response_gzip = self.compress_string(_response_text_var)
elif _response_text: # 发现有变体。导入asin没有标记。重新请求第二次请求
requests_num = 1
response_gzip = self.compress_string(_response_text)
else:
print('状态13', asin)
self.asin_not_div_id_dp_list.append(asin)
continue
requests_num = 0 # 第一次请求返回源码
response_gzip = self.compress_string(response)
html_data = f'{self.site_name}|-||=|-|=||-|{asin}|-||=|-|=||-|{response_gzip}|-||=|-|=||-|{new_date}|-||=|-|=||-|{requests_num}'
self.reuests_para_val.send_kafka(html_data=html_data, topic=self.topic_asin_html)
else:
print(f"当前线程-已完成-爬取-跳出循环")
break
if 'Click the button below to continue shopping' in response:
print(ingress, 'Click the button below to continue shopping ::', '#' * 80)
self.requests_error_asin_list.append(query[0])
else:
print('状态13', asin)
self.asin_not_div_id_dp_list.append(asin)
continue
# 压缩字符串
def compress_string(self, input_string):
......@@ -474,7 +488,7 @@ class async_asin_pg():
# site_name=self.site_name).xpath_html()
# print(items)
asin_list = self.save_asin_detail.read_db_data()
# asin_list = ['B0FV8W9T52|2025-01|1|1|null|null']
# asin_list = ['B0FM433BGV|2025-01|1|1|null|null']
if asin_list:
for asin in asin_list:
self.queries_asin_queue.put(asin)
......@@ -484,7 +498,7 @@ class async_asin_pg():
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
html_thread = []
for i in range(25):
for i in range(20):
thread2 = threading.Thread(target=self.get_asin)
thread2.start()
html_thread.append(thread2)
......@@ -557,4 +571,4 @@ class async_asin_pg():
pass
# if __name__ == '__main__':
# async_asin_pg(month=12, spider_int=1, week=14,site_name='us').run()
# async_asin_pg(month='02', spider_int=1, week=14,site_name='us').run()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment