Commit 409ca255 by Peng

no message

parent 4825673f
......@@ -3,24 +3,20 @@ import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_connect import BaseUtils
from amazon_params import py_ja3
from amazon_params.params import DB_REQUESTS_ASIN_PARAMS
from utils.requests_param import Requests_param_val
from queue import Queue
from queue import Queue, Empty
import time
import random
from lxml import etree
import json
from curl_cffi import requests
import requests as requests2
sess = requests2.Session()
import traceback
import pandas as pd
import threading
import urllib3
import re
import uuid
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings()
......@@ -78,9 +74,12 @@ class async_account_name_products(BaseUtils):
def get_product(self, t_num):
while True:
time.sleep(0.3)
if self.queries_asin_queue.empty() == False:
querys = self.queries_asin_queue.get()
try:
querys = self.queries_asin_queue.get_nowait()
except Empty:
print(f"当前线程-{t_num} 已完成-爬取-跳出循环")
break
else:
thread_num = len(threading.enumerate())
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie()
......@@ -96,10 +95,8 @@ class async_account_name_products(BaseUtils):
headers["cookie"] = cookie_str
try:
print(self.headers_num_int, '请求url: ', scraper_url)
sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False)
resp.close()
resp = requests.get(scraper_url, headers=headers,
timeout=10, verify=False, impersonate="chrome")
if self.reuests_para_val.check_amazon_yzm(resp):
print(f"{self.site_name} 站点 + 使用代理ip出现验证码:{scraper_url}")
time.sleep(random.uniform(1.5, 5.5))
......@@ -138,7 +135,7 @@ class async_account_name_products(BaseUtils):
continue
try:
ingress = ingress[0].strip()
except:
except Exception:
ingress = None
print("获取邮编錯誤:")
print(ingress, '邮编 ')
......@@ -147,7 +144,7 @@ class async_account_name_products(BaseUtils):
"Cina" in ingress):
try:
cookie_ubid_main_id = re.findall(r'ubid-main=(.*?);', cookie_str)[0]
except:
except Exception:
cookie_ubid_main_id = re.findall(r'session-id=(.*?);', cookie_str)[0]
for cookie_key_value in self.cookie_dict_delete_id.items():
if cookie_ubid_main_id in cookie_key_value[1]:
......@@ -162,10 +159,11 @@ class async_account_name_products(BaseUtils):
continue
# 获取产品总数
results_span_list = response_s.xpath(
'//span[contains(text(),"results")]/text()|//div[@class="a-section a-spacing-small a-spacing-top-small"]//span/text()')
'//span[contains(text(),"result")]/text()|//div[@class="a-section a-spacing-small a-spacing-top-small"]//span/text()')
results_list = []
if len(results_span_list) > 0:
ele_text = results_span_list[0].replace(".", "").replace(",", "").replace("\xa0", "")
print('results_span_list:::', results_span_list)
ele_a = re.findall("\d+-\d+", ele_text)
if len(ele_a) == 0:
ele_a = re.findall("\d+–\d+", ele_text)
......@@ -181,7 +179,7 @@ class async_account_name_products(BaseUtils):
results_int = 1
num_int = int(results_int)
self.seller_account_num_list.append((seller_id, num_int))
except:
except Exception:
num_int = 0
print("店铺 产品总数:", seller_id, account_name, num_int)
products_asin_link_list = response_s.xpath(
......@@ -194,13 +192,12 @@ class async_account_name_products(BaseUtils):
# 获取 asin 位置
asin_href_list = response_s.xpath(f"//div[@data-asin='{products_asin}']//a/@href")
if len(asin_href_list) > 0:
asin_href_list = response_s.xpath(f"//div[@data-asin='{products_asin}']//a/@href")
asin_href_join = ''.join(asin_href_list)
row_num_lsit = re.findall(fr"{products_asin}/ref=sr_1_(\d+)\?", asin_href_join)
try:
row_num = row_num_lsit[0] if row_num_lsit else 0
row_num_int = int(row_num)
except:
except Exception:
row_num_int = 0
else:
row_num_int = 0
......@@ -228,7 +225,9 @@ class async_account_name_products(BaseUtils):
f'//div[@data-asin="{products_asin}"]//span[contains(@class,"a-size-base s-")]//text()')
if len(asin_review) == 0:
asin_review = response_s.xpath(
f'//div[@data-asin="{products_asin}"]//span[@class="a-color-link"]//text()')
f'//div[@data-asin="{products_asin}"]//span[@class="a-color-link"]//text()|//div[@data-asin="{products_asin}"]//span[contains(@class,"-normal-weight-text")]/text()')
print('asin_priceasin_review::::',asin_review)
total_comments = None
if asin_review:
number_of_reviews = asin_review[0].strip().replace(')', '').replace('(', '')
......@@ -259,9 +258,9 @@ class async_account_name_products(BaseUtils):
elif self.site_name == 'de':
if "Sternebewertungen" in number_of_reviews:
total_comments = \
re.findall(r"(.*) Sternebewertungen", number_of_reviews[0])[0]
elif "Sternebewertung" in number_of_reviews[0]:
total_comments = re.findall(r"(.*) Sternebewertung", number_of_reviews[0])[
re.findall(r"(.*) Sternebewertungen", number_of_reviews)[0]
elif "Sternebewertung" in number_of_reviews:
total_comments = re.findall(r"(.*) Sternebewertung", number_of_reviews)[
0]
else:
total_comments = number_of_reviews.replace('\xa0', '').strip()
......@@ -271,6 +270,9 @@ class async_account_name_products(BaseUtils):
else:
total_comments = number_of_reviews
if total_comments:
if 'K' in total_comments or 'k' in total_comments:
num = float(total_comments.replace('K', '').replace('k', '').replace(',', '').replace('\xa0', '').strip())
total_comments = str(int(num * 1000))
reviews = total_comments.replace('.', '').replace(',', '').replace('\xa0',
'').strip()
try:
......@@ -278,7 +280,7 @@ class async_account_name_products(BaseUtils):
pass
else:
reviews = 0
except:
except Exception:
reviews = 0
else:
reviews = None
......@@ -316,6 +318,7 @@ class async_account_name_products(BaseUtils):
try:
asin_price = response_s.xpath(
f'//div[@data-asin="{products_asin}"]//span[@class="a-offscreen"]//text()')
print('asin_price::::',asin_price)
if asin_price:
if self.site_name == 'us':
prices = asin_price[0].replace("$", "").replace("£", "").replace("€",
......@@ -335,7 +338,7 @@ class async_account_name_products(BaseUtils):
try:
rating = rating.replace(',', '.')
rating = round(float(rating), 2)
except:
except Exception:
rating = 0
if price:
try:
......@@ -344,7 +347,7 @@ class async_account_name_products(BaseUtils):
price = None
if price:
price = round(float(price), 2)
except:
except Exception:
price = None
buy_data_list = response_s.xpath(
f"//div[@data-asin='{products_asin}']//span[contains(text(),'bought')]/text()") # 月销
......@@ -360,9 +363,6 @@ class async_account_name_products(BaseUtils):
else:
if len(products_asin_link_list) == 0:
self.asin_not_sure_list.append(seller_id)
else:
print(f"当前线程-{t_num} 已完成-爬取-跳出循环")
break
def init_list(self):
print("=======清空变量==========")
......@@ -422,7 +422,7 @@ class async_account_name_products(BaseUtils):
into_workflow_progress = f"INSERT INTO workflow_progress (page, date_info, site_name, date_type, is_end, status_val, status, table_name) VALUES ('店铺产品', '{self.year_month.replace('_', '-')}', '{self.site_name}', 'month', '否', 3, '抓取结束','{self.site_name}_asin_detail_product');"
cursor_us.execute(into_workflow_progress)
connect_us.commit()
except:
except Exception:
into_workflow_progress = f"update workflow_progress set status_val=3,status='抓取结束' where page='店铺产品' and date_info='{self.year_month.replace('_', '-')}' and site_name='{self.site_name}' and date_type='month'"
print(into_workflow_progress)
cursor_us.execute(into_workflow_progress)
......@@ -436,7 +436,7 @@ class async_account_name_products(BaseUtils):
}
try:
requests.post(url=url, data=data, timeout=15)
except:
except Exception:
pass
cursor_us.close()
connect_us.close()
......@@ -475,21 +475,21 @@ class async_account_name_products(BaseUtils):
self.engine = self.mysql_connect()
sql_read = f'SELECT account_name, id, seller_id FROM {self.db_seller_account_syn} WHERE product_state=1 LIMIT {self.read_size} for update;'
print(sql_read)
self.df_read = self.engine.read_sql(sql_read)
if self.df_read.shape[0] == 0:
self.df_read = self.engine.read_then_update(
select_sql=sql_read,
update_table=f"{self.db_seller_account_syn}",
set_values={"product_state": 2},
where_keys=["id"],
)
if self.df_read.shape[0] > 0:
asin_list = list(self.df_read.account_name + '|-|' + self.df_read.seller_id)
return asin_list
elif self.df_read.shape[0] == 0:
self.stop_item_queue = False
return []
with self.engine.begin() as conn:
self.index_tuple = tuple(self.df_read['id'])
if len(self.index_tuple) == 1:
sql_update = f"""UPDATE {self.db_seller_account_syn} a set product_state=2 where a.id in ({self.index_tuple[0]})"""
else:
sql_update = f"""UPDATE {self.db_seller_account_syn} a set product_state=2 where a.id in {self.index_tuple}"""
conn.execute(sql_update)
asin_list = list(self.df_read.account_name + '|-|' + self.df_read.seller_id)
return asin_list
except Exception as e:
print("读取数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}")
time.sleep(5)
continue
def save_data(self):
......@@ -557,10 +557,12 @@ class async_account_name_products(BaseUtils):
def db_change_state(self, state=2):
if state == 1:
self.db_change_state_common(state=state, account_name_list=self.requests_error_asin_list)
if state == 3:
elif state == 3:
self.db_change_state_common(state=state, account_name_list=self.account_name_list_update)
elif state == 4:
self.db_change_state_common(state=state, account_name_list=self.asin_not_found_list)
elif state == 5:
self.db_change_state_common(state=state, account_name_list=self.asin_not_seller_id)
elif state == 6:
self.db_change_state_common(state=state, account_name_list=self.asin_not_sure_list)
......@@ -583,4 +585,7 @@ class async_account_name_products(BaseUtils):
break
except Exception as e:
print(f"更改{self.db_seller_account_syn}表的state={state}出错", e, f"\n{traceback.format_exc()}")
continue
\ No newline at end of file
continue
if __name__ == '__main__':
async_account_name_products('us', read_size=20, proxy_name=None, week=5).run()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment