Commit c8524b22 by Peng

no message

parent 4418209b
......@@ -3,15 +3,11 @@ import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_connect import BaseUtils
from amazon_params.params import DB_CONN_DICT
import math
from secure_db_client import get_remote_engine
import pandas as pd
import time
import pymysql
import requests
# import numpy as np
#
# from scipy.optimize import curve_fit
import math
"""计算销量,均值差"""
......@@ -59,19 +55,19 @@ class CalculateMean(BaseUtils):
sql_6 = f"""
SELECT * from {self.site_name}_one_category WHERE id in ( select max(id) from {self.site_name}_one_category where `year_month`='2025_5' and orders=0 and rank>50000 GROUP BY `name`)
UNION
select * from {self.site_name}_one_category where `year_month`='2025_6' and rank<=50000
select * from {self.site_name}_one_category where `year_month`='2025_8' and rank<=50000
"""
print('查询原始表6:', sql_6)
self.df_sum_6 = pd.read_sql(sql_6, con=self.engine)
self.df_sum_6 = self.engine.read_sql(sql_6)
# ---- 7 月份 ----
sql_7 = f"""
SELECT * from {self.site_name}_one_category WHERE id in ( select max(id) from {self.site_name}_one_category where `year_month`='2025_5' and orders=0 and rank>50000 GROUP BY `name`)
UNION
select * from {self.site_name}_one_category where `year_month`='2025_7' and rank<=50000
select * from {self.site_name}_one_category where `year_month`='2025_9' and rank<=50000
"""
print('查询原始表7:', sql_7)
self.df_sum_7 = pd.read_sql(sql_7, con=self.engine)
self.df_sum_7 = self.engine.read_sql(sql_7)
# 合并后直接靠 keep='last' 留 7 月
self.df_sum = pd.concat([self.df_sum_6, self.df_sum_7], ignore_index=True)
......@@ -86,62 +82,12 @@ class CalculateMean(BaseUtils):
self.cate_list = list(set(self.df_sum.name))
sql_select = f"SELECT `year_month` from selection.week_20_to_30 WHERE `week`={int(self.week)} and `year`={self.year}"
print(sql_select, 'sql_select:')
df = pd.read_sql(sql_select, con=self.engine)
df = self.engine.read_sql(sql_select)
self.year_month = list(df['year_month'])[0] if list(df['year_month']) else ''
print("self.year_month:", self.year_month)
time.sleep(2)
self.handle_data()
# def handle_data(self,max_rank=1_000_000, step=1):
# records = []
# for cate in self.cate_list:
# dfk = (self.df_sum[self.df_sum.name == cate]
# [['rank', 'orders']]
# .drop_duplicates()
# .query('orders>0')
# .sort_values('rank'))
# if len(dfk) < 3: continue
#
# # 1) 构造 log(rank), log(orders)
# lr = np.log(dfk['rank'].values)
# lo = np.log(dfk['orders'].values)
# # 2) 二次多项式扩展 X = [1, lr, lr^2]
# X = np.vstack([np.ones_like(lr), lr, lr ** 2]).T
# # 3) 求解最小二乘: coef = (X^T X)^-1 X^T lo
# coef = np.linalg.lstsq(X, lo, rcond=None)[0]
#
# # 4) 用这个多项式预测 full_range
# full = np.arange(dfk['rank'].min(), max_rank + 1, step)
# lf = np.log(full)
# log_pred = coef[0] + coef[1] * lf + coef[2] * (lf ** 2)
# orders_pred = np.exp(log_pred)
# cutoff_idx = np.argmax(orders_pred <= 30)
# # 如果从未出现 orders_pred < min_orders,cutoff_idx 会是 0
# # 但此时 orders_pred[0] 一定 >= min_orders,所以要检查:
# if orders_pred[cutoff_idx] >= 30:
# # 数组中没有小于阈值的点,保留全部
# last = len(full)
# else:
# # 在 cutoff_idx 处开始 <min_orders,就截断到它之前
# last = cutoff_idx
# full = full[:last]
# orders_pred = orders_pred[:last]
#
# # 5. 组装输出 DataFrame
# dfout = pd.DataFrame({
# 'name': cate,
# 'rank': full,
# 'orders': orders_pred
# })
# # 用四舍五入计算日均销量
# dfout['orders_day'] = (dfout['orders'] / 30).round(0).astype(int)
# dfout['year_month'] = self.year_month
# dfout['week'] = self.week
#
# records.append(dfout)
#
# records.append(dfout)
# self.df_repeat = pd.concat(records, ignore_index=True)
def handle_data(self):#旧代码
print(len(self.cate_list))
......@@ -195,12 +141,12 @@ class CalculateMean(BaseUtils):
sql = f"select en_name as name, category_id from {self.site_name}_bs_category where 1 = 1 and nodes_num = 2 group by en_name, category_id"
print('sql',sql)
df_en_name = pd.read_sql(sql, con=self.engine)
df_en_name = self.engine.read_sql(sql)
# 使用 merge 判断两个列的 name 是否一样
self.df_repeat = pd.merge(self.df_repeat, df_en_name, on='name', how='left')
self.df_repeat = self.df_repeat.loc[self.df_repeat.orders >= 30] # 保留大于0的 排名月销
self.df_repeat.drop_duplicates(['name', 'rank','orders'], inplace=True) # 去重
self.df_repeat.to_sql(f"{self.site_name}_one_category_report", con=self.engine, if_exists="append", index=False)
self.engine.to_sql(self.df_repeat,f"{self.site_name}_one_category_report",if_exists="append")
def run(self):
self.db_read_data()
......@@ -208,21 +154,20 @@ class CalculateMean(BaseUtils):
self.db_save_data()
def sendMessage(self, week, site_name):
db = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'],
user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'],
database='selection', charset="utf8mb4")
cursor = db.cursor(cursor=pymysql.cursors.DictCursor)
engine_us_mysql = get_remote_engine(
site_name='us', # -> database "selection"
db_type="mysql", # -> 服务端 alias "mysql"
)
with engine_us_mysql.begin() as conn:
time_strftime = time.strftime("%Y-%m-%d %X", time.localtime())
update_workflow_progress = f"update workflow_progress set status_val=3,status='抓取结束' where page='ASIN销量' and date_info='2025-{week}' and site_name='{site_name}' and date_type='week'"
print(update_workflow_progress)
cursor.execute(update_workflow_progress)
db.commit()
cursor.close()
db.close()
conn.execute(update_workflow_progress)
url = 'http://47.112.96.71:8082/selection/sendMessage'
data = {
'account': 'pengyanbing,fangxingjun,wangrui4',
'account': 'pengyanbing,fangxingjun',
'title': f"{site_name} 站点类目销量统计",
'content': str(self.week) + f' 周 {site_name}站点类目销量计算 已结束,请确认下一步流程!时间:' + time_strftime
}
......@@ -240,12 +185,12 @@ if __name__ == '__main__':
handle_obj_us = CalculateMean(site_name='us', year=2025, week=week)
handle_obj_us.run()
handle_obj_us.sendMessage(week, site_name='us')
#handle_obj_uk = CalculateMean(site_name='uk', year=2025, week=week)
#handle_obj_uk.run()
# handle_obj_uk = CalculateMean(site_name='uk', year=2025, week=week)
# handle_obj_uk.run()
# handle_obj_uk.sendMessage(week, site_name='uk')
#handle_obj_de = CalculateMean(site_name='de', year=2025, week=week)
#handle_obj_de.run()
#handle_obj_de.sendMessage(week, site_name='de')
# handle_obj_de = CalculateMean(site_name='de', year=2025, week=week)
# handle_obj_de.run()
# handle_obj_de.sendMessage(week, site_name='de')
# handle_obj_fr = CalculateMean(site_name='fr', year=2025, week=week)
# handle_obj_fr.run()
# handle_obj_fr.sendMessage(week, site_name='fr')
......
import datetime
import json
import os
import random
import re
import sys
import time
import traceback
import pandas as pd
from lxml import etree
from secure_db_client import get_remote_engine
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from sqlalchemy import create_engine
from lxml import etree
import re
import time
import traceback
import datetime
import json
import sys
from sqlalchemy.engine import URL
syn_state = False
click_product_name_list = []
# 类目分析
class dow_category_Product():
def __init__(self, site):
self.site_name = site
self.click_product_name_list = []
self.update_cagetory_state = False
def mysql_connect(self, site='us'):
if site == 'us':
db = 'selection'
else:
db = f'selection_{site}'
DB_CONN_DICT = {
"mysql_port": 3306,
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
url = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database=db,
query={"charset": "utf8mb4"}
self.engine_mysql = get_remote_engine(
site_name=site, # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
self.engine_mysql = create_engine(
url)
url_us = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database='selection',
query={"charset": "utf8mb4"}
self.engine_us_mysql = get_remote_engine(
site_name='us', # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
self.engine_pg = get_remote_engine(
site_name=site, # -> database "selection"
db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
)
self.engine_us_mysql = create_engine(
url_us)
self.engine_pg = create_engine(
f"postgresql+psycopg2://postgres:F9kL2sXe81rZq@113.100.143.162:5432/{db}",
encoding='utf-8')
self.num = 0
week = time.strftime("%W")
yaer = time.strftime('%Y', time.localtime(time.time()))
......@@ -77,7 +58,7 @@ class dow_category_Product():
os.system(f'start Chrome {params_} --remote-debugging-port={port}')
chrome_options = Options()
# 禁止加载图片
chrome_options.add_argument('--blink-settings=imagesEnabled=false') # 这样可以;;;;激动
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{port}")
driver = webdriver.Chrome(r'chromedriver.exe', options=chrome_options)
# 无界面模式
......@@ -88,40 +69,19 @@ class dow_category_Product():
chrome_options.add_argument('--no‑sandbox')
# 改用 /tmp 而不是 /dev/shm(避免共享内存不足)
chrome_options.add_argument('--disable-dev-shm-usage')
# 其他可以尝试的降配项
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-application-cache')
chrome_options.add_argument('--disable-background-timer-throttling')
chrome_options.add_argument('--disable-backgrounding-occluded-windows')
chrome_options.add_argument('--disable-renderer-backgrounding')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# 彻底关掉大部分渲染路径
chrome_options.add_argument('--disable-gpu-compositing')
chrome_options.add_argument('--disable-2d-canvas-clip-aa')
chrome_options.add_argument('--disable-2d-canvas-cpu-path')
chrome_options.add_argument('--disable-accelerated-2d-canvas')
opts = chrome_options
# 限制渲染进程数
opts.add_argument('--renderer-process-limit=1')
# 限制实用进程数(utility)
opts.add_argument('--utility-process-limit=1')
# 关闭 Site Isolation
opts.add_argument('--disable-site-isolation-trials')
self.get_category(site, driver)
def get_category(self, site, driver):
for i in range(2):
try:
driver.get('https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?')
time.sleep(random.uniform(8, 25.25))
time.sleep(random.uniform(8, 20.25))
driver.get('https://sellercentral.amazon.com/selection/category-insights')
time.sleep(random.uniform(8, 25.25))
time.sleep(random.uniform(8, 20.25))
break
except:
time.sleep(5)
time.sleep(1)
time.sleep(5)
if site == 'us':
driver.execute_script(
'document.querySelector("#ATVPDKIKX0DER > kat-radiobutton").shadowRoot.querySelector("div > div.text > slot > kat-label:nth-child(1)").click()')
......@@ -131,7 +91,7 @@ class dow_category_Product():
elif site == 'de':
driver.execute_script(
'document.querySelector("#A1PA6795UKMFR9 > kat-radiobutton").shadowRoot.querySelector("div > div.text > slot > kat-label:nth-child(1)").click()')
time.sleep(random.uniform(8, 15.25))
time.sleep(random.uniform(5, 10.25))
html = etree.HTML(driver.page_source)
self.save_category(html)
print(333333333333333333333333)
......@@ -157,31 +117,35 @@ class dow_category_Product():
num = 0
for Category in Category_list:
self.cilik_site(driver)
product_nums = 0
click_product_name_list=[]
print(Category, ' 22222222222222222222222222222222222222')
if self.update_cagetory_state:
self.click_product_name_list = []
try:
num += 1
Category_name = Category
# _Category = Category.replace('&', '\\\&')
print("Category_name 名称 11111", Category)
driver.execute_script(f"""document.querySelector("kat-radiobutton[label='{Category}']").click()""")
time.sleep(2)
time.sleep(1)
html = etree.HTML(driver.page_source)
Product_Type_list = html.xpath(
'//h2[contains(text(),"Product Type")]/following-sibling::div/div')
product_nums = 0
for Product_Type in Product_Type_list:
time.sleep(0.5)
try:
save_Category_list = []
Product_name = Product_Type.xpath('./@id')
print(product_nums, "Product_name3222222222::", Product_name[0].upper())
if Product_name[0] in self.click_product_name_list:
print(product_nums, "已经抓取::", Product_name[0].upper())
continue
driver.execute_script("localStorage.clear();") # 清除本地存储
time.sleep(0.5)
# driver.execute_script("sessionStorage.clear();") # 清除会话存储
time.sleep(0.5)
driver.execute_script(
"caches.keys().then(function(names) { for (let name of names) { caches.delete(name); } });")
driver.execute_script("window.performance.clearResourceTimings();")
time.sleep(1)
time.sleep(0.5)
# 假设你已经有了 driver
# 先 enable heap profiler
driver.execute_cdp_cmd('HeapProfiler.enable', {})
......@@ -189,15 +153,9 @@ class dow_category_Product():
driver.execute_cdp_cmd('HeapProfiler.collectGarbage', {})
# 最后可选地 disable 掉
driver.execute_cdp_cmd('HeapProfiler.disable', {})
try:
time.sleep(2)
save_Category_list = []
Product_name = Product_Type.xpath('./@id')
print("Product_name3222222222::", Product_name[0].upper())
# print('click_product_name_list::', click_product_name_list)
# if Product_name[0] in click_product_name_list:
# continue
# click_product_name_list.append(Product_name[0])
time.sleep(0.5)
self.click_product_name_list.append(Product_name[0])
self.update_cagetory_state = False
driver.execute_script(f"document.querySelector('#{Product_name[0]} > kat-radiobutton').click()")
time.sleep(2)
html = etree.HTML(driver.page_source)
......@@ -213,14 +171,19 @@ class dow_category_Product():
Keyword = html.xpath(f"//div[@id='{Keyword_id[0]}']/kat-radiobutton/@label")
print('Keyword', Keyword)
driver.find_element(By.XPATH, f'//kat-radiobutton[@value="{Keyword_id[0]}"]').click()
time.sleep(2.3)
time.sleep(2)
html_1 = etree.HTML(driver.page_source)
most_popular_list = html_1.xpath(
"//div[@class='most-popular-keywords-container']/kat-list//li")
if most_popular_list:
for most_popular in most_popular_list:
most_popular_keyword = most_popular.xpath('.//div[2]/text()')[0]
most_popular_b_nums = most_popular.xpath('.//div/b/text()')[0]
most_keyword_list = most_popular.xpath('.//div[2]/text()')
print(most_keyword_list, 1111)
most_popular_keyword = most_keyword_list[0] if most_keyword_list else None
most_popular_b_nums_list = most_popular.xpath('.//div/b/text()')
print(most_popular_b_nums_list, 2222)
most_popular_b_nums = most_popular_b_nums_list[
0] if most_popular_b_nums_list else None
most_popular_dict = {"most_popular_keywords": most_popular_keyword,
'most_popular_search_nums': most_popular_b_nums}
most_popular_keyword_list.append(most_popular_dict)
......@@ -318,10 +281,12 @@ class dow_category_Product():
pattern = r'\£([\d.]+)'
elif self.site_name == 'de':
pattern = r'\€([\d.]+)'
else:
pattern = ''
# 使用findall函数找到所有匹配的金额
matches_list = re.findall(pattern, big_text_Advertisement)
ad_spend = matches_list[0]
majority_spend = matches_list[1]
ad_spend = matches_list[0] if matches_list else None
majority_spend = matches_list[1] if matches_list else None
else:
ad_spend = 0
majority_spend = 0
......@@ -331,6 +296,7 @@ class dow_category_Product():
print('原始数据')
print([big_text_sller, big_text_brand, big_text_asin, big_text_new_asin,
big_text_per_asin, big_text_Advertisement, big_text_star])
print(Category, ' 22222222222222222222222222222222222222')
# 品牌
if big_text_brand:
if 'K' in big_text_brand:
......@@ -382,46 +348,84 @@ class dow_category_Product():
print('转成int')
print([big_brand_int, big_asin_int,
big_new_asin_int, big_per_asin_int])
# top_data_json = self.new_top_grossing(driver, 'Top')
# time.sleep(1)
# news_data_json = self.new_top_grossing(driver, 'News')
# time.sleep(1)
top_data_json = self.new_top_grossing(driver, 'Top')
top_data_dict = json.loads(top_data_json)
if top_data_dict.get('products_aggregate_sales'):
_top_data_dict = self.parse_input('top', top_data_dict)
else:
_top_data_dict = self.parse_input('top', None)
top_sales_amount = _top_data_dict['top_sales_amount']
top_sales_volume = _top_data_dict['top_sales_volume']
top_search_ratio = _top_data_dict['top_search_ratio']
top_return_ratio = _top_data_dict['top_return_ratio']
top_adv_spend = _top_data_dict['top_adv_spend']
top_majority_spend = _top_data_dict['top_majority_spend']
print('top_majority_spend', top_majority_spend)
news_data_json = self.new_top_grossing(driver, 'News')
news_data_dict = json.loads(news_data_json)
if news_data_dict.get('products_aggregate_sales'):
_news_data_dict = self.parse_input('news', news_data_dict)
else:
_news_data_dict = self.parse_input('news', None)
news_sales_amount = _news_data_dict['news_sales_amount']
news_sales_volume = _news_data_dict['news_sales_volume']
news_search_ratio = _news_data_dict['news_search_ratio']
news_return_ratio = _news_data_dict['news_return_ratio']
news_adv_spend = _news_data_dict['news_adv_spend']
news_majority_spend = _news_data_dict['news_majority_spend']
print('news_majority_spend', news_majority_spend)
save_Category_list.append(
[Category_name, Product_name[0], Keyword[0], float(search_ratio),
float(product_average), float(return_ratio), float(return_product_average),
self.y_w, big_text_sller, big_text_brand, big_text_asin, big_text_new_asin,
big_text_per_asin, big_text_Advertisement, big_text_star, big_brand_int,
big_asin_int, big_new_asin_int, big_per_asin_int, five_star, three_star, two_star,
one_star, ad_spend, majority_spend, most_popular_json_dict, reasons_returns_json
])
one_star, ad_spend, majority_spend, most_popular_json_dict, reasons_returns_json,
top_data_json, news_data_json, top_sales_amount, top_sales_volume,
top_search_ratio,
top_return_ratio, top_adv_spend, top_majority_spend, news_sales_amount,
news_sales_volume,
news_search_ratio, news_return_ratio, news_adv_spend, news_majority_spend])
print('数据:', save_Category_list)
except:
print('============ 下标。超出 。 ==========')
continue
except Exception as e:
print('============ 下标。超出 。 ==========', e)
print('存储数据长度:', len(save_Category_list))
while True:
try:
if save_Category_list:
with self.engine_mysql.begin() as conn_mysql:
for i in save_Category_list:
dele_sql = f"DELETE from {site}_aba_profit_category_insights where category='{i[0]}' and product_type='{i[1]}' and item_type_keyword='{i[2]}' and year_week='{self.y_w}'"
print('删除删除mysql:', dele_sql)
conn_mysql.execute(dele_sql)
df = pd.DataFrame(data=save_Category_list,
columns=['category', "product_type", "item_type_keyword",
"search_ratio", "product_average", "return_ratio",
"return_product_average", "year_week", 'sellers',
'new_brands',
'asin', 'new_asin', 'per_asin', 'advertisement_spend',
'star_ratings', 'new_brands_int', 'asin_int',
'new_asin_int', 'per_asin_int', 'five_star',
'three_star', 'two_star', 'one_star', 'ad_spend',
'majority_spend', 'most_popular_keywords_item',
'reasons_returns_json'
])
df.to_sql(f'{site}_aba_profit_category_insights', con=self.engine_mysql,
if_exists="append", index=False)
print('存储成功 mysql')
# with self.engine_mysql.begin() as conn_mysql:
# for i in save_Category_list:
# dele_sql = f"DELETE from {site}_aba_profit_category_insights where category='{i[0]}' and product_type='{i[1]}' and item_type_keyword='{i[2]}' and year_week='{self.y_w}'"
# print('删除删除mysql:', dele_sql)
# conn_mysql.execute(dele_sql)
# df = pd.DataFrame(data=save_Category_list,
# columns=['category', "product_type", "item_type_keyword",
# "search_ratio", "product_average", "return_ratio",
# "return_product_average", "year_week", 'sellers',
# 'new_brands',
# 'asin', 'new_asin', 'per_asin', 'advertisement_spend',
# 'star_ratings', 'new_brands_int', 'asin_int',
# 'new_asin_int', 'per_asin_int', 'five_star',
# 'three_star', 'two_star', 'one_star', 'ad_spend',
# 'majority_spend', 'most_popular_keywords_item',
# 'reasons_returns_json', 'top_data_json',
# 'news_data_json',
# 'top_sales_amount', 'top_sales_volume',
# 'top_search_ratio',
# 'top_return_ratio', 'top_adv_spend',
# 'top_majority_spend',
# 'news_sales_amount',
# 'news_sales_volume',
# 'news_search_ratio', 'news_return_ratio',
# 'news_adv_spend',
# 'news_majority_spend'
# ])
# self.engine_mysql.to_sql(df, f'{site}_aba_profit_category_insights',
# if_exists="append")
# print('存储成功 mysql')
with self.engine_pg.begin() as conn_pg:
for i in save_Category_list:
dele_sql = f"DELETE from {site}_aba_profit_category_insights where category='{i[0]}' and product_type='{i[1]}' and item_type_keyword='{i[2]}' and year_week='{self.y_w}'"
......@@ -437,9 +441,18 @@ class dow_category_Product():
'new_asin_int', 'per_asin_int', 'five_star',
'three_star', 'two_star', 'one_star', 'ad_spend',
'majority_spend', 'most_popular_keywords_item',
'reasons_returns_json'])
df.to_sql(f'{site}_aba_profit_category_insights', con=self.engine_pg,
if_exists="append", index=False)
'reasons_returns_json', 'top_data_json',
'news_data_json', 'top_sales_amount', 'top_sales_volume',
'top_search_ratio',
'top_return_ratio', 'top_adv_spend',
'top_majority_spend',
'news_sales_amount',
'news_sales_volume',
'news_search_ratio', 'news_return_ratio',
'news_adv_spend',
'news_majority_spend'])
self.engine_pg.to_sql(df, f'{site}_aba_profit_category_insights',
if_exists="append")
print(save_Category_list)
print('存储成功 pg')
break
......@@ -451,23 +464,31 @@ class dow_category_Product():
print('============ 产品分类 下标。超出 。无数据 ==========', f"\n{traceback.format_exc()}")
time.sleep(2)
continue
# product_nums+=1
# if product_nums>10:
# time.sleep(2)
# print(product_nums, 'product_nums 重新启动 浏览器,')
# driver.close()
# driver.quit()
# time.sleep(2)
# product_nums = 0
# self.run()
product_nums += 1
if product_nums > 12:
product_nums = 0
print(product_nums, 'product_nums 重新启动 浏览器,')
product_nums = 0
driver.close()
driver.quit()
time.sleep(1)
self.run()
except Exception as e:
print(e, '执行错误')
time.sleep(random.uniform(10, 20))
self.reboot_driver(driver, site) # 重启刷新
while True:
try:
updated_at = datetime.datetime.now().strftime("%m-%d %H:%M:%S")
with self.engine_pg.begin() as conn:
update_sql = f"update seller_category_insights_syn set state =3 where category='{Category}'"
print('更新update_sql:', update_sql)
conn.execute(update_sql)
self.update_cagetory_state = True
break
except Exception as e:
print(e,'修改状态3报错')
time.sleep(20)
if num > 1:
driver.close()
driver.quit()
......@@ -479,7 +500,48 @@ class dow_category_Product():
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['site_name', 'date_info', 'status', 'status_val',
'table_name', 'date_type', 'page', 'is_end'])
df_seller_asin_account.to_sql('workflow_progress', con=self.engine_us_mysql, if_exists='append', index=False)
self.engine_us_mysql.to_sql(df_seller_asin_account, 'workflow_progress', if_exists='append')
def safe_get(self, lst, idx, default=None):
return lst[idx] if 0 <= idx < len(lst) else default
def parse_input(self, type, input):
sales_amount = 0
sales_volume = 0
search_ratio = 0
return_ratio = 0
adv_spend = 0
majority_spend = 0
if input:
products_aggregate_sales = input.get('products_aggregate_sales', [])[0]
if products_aggregate_sales:
split = products_aggregate_sales.split("|")
sales_amount_str = self.safe_get(split, 1, '').partition("$")[-1]
sales_volume_str = self.safe_get(re.findall(r'\d+', self.safe_get(split, 2, '')), 0, "0")
if len(sales_amount_str) > 0:
sales_amount = float(sales_amount_str.strip().replace(",", ""))
sales_volume = float(sales_volume_str)
pass
search_ratio = float(input['search_ratio'] or -1)
return_ratio = float(input['return_ratio'] or -1)
big_text_Advertisement = input['big_text_Advertisement']
if big_text_Advertisement:
split = big_text_Advertisement.split("|-|")
adv_spend_str = self.safe_get(split, 0, '').partition("$")[-1]
majority_spend_str = self.safe_get(split, 1, '').partition("$")[-1]
adv_spend = (float(adv_spend_str.strip()) if adv_spend_str != '' else 0)
majority_spend = (float(majority_spend_str.strip()) if majority_spend_str != '' else 0)
return {
f"{type}_sales_amount": sales_amount,
f"{type}_sales_volume": sales_volume,
f"{type}_search_ratio": search_ratio,
f"{type}_return_ratio": return_ratio,
f"{type}_adv_spend": adv_spend,
f"{type}_majority_spend": majority_spend
}
def analysis_top_Newly_html(self, driver):
html_top = etree.HTML(driver.page_source)
......@@ -572,7 +634,7 @@ class dow_category_Product():
'most_popular_json_dict': most_popular_keyword_list, 'search_ratio': search_ratio,
'return_ratio': return_ratio,
'big_text_Advertisement': big_text_Advertisement, 'big_text_star': big_text_star}
print('data_dict')
print('data_dict',data_dict)
return json.dumps(data_dict)
def new_top_grossing(self, driver, click_type):
......@@ -607,7 +669,7 @@ class dow_category_Product():
break
except Exception as e:
print('reboot_driver详细报错')
print(traceback.format_exc(e))
print(traceback.format_exc())
driver.close()
driver.quit()
time.sleep(5)
......@@ -623,8 +685,9 @@ class dow_category_Product():
print('接着上次中断的继续')
self.mysql_connect(site=self.site_name)
select_sql = 'select category from seller_category_insights_syn where state =1'
df = pd.read_sql(select_sql, con=self.engine_pg)
df = self.engine_pg.read_sql(select_sql)
category_list = list(df.category)
print(category_list)
if category_list:
return category_list
else:
......@@ -635,8 +698,8 @@ class dow_category_Product():
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['site_name', 'date_info', 'status', 'status_val',
'table_name', 'date_type', 'page', 'is_end'])
df_seller_asin_account.to_sql('workflow_progress', con=self.engine_us_mysql, if_exists='append',
index=False)
self.engine_us_mysql.to_sql(df_seller_asin_account, 'workflow_progress', if_exists='append'
)
def save_category(self, html):
Category_list = html.xpath('//h2[contains(text(),"Category")]/following-sibling::div/div')
......
import json
import random
import os
import re
import time
import traceback
from datetime import datetime
import pandas as pd
import redis
from lxml import html
import requests
from lxml import etree
from playwright.sync_api import sync_playwright
from secure_db_client import get_remote_engine
......@@ -12,23 +14,75 @@ from secure_db_client import get_remote_engine
def mysql_connect():
engine_us_mysql = get_remote_engine(
site_name='us', # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
)
return engine_us_mysql
def parse_list(s: str):
# 把 "[a, b, c]" 这样的值转成 ["a","b","c"]
return [t.strip() for t in s.strip().strip('[]').split(',') if t.strip()]
def run(asin_list):
print('asin_list:::',asin_list)
print('asin_list:::',len(asin_list))
def clean_text(node):
# 不用 normalize-space(),改用 Python 来规整空白
return " ".join("".join(node.itertext()).split())
def extract_visible_headers(doc):
print('extract_visible_headers')
# 读取被隐藏的列 id 列表
hidden_val = doc.xpath('//input[@id="hiddenColumnTitles"]/@value')
hidden_ids = set(parse_list(hidden_val[0])) if hidden_val else set()
headers = []
seen = set()
# 只看表头行,按出现顺序取 th,并排除隐藏列
for th in doc.xpath('//tr[@id="head-row"]/th[@id]'):
col_id = th.get('id')
if col_id in seen:
continue
seen.add(col_id)
if col_id in hidden_ids:
continue
# 进一步防御:如果有内联隐藏样式/类名,也跳过
style = (th.get('style') or '').replace(' ', '').lower()
classes = (th.get('class') or '')
if 'display:none' in style or 'a-hidden' in classes.split():
continue
label = clean_text(th)
if label:
headers.append((col_id, label))
return headers
def extract_rows(doc, headers):
rows = []
# 页面里的数据行都带有 mt-row 类
for tr in doc.xpath('//table[contains(@class,"mt-table")]//tr[contains(@class,"mt-row")]'):
row = {}
for col_id, label in headers:
# 数据单元格用 data-column 指出对应列 id
td = tr.xpath('.//td[@data-column=$c]', c=col_id)
value = clean_text(td[0]) if td else ""
row[label] = value
# 只要有任何可见列有值,就认为是有效数据行
if any(row.values()):
rows.append(row)
return rows
def run_spider(asin_list):
print('asin_list:::', asin_list)
print('asin_list:::', len(asin_list))
if asin_list:
try:
pr_name = "chrome.exe"
os.system('%s%s' % ("taskkill /F /IM ", pr_name))
except Exception as e:
print("强制关闭chrome.exe失败:", e)
# 初始化
with sync_playwright() as _playwright:
# _playwright.chromium.launch_persistent_context
browser = _playwright.chromium.launch_persistent_context(
context = _playwright.chromium.launch_persistent_context(
# 指定本机用户缓存地址
user_data_dir=r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data",
user_data_dir=r"C:\Users\admin\AppData\Local\Google\Chrome\User Data",
# 指定本机google客户端exe的路径
executable_path=r"C:\Program Files\Google\Chrome\Application\chrome.exe",
executable_path=r"C:\Users\admin\AppData\Local\Google\Chrome\Application\chrome.exe",
# 要想通过这个下载文件这个必然要开 默认是False
accept_downloads=True,
# 设置不是无头模式
......@@ -37,121 +91,180 @@ def run(asin_list):
locale='en-GB',
ignore_https_errors=True,
no_viewport=True,
slow_mo=10,
# 跳过检测
args=['--disable-blink-features=AutomationControlled', '--remote-debugging-port=9222']
slow_mo=10
)
page = browser.new_page()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page.add_init_script(js)
page.evaluate_handle('''() =>{ window.chrome = { runtime: {}, }; }''')
page.evaluate_handle(
'''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
# 模拟浏览器参数
page.locator("body").click()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page.add_init_script(js)
page = context.pages[0] if context.pages else context.new_page()
print('打开浏览器请求asin:')
page = browser.new_page()
# page.goto('https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?')
# page.wait_for_timeout(1500)
def intercept_request(request):
try:
page.goto('https://sellercentral.amazon.com')
time.sleep(random.uniform(2, 5))
except:
save_asin_var_data(asin_list[0], json.dumps({"content": "网络有问题 登录账号失败。远程账号电脑检查"}), '失败')
for asin in asin_list:
time.sleep(random.uniform(1, 3))
if "populate" in request.url:
print('request.url::', request.url)
resp = request.response().text()
html_data_list.append(resp)
except Exception as e:
print("获取响应失败:", e, f"\n{traceback.format_exc()}")
# try:
page.goto(
'https://sellercentral.amazon.com/listing/varwiz?ref_=xx_swlang_head_xx&mons_sel_locale=en_US&languageSwitched=1')
page.wait_for_timeout(1500)
print('page.url::', page.url)
for id_asin in asin_list:
print("id_asin::", id_asin)
id_asin_list = id_asin.split('|-|')
asin = id_asin_list[0]
id = int(id_asin_list[1])
print('开始抓取::', asin, 'id::', id)
error = None
if 'signin' in page.url:
save_asin_var_data(asin, [], '失败', 'us', [], [], '账号电脑退出登录', id)
semd_ms(asin)
time.sleep(120)
return
html_data_list = []
# page.reload() # 刷新页面
page.wait_for_timeout(1000)
page.locator('//*[@id="varwizard_accordion"]/div[1]/div/div[1]/h5/a/i').click()
page.wait_for_timeout(1000)
page.locator('//*[@id="varwiz-search-text"]').fill(f'{asin}')
page.wait_for_timeout(5000)
page.on("requestfinished", intercept_request)
page.locator('//*[@id="a-autoid-0"]/span/input').click() # 点击
page.wait_for_timeout(15000)
# 请求 指定的 URL
items_list = []
parent_asin_list = []
var_asin_list = []
print(f'请求asin {asin} data_list::', len(html_data_list))
for html_content in html_data_list:
if (
'variation family is not supported on this' in html_content and 'ERROR' in html_content) or (
'you searched for is not' in page.content()):
try:
print('请求asin', asin)
url = f"https://sellercentral.amazon.com/listing/varwiz/search?searchText={asin}"
print('url:', url)
page.goto(url)
time.sleep(random.uniform(3, 8))
print()
print(page.content())
html_string = page.content()
time.sleep(0.5)
if 'The ASIN you searched for is not part of any variation' not in html_string:
doc = html.fromstring(html_string)
# 取第一个 <pre> 的文本内容(会自动去掉标签内 HTML)
pre_nodes = doc.xpath('//pre')
if not pre_nodes:
raise ValueError("找不到 <pre> 节点")
pre_text = pre_nodes[0].text_content().strip()
# 直接尝试解析(适用于 <pre> 里就是整段 JSON 的情况)
data_json = json.loads(pre_text)
print(data_json) # dict / list
print('获取完成', asin)
save_asin_var_data(asin, data_json, '成功')
error = re.findall(r'message =(.*?);', html_content)[0]
except:
error = "The ASIN you searched for is not part of any variation family"
print('errorerror::', error)
save_asin_var_data(asin, [], '失败', 'us', [], [], error, id)
break
tree = etree.HTML(html_content)
visible_headers = extract_visible_headers(tree)
rows = extract_rows(tree, visible_headers)
if rows:
for i in rows:
print(i, '233333333333')
Parentage = i.get('Parentage')
if Parentage == 'parent':
parent_asin = i.get('ASIN')
parent_asin_list.append(parent_asin)
if Parentage == 'child':
var_asin = i.get('ASIN')
var_asin_list.append(var_asin)
items_list.extend(rows)
if items_list:
print('items_listitems_list::', len(items_list))
items_list_json = json.dumps(items_list)
save_asin_var_data(asin, items_list_json, '成功', 'us', parent_asin_list, var_asin_list, None, id)
html_data_list = []
# except Exception as e:
# print(asin, '报错::', e)
# semd_ms(asin)
# save_asin_var_data(asin, [], '失败', 'us', [], [], None, id)
else:
print('没有该asin,', asin)
save_asin_var_data(asin, json.dumps(
{"content": "The ASIN you searched for is not part of any variation family"}), '成功')
except Exception as e:
print('报错,‘23232323232323232323', e)
save_asin_var_data(asin, json.dumps({"content": "下载失败。远程账号电脑检查"}), '失败')
continue
def semd_ms(asin):
try:
url = 'http://47.112.96.71:8082/selection/sendMessage'
data = {
'content': '下载 变体数据失败。远程账号电脑 HM 299 421 380',
'title': f'账号电脑 {asin} 变体数据 失败',
'account': 'pengyanbing'
}
print(data)
y = requests.post(url=url, data=data, timeout=15)
print(y.content.decode('gbk'))
except:
pass
def redis_get_asin():
asin_list = []
random_key_list = []
redis_client = redis.Redis(host='113.100.143.162', port=6379, db=10, password='fG7#vT6kQ1pX')
def mysql_get_asin():
while True:
try:
print('轮询redis 查询,')
for i in range(10):
# 随机获取一个key
random_key = redis_client.randomkey()
if random_key:
random_key_list.append(random_key)
# 获取该key对应的value
value = redis_client.get(random_key)
value = value.decode('utf-8')
print('redis取出asin: ', value)
if value not in asin_list:
asin_list.append(value)
print('轮询 mysql 查询:', datetime.now().strftime("%m-%d %H:%M:%S"))
engine_us_mysql = mysql_connect()
spider_state_sql = """select asin,id from asin_variation_family_log where status = '未开始' and length(asin)=10 limit 20 """
print('spider_state_sql:', spider_state_sql)
df_asin = engine_us_mysql.read_sql(spider_state_sql)
if not df_asin.empty:
update_time = int(time.time())
with engine_us_mysql.begin() as conn:
index_tuple = tuple(df_asin['id'])
if len(index_tuple) == 1:
sql_update = f"""UPDATE asin_variation_family_log a set status='爬取中',update_time='{update_time}' where a.id in ({index_tuple[0]})"""
else:
break
if asin_list:
_asin_lis = list(set(asin_list))
print("_asin_lis:::",_asin_lis, )
sql_update = f"""UPDATE asin_variation_family_log a set status='爬取中',update_time='{update_time}' where a.id in {index_tuple}"""
print('UPDATE_sql:', sql_update)
conn.execute(sql_update)
_asin_lis = list(df_asin.asin + '|-|' + df_asin.id.astype("U"))
print("_asin_lis:::", _asin_lis, )
print("_asin_lis::: len ", len(_asin_lis))
run(_asin_lis) # 传递asin 列表
asin_list = []
for _key in random_key_list:
print(' 删除redis的asin:', _key)
redis_client.delete(_key) # 删除redis的asin
random_key_list = []
else:
run_spider(_asin_lis) # 传递asin 列表
time.sleep(3)
continue
# redis_client.close() 关闭redis
# break
except Exception as e:
print('查询redis报错', e)
redis_client.close()
redis_client = redis.Redis(host='192.168.10.224', port=6379, db=10, password='fG7#vT6kQ1pX')
time.sleep(5)
continue
print('查询 mysql_get_asin 报错::', e, f"\n{traceback.format_exc()}")
def save_asin_var_data(asin, data_json, spider_value, site_name, parent_asin_list, var_asin_list, error, id):
if parent_asin_list:
parent_asin = ','.join(parent_asin_list)
else:
parent_asin = ""
if len(var_asin_list) == 0:
var_asin_list = "'{}'"
else:
var_asin_list = "'" + '{' + ','.join(var_asin_list) + '}' + "'"
def save_asin_var_data(asin, data_json, spider_value):
if data_json:
data_json = data_json.replace('%', '%%').replace("'", "").replace("\'", "")
if error:
error = error.replace('%', '%%').replace("'", "").replace('"', '').replace("\'", "")
while True:
try:
engine_us_mysql = mysql_connect()
workflow_everyday_list = [[asin, data_json, spider_value]]
print('存储数据:', len(workflow_everyday_list))
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['asin', 'asin_var_data', 'spider_value'])
engine_us_mysql.to_sql(df_seller_asin_account, 'us_asin_var_info')
update_time = int(time.time())
print(f'更新 {asin} 数据:')
with engine_us_mysql.begin() as conn:
if error is None:
sql = f"""
UPDATE asin_variation_family_log
SET variation_family='{data_json}', status='{spider_value}', update_time='{update_time}' ,parent_asin='{parent_asin}',variation_asin={var_asin_list}
WHERE id={id} AND site_name='{site_name}'
"""
print('成功',sql)
elif error == '账号电脑退出登录':
sql = f"""
UPDATE asin_variation_family_log SET status='未开始' WHERE id={id} AND site_name='{site_name}'
"""
else:
sql = f"""
UPDATE asin_variation_family_log
SET variation_family='{data_json}', status='{spider_value}', update_time='{update_time}' ,parent_asin='{parent_asin}',variation_asin={var_asin_list},err_msg='{error}'
WHERE id={id} AND site_name='{site_name}'
"""
print('error is not None:: ', sql)
conn.execute(sql)
print(asin, '更新成功')
break
except Exception as e:
print('存储数据报错:', e)
time.sleep(5)
if __name__ == '__main__':
redis_get_asin()
mysql_get_asin()
from playwright.sync_api import sync_playwright
from sqlalchemy import create_engine
from secure_db_client import get_remote_engine
import pandas as pd
import urllib.parse
import json
import traceback
import time
from sqlalchemy.engine import URL
'商机探测器。下载bsr分类数据'
......@@ -17,32 +16,16 @@ class One688LoginSpider(object):
yaer = time.strftime('%Y', time.localtime(time.time()))
self.y_w = f"{yaer}-{month}"
self.mysql_connect()
def mysql_connect(self):
if self.site == 'us':
db = 'selection'
else:
db = f'selection_{self.site}'
DB_CONN_DICT = {
"mysql_port": 3306,
"mysql_db": db,
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
url = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database=db,
query={"charset": "utf8mb4"}
self.engine_us_mysql = get_remote_engine(
site_name='us', # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
self.engine_pg = get_remote_engine(
site_name=self.site, # -> database "selection"
db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
)
self.engine_us_mysql = create_engine( url)
self.engine_pg = create_engine(
f"postgresql+psycopg2://postgres:F9kL2sXe81rZq@113.100.143.162:5432/{db}",
encoding='utf-8')
return self.engine_us_mysql
def print_request_finished(self, request):
......@@ -59,7 +42,7 @@ class One688LoginSpider(object):
def select_category_json(self):
sql = 'SELECT category_json,id FROM seller_product_opportunity_syn where state=1'
engine_mysql = self.mysql_connect()
df_category_json = pd.read_sql(sql, con=engine_mysql)
df_category_json = engine_mysql.read_sql(sql)
category_data_list = list(df_category_json['category_json'] + '|=|=|' + df_category_json['id'].astype("U"))
data_list = []
for i in category_data_list:
......@@ -132,8 +115,7 @@ class One688LoginSpider(object):
'minimum_price', 'maximum_price', 'avg_price',
'return_rate_t360', 'search_volume_growth_t360',
'site', 'date_info', 'search_term'])
df_category_data.to_sql('seller_product_opportunity', con=self.engine_pg, if_exists='append',
index=False)
self.engine_pg.to_sql(df_category_data,'seller_product_opportunity', if_exists='append')
print('存储成功:', len(category_data_list))
with self.engine_us_mysql.begin() as conn:
sql_update = f"update seller_product_opportunity_syn set state=3 where id={int(data[1])};"
......@@ -155,10 +137,10 @@ class One688LoginSpider(object):
[self.site, self.y_w, '商机探测器抓取完成', 3, 'seller_product_opportunity', 'month',
'商机探测器', '是']]
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['site_name', 'date_info', 'status', 'status_val',
'table_name', 'report_date', 'page', 'is_end'])
df_seller_asin_account.to_sql('workflow_everyday', con=self.engine_us_mysql, if_exists='append',
index=False)
columns=['site_name', 'report_date', 'status', 'status_val',
'table_name', 'date_type', 'page', 'is_end'])
self.engine_us_mysql.to_sql(df_seller_asin_account,'workflow_everyday', if_exists='append'
)
def crawl(self, url):
self.page.on("requestfinished", self.print_request_finished)
......
# import sys
# import os
#
# sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
# from threading_spider.db_connectivity import connect_db
# import traceback
# import random
# from lxml import etree
# import json
# # import requests
# from curl_cffi import requests
# import time
# import pymysql
# from urllib.parse import quote
#
# """获取 junglescout bsr分类排名 销量"""
#
# rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700,
# 1800,
# 1900,
# 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700,
# 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500,
# 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300,
# 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900, 9000, 9100,
# 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000,
# 18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000,
# 85000, 90000, 95000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000, 550000,
# 600000, 650000, 700000, 750000, 800000, 850000, 900000, 950000, 1000000]
#
#
# def send_mes(site_name):
#
# month = time.strftime("%m")
# year = time.strftime("%Y")
# _year_month = f'{year}_{int(month)}'
# db_class = connect_db(site_name)
# cursor_mysql_db, db = db_class.mysql_db() # mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
# update_sql_state = f"""
# UPDATE {site_name}_one_category
# SET STATE=4
# WHERE `name` IN (
# SELECT `name` FROM (
# SELECT `name` FROM {site_name}_one_category WHERE rank=1 AND orders=0 AND `year_month`="{_year_month}"
# ) AS temp_table
# );
# """
# cursor.execute(update_sql_state)
# db.commit()
#
# def get_jl_rank(db_base):
# month = time.strftime("%m")
# year = time.strftime("%Y")
# year_month = f'{year}_{int(month)}'
# while True:
# try:
# db_class = connect_db(db_base)
# print(db_base)
# cursor_mysql_db, db = db_class.mysql_db() # mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
# category_name_sql_select = f"select distinct `name` , orders,category_first_id from {db_base}_one_category where rank=1 and `year_month` ='{year_month}'"
# print(category_name_sql_select)
# cursor.execute(category_name_sql_select)
# category_name_list = cursor.fetchall()
# print(category_name_list,'3444444444')
# update_sql_state1 = f"""UPDATE {db_base}_one_category SET STATE= 1"""
# cursor.execute(update_sql_state1)
# db.commit()
# break
# except:
# time.sleep(20)
# url = "https://www.junglescout.cn/sales-estimator/?gspk=OTQ0&gsxid=joW2dg6ZmAJA&utm_medium=944&utm_source=affiliate"
# h = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'www.junglescout.cn',
# 'Pragma': 'no-cache',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"',
# 'Sec-Fetch-Dest': 'document',
# 'Sec-Fetch-Mode': 'navigate',
# 'Sec-Fetch-Site': 'none',
# 'Sec-Fetch-User': '?1',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
# }
# while True:
# try:
#
# resp_html = requests.get(url, impersonate="chrome", headers=h,
# timeout=25, verify=False)
# if resp_html.status_code == 200 or resp_html.status_code == 201:
# break
# else:
# print(resp_html.text)
# print("请求首页报错----重新请求 ")
# time.sleep(2)
# continue
# except:
# time.sleep(5)
# continue
# html = etree.HTML(resp_html.text)
# name_lsit = html.xpath(
# f'//table/tbody/tr[4]/td[2]/div/ul/li[@class="category {db_base}-available"]/span[1]/text()')
# print('获取 js 分类名称:', name_lsit)
# for name in name_lsit:
# if db_base == 'fr':
# name = name.replace(' & ', ' et ')
# if name == 'Camera & Photo':
# name = 'Camera & Photo Products'
# while True:
# try:
# select_name_sql = f"select id from {db_base}_one_category where name='{name}'"
# cursor.execute(select_name_sql)
# name_list = cursor.fetchall()
# if name_list:
# pass
# else:
# print('junglescout 新增分类:', name)
# insert_sql = f'insert into {db_base}_one_category (name, rank, orders)values (%s,%s,%s)'
# cursor.execute(insert_sql, (name, 1, 1))
# db.commit()
# break
# except:
# time.sleep(15)
#
# print(category_name_list)
# for category_name in category_name_list:
# print("历史销量:", category_name['name'], category_name['orders'], category_name['category_first_id'])
# name_rnak_list = []
# Handmade_Products_list = []
# sales = 31
# for i in rank_list:
# if sales == 0:
# break
# token_num = 0
# while True:
# try:
# print(i)
# if db_base == 'fr':
# c_name = category_name['name'].replace(' et ', ' & ')
# else:
# c_name = category_name['name']
# if c_name == 'Camera & Photo Products':
# c_name = 'Camera & Photo'
# num = random.randint(115, 126)
# print(num)
# f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{num}.0.0.0 Safari/537.36'
# headers = {
# "Upgrade-Insecure-Requests": "1",
# "User-Agent": f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{num}.0.0.0 Safari/537.36',
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
# "Sec-Fetch-Site": "none",
# "Sec-Fetch-Mode": "navigate",
# "Sec-Fetch-User": "?1",
# "Sec-Fetch-Dest": "document",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept-Language": "zh-CN,zh;q=0.9",
# "Pragma": "no-cache",
# "Cache-Control": "no-cache"
# }
# params = (
# ('rank', f'{i}'),
# ('category', f'{c_name}'),
# ('store', f'{db_base}'),
# )
# for num in range(2):
# try:
# session = requests.Session()
# session.headers = headers
# session.get('https://d2ogs1k0ty8acr.cloudfront.net/sales', headers=headers,
# params=params, verify=False, timeout=10,impersonate="chrome")
# response = session.get('https://d2ogs1k0ty8acr.cloudfront.net/sales', headers=headers,
# params=params, verify=False, timeout=10,impersonate="chrome")
# print(response.url)
# print(response.text)
# break
# except Exception as e:
# print(e)
# time.sleep(6)
# continue
# response = json.loads(response.text)
# data = response['data']
# if data['sales']:
# sales = int(data['sales'])
# name_rnak_list.append(
# (category_name['name'], i, int(data['sales']), category_name['category_first_id'],
# year_month))
# if category_name['name'] == "Musical Instruments":
# Handmade_Products_list.append(
# ('Handmade Products', i, int(data['sales']), category_name['category_first_id'],
# year_month))
# token_num = 0
# if category_name['orders'] == int(data['sales']):
# print("销量不变")
# sales = 0
# break
# else:
# name_rnak_list.append(
# (category_name['name'], i, 0, category_name['category_first_id'], year_month))
# if category_name['name'] == "Musical Instruments":
# Handmade_Products_list.append(
# ('Handmade Products', i, 0, category_name['category_first_id'], year_month))
# sales = 0
# except Exception as e:
# print('错误', e, f"\n{traceback.format_exc()}")
# time.sleep(random.uniform(20, 60.5))
# token_num += 1
# continue
# time.sleep(random.uniform(15, 40.75))
# break
# print(name_rnak_list)
# while True:
# try:
# db.ping(reconnect=True)
# inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,category_first_id,`year_month`) values (%s, %s, %s, %s, %s)"
# print(inset_sql)
# cursor.executemany(inset_sql, name_rnak_list)
# db.commit()
# break
# except:
# db_class = connect_db(db_base)
# print(db_base)
# cursor_mysql_db, db = db_class.mysql_db() # mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
# time.sleep(20)
# if category_name['name'] == "Musical Instruments" and db_base == 'us':
# while True:
# try:
# inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,category_first_id,`year_month`) values (%s, %s, %s, %s, %s)"
# print(inset_sql)
# cursor.executemany(inset_sql, Handmade_Products_list)
# db.commit()
# break
# except:
# db_class = connect_db(db_base)
# print(db_base)
# cursor_mysql_db, db = db_class.mysql_db() # mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
# time.sleep(20)
# update_sql_state = f"""
# UPDATE {db_base}_one_category
# SET STATE=4
# WHERE `name` IN (
# SELECT `name` FROM (
# SELECT `name` FROM {db_base}_one_category WHERE rank=1 AND orders=0 AND `year_month`="{year_month}"
# ) AS temp_table
# );
# """
# cursor.execute(update_sql_state)
# db.commit()
# cursor.close()
# db.close()
#
#
# if __name__ == '__main__':
# site = 'us'
# get_jl_rank(site)
# send_mes(site)
import sys
import os
......@@ -271,7 +8,7 @@ from lxml import etree
from threading_spider.db_connectivity import connect_db
import json
import time
import pymysql
from utils.secure_db_client import get_remote_engine
import random
......@@ -292,34 +29,54 @@ def get_cid():
print(data_category)
save_site_category(json.loads(data_category))
def mysql_connect(site='us'):
engine_mysql = get_remote_engine(
site_name=site, # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
return engine_mysql
def db_cursor_connect_update(sql, site):
for i in range(3):
try:
engine_us_mysql = mysql_connect(site=site)
print('更新sql:', sql)
with engine_us_mysql.begin() as conn:
conn.execute(sql)
break
except:
print(site, 'db_cursor_connect 报错:', sql)
def db_cursor_connect_msyql_read(site,select_state1_sql):
for i in range(3):
try:
engine_mysql = mysql_connect(site=site)
df = engine_mysql.read_sql(select_state1_sql)
return df
except Exception as e:
import traceback
traceback.print_exc() # ★ 打印完整栈到终端
print(e, 'db_cursor_connect_msyql_read 报错:', select_state1_sql)
def junglescout_spider(db_base):
month = time.strftime("%m")
year = time.strftime("%Y")
year_month = f'{year}_{int(month)}'
db_class = connect_db(db_base)
cursor_mysql_db, connect_mysql_db = db_class.mysql_db() # mysql
cursor_us, connect_us = db_class.us_mysql_db() # us站点
cursor_us = connect_us.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
print(category_name_sql_select)
cursor_us.execute(category_name_sql_select)
category_name_list = cursor_us.fetchall()
print('category_name_list:',category_name_list)
for category_name in category_name_list:
db_class = connect_db(db_base)
print(db_base)
cursor_mysql_db, db = db_class.mysql_db() # mysql
db_class_us = connect_db('us')
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
print(category_name['name'], category_name['c_id'])
category_name_list_df = db_cursor_connect_msyql_read('us',category_name_sql_select)
print(category_name_list_df)
category_name_list = list(category_name_list_df['name']+'|-|==|'+category_name_list_df['c_id'])
for category_name_id in category_name_list:
print(category_name_id, '2323232323')
c_name = category_name_id.split('|-|==|')[0]
c_id = category_name_id.split('|-|==|')[1]
print(c_name, c_id)
name_rnak_list = []
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{category_name['c_id']}'"
print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{c_id}'"
db_cursor_connect_update(up_sql,'us')
rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1600,
1700,
......@@ -354,16 +111,16 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
'Cookie': 'Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1754561346; HMACCOUNT=F022519658636435; _ga=GA1.1.1814436837.1754561346; MEIQIA_TRACK_ID=30xFWMfHnmUko4gRxOqdrJNPOcY; MEIQIA_VISIT_ID=30xFWMuukIH8mg0Y3QtIVUHsOlv; ecookie=6fLTD5dFES0wy5bJ_CN; 5d6b3550f67a0d98a3f2=300e7c0221464bf96a29eee60c456f00; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=395de153390ead6f40f8e8ab7a472e28; _gcl_au=1.1.174716114.1754561346.1679700600.1754561355.1754561355; current_guest=qtmXt8RNChOi_250807-180618; rank-login-user=55981645716j2gNzbXWw3NxEgY4QumA2+nJmFK4cRNMQNZD9W4ScvveWtruw9iXoAChaMVh09V; rank-login-user-info="eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjU1OTgxNjQ1NzE2ajJnTnpiWFd3M054RWdZNFF1bUEyK25KbUZLNGNSTk1RTlpEOVc0U2N2dmVXdHJ1dzlpWG9BQ2hhTVZoMDlWIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJOQlhkbHU4eTlSZXdwS2doOWpJVzJBIiwiaWF0IjoxNzU0NTYxMzU1LCJleHAiOjE3NTQ2NDc3NTUsIm5iZiI6MTc1NDU2MTI5NSwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.csgALwYW8BkMpPMNB_LWfTmx9J4lYpLbqZW95ikqbz02AjJLMkoR8SmYfs_l5Y8_kr91FN-mMNG0_uG6LlMZg1_I_OWTX1GIVEqixiM4LnXO31VMK3yPCTEdFAUNosLKmXaLBsAkyovg82onGSOX3Sp8yy3QzCwREZc0TEVAONK7vBp0fFheyZNwejzyBfw_b7NPkFkTfvwzZo25QaHJyfkh0hxYgwtoDPSS_FmKrkpyh_zjbk7QIpJhY98k3ElI2OjdeAcE0ublxLemPI8GCwvj_V26Ob3mJ0WnvwyM5e2XBdCXF3Tn1OjOWvNP_fFr9TKDHguKLfZZzLOIO9gmkQ; ao_lo_to_n="55981645716j2gNzbXWw3NxEgY4QumA0yFbZjZZBIPjXVnHzCoK/gvYEiwOtDSpCKptN3oC6H7pg4af19gw3X2vJfRDGlTzgAJp8Uby054LbQLjTr8OLk="; rank-guest-user=6598164571W4w7830gIdYfyJ4dBpV8rZZnQ5nxne/EL2NJNXxISww1iqfwc5k9B1MBi3+pbbvB; _ga_38NCVF2XST=GS2.1.s1754561346$o1$g1$t1754561361$j45$l0$h2087898121; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1754561362; _ga_CN0F80S6GL=GS2.1.s1754561347$o1$g1$t1754561362$j45$l0$h0; JSESSIONID=012AF629221AF9FF44705008C9CE11D7',
'Cookie':'_ga=GA1.1.460823715.1761964155; _gcl_au=1.1.1179274784.1761964155; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1761964155; HMACCOUNT=B4AF3F9F7A9D6EC1; ecookie=Dze8cmDaWQgl62by_CN; cefb279b040e878c5f29=3df3541e6a5558a1721e067eca0b7599; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=34rGDtW8dkHrHYKAd1YqneyES16; MEIQIA_VISIT_ID=34rGDvmHZCXsyI9TcknmSs0VUgF; current_guest=Q5ta0ho0plze_251101-107638; _gaf_fp=71220b6380421e1c3114927822a0491d; rank-guest-user=4415412671XrN9Zk+EL9uIxING7/uXAkz1zoQytfQ4xehrp1wmpmp0tq0CKPMciyLt+xiapPpr; rank-login-user=4415412671XrN9Zk+EL9uIxING7/uXAtdtFXnuDWfcyj/blj6W2ZWpWUeF9+7WsIFXBV6TrXmy; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjQ0MTU0MTI2NzFYck45WmsrRUw5dUl4SU5HNy91WEF0ZHRGWG51RFdmY3lqL2JsajZXMlpXcFdVZUY5KzdXc0lGWEJWNlRyWG15In0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJfUWNIdGFKc1I3Xy04czRXcUF4UFpnIiwiaWF0IjoxNzYyMDg3NTQ0LCJleHAiOjE3NjIxNzM5NDQsIm5iZiI6MTc2MjA4NzQ4NCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.bA22TL7V1Ojva0xFsPl_1b---9IabSoJXdkWLxhspamEfSx8eLf-sv2VZz6fNqLbZI_ZXb9nBfdCbM0S2yzvElDeC9laJWi6Y_Cz5ywZvWPkkSl5Wmjal5Nso33UeoMffiBkjDkwIN6uIk-726zea76m7xrJmjQbN2wet_fzW04U4RbYPfCIam0eEvXQjhMAuYPoihIcF-LocsQ3Qr-m3xVaWD6CxxTC30rt4ZfD63kRGjrVa2RfgqVeBVS5nMwBF0PWEYgRUN2mB9jyDfnG472TNfxLhXIGPUTaoMtnaxQoRtbcENuapbpIZCpCruq1SuMNdqK3oxtdnUij6yiXEA; ao_lo_to_n="4415412671XrN9Zk+EL9uIxING7/uXAmjk9eVYRzsag6V6ttkMQIH2Lh3Ah2vwuQRDfzmyINXazLLen51hoAgbtysMQkarAmDtVJPvrGJg/tasB7+3bQc="; JSESSIONID=2FD41936F77140471FC8EC556826B071; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1762087559; _ga_CN0F80S6GL=GS2.1.s1762087538$o2$g1$t1762087559$j39$l0$h0; _ga_38NCVF2XST=GS2.1.s1762087538$o2$g1$t1762087565$j33$l0$h205427331',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": "US",
"cid": category_name['c_id'], # 分类id
"station": "DE",
"cid": c_id, # 分类id
"bsr": f"{i}" # 排名
}
print(category_name['name'], '请求参数 data::', data)
print(c_name, '请求参数 data::', data)
for i1 in range(3):
try:
response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300)
......@@ -383,60 +140,40 @@ def junglescout_spider(db_base):
# 没拿到数据,跳出循环
break
if est == 0.0:
print(f"{category_name['name']} 排名{i}:销量 0,跳出循环。")
print(f"{c_name} 排名{i}:销量 0,跳出循环。")
break
# 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1
print(type(est))
print('获取数据:', category_name['name'], i, est, year_month)
print('获取数据:', c_name, i, est, year_month)
sales = int(est)
name_rnak_list.append((category_name['name'], i, sales, year_month))
time.sleep(random.uniform(20, 65.75))
name_rnak_list.append((c_name, i, sales, year_month))
time.sleep(random.uniform(20, 45.75))
# break
for i in range(4):
try:
inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
print(inset_sql)
cursor_mysql_db.executemany(inset_sql, name_rnak_list)
db.commit()
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{category_name['c_id']}'"
engine_db_msyql = mysql_connect(site=db_base)
with engine_db_msyql.begin() as conn:
conn.execute(
f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)",
name_rnak_list)
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{c_id}'"
print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
db_cursor_connect_update(up_sql,'us')
break
except Exception as e:
print('存储失败:', e)
db_class = connect_db(db_base)
print(db_base)
cursor_mysql_db, db = db_class.mysql_db() # mysql
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
time.sleep(20)
print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(90, 200.5))
def save_site_category(site_bsr_dict=None):
db_class = connect_db('us')
cursor_us_mysql_db, db = db_class.us_mysql_db() # mysql
for i in site_bsr_dict.keys():
print(i)
delete_sql = f'DELETE from all_site_category where `name` ="{i}"' # 删除旧分类
print(delete_sql)
cursor_us_mysql_db.execute(delete_sql)
db.commit()
site_category_list = site_bsr_dict[i]
for site_category in site_category_list:
insert_sql = f'insert into all_site_category (site,`name`, c_id)values (%s,%s,%s)'
cursor_us_mysql_db.execute(insert_sql, (i, site_category['categoryLabel'], site_category['cid']))
db.commit()
db_class = connect_db(i)
cursor_site_mysql_db, db = db_class.mysql_db() # mysql
time.sleep(random.uniform(50, 120.5))
def run():
# get_cid()
junglescout_spider('us')
junglescout_spider('de')
if __name__ == '__main__':
run()
\ No newline at end of file
......@@ -670,6 +670,9 @@ class nsr_catgory(BaseUtils):
en_name_id_list.append(en_name_id[0])
id_tuple = tuple(en_name_id_list)
print(len(id_tuple))
if len(id_tuple) == 1:
update_sql = f"""UPDATE {self.site_name}_new_releases set one_category_id={id[0]} where id in ('{id_tuple[0]}')"""
else:
update_sql = f'update {self.site_name}_new_releases set one_category_id={id[0]} where id in {id_tuple}'
self.db_cursor_connect_update(update_sql, self.site_name)
except:
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import curl_cffi
from lxml import etree
# from threading_spider.db_connectivity import connect_db
from DrissionPage import ChromiumPage, ChromiumOptions
from DrissionPage.common import Keys
import json
import time
from utils.secure_db_client import get_remote_engine
import random
num_list = []
# # 获取所有站点的bsr 大类名称 和 分类id,存储到us站点
def get_cid():
url = 'https://www.sellersprite.com/v2/tools/sales-estimator'
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
resp = curl_cffi.get(url, headers=headers, impersonate="chrome")
html = etree.HTML(resp.text)
data_category = html.xpath("//script[@id='data-category']/text()")[0]
print(data_category)
def mysql_connect(site='us'):
engine_mysql = get_remote_engine(
site_name=site, # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
return engine_mysql
def db_cursor_connect_update(sql, site):
for i in range(3):
try:
engine_us_mysql = mysql_connect(site=site)
print('更新sql:', sql)
with engine_us_mysql.begin() as conn:
conn.execute(sql)
break
except:
print(site, 'db_cursor_connect 报错:', sql)
def db_cursor_connect_msyql_read(site, select_state1_sql):
for i in range(3):
try:
engine_mysql = mysql_connect(site=site)
df = engine_mysql.read_sql(select_state1_sql)
return df
except Exception as e:
import traceback
traceback.print_exc() # ★ 打印完整栈到终端
print(e, 'db_cursor_connect_msyql_read 报错:', select_state1_sql)
def sellersprite_spider(db_base):
month = time.strftime("%m")
year = time.strftime("%Y")
year_month = f'{year}_{int(month)}'
category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
print(category_name_sql_select)
category_name_list_df = db_cursor_connect_msyql_read('us', category_name_sql_select)
print(category_name_list_df)
category_name_list = list(category_name_list_df['name'] + '|-|==|' + category_name_list_df['c_id'])
cookies_dict = sellersprite_login(num=1) # 调用自动化登录账号 num使用第几个账号
for category_name_id in category_name_list:
print(category_name_id, '2323232323')
c_name = category_name_id.split('|-|==|')[0]
c_id = category_name_id.split('|-|==|')[1]
print(c_name, c_id)
name_rnak_list = []
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{c_id}'"
db_cursor_connect_update(up_sql, 'us')
rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1600,
1700,
1800,
1900,
2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500,
3600,
3700,
3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300,
5400,
5500,
5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100,
7200,
7300,
7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900,
9000,
9100,
9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000,
17000,
18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000
]
for i in rank_list:
headers = {
"Referer": "https://www.sellersprite.com/v2/tools/sales-estimator",
"Origin": "https://www.sellersprite.com",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": db_base.upper(),
"cid": c_id, # 分类id
"bsr": f"{i}" # 排名
}
print(c_name, '请求参数 data::', data)
for i1 in range(3):
try:
response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome", timeout=300,
cookies=cookies_dict)
print(response.url)
# print(response.text)
response = json.loads(response.text)
break
except:
time.sleep(random.uniform(15, 30.75))
try:
response_data = response['data']
print('code::', response['code'])
print('message::', response['message'])
print('estMonSales::', response_data['estMonSales'])
est = response_data.get('estMonSales')
if est is None:
# 没拿到数据,跳出循环
break
if est == 0.0:
print(f"{c_name} 排名{i}:销量 0,跳出循环。")
break
print(type(est))
print('获取数据:', c_name, i, est, year_month)
sales = int(est)
name_rnak_list.append((c_name, i, sales, year_month))
time.sleep(random.uniform(20, 45.75))
# break
except Exception as e:
print(e,5555555)
time.sleep(10) # # 调用自动化登录账号 报错 账号被封禁了。切换下一个账号
cookies_dict = sellersprite_login(num=0)
for i in range(4):
try:
inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
print(inset_sql)
engine_db_msyql = mysql_connect(site=db_base)
with engine_db_msyql.begin() as conn:
conn.execute(
f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)",
name_rnak_list)
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{c_id}'"
print('更新状态:', up_sql)
db_cursor_connect_update(up_sql, 'us')
break
except Exception as e:
print('存储失败:', e)
time.sleep(20)
print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(50, 120.5))
def sellersprite_login(num=2):
global num_list
num_list.append(num)
print('num_list',num_list)
if len(num_list) > 2:
num = 2
if len(num_list)>4:
num = 1
if len(num_list)>5:
print('睡眠')
num_list = []
num = 0
time.sleep(14400)
user_list = [['18307967347', 'Aa123456.'], ['qq16531218653@163.com', 'qq16531218653'], ['15368051270', '123456']]
print('登录账号:', user_list[num])
# 配置 Chrome 浏览器 - 端口 9222
chrome_options = ChromiumOptions()
chrome_options.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe')
chrome_options.set_local_port(9333) # 设置 Chrome 的调试端口
page_chrome = ChromiumPage(addr_or_opts=chrome_options)
print(f"Chrome 浏览器运行在端口: {9333}")
page_chrome.get("https://www.sellersprite.com/cn/w/user/login")
page_chrome.set.window.max()
page_chrome.set.cookies.clear()
time.sleep(random.randint(1, 3))
page_chrome.refresh()
# 等待页面初始加载
time.sleep(random.randint(1, 3))
page_chrome.get("https://www.sellersprite.com/cn/w/user/login")
time.sleep(random.randint(6, 10))
export_orders = page_chrome.ele('xpath://a[text()="账号登录"]', timeout=10)
export_orders.click()
print('点击账号登录')
time.sleep(random.randint(5, 10))
email_input = page_chrome.ele('xpath://div[@id="form_signin_password"]//input[@name="email"]')
email_input.clear() # 清除任何预填充的内容
email_input.input(user_list[num][0]) # 输入文本password
print("已输入账号到邮箱输入框")
time.sleep(random.randint(5, 10))
password_input = page_chrome.ele('xpath://div[@id="form_signin_password"]//input[@type="password"]')
password_input.clear() # 清除任何预填充的内容
password_input.input(user_list[num][1])
time.sleep(random.randint(5, 10))
page_chrome.actions.type(Keys.ENTER)
time.sleep(random.randint(5, 10))
page_chrome.get('https://www.sellersprite.com/v2/tools/sales-estimator')
time.sleep(random.randint(5, 10))
original_cookies_list = page_chrome.cookies()
# 将 cookies 列表转换为字典
original_cookie_dict = {cookie['name']: cookie['value'] for cookie in original_cookies_list}
print('original_cookie_dict::', original_cookie_dict)
page_chrome.close()
return original_cookie_dict
def run():
for i in [ 'uk', 'de']:
sellersprite_spider(i)
if __name__ == '__main__':
run()
# import sys
# import os
#
# sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
# import curl_cffi
# from lxml import etree
# import json
# import time
# import random
# from utils.secure_db_client import get_remote_engine
#
#
# def db_engine(site_name, db_type):
#
# """
# "mysql": "mysql", # 阿里云mysql
# "postgresql_14": "postgresql_14", # pg14爬虫库-内网
# "postgresql_14_outer": "postgresql_14_outer", # pg14爬虫库-外网
# "postgresql_15": "postgresql_15", # pg15正式库-内网
# "postgresql_15_outer": "postgresql_15_outer", # pg15正式库-外网
# "postgresql_cluster": "postgresql_cluster", # pg集群-内网
# "postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
# "doris": "doris", # doris集群-内网
# """
# engine = get_remote_engine(
# site_name=site_name, # -> database "selection"
# db_type=db_type, # -> 服务端 alias "mysql"
# )
# return engine
#
# def db_cursor_connect_update(sql):
# for i in range(3):
# try:
# engine_us_mysql = get_remote_engine('us','mysql')
# print('执行sql:', sql)
# with engine_us_mysql.begin() as conn:
# conn.execute(sql)
# break
# except:
# print('db_cursor_connect 报错:', sql)
# def get_cid():
# # 获取所有站点的bsr 大类名称 和 分类id,存储到us站点
# url = 'https://www.sellersprite.com/v2/tools/sales-estimator'
# headers = {
#
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
# "Accept-Encoding": "gzip, deflate, br, zstd",
# "Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
# "Cache-Control": "no-cache",
# "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
# }
# resp = curl_cffi.get(url, headers=headers, impersonate="chrome")
# html = etree.HTML(resp.text)
# data_category = html.xpath("//script[@id='data-category']/text()")[0]
# print(data_category)
# save_site_category(json.loads(data_category))
#
#
# def junglescout_spider(db_base):
# month = time.strftime("%m")
# year = time.strftime("%Y")
# year_month = f'{year}_{int(month)}'
# category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
# print(category_name_sql_select)
# _engine = db_engine('us','mysql')
# df_ = _engine.read_sql(category_name_sql_select)
# category_name_list = list(df_.name + '|==|--|'+df_.c_id)
# print('category_name_list:',category_name_list)
# for category_name_cid in category_name_list:
# category_name = category_name_cid.split('|==|--|')[0]
# c_id = category_name_cid.split('|==|--|')[1]
# print(category_name, c_id)
# name_rnak_list = []
# up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{c_id}'"
# print('更新状态:', up_sql)
# db_cursor_connect_update(up_sql)
# rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
# 1600,
# 1700,
# 1800,
# 1900,
# 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500,
# 3600,
# 3700,
# 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300,
# 5400,
# 5500,
# 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100,
# 7200,
# 7300,
# 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900,
# 9000,
# 9100,
# 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000,
# 17000,
# 18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000
# ]
# for i in rank_list:
# headers = {
# "Referer": "https://www.sellersprite.com/v2/tools/sales-estimator",
# "Origin":"https://www.sellersprite.com",
# "Accept": "application/json, text/javascript, */*; q=0.01",
# "Accept-Encoding": "gzip, deflate, br, zstd",
# "Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
# "Cache-Control": "no-cache",
# 'Cookie': 'Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1754980411; HMACCOUNT=470336FBA166E984; _ga=GA1.1.369458666.1754980412; _gcl_au=1.1.264141837.1754980412; MEIQIA_TRACK_ID=31AwurWIdtN2Ob4FQ2QW7kcpB0x; MEIQIA_VISIT_ID=31AwuwdPS1Vv4z7uSPqpc0Gj1ce; ecookie=0oXdazBemDJwQj8M_CN; 5d6b3550f67a0d98a3f2=300e7c0221464bf96a29eee60c456f00; _fp=65dbbe41a37f8f9fbe702eba96328267; current_guest=WEMjtTq1tsth_250812-144558; _gaf_fp=59df4b0c2b58ce924ed353a3d4aff048; rank-guest-user=6308305571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9ligCErogWOBlL8kpVxO2TpkM; rank-login-user=6308305571KK6FnhfedvWg9tSSyk3xj2GRIc/8HSm4vuPYVHI5vKIJMH4fZg1mmDOwxfDtgQRc; rank-login-user-info=eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjYzMDgzMDU1NzFLSzZGbmhmZWR2V2c5dFNTeWszeGoyR1JJYy84SFNtNHZ1UFlWSEk1dktJSk1INGZaZzFtbURPd3hmRHRnUVJjIn0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJ4RTZsWDJPbDdhWEphOVRjZVdfbER3IiwiaWF0IjoxNzU0OTgwNDM2LCJleHAiOjE3NTUwNjY4MzYsIm5iZiI6MTc1NDk4MDM3Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.ijVJw3SbLMEwdgwRIK-WviTFwmhYpp5_Kme31vu9V1HdiBLuyNjCwO97G6c5Pjlpio4jJjVdmaWXjB0rgAfpZA0XG3eui7bENUjqeOnAlBseIVfPBUjDWp9vhgV8uGGtpkD5I8wBAu26KSyE6tLuE4ML3mGMaWuNCPc9NpiLszyIaGmp0FkCU761-PmV0K9mNDKPQmUSlCOqHsNn5mscdfWfIaUaYdlAKEh0ojJoIOrDnFK3hLy9qWQ4W2ChgI8HQUU4Y48QwZCXsTn8fvZ6cNstRNuTjeT2Iq16hzdpWyuVzBgFe2YkwRYBdvi4plyPZkauZtb6RRkoi6oNA1qCXA; ao_lo_to_n=6308305571KK6FnhfedvWg9tSSyk3xjw9sOVkTMvBIw9HsBCFNE4fph+5i+ZmhNt9gtZL7px6VG01jHqLt1SVP6xii5hleGt0VSRpt/FdjNQpaxROiyf0=; JSESSIONID=57A66ED8AB79C6F6E5B12EC4414C0E00; _ga_38NCVF2XST=GS2.1.s1754980411$o1$g1$t1754980455$j16$l0$h797116043; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1754980455; _ga_CN0F80S6GL=GS2.1.s1754980412$o1$g1$t1754980455$j17$l0$h0',
# "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# }
# url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
# data = {
# "station": "UK",
# "cid": c_id, # 分类id
# "bsr": f"{i}" # 排名
# }
# print(category_name, '请求参数 data::', data)
# for i1 in range(3):
# try:
# response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300)
# print(response.url)
# # print(response.text)
# response = json.loads(response.text)
# break
# except:
# time.sleep(random.uniform(15, 35.75))
#
# response_data = response['data']
# print('code::', response['code'])
# print('message::', response['message'])
# print('estMonSales::', response_data['estMonSales'])
# est = response_data.get('estMonSales')
# if est is None:
# # 没拿到数据,跳出循环
# break
# if est == 0.0:
# print(f"{category_name} 排名{i}:销量 0,跳出循环。")
# break
#
# # 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1
# print(type(est))
# print('获取数据:', category_name, i, est, year_month)
# sales = int(est)
# name_rnak_list.append((category_name, i, sales, year_month))
# time.sleep(random.uniform(30, 95.75))
# # break
# for i in range(4):
# try:
# inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
# print(inset_sql)
# engine_mysql = db_engine(db_base, 'mysql')
# with engine_mysql.begin() as conn:
# conn.execute(
# f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)",
# name_rnak_list)
# up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{c_id}'"
# print('更新状态:', up_sql)
# db_cursor_connect_update(up_sql)
# break
# except Exception as e:
# print('存储失败:',e)
# _engine = db_engine('us','mysql')
#
# print('当前完成。获取下一个分类销量')
# time.sleep(random.uniform(120, 240.5))
#
#
# def save_site_category(site_bsr_dict=None):
# engine_mysql = db_engine('us', 'mysql')
# for i in site_bsr_dict.keys():
# print(i)
# delete_sql = f'DELETE from all_site_category where `name` ="{i}"' # 删除旧分类
# db_cursor_connect_update(delete_sql)
# site_category_list = site_bsr_dict[i]
# with engine_mysql.begin() as conn:
# for site_category in site_category_list:
# insert_sql = f'insert into all_site_category (site,`name`, c_id)values (%s,%s,%s)'
# conn.execute(insert_sql, (i, site_category['categoryLabel'], site_category['cid']))
# def run():
# # get_cid()
# junglescout_spider('uk')
#
#
# if __name__ == '__main__':
# run()
import sys
import os
......@@ -7,7 +196,7 @@ from lxml import etree
from threading_spider.db_connectivity import connect_db
import json
import time
import pymysql
from utils.secure_db_client import get_remote_engine
import random
......@@ -26,36 +215,55 @@ def get_cid():
html = etree.HTML(resp.text)
data_category = html.xpath("//script[@id='data-category']/text()")[0]
print(data_category)
save_site_category(json.loads(data_category))
def mysql_connect(site='us'):
engine_mysql = get_remote_engine(
site_name=site, # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
return engine_mysql
def db_cursor_connect_update(sql, site):
for i in range(3):
try:
engine_us_mysql = mysql_connect(site=site)
print('更新sql:', sql)
with engine_us_mysql.begin() as conn:
conn.execute(sql)
break
except:
print(site, 'db_cursor_connect 报错:', sql)
def db_cursor_connect_msyql_read(site,select_state1_sql):
for i in range(3):
try:
engine_mysql = mysql_connect(site=site)
df = engine_mysql.read_sql(select_state1_sql)
return df
except Exception as e:
import traceback
traceback.print_exc() # ★ 打印完整栈到终端
print(e, 'db_cursor_connect_msyql_read 报错:', select_state1_sql)
def junglescout_spider(db_base):
month = time.strftime("%m")
year = time.strftime("%Y")
year_month = f'{year}_{int(month)}'
db_class = connect_db(db_base)
cursor_mysql_db, connect_mysql_db = db_class.mysql_db() # mysql
cursor_us, connect_us = db_class.us_mysql_db() # us站点
cursor_us = connect_us.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
print(category_name_sql_select)
cursor_us.execute(category_name_sql_select)
category_name_list = cursor_us.fetchall()
print('category_name_list:',category_name_list)
for category_name in category_name_list:
db_class = connect_db(db_base)
print(db_base)
cursor_mysql_db, db = db_class.mysql_db() # mysql
db_class_us = connect_db('us')
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
print(category_name['name'], category_name['c_id'])
category_name_list_df = db_cursor_connect_msyql_read('us',category_name_sql_select)
print(category_name_list_df)
category_name_list = list(category_name_list_df['name']+'|-|==|'+category_name_list_df['c_id'])
for category_name_id in category_name_list:
print(category_name_id, '2323232323')
c_name = category_name_id.split('|-|==|')[0]
c_id = category_name_id.split('|-|==|')[1]
print(c_name, c_id)
name_rnak_list = []
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{category_name['c_id']}'"
print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{c_id}'"
db_cursor_connect_update(up_sql,'us')
rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1600,
1700,
......@@ -76,6 +284,11 @@ def junglescout_spider(db_base):
9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000,
17000,
18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000
# 55000, 60000, 65000, 70000, 75000,
# 80000,
# 85000, 90000, 95000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000,
# 550000,
# 600000, 650000, 700000, 750000, 800000, 850000, 900000, 950000, 1000000
]
for i in rank_list:
headers = {
......@@ -85,16 +298,16 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; x-hng=lang=zh-CN&domain=www.sellersprite.com; a40ac813159995d028ba=3d9b7c15f5787387e62acd734f598f23; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751973053,1752031904,1752460043,1752653436; HMACCOUNT=800EBCCFB4C6BBFB; rank-guest-user=8301172571YFpPM/DhYDVQzRAgRu7tcQTFTi48nSnOk/TNMkep2gdtR77QXyNfDPmFlYbdSsdL; rank-login-user=8301172571YFpPM/DhYDVQzRAgRu7tcWqD2KCbe1WiKcOarfxTCdls3AJ9lNFy+VA8a+RTm195; rank-login-user-info=eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjgzMDExNzI1NzFZRnBQTS9EaFlEVlF6UkFnUnU3dGNXcUQyS0NiZTFXaUtjT2FyZnhUQ2RsczNBSjlsTkZ5K1ZBOGErUlRtMTk1In0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJLcVRRV2RPbVNNcjlKTU1qYTdXRjFRIiwiaWF0IjoxNzUyNjUzNDM4LCJleHAiOjE3NTI3Mzk4MzgsIm5iZiI6MTc1MjY1MzM3OCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.caY2QxTbtUVg7CQXvNJcmVo1YU0TGy3AD01dIddF76PHjYbbFh5a8zZAdAXnAKM1wNcs39d1MM8Wa-uoXHiitqDlCZsWyF9aXzco9L4wn-yU4xlMYsf7VoquZI6bxaMT2TNeX6vgQBod-NeXHYFpZQWdrH5sfZHQypkpRINb_o1QwaWvZrjufj1UwYdiypryBxTDyCuLfD4djU0PLMRXvifY6Ef86VNjAlsY8gFqDdHiVLixR2GWGdKRtoG74Ak5DX2eMDT6ak-OMrWYOaikthxIXiqdADTq2tvUCmjO4pE0oYnWhSEx9-UABo7jxJ0v_Af8B6AVu7ccC0NUUvWBMA; ao_lo_to_n=8301172571YFpPM/DhYDVQzRAgRu7tca/7vKUOAtDW4w4LhsAzrvlsqk8xCK+opMY27DGtrDKlwUwhqg///+C6QOw12iRKNIq9mCOV5+ORmOA+PwqisF4=; _gaf_fp=0f3f9e0c791b5513d38aa715d0624aab; _gcl_au=1.1.420472597.1749119222.448034571.1752653439.1752653439; JSESSIONID=0F617D64E2FD6DD92F3BB10935E3C846; _ga_38NCVF2XST=GS2.1.s1752653436$o51$g1$t1752653450$j46$l0$h366949276; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1752653451; _ga_CN0F80S6GL=GS2.1.s1752653437$o50$g1$t1752653451$j46$l0$h0',
'Cookie': '_gcl_au=1.1.1447326361.1758264089; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1758264089; HMACCOUNT=7F9DD13A227E1D2B; _ga=GA1.1.991212207.1758264089; MEIQIA_TRACK_ID=32uIa4AhuxRLTcHXxm0PIk6Neqn; MEIQIA_VISIT_ID=32uIa1sK6QRdxkjk2DRJqfS7UaG; ecookie=qmC7o7xcw0S1xZD9_CN; d0c5b9780b50a33ad822=dc2936bc8106c9a8ee1714818e7e7a72; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=8dc236c6b0cc21a83974c129be61cfc2; current_guest=BWQNYACe6Zku_250919-145796; rank-login-user=6961238571i+0e7rddchJDh/F/NHVdW5HCX9QRmjEXsHOplEmuEXlp0Qwv/G3CXc4Z8WBR8qa8; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjY5NjEyMzg1NzFpKzBlN3JkZGNoSkRoL0YvTkhWZFc1SENYOVFSbWpFWHNIT3BsRW11RVhscDBRd3YvRzNDWGM0WjhXQlI4cWE4In0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJhYnA1OTJtVWFjeElMQm96TkRVTEFBIiwiaWF0IjoxNzU4MjY0MDk2LCJleHAiOjE3NTgzNTA0OTYsIm5iZiI6MTc1ODI2NDAzNiwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.Q4Sne6pW_Lhytp1g9sR99PbRNN7BcE5azPHptecj7JIKFyRhFNJ7ZuwDnXW755Cs64JwS16Ra2R67pb1WhuxFoFFP6MBNZIql3jMnpBUO33LKBWbHkN3x5r4OXHCjCM6BvhhAyDWftUHnG-QeF-zTuQDfgVPiWSxEPnJj0pBvpDSOAYWgSs7G3J7pWE1Mz5nL4m7VhsKyIpQ0oT3zv5zgPOfXq6CaB0Ud5LjBJ9ADpNlQOi2-7hT05lQoZRgzce8Irx8jrla4icnWsBSYEUyTZSlNEf1pZVly4aK_txRfgYj5PjOyph4axuvZGq1am3wcEiD8MRBNz-ksEspXCeW0Q; ao_lo_to_n="6961238571i+0e7rddchJDh/F/NHVdW+To037jtP43UpoX84C25sG3pqka4v1jufgWxEucssJpM3EjpAFpLcWIRm7MR3R6a8lu71Loi14HqnMls77u9v8="; rank-guest-user=6961238571i+0e7rddchJDh/F/NHVdW6XmQntBFYjoP/KdgG5QRJAelN+6DbB+BnVatmY+ztdl; JSESSIONID=0FEB643072E7552D57ED5DEC085D20D8; _ga_38NCVF2XST=GS2.1.s1758264089$o1$g1$t1758264103$j46$l0$h1039295416; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1758264103; _ga_CN0F80S6GL=GS2.1.s1758264090$o1$g1$t1758264103$j47$l0$h0',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": "UK",
"cid": category_name['c_id'], # 分类id
"station": "DE",
"cid": c_id, # 分类id
"bsr": f"{i}" # 排名
}
print(category_name['name'], '请求参数 data::', data)
print(c_name, '请求参数 data::', data)
for i1 in range(3):
try:
response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300)
......@@ -103,7 +316,7 @@ def junglescout_spider(db_base):
response = json.loads(response.text)
break
except:
time.sleep(random.uniform(15, 35.75))
time.sleep(random.uniform(15, 30.75))
response_data = response['data']
print('code::', response['code'])
......@@ -114,61 +327,39 @@ def junglescout_spider(db_base):
# 没拿到数据,跳出循环
break
if est == 0.0:
print(f"{category_name['name']} 排名{i}:销量 0,跳出循环。")
print(f"{c_name} 排名{i}:销量 0,跳出循环。")
break
# 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1
print(type(est))
print('获取数据:', category_name['name'], i, est, year_month)
print('获取数据:', c_name, i, est, year_month)
sales = int(est)
name_rnak_list.append((category_name['name'], i, sales, year_month))
time.sleep(random.uniform(30, 95.75))
name_rnak_list.append((c_name, i, sales, year_month))
time.sleep(random.uniform(20, 65.75))
# break
for i in range(4):
try:
inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
print(inset_sql)
cursor_mysql_db.executemany(inset_sql, name_rnak_list)
db.commit()
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{category_name['c_id']}'"
engine_db_msyql = mysql_connect(site=db_base)
with engine_db_msyql.begin() as conn:
conn.execute(
f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)",
name_rnak_list)
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{c_id}'"
print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
db_cursor_connect_update(up_sql,'us')
break
except Exception as e:
print('存储失败:',e)
db_class = connect_db(db_base)
print(db_base)
cursor_mysql_db, db = db_class.mysql_db() # mysql
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
time.sleep(20)
print('存储失败:', e)
time.sleep(20)
print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(120, 240.5))
def save_site_category(site_bsr_dict=None):
db_class = connect_db('us')
cursor_us_mysql_db, db = db_class.us_mysql_db() # mysql
for i in site_bsr_dict.keys():
print(i)
delete_sql = f'DELETE from all_site_category where `name` ="{i}"' # 删除旧分类
print(delete_sql)
cursor_us_mysql_db.execute(delete_sql)
db.commit()
site_category_list = site_bsr_dict[i]
for site_category in site_category_list:
insert_sql = f'insert into all_site_category (site,`name`, c_id)values (%s,%s,%s)'
cursor_us_mysql_db.execute(insert_sql, (i, site_category['categoryLabel'], site_category['cid']))
db.commit()
db_class = connect_db(i)
cursor_site_mysql_db, db = db_class.mysql_db() # mysql
time.sleep(random.uniform(90, 200.5))
def run():
# get_cid()
junglescout_spider('uk')
junglescout_spider('de')
if __name__ == '__main__':
run()
\ No newline at end of file
......@@ -408,11 +408,12 @@ class async_account_name_products(BaseUtils):
try:
with self.engine_pg6.begin() as conn:
# 查詢收藏asin
sql_read_asin = f'SELECT id, data_id, end_time FROM {self.db_user_collection_syn} WHERE now() >= crawling_time and now() <= end_time and state = 1 and data_type = 1 ORDER BY id FETCH FIRST {self.read_size} ROWS ONLY FOR UPDATE;'
sql_read_asin = f'SELECT id, data_id, end_time FROM {self.db_user_collection_syn} WHERE now() >= crawling_time and now() <= end_time and state = 1 and data_type = 1 ORDER BY id FOR UPDATE'
print('查詢收藏asin:', sql_read_asin)
b = conn.execute(sql_read_asin)
self.df_read_asin = pd.DataFrame(b, columns=['id', 'data_id', 'end_time'])
self.df_read_asin = self.engine_pg6.read_sql(sql_read_asin)
if self.df_read_asin.shape[0] !=0:
self.index_tuple_asin = tuple(self.df_read_asin['id'])
print('self.index_tuple_asin::',len(self.index_tuple_asin))
if self.index_tuple_asin:
if len(self.index_tuple_asin) == 1:
sql_update = f"""UPDATE {self.db_user_collection_syn} b set state=2 where b.id in ({self.index_tuple_asin[0]})"""
......@@ -428,12 +429,14 @@ class async_account_name_products(BaseUtils):
user_asin_list = user_asin.split('|-|')
self.user_asin_list.append(user_asin_list)
print(self.user_asin_list)
print('存储 收藏asin')
self.save_asin_syn()
# 查询收藏店铺
sql_read = f'SELECT id, product_url,data_id,end_time FROM {self.db_user_collection_syn} WHERE now() >= crawling_time and now() <= end_time and state = 1 and data_type = 2 ORDER BY id FETCH FIRST {self.read_size} ROWS ONLY FOR UPDATE;'
print('查询收藏店铺:', sql_read)
a = conn.execute(sql_read)
self.df_read = pd.DataFrame(a, columns=['id', 'product_url', 'data_id', 'end_time'])
# a = conn.execute(sql_read)
self.df_read = self.engine_pg6.read_sql(sql_read)
# self.df_read = pd.DataFrame(a, columns=['id', 'product_url', 'data_id', 'end_time'])
if self.df_read.shape[0] == 0:
self.stop_item_queue = False
return []
......@@ -473,9 +476,7 @@ class async_account_name_products(BaseUtils):
else:
sql_DELETE = f"""DELETE FROM {self.site_name}_user_seller_collections where account_id in {tuple(self.account_name_list_update)}"""
conn.execute(sql_DELETE)
df_asin_variation.to_sql(f"{self.site_name}_user_seller_collections", con=self.engine_pg6,
if_exists='append',
index=False)
self.engine_pg6.to_sql(df_asin_variation,f"{self.site_name}_user_seller_collections",if_exists='append')
self.asin_detail_list = []
break
except Exception as e:
......
......@@ -351,8 +351,8 @@ else:
# redis
REDIS_CONN = {
"redis_host": "113.100.143.162",
"redis_port": 6379,
"redis_pwd": "fG7#vT6kQ1pX",
"redis_port": 54372,
"redis_pwd": "N8#rTp2Xz!Lk6@Vw9qHs4&Yb1Fm0Cj3",
"redis_db": 14
}
......
import sys
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from func_timeout import func_set_timeout
......@@ -29,11 +29,12 @@ class Save_asin_detail(BaseUtils):
self.init_db_names()
self.cols = self.reuests_para_val.db_column(site_name)
self.redis_client = self.redis_db()
def init_db_names(self):
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect() # 更改变体 时 存储 变体表 使用 self.engine
self.kafuka_producer = self.kafuka_connect() # 卡夫卡连接
self.kafuka_producer_str = self.kafuka_connect(acks=True,connections_max_idle_ms=300000) # 卡夫卡连接
self.kafuka_producer_str = self.kafuka_connect(acks=True, connections_max_idle_ms=300000) # 卡夫卡连接
self.redis_db14 = self.redis_db() # redis 链接
self.db_syn = self.site_name + '_all_syn_st_month_2025'
self.db_seller_account_syn = self.site_name + DB_REQUESTS_ASIN_PARAMS['db_seller_account_syn'][2:] + '_distinct'
......@@ -90,7 +91,7 @@ class Save_asin_detail(BaseUtils):
self.df_read.drop_duplicates(['asin'], inplace=True)
if self.df_read.shape[0] > 0:
self.index_tuple = tuple(self.df_read['id'])
print(self.index_tuple,'self.index_tuplself.index_tuplself.index_tupl')
print(self.index_tuple, 'self.index_tuplself.index_tuplself.index_tupl')
# 使用默认值填充空值
self.df_read['volume'].fillna('null', inplace=True)
self.df_read['weight_str'].fillna('null', inplace=True)
......@@ -220,7 +221,7 @@ class Save_asin_detail(BaseUtils):
print(f'存储pg:{self.site_name}_asin_detail_month_{report_info}')
# df.to_csv(r'2025-7-30_srs_search_term_asin.csv', index=False)
self.engine_pg.to_sql(df,f"{self.site_name}_asin_detail_month_{report_info}",
self.engine_pg.to_sql(df, f"{self.site_name}_asin_detail_month_{report_info}",
if_exists='append')
break
except Exception as e:
......@@ -258,6 +259,27 @@ class Save_asin_detail(BaseUtils):
self.db_change_state(state=13, asin_list=asin_not_div_id_dp_list)
@func_set_timeout(240)
def save_asin_not_buysales(self, asin_buySales_list):
while True:
try:
if is_internet_available():
pass
else:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
print('错误月销的asin:', asin_buySales_list)
print('错误月销的asin:', len(asin_buySales_list))
df_asin_ = pd.DataFrame(data=asin_buySales_list, columns=['asin', 'buysales', 'date_info'])
self.engine_pg.to_sql(df_asin_, f'{self.site_name}_asin_detail_2025_not_buysales', if_exists='append')
break
except Exception as e:
print("存储 _asin_detail_2025_not_buysales 文本 数据错误", e)
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
time.sleep(random.uniform(10, 20.5))
continue
@func_set_timeout(240)
def save_bs_category_asin_detail(self, bs_category_asin_list_pg):
# 存储 asin bsr 文本
while True:
......@@ -278,7 +300,8 @@ class Save_asin_detail(BaseUtils):
if df_asin_bsr_pg.shape[0] > 0:
date_info_ = list(df_asin_bsr_pg.date_info)[0].replace('-', '_')
print(f'{self.site_name}_bs_category_asin_detail_month_{date_info_}')
self.engine_pg.to_sql(df_asin_bsr_pg,f'{self.site_name}_bs_category_asin_detail_month_{date_info_}',
self.engine_pg.to_sql(df_asin_bsr_pg,
f'{self.site_name}_bs_category_asin_detail_month_{date_info_}',
if_exists='append')
bs_category_asin_list_pg = []
break
......@@ -340,7 +363,7 @@ class Save_asin_detail(BaseUtils):
else:
sql_delete = f"delete from {self.db_seller_asin_account} where asin in {tuple(set(df_seller_asin_account.asin))};"
conn.execute(sql_delete)
self.engine.to_sql(df_seller_asin_account,self.db_seller_asin_account,
self.engine.to_sql(df_seller_asin_account, self.db_seller_asin_account,
if_exists='append')
buyBoxname_asin_list = []
break
......@@ -412,7 +435,7 @@ class Save_asin_detail(BaseUtils):
sql_delete = f"delete from {self.site_name}_all_syn_st_asin where asin in {tuple(set(df_asin.asin))};"
conn.execute(sql_delete)
df_asin['state'] = state
self.engine_pg.to_sql(df_asin,f'{self.site_name}_all_syn_st_asin',if_exists='append')
self.engine_pg.to_sql(df_asin, f'{self.site_name}_all_syn_st_asin', if_exists='append')
break
except Exception as e:
self.engine = self.mysql_connect()
......@@ -422,6 +445,5 @@ class Save_asin_detail(BaseUtils):
f"\n{traceback.format_exc()}")
continue
if __name__ == '__main__':
Save_asin_detail()
# if __name__ == '__main__':
# Save_asin_detail()
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import datetime
import json
import os
import random
import re
import socket
import time
import traceback
from sqlalchemy import create_engine
from random import randint
from time import sleep
import numpy as np
import pandas as pd
import pydub
import redis
import requests
from sqlalchemy import text
from lxml import etree
import os
import socket
from secure_db_client import get_remote_engine
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import re
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.common.exceptions import WebDriverException, TimeoutException
import random
import pydub
from selenium.webdriver.support.wait import WebDriverWait
from speech_recognition import Recognizer, AudioFile
from time import sleep
from random import randint
from selenium.common.exceptions import NoSuchElementException
import numpy as np
import datetime
import redis
class H10():
def __init__(self):
with open('config_json', 'r', encoding='utf-8')as f:
params_data_json = f.read()
self.params_data_dict = json.loads(params_data_json)
self.db_syn = 'all_h10_syn'
self.site_name = 'us'
self.site_name_csv = 'us'
self.sku_list = []
self.err_asin_list = []
self.err_asins_adv_list = []
self.asin_state_5_list = []
self.sku_state = False
self.ip = self.get_ip_address()
self.useremail_state = True
self.redis_db()
# self.mysql_inv()
def redis_db(self):
self.redis_db1 = redis.Redis(host='120.79.147.190', port=6379, password='Vm5vQH4ydFXh', db=0)
self.redis_db1 = redis.Redis(host=self.params_data_dict['redis_host'], port=self.params_data_dict['redis_port'],
password=self.params_data_dict['redis_pwd'], db=self.params_data_dict['redis_db'])
def get_token(self):
while True:
try:
print('redis 获取token')
val = self.redis_db1.hget('thirdParty:token:inventory', 'disanfang')
self.val_str = val.decode('utf-8')
print(self.val_str)
break
except:
except Exception as e:
self.redis_db()
print('redis 获取token 报错')
print('redis 获取token 报错', e)
time.sleep(20)
def get_ip_address(self):
......@@ -62,7 +66,8 @@ class H10():
s.connect(('baidu.com', 0))
ip = s.getsockname()[0]
# You are viewing a demo of Cerebro
user_pw_dict = {'192.168.10.244': [r'C:\Users\win10-244\Downloads', 'YSWGHF422023@outlook.com', 'soundasia422023@'],
user_pw_dict = {
'192.168.10.244': [r'C:\Users\win10-244\Downloads', 'YSWGHF422023@outlook.com', 'soundasia422023@'],
'192.168.10.245': [r'C:\Users\win10-245\Downloads', 'CherryY2023@outlook.com', '20230322Yy@'],
'192.168.10.246': [r'C:\Users\win10-246\Downloads', 'H10961961@outlook.com', 'soundasia961961@'],
'192.168.10.247': [r'C:\Users\win10-247\Downloads', 'X18756082657@outlook.com', 'Zyx13075039897@'],
......@@ -79,67 +84,14 @@ class H10():
else:
return []
def mysql_inv(self):
nums = 0
while True:
nums += 1
try:
self.engine_adv = create_engine(
'mysql+pymysql://chenjianyun:Cjy8751_07@rm-wz956fk600d89g2g7uo.mysql.rds.aliyuncs.com:3306/inventory?charset=utf8mb4') # , pool_recycle=3600
break
except Exception as e:
print("error_mysql_connect:", e, f"\n{traceback.format_exc()}")
time.sleep(nums * 20)
continue
def mysql_connect(self, site='us'):
DB_CONN_DICT = {
"mysql_port": 3306,
"mysql_db": "selection",
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
nums = 0
while True:
nums += 1
try:
db = f'selection'
self.engine_us = create_engine(
f'mysql+pymysql://{DB_CONN_DICT["mysql_user"]}:' + f'{DB_CONN_DICT["mysql_pwd"]}@{DB_CONN_DICT["mysql_host"]}:{DB_CONN_DICT["mysql_port"]}/{db}?charset=utf8mb4') # , pool_recycle=3600
break
except Exception as e:
print("error_mysql_connect:", e, f"\n{traceback.format_exc()}")
time.sleep(nums * 20)
continue
nums = 0
while True:
nums += 1
try:
self.engine_us = get_remote_engine('us', 'mysql')
if self.site_name == 'us' or self.site_name == 'mx':
db = 'selection'
else:
db = f'selection_{site}'
self.engine = create_engine(
f'mysql+pymysql://{DB_CONN_DICT["mysql_user"]}:' + f'{DB_CONN_DICT["mysql_pwd"]}@{DB_CONN_DICT["mysql_host"]}:{DB_CONN_DICT["mysql_port"]}/{db}?charset=utf8mb4') # , pool_recycle=3600
break
except Exception as e:
print("error_mysql_connect:", e, f"\n{traceback.format_exc()}")
time.sleep(nums * 20)
continue
self.site_name = 'us'
self.engine = get_remote_engine(self.site_name, 'mysql')
def web_drver(self):
# port = 9222
# params_ = ""
# params_ = "--blink-settings=imagesEnabled=false"
# os.system(f'start Chrome {params_} --remote-debugging-port={port}')
chrome_options = Options()
# 禁止加载图片
# chrome_options.add_argument('--blink-settings=imagesEnabled=false')
# chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{port}") # 打开调用本地浏览器
# 设置driver以无头浏览的模式运行
# chrome_options.add_argument('-headless')
# 禁用GPU(可选)
chrome_options.add_argument('-disable-gpu')
chrome_options.add_argument("--disable-notifications")
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
......@@ -313,10 +265,10 @@ class H10():
# https://members.helium10.com/user/signin?re=L2NlcmVicm8=
self.driver.get('https://members.helium10.com/user/signin')
sleep(randint(2, 4))
search_box = self.driver.find_element_by_id('loginform-email')
search_box = self.driver.find_element(By.ID, 'loginform-email')
search_box.send_keys(self.email_name)
sleep(randint(1, 2))
search_box = self.driver.find_element_by_id('loginform-password')
search_box = self.driver.find_element(By.ID, 'loginform-password')
search_box.send_keys(self.pw)
sleep(randint(1, 2))
try:
......@@ -453,7 +405,7 @@ class H10():
if asin not in self.err_asin_list and self.useremail_state:
print('cerebro界面', self.site_name_url)
self.driver.get(f'https://members.helium10.com/cerebro?accountId={self.account_id}')
time.sleep(10)
time.sleep(8)
if 'You are viewing a demo of Cerebro' in self.driver.page_source:
print(self.email_name, '账号过期')
self.driver.refresh()
......@@ -496,15 +448,15 @@ class H10():
try:
self.driver.execute_script(
f"""document.querySelector("img[loading='lazy']").click()""")
time.sleep(1)
time.sleep(1.5)
except:
self.driver.execute_script(
f"""document.querySelector("img[alt='{alt}']").click()""")
time.sleep(1)
time.sleep(1.5)
self.verify()
# 切换站点
self.driver.execute_script(f"""document.querySelector("div[data-value='{host}']").click()""")
time.sleep(2)
time.sleep(1.5)
# 输入asin
print('输入asin', asin)
if ',' in asin:
......@@ -520,7 +472,7 @@ class H10():
# 点击 get keyword
time.sleep(1)
self.driver.execute_script('document.querySelector("#CerebroSearchButtons > button").click()')
time.sleep(3)
time.sleep(2)
html = self.driver.page_source
if 'You have reached the limit of the uses' in html:
self.useremail_state = False
......@@ -540,11 +492,11 @@ class H10():
self.verify()
time.sleep(2)
try:
if 'searched this product before' in html:
if 'searched this product before' in html or '先前已搜索过此产品' in html:
print('33333333333')
self.driver.execute_script(
"""document.querySelector("button[data-testid='runnewsearch']").click()""")
sleep(randint(20, 35))
sleep(randint(10, 35))
except:
print('点击 run 报错')
......@@ -568,16 +520,11 @@ class H10():
elif 'errorCodes.undefined' in html:
continue
self.verify()
resp = etree.HTML(html)
try:
div_class = resp.xpath(
'//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class')
except:
time.sleep(2.5)
resp = etree.HTML(html)
try:
div_class = resp.xpath(
'//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class')
'''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
except:
print('报错22222222222222')
if asinstype:
......@@ -597,7 +544,7 @@ class H10():
html1 = self.driver.page_source
resp1 = etree.HTML(html1)
span_class = resp1.xpath(
'//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class')[0]
'//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class|//span[contains(text(),"已分析的产品")]/parent::div/following-sibling::div/@class')[0]
# 选择亚马逊精选参数1
self.driver.execute_script(
f"""document.querySelector("div[class='{span_class}']").click()""")
......@@ -641,12 +588,10 @@ class H10():
def read_db_data(self, sku):
while True:
try:
if self.read_product_sku(sku):
with self.engine_us.begin() as conn:
sql_read = f"SELECT asin, id,site,sku FROM {self.db_syn} WHERE STATE = 1 and site='{self.site_url}' and sku='{sku}' limit 10 FOR UPDATE;"
print(sql_read)
a = conn.execute(sql_read)
self.df_read = pd.DataFrame(a, columns=['asin', 'id', 'site', 'sku'])
self.df_read = self.engine_us.read_sql(sql_read)
self.df_read.drop_duplicates(['asin'], inplace=True)
if self.df_read.shape[0] == 0:
print('*********** asin 数据抓取 完毕 *****************')
......@@ -663,9 +608,6 @@ class H10():
self.site_name_url = list(self.df_read.site)[0]
self.sku = list(self.df_read.sku)[0]
return asin_list
else:
self.asin_state_5_list.append(sku)
return []
except Exception as e:
print("读取数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}")
time.sleep(10)
......@@ -675,40 +617,26 @@ class H10():
def read_db_sku(self):
while True:
try:
sql = f"""SELECT DISTINCT sku,token from all_h10_syn where site='{self.site_url}' and state = 1 """
df = pd.read_sql(sql, con=self.engine_us)
sql = f"""SELECT DISTINCT sku,token from all_h10_syn where site='{self.site_url}' and state =1"""
print(sql, '2323324dd')
df = self.engine_us.read_sql(sql)
if not df.empty:
self.sku_data_list = list(df.sku + '|-|' + df.token)
print(self.sku_data_list)
else:
self.sku_data_list = []
break
except:
print('读取sku 失败0')
except Exception as e:
print('读取sku 失败0', e)
time.sleep(30)
self.mysql_connect()
continue
def read_product_sku(self, sku):
# for i in range(5):
# try:
# sql = f"select id from product_audit where product_sku ='{sku}' and product_audit_status = '5-1' ;"
# print(sql)
# df = pd.read_sql(sql, con=self.engine_adv)
# id_list = list(df.id)
# print('检查是否手动下载:', id_list)
# if id_list:
# print('id_list::', id_list)
# return False
# else:
# print('可以')
# return True
# except:
# self.mysql_inv()
return True
def read_db_asin(self):
while True:
try:
sql_read = f"SELECT sku, site, GROUP_CONCAT(asin SEPARATOR ',') AS asin_list FROM {self.db_syn} WHERE site = '{self.site_url}' and sku='{self.sku}';"
df = pd.read_sql(sql_read, con=self.engine_us)
df = self.engine_us.read_sql(sql_read)
sku_list = list(df.sku)
if sku_list:
data_list = list(df.sku + '|' + df.site + '|' + df.asin_list)
......@@ -742,23 +670,6 @@ class H10():
self.mysql_connect()
continue
def update_sku_syn(self):
if self.asin_state_5_list:
while True:
try:
with self.engine_us.begin() as conn:
if len(self.asin_state_5_list) == 1:
sql_update = f"update {self.db_syn} set state=5 where sku in ('{self.asin_state_5_list[0]}')"
else:
sql_update = f"update {self.db_syn} set state=5 where sku in {tuple(self.asin_state_5_list)}"
conn.execute(sql_update)
self.asin_state_5_list = []
break
except Exception as e:
print("update_sku_syn", e, f"\n{traceback.format_exc()}")
self.mysql_connect()
continue
def del_file(self, path_data, asin_type):
try:
for i in os.listdir(path_data): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
......@@ -778,18 +689,22 @@ class H10():
except:
print(path_data, '删除111111111')
def read_files(self, path, asin):
columns_to_include = ['Keyword Phrase', 'Cerebro IQ Score', 'Search Volume', 'Search Volume Trend',
'Sponsored ASINs',
'Competing Products', 'CPR', 'Title Density', 'Amazon Recommended', 'Organic',
'Amazon Rec. Rank', 'Sponsored Rank', 'Organic Rank']
def if_csv_path(self, file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
f.read()
f.close()
return True
except:
print('文件路径不存在')
return False
def read_files(self, path, asin):
time_strftime = time.strftime("%Y-%m-%d", time.localtime())
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{time_strftime}.csv'
df = pd.read_csv(file_path, usecols=columns_to_include)
state = self.if_csv_path(file_path)
print('读取文件11111::', file_path)
except:
try:
if state == False:
# 获取当前日期
current_date = datetime.date.today()
# 计算前一天日期
......@@ -797,48 +712,53 @@ class H10():
# 格式化前一天日期为字符串
previous_date_str = previous_date.strftime("%Y-%m-%d")
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{previous_date_str}.csv'
df = pd.read_csv(file_path, usecols=columns_to_include)
print('读取文件2222222::', file_path)
except:
state = self.if_csv_path(file_path)
if state == False:
self.driver.refresh()
time.sleep(5)
print('重新下载文件:', asin,path)
print('重新下载文件:', asin, path)
self.webdrvier_html(asin, None)
time.sleep(5)
time_strftime = time.strftime("%Y-%m-%d", time.localtime())
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{time_strftime}.csv'
df = pd.read_csv(file_path, usecols=columns_to_include)
print('读取文件333333::', file_path)
return df
columns = pd.read_csv(file_path, nrows=0).columns.tolist()
def sava_data(self, path):
print('self.err_asin_list::', self.err_asin_list)
df_asin_data_list = []
for asin in self.asin_list:
print(asin, '333333333333333', self.err_asin_list)
if asin not in self.err_asin_list:
df = self.read_files(path, asin)
# columns_to_include = ['Keyword Phrase', 'Cerebro IQ Score', 'Search Volume', 'Search Volume Trend',
# 'Sponsored ASINs',
# 'Competing Products', 'CPR', 'Title Density', 'Amazon Recommended', 'Organic',
# 'Amazon Rec. Rank', 'Sponsored Rank', 'Organic Rank']
# try:
# time_strftime = time.strftime("%Y-%m-%d", time.localtime())
# file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{time_strftime}.csv'
# print('file_path1111', file_path)
# df = pd.read_csv(file_path, usecols=columns_to_include)
# print('读取文件::', file_path)
# except:
# # 获取当前日期
# current_date = datetime.date.today()
# # 计算前一天日期
# previous_date = current_date - datetime.timedelta(days=1)
# # 格式化前一天日期为字符串
# previous_date_str = previous_date.strftime("%Y-%m-%d")
# file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{previous_date_str}.csv'
# print('file_path22222222222', file_path)
# df = pd.read_csv(file_path, usecols=columns_to_include)
# print('读取文件::', file_path)
def contains_chinese(text):
return bool(re.search(r'[\u4e00-\u9fff]', text))
is_chinese_header = any(contains_chinese(col) for col in columns)
if is_chinese_header:
print("表头是中文")
columns_to_include_zh = ['关键词词组', 'Cerebro IQ 得分', '搜索量', '搜索量趋势',
'广告推广ASIN 数',
'竞品数', 'CPR', '标题密度', '亚马逊推荐', '自然',
'亚马逊推荐排名', '广告排名', '自然排名']
df = pd.read_csv(file_path, usecols=columns_to_include_zh)
# 中文 -> 英文映射
df.rename(columns={
'关键词词组': 'keyword',
'Cerebro IQ 得分': 'cerebro_iq_score',
'搜索量': 'search_volume',
'搜索量趋势': 'search_volume_trend',
'广告推广ASIN 数': 'sponsored_asins',
'竞品数': 'competing_product',
'CPR': 'cpr',
'标题密度': 'title_desity',
'亚马逊推荐': 'amazon_recommended',
'自然': 'organic',
'亚马逊推荐排名': 'amazon_recommended_rank',
'广告排名': 'sponsored_rank',
'自然排名': 'organic_rank'
}, inplace=True)
else:
print("表头是英文")
columns_to_include_en = ['Keyword Phrase', 'Cerebro IQ Score', 'Search Volume', 'Search Volume Trend',
'Sponsored ASINs',
'Competing Products', 'CPR', 'Title Density', 'Amazon Recommended', 'Organic',
'Amazon Rec. Rank', 'Sponsored Rank', 'Organic Rank']
df = pd.read_csv(file_path, usecols=columns_to_include_en)
df.rename(columns={
'Keyword Phrase': 'keyword',
'Cerebro IQ Score': 'cerebro_iq_score',
......@@ -854,6 +774,15 @@ class H10():
'Sponsored Rank': 'sponsored_rank',
'Organic Rank': 'organic_rank'
}, inplace=True)
return df
def sava_data(self, path):
print('self.err_asin_list::', self.err_asin_list)
df_asin_data_list = []
for asin in self.asin_list:
print(asin, '333333333333333', self.err_asin_list)
if asin not in self.err_asin_list:
df = self.read_files(path, asin)
df['asin'] = asin
df['sku'] = self.sku
df_asin_data_list.append(df)
......@@ -885,7 +814,6 @@ class H10():
print('调接口更新数据:', data)
sku_data_list = data.split('|')
sku = sku_data_list[0]
if self.read_product_sku(sku):
self.sku_state = False
if sku not in self.sku_list:
self.sku_list.append(sku)
......@@ -899,7 +827,7 @@ class H10():
}
url = f'http://120.79.147.190:8080/soundasia_selection/updateKeyWords/selWords?site={site}&asins={asins}'
print(url)
for i in range(5):
for i in range(20):
try:
resp = requests.get(url, timeout=30, headers=headers).json()
self.data = {"sku": self.sku,
......@@ -910,9 +838,9 @@ class H10():
break
except Exception as e:
print("请求java 接口报错:", e, f"\n{traceback.format_exc()}")
time.sleep(3)
time.sleep(105)
continue
print('type_resp::',type(resp))
print('type_resp::', type(resp))
# core核心词
core_list = resp['result']['core'].split('\n')
# updown长尾词
......@@ -929,9 +857,10 @@ class H10():
sql_delete_bsr = f"delete from product_audit where product_sku in ('{tuple(df_save.product_sku)[0]}') and site='{self.site_url}';"
else:
sql_delete_bsr = f"delete from product_audit where product_sku in {tuple(set(df_save.product_sku))} and site='{self.site_url}';"
print('sql_delete_bsr', sql_delete_bsr)
conn.execute(sql_delete_bsr)
df_save.to_sql("product_audit", con=self.engine_us,
if_exists='append', index=False)
self.engine_us.to_sql(df_save, "product_audit",
if_exists='append')
except Exception as e:
print("save_competition:", e, f"\n{traceback.format_exc()}")
print('存储优质词报错。重连数据库')
......@@ -941,9 +870,6 @@ class H10():
print(f'存储 core核心词 updown长尾词 报错, \n{e, traceback.format_exc()}')
self.mysql_connect()
time.sleep(5)
else:
self.asin_state_5_list.append(sku)
self.sku_state = True
# 定义一个函数来获取三列的最小值(忽略为0的值)
def get_min(self, row):
......@@ -953,13 +879,11 @@ class H10():
def save_competition(self, path, asin_list, site_url, site):
print('self.err_asins_adv_list;;', self.err_asins_adv_list)
if asin_list[0] not in ''.join(self.err_asins_adv_list):
try:
time_strftime = time.strftime("%Y-%m-%d", time.localtime())
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
print('file_pathsave_competition1111111', file_path)
df = pd.read_csv(file_path)
except:
# 获取当前日期
state = self.if_csv_path(file_path)
if state==False:
current_date = datetime.date.today()
# 计算前一天日期
previous_date = current_date - datetime.timedelta(days=1)
......@@ -967,8 +891,20 @@ class H10():
previous_date_str = previous_date.strftime("%Y-%m-%d")
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{previous_date_str}.csv'
print('file_pathsave_competition2222', file_path)
df = pd.read_csv(file_path)
# 创建一个字典来映射原始列名和新的列名
columns = pd.read_csv(file_path, nrows=0).columns.tolist()
def contains_chinese(text):
return bool(re.search(r'[\u4e00-\u9fff]', text))
is_chinese_header = any(contains_chinese(col) for col in columns)
if is_chinese_header:
print("save_competition 表头是中文")
column_mapping = {
'关键词词组': 'keyword_phrase',
'自然排名': 'rank0',
}
else:
print("save_competition 表头是英文")
column_mapping = {
'Keyword Phrase': 'keyword_phrase',
'Position (Rank)': 'rank0',
......@@ -976,6 +912,7 @@ class H10():
# 将asin_list中的元素作为列名,并给它们一个新的列名(rank1, rank2, rank3, ...)
for i, asin in enumerate(asin_list[1:], start=1):
column_mapping[asin] = f'rank{i}'
df = pd.read_csv(file_path)
df.rename(columns=column_mapping, inplace=True)
# 只保留包含rank的列
rank_columns = [col for col in df.columns if col.startswith('rank')]
......@@ -992,6 +929,8 @@ class H10():
print(new_df.columns)
new_df.replace({np.nan: None}, inplace=True) # 将 NaN 替换为 None
# print(' 低竞争 优质词')
# "join() 里存在 None。 用 astype(str) 清洗。"
new_df['keyword_phrase'] = new_df['keyword_phrase'].astype(str)
competition_phrase_keywords = '&&&'.join(list(new_df['keyword_phrase']))
rank_list = list(new_df['rank'])
competition_phrase_rank = ','.join(str(x) for x in rank_list)
......@@ -1004,18 +943,11 @@ class H10():
try:
if new_df.shape[0] > 0:
with self.engine_us.begin() as conn:
if len(set(new_df.keyword_phrase)) == 1:
sql_delete = f"""delete from adv_low_competition_phrase where site = '{site_url}' and sku='{self.sku}' and keyword_phrase in ("{tuple(new_df.keyword_phrase)[0]}");"""
sql_delete = f"DELETE FROM adv_low_competition_phrase WHERE site = '{site_url}' AND sku = '{self.sku}' ;"
print("sql_delete:", sql_delete)
conn.execute(sql_delete)
else:
sql_delete = text(
"DELETE FROM adv_low_competition_phrase WHERE site = :site_url AND sku = :sku AND keyword_phrase IN :keyword_phrases;"
self.engine_us.to_sql(new_df, "adv_low_competition_phrase", if_exists='append'
)
print("sql_delete:", sql_delete)
conn.execute(sql_delete, site_url=site_url, sku=self.sku,
keyword_phrases=tuple(set(new_df['keyword_phrase'].tolist())))
new_df.to_sql("adv_low_competition_phrase", con=self.engine_us, if_exists='append',
index=False)
except Exception as e:
print("save_competition:", e, f"\n{traceback.format_exc()}")
print('存储优质词报错。重连数据库')
......@@ -1026,16 +958,17 @@ class H10():
def requests_updateSkuByAsinH10Data_api(self, data):
print('调用接口:', self.data)
#
# url = 'http://120.24.90.10:80/api/ComprehensiveProject/updateSkuByAsinH10Data'
url = 'https://xcu.yswg.com.cn/api/ComprehensiveProject/updateSkuByAsinH10Data'
url = 'https://xcu.yswg.com.cn/api/selections/updateSkuByAsinH10Data'
for i in range(5):
try:
res = requests.post(url, json=data, timeout=30)
print(res.status_code)
print(res.text)
print(res.json())
except:
pass
print(res.json(), 1111112222)
break
except Exception as e:
print(e, 'requests_updateSkuByAsinH10Data_api 报错 2323232323')
time.sleep(10)
def send_ms(self, ms):
if self.useremail_state == False:
......@@ -1064,8 +997,8 @@ class H10():
else:
path = r'C:\Users\ASUS\Downloads'
print('当前路径:', path)
self.email_name = 'H10961961@outlook.com'
self.pw = 'soundasia961961@'
self.email_name = 'yashengweige678@outlook.com'
self.pw = '987654321yswg@' # 'yashengweige678@outlook.com', '987654321yswg@'
self.web_drver()
while True:
self.data = {}
......@@ -1137,15 +1070,13 @@ class H10():
self.del_file(path, self.asin_list[0])
# 修改状态4
self.db_change_state_common(self.err_asin_list, 4)
# 修改状态5
self.update_sku_syn()
else:
self.db_change_state_common(self.asin_list, 1)
time.sleep(3600)
self.useremail_state = True
break
else:
self.update_sku_syn()
self.mysql_connect(site)
time.sleep(randint(20, 50))
new_date = datetime.datetime.now().strftime("%H")
......@@ -1158,5 +1089,6 @@ class H10():
if new_date == '08':
self.driver.refresh()
if __name__ == '__main__':
H10().run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.secure_db_client import get_remote_engine
from amazon_spider.VPS_IP import pppoe_ip
from amazon_params import py_ja3
from utils.asin_parse import ParseAsinUs
from utils.requests_param import Requests_param_val
from queue import Queue
from lxml import etree
import requests
import urllib3
from datetime import datetime
import json
import pandas as pd
import threading
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
urllib3.disable_warnings()
import ast
class ai_async_asin_pg():
def __init__(self):
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.item_queue = Queue() # 存储 item 详情数据队列
self.pg_connect()
self.sp_asin_queue = Queue()
self.spider_state = None
self.update_ai_asin_analyze_log_list = []
month = time.strftime("%m")
day = time.strftime("%d")
if int(day) > 10:
_month = int(month)
else:
if int(month) > 1:
_month = int(month) - 1 # 上个月
else:
_month = int(month)
if _month < 10:
_month = str(f'0{_month}')
self.topic_asin_html = f'asin_html_2025_{str(_month)}'
def get_params(self, site_name='us'):
self.site_name = site_name # 站点
self.reuests_para_val = Requests_param_val(site_name=self.site_name)
self.cookies_queue = Queue() # cookie队列
self.cookie_dict_delete_id = {}
# 返回 对应站点的host,首页链接
self.site_url, self.host = self.reuests_para_val.get_site_url(self.site_name)
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie(num=168)
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
def pg_connect(self):
self.engine_pg = get_remote_engine(
site_name='us', # -> database "selection"
db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
)
def get_asin(self):
while True:
if self.queries_asin_queue.empty() == False and self.spider_state is None:
asin_queu = self.queries_asin_queue.get()
elif self.sp_asin_queue.empty() == False:
self.spider_state = '竞品asin'
print('执行竞品asin 抓取')
asin_queu = self.sp_asin_queue.get()
else:
break
print('::asin_queu::: ', asin_queu)
queu_list = asin_queu.split('|-|')
print('queu_list:::', queu_list)
asin = queu_list[0]
task_id = queu_list[1]
site_name = queu_list[2]
module = queu_list[3]
if module == 'Amazon:asin':
sub_step = 'Amazon:asin:竞品'
elif module == 'Amazon:asinList':
sub_step = 'Amazon:asinList:详情'
else:
sub_step = None
self.get_params(site_name=site_name)
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie()
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
# 获取组装cookie
cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
asin=asin, scraper_url=None)
headers["cookie"] = cookie_str
scraper_url = self.site_url + 'dp/' + asin + "?th=1&psc=1"
print('scraper_url::', scraper_url)
try:
sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False)
# with open(rf'{self.site_name}_22_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp):
print('出现验证码,。asin---> ', asin)
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
continue
except Exception as e:
print("请求错误错误: 。asin---> ", asin, '错误:', e)
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
continue
response_url = resp.url
response = resp.text
response_s = etree.HTML(response)
if self.reuests_para_val.check_amazon_not_page(response): # asin 已下架 状态 4 Listen Now
continue
if self.reuests_para_val.check_amazon_page(response, response_url): # 检查是不是正常商品页面
continue
if self.reuests_para_val.check_amazon_allow_redirects(response_url, asin): # 检查是否被重定向
continue
# 获取邮编
try:
ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()")
print(ingress, ' 打印 邮编 ', resp.url)
except Exception as e:
print('asin 不是正常页面', asin)
continue
try:
ingress = ingress[0].strip()
except:
ingress = None
if ingress:
if self.reuests_para_val.check_amazon_ingress(ingress):
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
continue
div_dp = response_s.xpath('//div[@id="dp"]')
if div_dp:
items = ParseAsinUs(resp=response, asin=asin, site_name=self.site_name).xpath_html()
new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item = {'asin': items["asin"], 'task_id': task_id, 'url': scraper_url, 'sub_step': sub_step,
'title': items["title"], 'img_url': items["img_url"],
'rating': items["rating"], 'total_comments': items["total_comments"],
'price': items["price"], "rank": items["rank"], 'category': items["category"],
'launch_time': items["launch_time"], 'volume': items["volume"],
'weight': items["weight"], "page_inventory": items["page_inventory"],
"buy_box_seller_type": items["buy_box_seller_type"],
"asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
'img_num': items["img_num"], 'img_type': items["img_type"],
'activity_type': items["activity_type"],
'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
'eight_val': items["eight_val"],
'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
'three_star': items["three_star"],
'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
'together_asin': items["together_asin"],
'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
'node_id': items["node_id"],
'sp_num': items["sp_num"], 'describe': items["describe"],
'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
'variat_num': items['variat_num'],
'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
'all_best_sellers_herf': items['all_best_sellers_herf'],
'product_description': items['product_description'], 'buy_sales': items['buySales'],
'image_view': items['image_view'], 'product_json': items['product_json'],
'product_detail_json': items['productdetail_json'],
'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
'lob_asin_json': items['lob_asin_json'],
'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
'compare_similar_asin_json': items['compare_similar_asin_json'],
'customer_reviews_json': items['customer_reviews_json'],
'together_asin_json': items['together_asin_json'],
'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': items['bundle_asin_component_json'],
'bsr_category_asin_list': items['bs_category_asin_list_pg'],'review_json_list':items['review_json_list']
}
print(item)
# a = None
# if result_list_json and module == 'Amazon:asin' and self.spider_state is None:
# is_sp_asin_state = None
# result_list_dict = json.loads(result_list_json)
# print(asin, '判断是否有竞品asin')
# for result_dict in result_list_dict:
# # Based on your recent shopping trends # Frequently purchased items with fast delivery
# # Customers who viewed this item also viewed # Brand in this category on Amazon
# sp_type = 'Based on your recent shopping trends'
# if result_dict.get(sp_type):
# print(asin, '找到有竞品asin。 数量:', len(result_dict[sp_type]))
# for i in result_dict[sp_type]:
# sp_asin = i + '|-|' + task_id + '|-|' + site_name + '|-|' + module
# self.sp_asin_queue.put(sp_asin)
# is_sp_asin_state = 111
# a = 1
# if is_sp_asin_state is None:
# print('没有找到竞品asin')
# self.item_queue.put(item)
# # self.save_data()
# # self.update_ai_asin_analyze_log([int(task_id)], '成功')
# a = 1
self.item_queue.put(item)
Requests_param_val().send_kafka(html_data=response, topic=self.topic_asin_html)
Requests_param_val().kafuka_producer_str.flush(timeout=30)
# if self.spider_state == '竞品asin':
# self.item_queue.put(item)
# a = 1
#
# if module == 'Amazon:asinList':
# self.item_queue.put(item)
# a = 1
# if a is None:
# self.item_queue.put(item)
else:
print('asin 商品 异常')
def update_ai_asin_analyze_log(self, task_id_list, status):
if task_id_list:
task_id_list = list(set(task_id_list))
while True:
try:
with self.engine_pg.begin() as conn:
for task_id in task_id_list:
sql_update = f"""UPDATE ai_asin_analyze_log a set spider_status='{status}' where a.task_id = {task_id}"""
print('UPDATE_sql:', sql_update)
conn.execute(sql_update)
break
except Exception as e:
print('更新 ai_asin_analyze_log 失败', e)
time.sleep(5)
def save_data(self):
self.pg_connect()
items_data_list = []
update_time = int(time.time())
task_id_list = []
while True:
if self.item_queue.empty() == False:
items = self.item_queue.get()
unique_key = self.site_name + ':' + items['asin']
items_data_list.append(
[int(items['task_id']), items['url'], items['sub_step'], '爬取成功', items, update_time, unique_key])
task_id_list.append(int(items['task_id']))
else:
break
if task_id_list:
self.update_ai_asin_analyze_log(task_id_list, '成功')
while True:
try:
print('存储数据, 数量', len(items_data_list))
if items_data_list:
print(len(items_data_list))
df_asin_detail = pd.DataFrame(data=items_data_list,
columns=['task_id', 'url', 'sub_step', 'status', 'html_json',
'create_time',
'unique_key'])
df_asin_detail['html_json'] = df_asin_detail['html_json'].apply(
lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, (dict, list)) else x
)
self.engine_pg.to_sql(df_asin_detail, 'ai_asin_analyze_spider', if_exists='append')
break
except Exception as e:
print('存储报错::', e)
time.sleep(10)
else:
print('save_data 存储数据, 数量', len(items_data_list))
def task(self):
result = 1 + 1
print("执行结果:", result)
def read_ai_asin(self):
time_ip_num = 0
while True:
try:
time_ip_num += 1
self.pg_connect()
for module in ['Amazon:asin','Amazon:asinList']:
if module == 'Amazon:asin':
# pass
sql = f"SELECT elem->>'boyris' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module='{module}' and spider_status='未开始' for update;"
else:
sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and spider_status='未开始' for update;"""
# sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and task_id=39 for update;"""
print(sql)
df_read = self.engine_pg.read_then_update(
select_sql=sql,
update_table='ai_asin_analyze_log',
set_values={"spider_status": '爬取中'}, # 把库存清零
where_keys=["task_id"], # WHERE sku = :sku
)
print(f'开始 {module} 任务:', sql)
if not df_read.empty:
if module == 'Amazon:asin':
_asin_list = ast.literal_eval(df_read['asin'][0])
asin_id_list = []
for _aisn in _asin_list:
asin_data_list = list(
_aisn + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + module)
asin_id_list.extend(asin_data_list)
else:
asin_id_list = list(
df_read['asin'] + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + module)
print(asin_id_list)
for asin_id in asin_id_list:
print(asin_id)
self.queries_asin_queue.put(asin_id)
html_thread = []
for i in range(5):
thread2 = threading.Thread(target=self.get_asin)
thread2.start()
html_thread.append(thread2)
for t2 in html_thread:
t2.join()
self.save_data()
time.sleep(5)
if 10 <= datetime.now().hour < 22:
if time_ip_num > 60:
pppoe_ip()
time_ip_num = 0
time.sleep(5)
# break
except Exception as e:
print('查询报错:', e)
# break
if __name__ == '__main__':
ai_async_asin_pg().read_ai_asin()
......@@ -3,17 +3,16 @@ import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from secure_db_client import get_remote_engine
import traceback
from curl_cffi import requests
from utils.db_connect import BaseUtils
import re
from lxml import etree
os.environ['NO_PROXY'] = 'amazon.com'
import json
from urllib.parse import urlparse
import datetime
import time
class Amazon_reviewer():
......@@ -111,6 +110,35 @@ class Amazon_reviewer():
"review_data_img": review_img}
print(items)
def pg_get_asin(self):
while True:
try:
print('轮询 mysql 查询:', datetime.now().strftime("%m-%d %H:%M:%S"))
engine_pg = self.pg_connect()
spider_state_sql = """select asin,task_id from ai_asin_analyze_spider where status = '未开始' limit 20 """
print('spider_state_sql:', spider_state_sql)
df_asin = engine_pg.read_sql(spider_state_sql)
if not df_asin.empty:
update_time = int(time.time())
with engine_pg.begin() as conn:
index_tuple = tuple(df_asin['task_id'])
if len(index_tuple) == 1:
sql_update = f"""UPDATE ai_asin_analyze_spider a set status='爬取中',update_time='{update_time}' where a.task_id in ({index_tuple[0]})"""
else:
sql_update = f"""UPDATE ai_asin_analyze_spider a set status='爬取中',update_time='{update_time}' where a.task_id in {index_tuple}"""
print('UPDATE_sql:', sql_update)
conn.execute(sql_update)
_asin_lis = list(df_asin.asin + '|-|' + df_asin.task_id.astype("U"))
print("_asin_lis:::", _asin_lis, )
print("_asin_lis::: len ", len(_asin_lis))
run_spider(_asin_lis) # 传递asin 列表
time.sleep(3)
# break
except Exception as e:
print('查询 mysql_get_asin 报错::', e, f"\n{traceback.format_exc()}")
def run(self):
self.redis_db()
self.get_asin_reviewer()
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.secure_db_client import get_remote_engine
from amazon_params import py_ja3
from utils.asin_parse import ParseAsinUs
from amazon_spider.VPS_IP import pppoe_ip
from utils.requests_param import Requests_param_val
from queue import Queue
from lxml import etree
import requests
import urllib3
import pandas as pd
from datetime import datetime
import json
import threading
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
urllib3.disable_warnings()
class ai_async_asin_pg():
def __init__(self, site_name='us'):
self.site_name = site_name
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.item_queue = Queue() # 存储 item 详情数据队列
self.pg_connect()
self.sp_asin_queue = Queue()
self.spider_state = None
self.update_ai_asin_analyze_log_list = []
month = time.strftime("%m")
day = time.strftime("%d")
if int(day)>10:
_month = int(month)
else:
if int(month) > 1:
_month = int(month) - 1 # 上个月
else:
_month = int(month)
if _month < 10:
_month = str(f'0{_month}')
self.topic_asin_html = f'asin_html_2025_{str(_month)}'
def get_params(self):
# 站点
self.reuests_para_val = Requests_param_val(site_name=self.site_name)
self.cookies_queue = Queue() # cookie队列
self.cookie_dict_delete_id = {}
# 返回 对应站点的host,首页链接
self.site_url, self.host = self.reuests_para_val.get_site_url(self.site_name)
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie(num=168)
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
def pg_connect(self):
self.engine_pg = get_remote_engine(
site_name=self.site_name, # -> database "selection"
db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
)
return self.engine_pg
def get_asin(self):
while True:
if self.queries_asin_queue.empty() == False and self.spider_state is None:
asin_queu = self.queries_asin_queue.get()
elif self.sp_asin_queue.empty() == False:
self.spider_state = '竞品asin'
print('执行竞品asin 抓取')
asin_queu = self.sp_asin_queue.get()
else:
break
# ['B09658Q5RP|-|82|-|us|-|6248', 'B0CSPVS7JL|-|82|-|us|-|6249']
print('::asin_queu::: ', asin_queu)
queu_list = asin_queu.split('|-|')
print('queu_list:::', queu_list)
asin = queu_list[0]
task_id = queu_list[1]
site_name = queu_list[2]
id_str = queu_list[3]
sub_step = queu_list[4]
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie()
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
# 获取组装cookie
cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
asin=asin, scraper_url=None)
headers["cookie"] = cookie_str
scraper_url = self.site_url + 'dp/' + asin + "?th=1&psc=1"
print('scraper_url::', scraper_url)
try:
sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False)
# with open(rf'{self.site_name}_22_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp):
print('出现验证码,。asin---> ', asin)
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
continue
except Exception as e:
print("请求错误错误: 。asin---> ", asin, '错误:', e)
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
continue
response_url = resp.url
response = resp.text
response_s = etree.HTML(response)
if self.reuests_para_val.check_amazon_not_page(response): # asin 已下架 状态 4 Listen Now
continue
if self.reuests_para_val.check_amazon_page(response, response_url): # 检查是不是正常商品页面
continue
if self.reuests_para_val.check_amazon_allow_redirects(response_url, asin): # 检查是否被重定向
continue
# 获取邮编
try:
ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()")
print(ingress, ' 打印 邮编 ', resp.url)
except Exception as e:
print('asin 不是正常页面', asin)
continue
try:
ingress = ingress[0].strip()
except:
ingress = None
if ingress:
if self.reuests_para_val.check_amazon_ingress(ingress):
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
continue
div_dp = response_s.xpath('//div[@id="dp"]')
if div_dp:
items = ParseAsinUs(resp=response, asin=asin, site_name=self.site_name).xpath_html()
new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item = {'asin': items["asin"], 'task_id': task_id, 'id': id_str,'sub_step':sub_step, 'url': scraper_url,
'title': items["title"], 'img_url': items["img_url"],
'rating': items["rating"], 'total_comments': items["total_comments"],
'price': items["price"], "rank": items["rank"], 'category': items["category"],
'launch_time': items["launch_time"], 'volume': items["volume"],
'weight': items["weight"], "page_inventory": items["page_inventory"],
"buy_box_seller_type": items["buy_box_seller_type"],
"asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
'img_num': items["img_num"], 'img_type': items["img_type"],
'activity_type': items["activity_type"],
'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
'eight_val': items["eight_val"],
'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
'three_star': items["three_star"],
'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
'together_asin': items["together_asin"],
'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
'node_id': items["node_id"],
'sp_num': items["sp_num"], 'describe': items["describe"],
'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
'variat_num': items['variat_num'],
'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
'all_best_sellers_herf': items['all_best_sellers_herf'],
'product_description': items['product_description'], 'buy_sales': items['buySales'],
'image_view': items['image_view'], 'product_json': items['product_json'],
'product_detail_json': items['productdetail_json'],
'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
'lob_asin_json': items['lob_asin_json'],
'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
'compare_similar_asin_json': items['compare_similar_asin_json'],
'customer_reviews_json': items['customer_reviews_json'],
'together_asin_json': items['together_asin_json'],
'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': items['bundle_asin_component_json'],
'bsr_category_asin_list': items['bs_category_asin_list_pg'],
'review_json_list': items['review_json_list']
}
print(item)
self.item_queue.put(item)
Requests_param_val().send_kafka(html_data=response, topic=self.topic_asin_html)
Requests_param_val().kafuka_producer_str.flush(timeout=30)
else:
print('asin 商品 异常')
def save_data(self):
self.pg_connect()
items_data_list = []
id_list = []
while True:
if self.item_queue.empty() == False:
items = self.item_queue.get()
items_data_list.append([int(items['task_id']), items['asin'], items['site_name'], '成功', items, int(items['id']),items['sub_step']])
id_list.append(int(items['id']))
else:
break
if items_data_list:
while True:
try:
self.pg_connect()
print('存储数据, 数量', len(items_data_list))
with self.engine_pg.begin() as conn:
if len(set(id_list)) == 1:
sql_delete = f"delete from ai_asin_analyze_spider where id in ({tuple(id_list)[0]});"
else:
sql_delete = f"delete from ai_asin_analyze_spider where id in {tuple(set(id_list))};"
print('删除:',sql_delete)
conn.execute(sql_delete)
print(len(items_data_list))
df_asin_detail = pd.DataFrame(data=items_data_list,
columns=['task_id', 'unique_key', 'site_name', 'status', 'html_json',
'id','sub_step'])
df_asin_detail['html_json'] = df_asin_detail['html_json'].apply(
lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, (dict, list)) else x
)
self.engine_pg.to_sql(df_asin_detail, 'ai_asin_analyze_spider', if_exists='append')
break
except Exception as e:
print('存储报错::', e)
self.pg_connect()
time.sleep(10)
def init_list(self):
print("=======清空变量==========")
self.asin_not_found_list = [] # 4
self.asin_not_sure_list = [] # 6
self.asin_not_foot_list = [] # 7
self.asin_not_foot2_list = [] # 8
self.asin_not_buyBox_list = [] # 9
self.asin_not_response_list = [] # 10
self.asin_not_redirect_list = [] # 12
self.asin_not_div_id_dp_list = [] # 13 返回html没有包含div @id=dp,状态13
self.requests_error_asin_list = [] # 1
self.asin_list_update = [] # 3
self.item_queue = Queue() # 存储 item 详情数据队列
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.buyBox_list = [] # 卖家名称 url 列表
self.asin_detail_list = [] # 存储asin 详情的列表
self.buyBoxname_asin_list = [] # asin 卖家的列表
self.delete_cookies_list = [] # 存储出现中国邮编的cookie
self.star_list = []
self.add_cart_asin_list = [] # 存储绑定购买的asin
self.asin_brand_list = []
self.bs_category_asin_list = []
self.bs_category_asin_list_pg = []
self.reuests_para_val.kafuka_producer_str.close(timeout=10)
self.asin_video_list = []
self.cookies_queue = Queue() # cookie队列
self.item_queue = Queue() # 存储 item 详情数据队列
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.buyBox_list = [] # 卖家名称 url 列表
self.asin_detail_list = [] # 存储asin 详情的列表
self.buyBoxname_asin_list = [] # asin 卖家的列表item
self.delete_cookies_list = [] # 存储出现中国邮编的cookie
self.cookie_dict_delete_id = {}
self.star_list = [] # 存储星级百分比
self.add_cart_asin_list = [] # 存储 绑定购买的asin
self.asin_brand_list = [] # 存储asin 对应 的品牌
self.bs_category_asin_list = [] # 存储 asin 详情 bsr 文本类目
self.bs_category_asin_list_pg = [] # 存储 asin 详情 bsr 文本类目
# 验证码 1
self.yzm_err_total_list = []
# 异常 2
self.asin_request_errp_total_list = []
# 成功 3
self.success_asin_total_list = []
# 每小时
self.hour_total_count_list = []
# 总请求 4
self.request_total_count_list = []
def run_ai_asin(self, asin_id_list):
self.get_params()
print(asin_id_list)
for asin_id in asin_id_list:
print(asin_id)
self.queries_asin_queue.put(asin_id)
html_thread = []
for i in range(5):
thread2 = threading.Thread(target=self.get_asin)
thread2.start()
html_thread.append(thread2)
for t2 in html_thread:
t2.join()
self.save_data()
self.init_list()
def select_asin():
time_ip_num = 0
while True:
try:
time_ip_num += 1
for site in ['us', 'de', 'uk']:
select_sql = f"""select id, site_name, task_id, unique_key as asin,sub_step from ai_asin_analyze_spider where sub_step = 'AsinInfoRepository:详情' and status = '未开始' and site_name='{site}' order by task_id"""
print('select_sql::', select_sql)
engine_pg15 = ai_async_asin_pg(site_name='us').pg_connect()
df_read = engine_pg15.read_then_update(
select_sql=select_sql,
update_table='ai_asin_analyze_spider',
set_values={"status": '爬取中'}, # 把库存清零
where_keys=["id", "site_name"], # WHERE sku = :sku
)
if not df_read.empty:
asin_id_list = list(
df_read['asin'] + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + df_read.id.astype("U") + '|-|' + df_read.sub_step)
print(asin_id_list)
ai_async_asin_pg(site_name=site).run_ai_asin(asin_id_list)
time.sleep(5)
if 10 <= datetime.now().hour < 22:
if time_ip_num > 180:
pppoe_ip()
time_ip_num = 0
time.sleep(5)
except Exception as e:
print(e,2333333)
time.sleep(5)
if __name__ == '__main__':
select_asin()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment