Commit c8524b22 by Peng

no message

parent 4418209b
...@@ -3,15 +3,11 @@ import os ...@@ -3,15 +3,11 @@ import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_connect import BaseUtils from utils.db_connect import BaseUtils
from amazon_params.params import DB_CONN_DICT from secure_db_client import get_remote_engine
import math
import pandas as pd import pandas as pd
import time import time
import pymysql
import requests import requests
# import numpy as np import math
#
# from scipy.optimize import curve_fit
"""计算销量,均值差""" """计算销量,均值差"""
...@@ -59,19 +55,19 @@ class CalculateMean(BaseUtils): ...@@ -59,19 +55,19 @@ class CalculateMean(BaseUtils):
sql_6 = f""" sql_6 = f"""
SELECT * from {self.site_name}_one_category WHERE id in ( select max(id) from {self.site_name}_one_category where `year_month`='2025_5' and orders=0 and rank>50000 GROUP BY `name`) SELECT * from {self.site_name}_one_category WHERE id in ( select max(id) from {self.site_name}_one_category where `year_month`='2025_5' and orders=0 and rank>50000 GROUP BY `name`)
UNION UNION
select * from {self.site_name}_one_category where `year_month`='2025_6' and rank<=50000 select * from {self.site_name}_one_category where `year_month`='2025_8' and rank<=50000
""" """
print('查询原始表6:', sql_6) print('查询原始表6:', sql_6)
self.df_sum_6 = pd.read_sql(sql_6, con=self.engine) self.df_sum_6 = self.engine.read_sql(sql_6)
# ---- 7 月份 ---- # ---- 7 月份 ----
sql_7 = f""" sql_7 = f"""
SELECT * from {self.site_name}_one_category WHERE id in ( select max(id) from {self.site_name}_one_category where `year_month`='2025_5' and orders=0 and rank>50000 GROUP BY `name`) SELECT * from {self.site_name}_one_category WHERE id in ( select max(id) from {self.site_name}_one_category where `year_month`='2025_5' and orders=0 and rank>50000 GROUP BY `name`)
UNION UNION
select * from {self.site_name}_one_category where `year_month`='2025_7' and rank<=50000 select * from {self.site_name}_one_category where `year_month`='2025_9' and rank<=50000
""" """
print('查询原始表7:', sql_7) print('查询原始表7:', sql_7)
self.df_sum_7 = pd.read_sql(sql_7, con=self.engine) self.df_sum_7 = self.engine.read_sql(sql_7)
# 合并后直接靠 keep='last' 留 7 月 # 合并后直接靠 keep='last' 留 7 月
self.df_sum = pd.concat([self.df_sum_6, self.df_sum_7], ignore_index=True) self.df_sum = pd.concat([self.df_sum_6, self.df_sum_7], ignore_index=True)
...@@ -86,62 +82,12 @@ class CalculateMean(BaseUtils): ...@@ -86,62 +82,12 @@ class CalculateMean(BaseUtils):
self.cate_list = list(set(self.df_sum.name)) self.cate_list = list(set(self.df_sum.name))
sql_select = f"SELECT `year_month` from selection.week_20_to_30 WHERE `week`={int(self.week)} and `year`={self.year}" sql_select = f"SELECT `year_month` from selection.week_20_to_30 WHERE `week`={int(self.week)} and `year`={self.year}"
print(sql_select, 'sql_select:') print(sql_select, 'sql_select:')
df = pd.read_sql(sql_select, con=self.engine) df = self.engine.read_sql(sql_select)
self.year_month = list(df['year_month'])[0] if list(df['year_month']) else '' self.year_month = list(df['year_month'])[0] if list(df['year_month']) else ''
print("self.year_month:", self.year_month) print("self.year_month:", self.year_month)
time.sleep(2) time.sleep(2)
self.handle_data() self.handle_data()
# def handle_data(self,max_rank=1_000_000, step=1):
# records = []
# for cate in self.cate_list:
# dfk = (self.df_sum[self.df_sum.name == cate]
# [['rank', 'orders']]
# .drop_duplicates()
# .query('orders>0')
# .sort_values('rank'))
# if len(dfk) < 3: continue
#
# # 1) 构造 log(rank), log(orders)
# lr = np.log(dfk['rank'].values)
# lo = np.log(dfk['orders'].values)
# # 2) 二次多项式扩展 X = [1, lr, lr^2]
# X = np.vstack([np.ones_like(lr), lr, lr ** 2]).T
# # 3) 求解最小二乘: coef = (X^T X)^-1 X^T lo
# coef = np.linalg.lstsq(X, lo, rcond=None)[0]
#
# # 4) 用这个多项式预测 full_range
# full = np.arange(dfk['rank'].min(), max_rank + 1, step)
# lf = np.log(full)
# log_pred = coef[0] + coef[1] * lf + coef[2] * (lf ** 2)
# orders_pred = np.exp(log_pred)
# cutoff_idx = np.argmax(orders_pred <= 30)
# # 如果从未出现 orders_pred < min_orders,cutoff_idx 会是 0
# # 但此时 orders_pred[0] 一定 >= min_orders,所以要检查:
# if orders_pred[cutoff_idx] >= 30:
# # 数组中没有小于阈值的点,保留全部
# last = len(full)
# else:
# # 在 cutoff_idx 处开始 <min_orders,就截断到它之前
# last = cutoff_idx
# full = full[:last]
# orders_pred = orders_pred[:last]
#
# # 5. 组装输出 DataFrame
# dfout = pd.DataFrame({
# 'name': cate,
# 'rank': full,
# 'orders': orders_pred
# })
# # 用四舍五入计算日均销量
# dfout['orders_day'] = (dfout['orders'] / 30).round(0).astype(int)
# dfout['year_month'] = self.year_month
# dfout['week'] = self.week
#
# records.append(dfout)
#
# records.append(dfout)
# self.df_repeat = pd.concat(records, ignore_index=True)
def handle_data(self):#旧代码 def handle_data(self):#旧代码
print(len(self.cate_list)) print(len(self.cate_list))
...@@ -195,12 +141,12 @@ class CalculateMean(BaseUtils): ...@@ -195,12 +141,12 @@ class CalculateMean(BaseUtils):
sql = f"select en_name as name, category_id from {self.site_name}_bs_category where 1 = 1 and nodes_num = 2 group by en_name, category_id" sql = f"select en_name as name, category_id from {self.site_name}_bs_category where 1 = 1 and nodes_num = 2 group by en_name, category_id"
print('sql',sql) print('sql',sql)
df_en_name = pd.read_sql(sql, con=self.engine) df_en_name = self.engine.read_sql(sql)
# 使用 merge 判断两个列的 name 是否一样 # 使用 merge 判断两个列的 name 是否一样
self.df_repeat = pd.merge(self.df_repeat, df_en_name, on='name', how='left') self.df_repeat = pd.merge(self.df_repeat, df_en_name, on='name', how='left')
self.df_repeat = self.df_repeat.loc[self.df_repeat.orders >= 30] # 保留大于0的 排名月销 self.df_repeat = self.df_repeat.loc[self.df_repeat.orders >= 30] # 保留大于0的 排名月销
self.df_repeat.drop_duplicates(['name', 'rank','orders'], inplace=True) # 去重 self.df_repeat.drop_duplicates(['name', 'rank','orders'], inplace=True) # 去重
self.df_repeat.to_sql(f"{self.site_name}_one_category_report", con=self.engine, if_exists="append", index=False) self.engine.to_sql(self.df_repeat,f"{self.site_name}_one_category_report",if_exists="append")
def run(self): def run(self):
self.db_read_data() self.db_read_data()
...@@ -208,21 +154,20 @@ class CalculateMean(BaseUtils): ...@@ -208,21 +154,20 @@ class CalculateMean(BaseUtils):
self.db_save_data() self.db_save_data()
def sendMessage(self, week, site_name): def sendMessage(self, week, site_name):
db = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], engine_us_mysql = get_remote_engine(
user=DB_CONN_DICT['mysql_user'], site_name='us', # -> database "selection"
password=DB_CONN_DICT['mysql_pwd'], db_type="mysql", # -> 服务端 alias "mysql"
database='selection', charset="utf8mb4") )
cursor = db.cursor(cursor=pymysql.cursors.DictCursor) with engine_us_mysql.begin() as conn:
time_strftime = time.strftime("%Y-%m-%d %X", time.localtime()) time_strftime = time.strftime("%Y-%m-%d %X", time.localtime())
update_workflow_progress = f"update workflow_progress set status_val=3,status='抓取结束' where page='ASIN销量' and date_info='2025-{week}' and site_name='{site_name}' and date_type='week'" update_workflow_progress = f"update workflow_progress set status_val=3,status='抓取结束' where page='ASIN销量' and date_info='2025-{week}' and site_name='{site_name}' and date_type='week'"
print(update_workflow_progress) print(update_workflow_progress)
cursor.execute(update_workflow_progress) conn.execute(update_workflow_progress)
db.commit()
cursor.close()
db.close()
url = 'http://47.112.96.71:8082/selection/sendMessage' url = 'http://47.112.96.71:8082/selection/sendMessage'
data = { data = {
'account': 'pengyanbing,fangxingjun,wangrui4', 'account': 'pengyanbing,fangxingjun',
'title': f"{site_name} 站点类目销量统计", 'title': f"{site_name} 站点类目销量统计",
'content': str(self.week) + f' 周 {site_name}站点类目销量计算 已结束,请确认下一步流程!时间:' + time_strftime 'content': str(self.week) + f' 周 {site_name}站点类目销量计算 已结束,请确认下一步流程!时间:' + time_strftime
} }
...@@ -240,12 +185,12 @@ if __name__ == '__main__': ...@@ -240,12 +185,12 @@ if __name__ == '__main__':
handle_obj_us = CalculateMean(site_name='us', year=2025, week=week) handle_obj_us = CalculateMean(site_name='us', year=2025, week=week)
handle_obj_us.run() handle_obj_us.run()
handle_obj_us.sendMessage(week, site_name='us') handle_obj_us.sendMessage(week, site_name='us')
#handle_obj_uk = CalculateMean(site_name='uk', year=2025, week=week) # handle_obj_uk = CalculateMean(site_name='uk', year=2025, week=week)
#handle_obj_uk.run() # handle_obj_uk.run()
# handle_obj_uk.sendMessage(week, site_name='uk') # handle_obj_uk.sendMessage(week, site_name='uk')
#handle_obj_de = CalculateMean(site_name='de', year=2025, week=week) # handle_obj_de = CalculateMean(site_name='de', year=2025, week=week)
#handle_obj_de.run() # handle_obj_de.run()
#handle_obj_de.sendMessage(week, site_name='de') # handle_obj_de.sendMessage(week, site_name='de')
# handle_obj_fr = CalculateMean(site_name='fr', year=2025, week=week) # handle_obj_fr = CalculateMean(site_name='fr', year=2025, week=week)
# handle_obj_fr.run() # handle_obj_fr.run()
# handle_obj_fr.sendMessage(week, site_name='fr') # handle_obj_fr.sendMessage(week, site_name='fr')
......
import datetime
import json
import os import os
import random import random
import re
import sys
import time
import traceback
import pandas as pd import pandas as pd
from lxml import etree
from secure_db_client import get_remote_engine
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from sqlalchemy import create_engine
from lxml import etree
import re
import time
import traceback
import datetime
import json
import sys
from sqlalchemy.engine import URL
syn_state = False syn_state = False
click_product_name_list = []
# 类目分析 # 类目分析
class dow_category_Product(): class dow_category_Product():
def __init__(self, site): def __init__(self, site):
self.site_name = site self.site_name = site
self.click_product_name_list = []
self.update_cagetory_state = False
def mysql_connect(self, site='us'): def mysql_connect(self, site='us'):
if site == 'us': self.engine_mysql = get_remote_engine(
db = 'selection' site_name=site, # -> database "selection"
else: db_type='mysql', # -> 服务端 alias "mysql"
db = f'selection_{site}'
DB_CONN_DICT = {
"mysql_port": 3306,
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
url = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database=db,
query={"charset": "utf8mb4"}
) )
self.engine_mysql = create_engine(
url) self.engine_us_mysql = get_remote_engine(
url_us = URL.create( site_name='us', # -> database "selection"
drivername="mysql+pymysql", db_type='mysql', # -> 服务端 alias "mysql"
username=DB_CONN_DICT["mysql_user"], )
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"], self.engine_pg = get_remote_engine(
port=int(DB_CONN_DICT["mysql_port"]), site_name=site, # -> database "selection"
database='selection', db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
query={"charset": "utf8mb4"}
) )
self.engine_us_mysql = create_engine(
url_us)
self.engine_pg = create_engine(
f"postgresql+psycopg2://postgres:F9kL2sXe81rZq@113.100.143.162:5432/{db}",
encoding='utf-8')
self.num = 0 self.num = 0
week = time.strftime("%W") week = time.strftime("%W")
yaer = time.strftime('%Y', time.localtime(time.time())) yaer = time.strftime('%Y', time.localtime(time.time()))
...@@ -77,7 +58,7 @@ class dow_category_Product(): ...@@ -77,7 +58,7 @@ class dow_category_Product():
os.system(f'start Chrome {params_} --remote-debugging-port={port}') os.system(f'start Chrome {params_} --remote-debugging-port={port}')
chrome_options = Options() chrome_options = Options()
# 禁止加载图片 # 禁止加载图片
chrome_options.add_argument('--blink-settings=imagesEnabled=false') # 这样可以;;;;激动 chrome_options.add_argument('--blink-settings=imagesEnabled=false')
chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{port}") chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{port}")
driver = webdriver.Chrome(r'chromedriver.exe', options=chrome_options) driver = webdriver.Chrome(r'chromedriver.exe', options=chrome_options)
# 无界面模式 # 无界面模式
...@@ -88,40 +69,19 @@ class dow_category_Product(): ...@@ -88,40 +69,19 @@ class dow_category_Product():
chrome_options.add_argument('--no‑sandbox') chrome_options.add_argument('--no‑sandbox')
# 改用 /tmp 而不是 /dev/shm(避免共享内存不足) # 改用 /tmp 而不是 /dev/shm(避免共享内存不足)
chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-dev-shm-usage')
# 其他可以尝试的降配项
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-application-cache')
chrome_options.add_argument('--disable-background-timer-throttling')
chrome_options.add_argument('--disable-backgrounding-occluded-windows')
chrome_options.add_argument('--disable-renderer-backgrounding')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# 彻底关掉大部分渲染路径
chrome_options.add_argument('--disable-gpu-compositing')
chrome_options.add_argument('--disable-2d-canvas-clip-aa')
chrome_options.add_argument('--disable-2d-canvas-cpu-path')
chrome_options.add_argument('--disable-accelerated-2d-canvas')
opts = chrome_options
# 限制渲染进程数
opts.add_argument('--renderer-process-limit=1')
# 限制实用进程数(utility)
opts.add_argument('--utility-process-limit=1')
# 关闭 Site Isolation
opts.add_argument('--disable-site-isolation-trials')
self.get_category(site, driver) self.get_category(site, driver)
def get_category(self, site, driver): def get_category(self, site, driver):
for i in range(2): for i in range(2):
try: try:
driver.get('https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?') driver.get('https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?')
time.sleep(random.uniform(8, 25.25)) time.sleep(random.uniform(8, 20.25))
driver.get('https://sellercentral.amazon.com/selection/category-insights') driver.get('https://sellercentral.amazon.com/selection/category-insights')
time.sleep(random.uniform(8, 25.25)) time.sleep(random.uniform(8, 20.25))
break break
except: except:
time.sleep(5) time.sleep(5)
time.sleep(1) time.sleep(5)
if site == 'us': if site == 'us':
driver.execute_script( driver.execute_script(
'document.querySelector("#ATVPDKIKX0DER > kat-radiobutton").shadowRoot.querySelector("div > div.text > slot > kat-label:nth-child(1)").click()') 'document.querySelector("#ATVPDKIKX0DER > kat-radiobutton").shadowRoot.querySelector("div > div.text > slot > kat-label:nth-child(1)").click()')
...@@ -131,7 +91,7 @@ class dow_category_Product(): ...@@ -131,7 +91,7 @@ class dow_category_Product():
elif site == 'de': elif site == 'de':
driver.execute_script( driver.execute_script(
'document.querySelector("#A1PA6795UKMFR9 > kat-radiobutton").shadowRoot.querySelector("div > div.text > slot > kat-label:nth-child(1)").click()') 'document.querySelector("#A1PA6795UKMFR9 > kat-radiobutton").shadowRoot.querySelector("div > div.text > slot > kat-label:nth-child(1)").click()')
time.sleep(random.uniform(8, 15.25)) time.sleep(random.uniform(5, 10.25))
html = etree.HTML(driver.page_source) html = etree.HTML(driver.page_source)
self.save_category(html) self.save_category(html)
print(333333333333333333333333) print(333333333333333333333333)
...@@ -157,31 +117,35 @@ class dow_category_Product(): ...@@ -157,31 +117,35 @@ class dow_category_Product():
num = 0 num = 0
for Category in Category_list: for Category in Category_list:
self.cilik_site(driver) self.cilik_site(driver)
product_nums = 0
click_product_name_list=[]
print(Category, ' 22222222222222222222222222222222222222') print(Category, ' 22222222222222222222222222222222222222')
if self.update_cagetory_state:
self.click_product_name_list = []
try: try:
num += 1 num += 1
Category_name = Category Category_name = Category
# _Category = Category.replace('&', '\\\&') # _Category = Category.replace('&', '\\\&')
print("Category_name 名称 11111", Category) print("Category_name 名称 11111", Category)
driver.execute_script(f"""document.querySelector("kat-radiobutton[label='{Category}']").click()""") driver.execute_script(f"""document.querySelector("kat-radiobutton[label='{Category}']").click()""")
time.sleep(2) time.sleep(1)
html = etree.HTML(driver.page_source) html = etree.HTML(driver.page_source)
Product_Type_list = html.xpath( Product_Type_list = html.xpath(
'//h2[contains(text(),"Product Type")]/following-sibling::div/div') '//h2[contains(text(),"Product Type")]/following-sibling::div/div')
product_nums = 0 product_nums = 0
for Product_Type in Product_Type_list: for Product_Type in Product_Type_list:
time.sleep(0.5) try:
save_Category_list = []
Product_name = Product_Type.xpath('./@id')
print(product_nums, "Product_name3222222222::", Product_name[0].upper())
if Product_name[0] in self.click_product_name_list:
print(product_nums, "已经抓取::", Product_name[0].upper())
continue
driver.execute_script("localStorage.clear();") # 清除本地存储 driver.execute_script("localStorage.clear();") # 清除本地存储
time.sleep(0.5) time.sleep(0.5)
# driver.execute_script("sessionStorage.clear();") # 清除会话存储
time.sleep(0.5)
driver.execute_script( driver.execute_script(
"caches.keys().then(function(names) { for (let name of names) { caches.delete(name); } });") "caches.keys().then(function(names) { for (let name of names) { caches.delete(name); } });")
driver.execute_script("window.performance.clearResourceTimings();") driver.execute_script("window.performance.clearResourceTimings();")
time.sleep(1) time.sleep(0.5)
# 假设你已经有了 driver # 假设你已经有了 driver
# 先 enable heap profiler # 先 enable heap profiler
driver.execute_cdp_cmd('HeapProfiler.enable', {}) driver.execute_cdp_cmd('HeapProfiler.enable', {})
...@@ -189,15 +153,9 @@ class dow_category_Product(): ...@@ -189,15 +153,9 @@ class dow_category_Product():
driver.execute_cdp_cmd('HeapProfiler.collectGarbage', {}) driver.execute_cdp_cmd('HeapProfiler.collectGarbage', {})
# 最后可选地 disable 掉 # 最后可选地 disable 掉
driver.execute_cdp_cmd('HeapProfiler.disable', {}) driver.execute_cdp_cmd('HeapProfiler.disable', {})
try: time.sleep(0.5)
time.sleep(2) self.click_product_name_list.append(Product_name[0])
save_Category_list = [] self.update_cagetory_state = False
Product_name = Product_Type.xpath('./@id')
print("Product_name3222222222::", Product_name[0].upper())
# print('click_product_name_list::', click_product_name_list)
# if Product_name[0] in click_product_name_list:
# continue
# click_product_name_list.append(Product_name[0])
driver.execute_script(f"document.querySelector('#{Product_name[0]} > kat-radiobutton').click()") driver.execute_script(f"document.querySelector('#{Product_name[0]} > kat-radiobutton').click()")
time.sleep(2) time.sleep(2)
html = etree.HTML(driver.page_source) html = etree.HTML(driver.page_source)
...@@ -213,14 +171,19 @@ class dow_category_Product(): ...@@ -213,14 +171,19 @@ class dow_category_Product():
Keyword = html.xpath(f"//div[@id='{Keyword_id[0]}']/kat-radiobutton/@label") Keyword = html.xpath(f"//div[@id='{Keyword_id[0]}']/kat-radiobutton/@label")
print('Keyword', Keyword) print('Keyword', Keyword)
driver.find_element(By.XPATH, f'//kat-radiobutton[@value="{Keyword_id[0]}"]').click() driver.find_element(By.XPATH, f'//kat-radiobutton[@value="{Keyword_id[0]}"]').click()
time.sleep(2.3) time.sleep(2)
html_1 = etree.HTML(driver.page_source) html_1 = etree.HTML(driver.page_source)
most_popular_list = html_1.xpath( most_popular_list = html_1.xpath(
"//div[@class='most-popular-keywords-container']/kat-list//li") "//div[@class='most-popular-keywords-container']/kat-list//li")
if most_popular_list: if most_popular_list:
for most_popular in most_popular_list: for most_popular in most_popular_list:
most_popular_keyword = most_popular.xpath('.//div[2]/text()')[0] most_keyword_list = most_popular.xpath('.//div[2]/text()')
most_popular_b_nums = most_popular.xpath('.//div/b/text()')[0] print(most_keyword_list, 1111)
most_popular_keyword = most_keyword_list[0] if most_keyword_list else None
most_popular_b_nums_list = most_popular.xpath('.//div/b/text()')
print(most_popular_b_nums_list, 2222)
most_popular_b_nums = most_popular_b_nums_list[
0] if most_popular_b_nums_list else None
most_popular_dict = {"most_popular_keywords": most_popular_keyword, most_popular_dict = {"most_popular_keywords": most_popular_keyword,
'most_popular_search_nums': most_popular_b_nums} 'most_popular_search_nums': most_popular_b_nums}
most_popular_keyword_list.append(most_popular_dict) most_popular_keyword_list.append(most_popular_dict)
...@@ -318,10 +281,12 @@ class dow_category_Product(): ...@@ -318,10 +281,12 @@ class dow_category_Product():
pattern = r'\£([\d.]+)' pattern = r'\£([\d.]+)'
elif self.site_name == 'de': elif self.site_name == 'de':
pattern = r'\€([\d.]+)' pattern = r'\€([\d.]+)'
else:
pattern = ''
# 使用findall函数找到所有匹配的金额 # 使用findall函数找到所有匹配的金额
matches_list = re.findall(pattern, big_text_Advertisement) matches_list = re.findall(pattern, big_text_Advertisement)
ad_spend = matches_list[0] ad_spend = matches_list[0] if matches_list else None
majority_spend = matches_list[1] majority_spend = matches_list[1] if matches_list else None
else: else:
ad_spend = 0 ad_spend = 0
majority_spend = 0 majority_spend = 0
...@@ -331,6 +296,7 @@ class dow_category_Product(): ...@@ -331,6 +296,7 @@ class dow_category_Product():
print('原始数据') print('原始数据')
print([big_text_sller, big_text_brand, big_text_asin, big_text_new_asin, print([big_text_sller, big_text_brand, big_text_asin, big_text_new_asin,
big_text_per_asin, big_text_Advertisement, big_text_star]) big_text_per_asin, big_text_Advertisement, big_text_star])
print(Category, ' 22222222222222222222222222222222222222')
# 品牌 # 品牌
if big_text_brand: if big_text_brand:
if 'K' in big_text_brand: if 'K' in big_text_brand:
...@@ -382,46 +348,84 @@ class dow_category_Product(): ...@@ -382,46 +348,84 @@ class dow_category_Product():
print('转成int') print('转成int')
print([big_brand_int, big_asin_int, print([big_brand_int, big_asin_int,
big_new_asin_int, big_per_asin_int]) big_new_asin_int, big_per_asin_int])
# top_data_json = self.new_top_grossing(driver, 'Top')
# time.sleep(1) top_data_json = self.new_top_grossing(driver, 'Top')
# news_data_json = self.new_top_grossing(driver, 'News')
# time.sleep(1) top_data_dict = json.loads(top_data_json)
if top_data_dict.get('products_aggregate_sales'):
_top_data_dict = self.parse_input('top', top_data_dict)
else:
_top_data_dict = self.parse_input('top', None)
top_sales_amount = _top_data_dict['top_sales_amount']
top_sales_volume = _top_data_dict['top_sales_volume']
top_search_ratio = _top_data_dict['top_search_ratio']
top_return_ratio = _top_data_dict['top_return_ratio']
top_adv_spend = _top_data_dict['top_adv_spend']
top_majority_spend = _top_data_dict['top_majority_spend']
print('top_majority_spend', top_majority_spend)
news_data_json = self.new_top_grossing(driver, 'News')
news_data_dict = json.loads(news_data_json)
if news_data_dict.get('products_aggregate_sales'):
_news_data_dict = self.parse_input('news', news_data_dict)
else:
_news_data_dict = self.parse_input('news', None)
news_sales_amount = _news_data_dict['news_sales_amount']
news_sales_volume = _news_data_dict['news_sales_volume']
news_search_ratio = _news_data_dict['news_search_ratio']
news_return_ratio = _news_data_dict['news_return_ratio']
news_adv_spend = _news_data_dict['news_adv_spend']
news_majority_spend = _news_data_dict['news_majority_spend']
print('news_majority_spend', news_majority_spend)
save_Category_list.append( save_Category_list.append(
[Category_name, Product_name[0], Keyword[0], float(search_ratio), [Category_name, Product_name[0], Keyword[0], float(search_ratio),
float(product_average), float(return_ratio), float(return_product_average), float(product_average), float(return_ratio), float(return_product_average),
self.y_w, big_text_sller, big_text_brand, big_text_asin, big_text_new_asin, self.y_w, big_text_sller, big_text_brand, big_text_asin, big_text_new_asin,
big_text_per_asin, big_text_Advertisement, big_text_star, big_brand_int, big_text_per_asin, big_text_Advertisement, big_text_star, big_brand_int,
big_asin_int, big_new_asin_int, big_per_asin_int, five_star, three_star, two_star, big_asin_int, big_new_asin_int, big_per_asin_int, five_star, three_star, two_star,
one_star, ad_spend, majority_spend, most_popular_json_dict, reasons_returns_json one_star, ad_spend, majority_spend, most_popular_json_dict, reasons_returns_json,
]) top_data_json, news_data_json, top_sales_amount, top_sales_volume,
top_search_ratio,
top_return_ratio, top_adv_spend, top_majority_spend, news_sales_amount,
news_sales_volume,
news_search_ratio, news_return_ratio, news_adv_spend, news_majority_spend])
print('数据:', save_Category_list) print('数据:', save_Category_list)
except: except Exception as e:
print('============ 下标。超出 。 ==========') print('============ 下标。超出 。 ==========', e)
continue
print('存储数据长度:', len(save_Category_list)) print('存储数据长度:', len(save_Category_list))
while True: while True:
try: try:
if save_Category_list: if save_Category_list:
with self.engine_mysql.begin() as conn_mysql: # with self.engine_mysql.begin() as conn_mysql:
for i in save_Category_list: # for i in save_Category_list:
dele_sql = f"DELETE from {site}_aba_profit_category_insights where category='{i[0]}' and product_type='{i[1]}' and item_type_keyword='{i[2]}' and year_week='{self.y_w}'" # dele_sql = f"DELETE from {site}_aba_profit_category_insights where category='{i[0]}' and product_type='{i[1]}' and item_type_keyword='{i[2]}' and year_week='{self.y_w}'"
print('删除删除mysql:', dele_sql) # print('删除删除mysql:', dele_sql)
conn_mysql.execute(dele_sql) # conn_mysql.execute(dele_sql)
df = pd.DataFrame(data=save_Category_list, # df = pd.DataFrame(data=save_Category_list,
columns=['category', "product_type", "item_type_keyword", # columns=['category', "product_type", "item_type_keyword",
"search_ratio", "product_average", "return_ratio", # "search_ratio", "product_average", "return_ratio",
"return_product_average", "year_week", 'sellers', # "return_product_average", "year_week", 'sellers',
'new_brands', # 'new_brands',
'asin', 'new_asin', 'per_asin', 'advertisement_spend', # 'asin', 'new_asin', 'per_asin', 'advertisement_spend',
'star_ratings', 'new_brands_int', 'asin_int', # 'star_ratings', 'new_brands_int', 'asin_int',
'new_asin_int', 'per_asin_int', 'five_star', # 'new_asin_int', 'per_asin_int', 'five_star',
'three_star', 'two_star', 'one_star', 'ad_spend', # 'three_star', 'two_star', 'one_star', 'ad_spend',
'majority_spend', 'most_popular_keywords_item', # 'majority_spend', 'most_popular_keywords_item',
'reasons_returns_json' # 'reasons_returns_json', 'top_data_json',
]) # 'news_data_json',
df.to_sql(f'{site}_aba_profit_category_insights', con=self.engine_mysql, # 'top_sales_amount', 'top_sales_volume',
if_exists="append", index=False) # 'top_search_ratio',
print('存储成功 mysql') # 'top_return_ratio', 'top_adv_spend',
# 'top_majority_spend',
# 'news_sales_amount',
# 'news_sales_volume',
# 'news_search_ratio', 'news_return_ratio',
# 'news_adv_spend',
# 'news_majority_spend'
# ])
# self.engine_mysql.to_sql(df, f'{site}_aba_profit_category_insights',
# if_exists="append")
# print('存储成功 mysql')
with self.engine_pg.begin() as conn_pg: with self.engine_pg.begin() as conn_pg:
for i in save_Category_list: for i in save_Category_list:
dele_sql = f"DELETE from {site}_aba_profit_category_insights where category='{i[0]}' and product_type='{i[1]}' and item_type_keyword='{i[2]}' and year_week='{self.y_w}'" dele_sql = f"DELETE from {site}_aba_profit_category_insights where category='{i[0]}' and product_type='{i[1]}' and item_type_keyword='{i[2]}' and year_week='{self.y_w}'"
...@@ -437,9 +441,18 @@ class dow_category_Product(): ...@@ -437,9 +441,18 @@ class dow_category_Product():
'new_asin_int', 'per_asin_int', 'five_star', 'new_asin_int', 'per_asin_int', 'five_star',
'three_star', 'two_star', 'one_star', 'ad_spend', 'three_star', 'two_star', 'one_star', 'ad_spend',
'majority_spend', 'most_popular_keywords_item', 'majority_spend', 'most_popular_keywords_item',
'reasons_returns_json']) 'reasons_returns_json', 'top_data_json',
df.to_sql(f'{site}_aba_profit_category_insights', con=self.engine_pg, 'news_data_json', 'top_sales_amount', 'top_sales_volume',
if_exists="append", index=False) 'top_search_ratio',
'top_return_ratio', 'top_adv_spend',
'top_majority_spend',
'news_sales_amount',
'news_sales_volume',
'news_search_ratio', 'news_return_ratio',
'news_adv_spend',
'news_majority_spend'])
self.engine_pg.to_sql(df, f'{site}_aba_profit_category_insights',
if_exists="append")
print(save_Category_list) print(save_Category_list)
print('存储成功 pg') print('存储成功 pg')
break break
...@@ -451,23 +464,31 @@ class dow_category_Product(): ...@@ -451,23 +464,31 @@ class dow_category_Product():
print('============ 产品分类 下标。超出 。无数据 ==========', f"\n{traceback.format_exc()}") print('============ 产品分类 下标。超出 。无数据 ==========', f"\n{traceback.format_exc()}")
time.sleep(2) time.sleep(2)
continue continue
# product_nums+=1 product_nums += 1
# if product_nums>10: if product_nums > 12:
# time.sleep(2) product_nums = 0
# print(product_nums, 'product_nums 重新启动 浏览器,') print(product_nums, 'product_nums 重新启动 浏览器,')
# driver.close() product_nums = 0
# driver.quit() driver.close()
# time.sleep(2) driver.quit()
# product_nums = 0 time.sleep(1)
# self.run() self.run()
except Exception as e: except Exception as e:
print(e, '执行错误') print(e, '执行错误')
time.sleep(random.uniform(10, 20)) time.sleep(random.uniform(10, 20))
self.reboot_driver(driver, site) # 重启刷新 self.reboot_driver(driver, site) # 重启刷新
while True:
try:
updated_at = datetime.datetime.now().strftime("%m-%d %H:%M:%S")
with self.engine_pg.begin() as conn: with self.engine_pg.begin() as conn:
update_sql = f"update seller_category_insights_syn set state =3 where category='{Category}'" update_sql = f"update seller_category_insights_syn set state =3 where category='{Category}'"
print('更新update_sql:', update_sql) print('更新update_sql:', update_sql)
conn.execute(update_sql) conn.execute(update_sql)
self.update_cagetory_state = True
break
except Exception as e:
print(e,'修改状态3报错')
time.sleep(20)
if num > 1: if num > 1:
driver.close() driver.close()
driver.quit() driver.quit()
...@@ -479,7 +500,48 @@ class dow_category_Product(): ...@@ -479,7 +500,48 @@ class dow_category_Product():
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list, df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['site_name', 'date_info', 'status', 'status_val', columns=['site_name', 'date_info', 'status', 'status_val',
'table_name', 'date_type', 'page', 'is_end']) 'table_name', 'date_type', 'page', 'is_end'])
df_seller_asin_account.to_sql('workflow_progress', con=self.engine_us_mysql, if_exists='append', index=False) self.engine_us_mysql.to_sql(df_seller_asin_account, 'workflow_progress', if_exists='append')
def safe_get(self, lst, idx, default=None):
return lst[idx] if 0 <= idx < len(lst) else default
def parse_input(self, type, input):
sales_amount = 0
sales_volume = 0
search_ratio = 0
return_ratio = 0
adv_spend = 0
majority_spend = 0
if input:
products_aggregate_sales = input.get('products_aggregate_sales', [])[0]
if products_aggregate_sales:
split = products_aggregate_sales.split("|")
sales_amount_str = self.safe_get(split, 1, '').partition("$")[-1]
sales_volume_str = self.safe_get(re.findall(r'\d+', self.safe_get(split, 2, '')), 0, "0")
if len(sales_amount_str) > 0:
sales_amount = float(sales_amount_str.strip().replace(",", ""))
sales_volume = float(sales_volume_str)
pass
search_ratio = float(input['search_ratio'] or -1)
return_ratio = float(input['return_ratio'] or -1)
big_text_Advertisement = input['big_text_Advertisement']
if big_text_Advertisement:
split = big_text_Advertisement.split("|-|")
adv_spend_str = self.safe_get(split, 0, '').partition("$")[-1]
majority_spend_str = self.safe_get(split, 1, '').partition("$")[-1]
adv_spend = (float(adv_spend_str.strip()) if adv_spend_str != '' else 0)
majority_spend = (float(majority_spend_str.strip()) if majority_spend_str != '' else 0)
return {
f"{type}_sales_amount": sales_amount,
f"{type}_sales_volume": sales_volume,
f"{type}_search_ratio": search_ratio,
f"{type}_return_ratio": return_ratio,
f"{type}_adv_spend": adv_spend,
f"{type}_majority_spend": majority_spend
}
def analysis_top_Newly_html(self, driver): def analysis_top_Newly_html(self, driver):
html_top = etree.HTML(driver.page_source) html_top = etree.HTML(driver.page_source)
...@@ -572,7 +634,7 @@ class dow_category_Product(): ...@@ -572,7 +634,7 @@ class dow_category_Product():
'most_popular_json_dict': most_popular_keyword_list, 'search_ratio': search_ratio, 'most_popular_json_dict': most_popular_keyword_list, 'search_ratio': search_ratio,
'return_ratio': return_ratio, 'return_ratio': return_ratio,
'big_text_Advertisement': big_text_Advertisement, 'big_text_star': big_text_star} 'big_text_Advertisement': big_text_Advertisement, 'big_text_star': big_text_star}
print('data_dict') print('data_dict',data_dict)
return json.dumps(data_dict) return json.dumps(data_dict)
def new_top_grossing(self, driver, click_type): def new_top_grossing(self, driver, click_type):
...@@ -607,7 +669,7 @@ class dow_category_Product(): ...@@ -607,7 +669,7 @@ class dow_category_Product():
break break
except Exception as e: except Exception as e:
print('reboot_driver详细报错') print('reboot_driver详细报错')
print(traceback.format_exc(e)) print(traceback.format_exc())
driver.close() driver.close()
driver.quit() driver.quit()
time.sleep(5) time.sleep(5)
...@@ -623,8 +685,9 @@ class dow_category_Product(): ...@@ -623,8 +685,9 @@ class dow_category_Product():
print('接着上次中断的继续') print('接着上次中断的继续')
self.mysql_connect(site=self.site_name) self.mysql_connect(site=self.site_name)
select_sql = 'select category from seller_category_insights_syn where state =1' select_sql = 'select category from seller_category_insights_syn where state =1'
df = pd.read_sql(select_sql, con=self.engine_pg) df = self.engine_pg.read_sql(select_sql)
category_list = list(df.category) category_list = list(df.category)
print(category_list)
if category_list: if category_list:
return category_list return category_list
else: else:
...@@ -635,8 +698,8 @@ class dow_category_Product(): ...@@ -635,8 +698,8 @@ class dow_category_Product():
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list, df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['site_name', 'date_info', 'status', 'status_val', columns=['site_name', 'date_info', 'status', 'status_val',
'table_name', 'date_type', 'page', 'is_end']) 'table_name', 'date_type', 'page', 'is_end'])
df_seller_asin_account.to_sql('workflow_progress', con=self.engine_us_mysql, if_exists='append', self.engine_us_mysql.to_sql(df_seller_asin_account, 'workflow_progress', if_exists='append'
index=False) )
def save_category(self, html): def save_category(self, html):
Category_list = html.xpath('//h2[contains(text(),"Category")]/following-sibling::div/div') Category_list = html.xpath('//h2[contains(text(),"Category")]/following-sibling::div/div')
......
import json import json
import random import os
import re
import time import time
import traceback
from datetime import datetime
import pandas as pd import requests
import redis from lxml import etree
from lxml import html
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from secure_db_client import get_remote_engine from secure_db_client import get_remote_engine
...@@ -12,23 +14,75 @@ from secure_db_client import get_remote_engine ...@@ -12,23 +14,75 @@ from secure_db_client import get_remote_engine
def mysql_connect(): def mysql_connect():
engine_us_mysql = get_remote_engine( engine_us_mysql = get_remote_engine(
site_name='us', # -> database "selection" site_name='us', # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql" db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
) )
return engine_us_mysql return engine_us_mysql
def parse_list(s: str):
# 把 "[a, b, c]" 这样的值转成 ["a","b","c"]
return [t.strip() for t in s.strip().strip('[]').split(',') if t.strip()]
def run(asin_list): def clean_text(node):
print('asin_list:::',asin_list) # 不用 normalize-space(),改用 Python 来规整空白
print('asin_list:::',len(asin_list)) return " ".join("".join(node.itertext()).split())
def extract_visible_headers(doc):
print('extract_visible_headers')
# 读取被隐藏的列 id 列表
hidden_val = doc.xpath('//input[@id="hiddenColumnTitles"]/@value')
hidden_ids = set(parse_list(hidden_val[0])) if hidden_val else set()
headers = []
seen = set()
# 只看表头行,按出现顺序取 th,并排除隐藏列
for th in doc.xpath('//tr[@id="head-row"]/th[@id]'):
col_id = th.get('id')
if col_id in seen:
continue
seen.add(col_id)
if col_id in hidden_ids:
continue
# 进一步防御:如果有内联隐藏样式/类名,也跳过
style = (th.get('style') or '').replace(' ', '').lower()
classes = (th.get('class') or '')
if 'display:none' in style or 'a-hidden' in classes.split():
continue
label = clean_text(th)
if label:
headers.append((col_id, label))
return headers
def extract_rows(doc, headers):
rows = []
# 页面里的数据行都带有 mt-row 类
for tr in doc.xpath('//table[contains(@class,"mt-table")]//tr[contains(@class,"mt-row")]'):
row = {}
for col_id, label in headers:
# 数据单元格用 data-column 指出对应列 id
td = tr.xpath('.//td[@data-column=$c]', c=col_id)
value = clean_text(td[0]) if td else ""
row[label] = value
# 只要有任何可见列有值,就认为是有效数据行
if any(row.values()):
rows.append(row)
return rows
def run_spider(asin_list):
print('asin_list:::', asin_list)
print('asin_list:::', len(asin_list))
if asin_list: if asin_list:
try:
pr_name = "chrome.exe"
os.system('%s%s' % ("taskkill /F /IM ", pr_name))
except Exception as e:
print("强制关闭chrome.exe失败:", e)
# 初始化 # 初始化
with sync_playwright() as _playwright: with sync_playwright() as _playwright:
# _playwright.chromium.launch_persistent_context context = _playwright.chromium.launch_persistent_context(
browser = _playwright.chromium.launch_persistent_context(
# 指定本机用户缓存地址 # 指定本机用户缓存地址
user_data_dir=r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data", user_data_dir=r"C:\Users\admin\AppData\Local\Google\Chrome\User Data",
# 指定本机google客户端exe的路径 # 指定本机google客户端exe的路径
executable_path=r"C:\Program Files\Google\Chrome\Application\chrome.exe", executable_path=r"C:\Users\admin\AppData\Local\Google\Chrome\Application\chrome.exe",
# 要想通过这个下载文件这个必然要开 默认是False # 要想通过这个下载文件这个必然要开 默认是False
accept_downloads=True, accept_downloads=True,
# 设置不是无头模式 # 设置不是无头模式
...@@ -37,121 +91,180 @@ def run(asin_list): ...@@ -37,121 +91,180 @@ def run(asin_list):
locale='en-GB', locale='en-GB',
ignore_https_errors=True, ignore_https_errors=True,
no_viewport=True, no_viewport=True,
slow_mo=10, slow_mo=10
# 跳过检测
args=['--disable-blink-features=AutomationControlled', '--remote-debugging-port=9222']
) )
page = context.pages[0] if context.pages else context.new_page()
page = browser.new_page()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page.add_init_script(js)
page.evaluate_handle('''() =>{ window.chrome = { runtime: {}, }; }''')
page.evaluate_handle(
'''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
# 模拟浏览器参数
page.locator("body").click()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page.add_init_script(js)
print('打开浏览器请求asin:') print('打开浏览器请求asin:')
page = browser.new_page()
# page.goto('https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?')
# page.wait_for_timeout(1500)
def intercept_request(request):
try: try:
page.goto('https://sellercentral.amazon.com') if "populate" in request.url:
time.sleep(random.uniform(2, 5)) print('request.url::', request.url)
except: resp = request.response().text()
save_asin_var_data(asin_list[0], json.dumps({"content": "网络有问题 登录账号失败。远程账号电脑检查"}), '失败') html_data_list.append(resp)
for asin in asin_list: except Exception as e:
time.sleep(random.uniform(1, 3)) print("获取响应失败:", e, f"\n{traceback.format_exc()}")
# try:
page.goto(
'https://sellercentral.amazon.com/listing/varwiz?ref_=xx_swlang_head_xx&mons_sel_locale=en_US&languageSwitched=1')
page.wait_for_timeout(1500)
print('page.url::', page.url)
for id_asin in asin_list:
print("id_asin::", id_asin)
id_asin_list = id_asin.split('|-|')
asin = id_asin_list[0]
id = int(id_asin_list[1])
print('开始抓取::', asin, 'id::', id)
error = None
if 'signin' in page.url:
save_asin_var_data(asin, [], '失败', 'us', [], [], '账号电脑退出登录', id)
semd_ms(asin)
time.sleep(120)
return
html_data_list = []
# page.reload() # 刷新页面
page.wait_for_timeout(1000)
page.locator('//*[@id="varwizard_accordion"]/div[1]/div/div[1]/h5/a/i').click()
page.wait_for_timeout(1000)
page.locator('//*[@id="varwiz-search-text"]').fill(f'{asin}')
page.wait_for_timeout(5000)
page.on("requestfinished", intercept_request)
page.locator('//*[@id="a-autoid-0"]/span/input').click() # 点击
page.wait_for_timeout(15000)
# 请求 指定的 URL
items_list = []
parent_asin_list = []
var_asin_list = []
print(f'请求asin {asin} data_list::', len(html_data_list))
for html_content in html_data_list:
if (
'variation family is not supported on this' in html_content and 'ERROR' in html_content) or (
'you searched for is not' in page.content()):
try: try:
print('请求asin', asin) error = re.findall(r'message =(.*?);', html_content)[0]
url = f"https://sellercentral.amazon.com/listing/varwiz/search?searchText={asin}" except:
print('url:', url) error = "The ASIN you searched for is not part of any variation family"
page.goto(url) print('errorerror::', error)
time.sleep(random.uniform(3, 8)) save_asin_var_data(asin, [], '失败', 'us', [], [], error, id)
print() break
print(page.content()) tree = etree.HTML(html_content)
html_string = page.content() visible_headers = extract_visible_headers(tree)
time.sleep(0.5) rows = extract_rows(tree, visible_headers)
if 'The ASIN you searched for is not part of any variation' not in html_string: if rows:
doc = html.fromstring(html_string) for i in rows:
# 取第一个 <pre> 的文本内容(会自动去掉标签内 HTML) print(i, '233333333333')
pre_nodes = doc.xpath('//pre') Parentage = i.get('Parentage')
if not pre_nodes: if Parentage == 'parent':
raise ValueError("找不到 <pre> 节点") parent_asin = i.get('ASIN')
pre_text = pre_nodes[0].text_content().strip() parent_asin_list.append(parent_asin)
# 直接尝试解析(适用于 <pre> 里就是整段 JSON 的情况) if Parentage == 'child':
data_json = json.loads(pre_text) var_asin = i.get('ASIN')
print(data_json) # dict / list var_asin_list.append(var_asin)
print('获取完成', asin) items_list.extend(rows)
save_asin_var_data(asin, data_json, '成功') if items_list:
print('items_listitems_list::', len(items_list))
items_list_json = json.dumps(items_list)
save_asin_var_data(asin, items_list_json, '成功', 'us', parent_asin_list, var_asin_list, None, id)
html_data_list = []
# except Exception as e:
# print(asin, '报错::', e)
# semd_ms(asin)
# save_asin_var_data(asin, [], '失败', 'us', [], [], None, id)
else:
print('没有该asin,', asin) def semd_ms(asin):
save_asin_var_data(asin, json.dumps( try:
{"content": "The ASIN you searched for is not part of any variation family"}), '成功') url = 'http://47.112.96.71:8082/selection/sendMessage'
except Exception as e: data = {
print('报错,‘23232323232323232323', e) 'content': '下载 变体数据失败。远程账号电脑 HM 299 421 380',
save_asin_var_data(asin, json.dumps({"content": "下载失败。远程账号电脑检查"}), '失败') 'title': f'账号电脑 {asin} 变体数据 失败',
continue 'account': 'pengyanbing'
}
print(data)
y = requests.post(url=url, data=data, timeout=15)
print(y.content.decode('gbk'))
except:
pass
def redis_get_asin(): def mysql_get_asin():
asin_list = []
random_key_list = []
redis_client = redis.Redis(host='113.100.143.162', port=6379, db=10, password='fG7#vT6kQ1pX')
while True: while True:
try: try:
print('轮询redis 查询,') print('轮询 mysql 查询:', datetime.now().strftime("%m-%d %H:%M:%S"))
for i in range(10): engine_us_mysql = mysql_connect()
# 随机获取一个key spider_state_sql = """select asin,id from asin_variation_family_log where status = '未开始' and length(asin)=10 limit 20 """
random_key = redis_client.randomkey() print('spider_state_sql:', spider_state_sql)
if random_key: df_asin = engine_us_mysql.read_sql(spider_state_sql)
random_key_list.append(random_key) if not df_asin.empty:
# 获取该key对应的value update_time = int(time.time())
value = redis_client.get(random_key) with engine_us_mysql.begin() as conn:
value = value.decode('utf-8') index_tuple = tuple(df_asin['id'])
print('redis取出asin: ', value) if len(index_tuple) == 1:
if value not in asin_list: sql_update = f"""UPDATE asin_variation_family_log a set status='爬取中',update_time='{update_time}' where a.id in ({index_tuple[0]})"""
asin_list.append(value)
else: else:
break sql_update = f"""UPDATE asin_variation_family_log a set status='爬取中',update_time='{update_time}' where a.id in {index_tuple}"""
print('UPDATE_sql:', sql_update)
if asin_list: conn.execute(sql_update)
_asin_lis = list(set(asin_list)) _asin_lis = list(df_asin.asin + '|-|' + df_asin.id.astype("U"))
print("_asin_lis:::",_asin_lis, ) print("_asin_lis:::", _asin_lis, )
print("_asin_lis::: len ", len(_asin_lis)) print("_asin_lis::: len ", len(_asin_lis))
run(_asin_lis) # 传递asin 列表 run_spider(_asin_lis) # 传递asin 列表
asin_list = []
for _key in random_key_list:
print(' 删除redis的asin:', _key)
redis_client.delete(_key) # 删除redis的asin
random_key_list = []
else:
time.sleep(3) time.sleep(3)
continue # break
# redis_client.close() 关闭redis
except Exception as e: except Exception as e:
print('查询redis报错', e) print('查询 mysql_get_asin 报错::', e, f"\n{traceback.format_exc()}")
redis_client.close()
redis_client = redis.Redis(host='192.168.10.224', port=6379, db=10, password='fG7#vT6kQ1pX')
time.sleep(5)
continue
def save_asin_var_data(asin, data_json, spider_value, site_name, parent_asin_list, var_asin_list, error, id):
if parent_asin_list:
parent_asin = ','.join(parent_asin_list)
else:
parent_asin = ""
if len(var_asin_list) == 0:
var_asin_list = "'{}'"
else:
var_asin_list = "'" + '{' + ','.join(var_asin_list) + '}' + "'"
def save_asin_var_data(asin, data_json, spider_value): if data_json:
data_json = data_json.replace('%', '%%').replace("'", "").replace("\'", "")
if error:
error = error.replace('%', '%%').replace("'", "").replace('"', '').replace("\'", "")
while True:
try:
engine_us_mysql = mysql_connect() engine_us_mysql = mysql_connect()
workflow_everyday_list = [[asin, data_json, spider_value]] update_time = int(time.time())
print('存储数据:', len(workflow_everyday_list)) print(f'更新 {asin} 数据:')
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list, with engine_us_mysql.begin() as conn:
columns=['asin', 'asin_var_data', 'spider_value']) if error is None:
engine_us_mysql.to_sql(df_seller_asin_account, 'us_asin_var_info') sql = f"""
UPDATE asin_variation_family_log
SET variation_family='{data_json}', status='{spider_value}', update_time='{update_time}' ,parent_asin='{parent_asin}',variation_asin={var_asin_list}
WHERE id={id} AND site_name='{site_name}'
"""
print('成功',sql)
elif error == '账号电脑退出登录':
sql = f"""
UPDATE asin_variation_family_log SET status='未开始' WHERE id={id} AND site_name='{site_name}'
"""
else:
sql = f"""
UPDATE asin_variation_family_log
SET variation_family='{data_json}', status='{spider_value}', update_time='{update_time}' ,parent_asin='{parent_asin}',variation_asin={var_asin_list},err_msg='{error}'
WHERE id={id} AND site_name='{site_name}'
"""
print('error is not None:: ', sql)
conn.execute(sql)
print(asin, '更新成功')
break
except Exception as e:
print('存储数据报错:', e)
time.sleep(5)
if __name__ == '__main__': if __name__ == '__main__':
redis_get_asin() mysql_get_asin()
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from sqlalchemy import create_engine from secure_db_client import get_remote_engine
import pandas as pd import pandas as pd
import urllib.parse import urllib.parse
import json import json
import traceback import traceback
import time import time
from sqlalchemy.engine import URL
'商机探测器。下载bsr分类数据' '商机探测器。下载bsr分类数据'
...@@ -17,32 +16,16 @@ class One688LoginSpider(object): ...@@ -17,32 +16,16 @@ class One688LoginSpider(object):
yaer = time.strftime('%Y', time.localtime(time.time())) yaer = time.strftime('%Y', time.localtime(time.time()))
self.y_w = f"{yaer}-{month}" self.y_w = f"{yaer}-{month}"
self.mysql_connect() self.mysql_connect()
def mysql_connect(self): def mysql_connect(self):
if self.site == 'us': self.engine_us_mysql = get_remote_engine(
db = 'selection' site_name='us', # -> database "selection"
else: db_type='mysql', # -> 服务端 alias "mysql"
db = f'selection_{self.site}' )
DB_CONN_DICT = { self.engine_pg = get_remote_engine(
"mysql_port": 3306, site_name=self.site, # -> database "selection"
"mysql_db": db, db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
url = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database=db,
query={"charset": "utf8mb4"}
) )
self.engine_us_mysql = create_engine( url)
self.engine_pg = create_engine(
f"postgresql+psycopg2://postgres:F9kL2sXe81rZq@113.100.143.162:5432/{db}",
encoding='utf-8')
return self.engine_us_mysql return self.engine_us_mysql
def print_request_finished(self, request): def print_request_finished(self, request):
...@@ -59,7 +42,7 @@ class One688LoginSpider(object): ...@@ -59,7 +42,7 @@ class One688LoginSpider(object):
def select_category_json(self): def select_category_json(self):
sql = 'SELECT category_json,id FROM seller_product_opportunity_syn where state=1' sql = 'SELECT category_json,id FROM seller_product_opportunity_syn where state=1'
engine_mysql = self.mysql_connect() engine_mysql = self.mysql_connect()
df_category_json = pd.read_sql(sql, con=engine_mysql) df_category_json = engine_mysql.read_sql(sql)
category_data_list = list(df_category_json['category_json'] + '|=|=|' + df_category_json['id'].astype("U")) category_data_list = list(df_category_json['category_json'] + '|=|=|' + df_category_json['id'].astype("U"))
data_list = [] data_list = []
for i in category_data_list: for i in category_data_list:
...@@ -132,8 +115,7 @@ class One688LoginSpider(object): ...@@ -132,8 +115,7 @@ class One688LoginSpider(object):
'minimum_price', 'maximum_price', 'avg_price', 'minimum_price', 'maximum_price', 'avg_price',
'return_rate_t360', 'search_volume_growth_t360', 'return_rate_t360', 'search_volume_growth_t360',
'site', 'date_info', 'search_term']) 'site', 'date_info', 'search_term'])
df_category_data.to_sql('seller_product_opportunity', con=self.engine_pg, if_exists='append', self.engine_pg.to_sql(df_category_data,'seller_product_opportunity', if_exists='append')
index=False)
print('存储成功:', len(category_data_list)) print('存储成功:', len(category_data_list))
with self.engine_us_mysql.begin() as conn: with self.engine_us_mysql.begin() as conn:
sql_update = f"update seller_product_opportunity_syn set state=3 where id={int(data[1])};" sql_update = f"update seller_product_opportunity_syn set state=3 where id={int(data[1])};"
...@@ -155,10 +137,10 @@ class One688LoginSpider(object): ...@@ -155,10 +137,10 @@ class One688LoginSpider(object):
[self.site, self.y_w, '商机探测器抓取完成', 3, 'seller_product_opportunity', 'month', [self.site, self.y_w, '商机探测器抓取完成', 3, 'seller_product_opportunity', 'month',
'商机探测器', '是']] '商机探测器', '是']]
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list, df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['site_name', 'date_info', 'status', 'status_val', columns=['site_name', 'report_date', 'status', 'status_val',
'table_name', 'report_date', 'page', 'is_end']) 'table_name', 'date_type', 'page', 'is_end'])
df_seller_asin_account.to_sql('workflow_everyday', con=self.engine_us_mysql, if_exists='append', self.engine_us_mysql.to_sql(df_seller_asin_account,'workflow_everyday', if_exists='append'
index=False) )
def crawl(self, url): def crawl(self, url):
self.page.on("requestfinished", self.print_request_finished) self.page.on("requestfinished", self.print_request_finished)
......
# import sys
# import os
#
# sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
# from threading_spider.db_connectivity import connect_db
# import traceback
# import random
# from lxml import etree
# import json
# # import requests
# from curl_cffi import requests
# import time
# import pymysql
# from urllib.parse import quote
#
# """获取 junglescout bsr分类排名 销量"""
#
# rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700,
# 1800,
# 1900,
# 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700,
# 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500,
# 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300,
# 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900, 9000, 9100,
# 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000,
# 18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000,
# 85000, 90000, 95000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000, 550000,
# 600000, 650000, 700000, 750000, 800000, 850000, 900000, 950000, 1000000]
#
#
# def send_mes(site_name):
#
# month = time.strftime("%m")
# year = time.strftime("%Y")
# _year_month = f'{year}_{int(month)}'
# db_class = connect_db(site_name)
# cursor_mysql_db, db = db_class.mysql_db() # mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
# update_sql_state = f"""
# UPDATE {site_name}_one_category
# SET STATE=4
# WHERE `name` IN (
# SELECT `name` FROM (
# SELECT `name` FROM {site_name}_one_category WHERE rank=1 AND orders=0 AND `year_month`="{_year_month}"
# ) AS temp_table
# );
# """
# cursor.execute(update_sql_state)
# db.commit()
#
# def get_jl_rank(db_base):
# month = time.strftime("%m")
# year = time.strftime("%Y")
# year_month = f'{year}_{int(month)}'
# while True:
# try:
# db_class = connect_db(db_base)
# print(db_base)
# cursor_mysql_db, db = db_class.mysql_db() # mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
# category_name_sql_select = f"select distinct `name` , orders,category_first_id from {db_base}_one_category where rank=1 and `year_month` ='{year_month}'"
# print(category_name_sql_select)
# cursor.execute(category_name_sql_select)
# category_name_list = cursor.fetchall()
# print(category_name_list,'3444444444')
# update_sql_state1 = f"""UPDATE {db_base}_one_category SET STATE= 1"""
# cursor.execute(update_sql_state1)
# db.commit()
# break
# except:
# time.sleep(20)
# url = "https://www.junglescout.cn/sales-estimator/?gspk=OTQ0&gsxid=joW2dg6ZmAJA&utm_medium=944&utm_source=affiliate"
# h = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'Host': 'www.junglescout.cn',
# 'Pragma': 'no-cache',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"',
# 'Sec-Fetch-Dest': 'document',
# 'Sec-Fetch-Mode': 'navigate',
# 'Sec-Fetch-Site': 'none',
# 'Sec-Fetch-User': '?1',
# 'Upgrade-Insecure-Requests': '1',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
# }
# while True:
# try:
#
# resp_html = requests.get(url, impersonate="chrome", headers=h,
# timeout=25, verify=False)
# if resp_html.status_code == 200 or resp_html.status_code == 201:
# break
# else:
# print(resp_html.text)
# print("请求首页报错----重新请求 ")
# time.sleep(2)
# continue
# except:
# time.sleep(5)
# continue
# html = etree.HTML(resp_html.text)
# name_lsit = html.xpath(
# f'//table/tbody/tr[4]/td[2]/div/ul/li[@class="category {db_base}-available"]/span[1]/text()')
# print('获取 js 分类名称:', name_lsit)
# for name in name_lsit:
# if db_base == 'fr':
# name = name.replace(' & ', ' et ')
# if name == 'Camera & Photo':
# name = 'Camera & Photo Products'
# while True:
# try:
# select_name_sql = f"select id from {db_base}_one_category where name='{name}'"
# cursor.execute(select_name_sql)
# name_list = cursor.fetchall()
# if name_list:
# pass
# else:
# print('junglescout 新增分类:', name)
# insert_sql = f'insert into {db_base}_one_category (name, rank, orders)values (%s,%s,%s)'
# cursor.execute(insert_sql, (name, 1, 1))
# db.commit()
# break
# except:
# time.sleep(15)
#
# print(category_name_list)
# for category_name in category_name_list:
# print("历史销量:", category_name['name'], category_name['orders'], category_name['category_first_id'])
# name_rnak_list = []
# Handmade_Products_list = []
# sales = 31
# for i in rank_list:
# if sales == 0:
# break
# token_num = 0
# while True:
# try:
# print(i)
# if db_base == 'fr':
# c_name = category_name['name'].replace(' et ', ' & ')
# else:
# c_name = category_name['name']
# if c_name == 'Camera & Photo Products':
# c_name = 'Camera & Photo'
# num = random.randint(115, 126)
# print(num)
# f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{num}.0.0.0 Safari/537.36'
# headers = {
# "Upgrade-Insecure-Requests": "1",
# "User-Agent": f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{num}.0.0.0 Safari/537.36',
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
# "Sec-Fetch-Site": "none",
# "Sec-Fetch-Mode": "navigate",
# "Sec-Fetch-User": "?1",
# "Sec-Fetch-Dest": "document",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept-Language": "zh-CN,zh;q=0.9",
# "Pragma": "no-cache",
# "Cache-Control": "no-cache"
# }
# params = (
# ('rank', f'{i}'),
# ('category', f'{c_name}'),
# ('store', f'{db_base}'),
# )
# for num in range(2):
# try:
# session = requests.Session()
# session.headers = headers
# session.get('https://d2ogs1k0ty8acr.cloudfront.net/sales', headers=headers,
# params=params, verify=False, timeout=10,impersonate="chrome")
# response = session.get('https://d2ogs1k0ty8acr.cloudfront.net/sales', headers=headers,
# params=params, verify=False, timeout=10,impersonate="chrome")
# print(response.url)
# print(response.text)
# break
# except Exception as e:
# print(e)
# time.sleep(6)
# continue
# response = json.loads(response.text)
# data = response['data']
# if data['sales']:
# sales = int(data['sales'])
# name_rnak_list.append(
# (category_name['name'], i, int(data['sales']), category_name['category_first_id'],
# year_month))
# if category_name['name'] == "Musical Instruments":
# Handmade_Products_list.append(
# ('Handmade Products', i, int(data['sales']), category_name['category_first_id'],
# year_month))
# token_num = 0
# if category_name['orders'] == int(data['sales']):
# print("销量不变")
# sales = 0
# break
# else:
# name_rnak_list.append(
# (category_name['name'], i, 0, category_name['category_first_id'], year_month))
# if category_name['name'] == "Musical Instruments":
# Handmade_Products_list.append(
# ('Handmade Products', i, 0, category_name['category_first_id'], year_month))
# sales = 0
# except Exception as e:
# print('错误', e, f"\n{traceback.format_exc()}")
# time.sleep(random.uniform(20, 60.5))
# token_num += 1
# continue
# time.sleep(random.uniform(15, 40.75))
# break
# print(name_rnak_list)
# while True:
# try:
# db.ping(reconnect=True)
# inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,category_first_id,`year_month`) values (%s, %s, %s, %s, %s)"
# print(inset_sql)
# cursor.executemany(inset_sql, name_rnak_list)
# db.commit()
# break
# except:
# db_class = connect_db(db_base)
# print(db_base)
# cursor_mysql_db, db = db_class.mysql_db() # mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
# time.sleep(20)
# if category_name['name'] == "Musical Instruments" and db_base == 'us':
# while True:
# try:
# inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,category_first_id,`year_month`) values (%s, %s, %s, %s, %s)"
# print(inset_sql)
# cursor.executemany(inset_sql, Handmade_Products_list)
# db.commit()
# break
# except:
# db_class = connect_db(db_base)
# print(db_base)
# cursor_mysql_db, db = db_class.mysql_db() # mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
# time.sleep(20)
# update_sql_state = f"""
# UPDATE {db_base}_one_category
# SET STATE=4
# WHERE `name` IN (
# SELECT `name` FROM (
# SELECT `name` FROM {db_base}_one_category WHERE rank=1 AND orders=0 AND `year_month`="{year_month}"
# ) AS temp_table
# );
# """
# cursor.execute(update_sql_state)
# db.commit()
# cursor.close()
# db.close()
#
#
# if __name__ == '__main__':
# site = 'us'
# get_jl_rank(site)
# send_mes(site)
import sys import sys
import os import os
...@@ -271,7 +8,7 @@ from lxml import etree ...@@ -271,7 +8,7 @@ from lxml import etree
from threading_spider.db_connectivity import connect_db from threading_spider.db_connectivity import connect_db
import json import json
import time import time
import pymysql from utils.secure_db_client import get_remote_engine
import random import random
...@@ -292,34 +29,54 @@ def get_cid(): ...@@ -292,34 +29,54 @@ def get_cid():
print(data_category) print(data_category)
save_site_category(json.loads(data_category)) save_site_category(json.loads(data_category))
def mysql_connect(site='us'):
engine_mysql = get_remote_engine(
site_name=site, # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
return engine_mysql
def db_cursor_connect_update(sql, site):
for i in range(3):
try:
engine_us_mysql = mysql_connect(site=site)
print('更新sql:', sql)
with engine_us_mysql.begin() as conn:
conn.execute(sql)
break
except:
print(site, 'db_cursor_connect 报错:', sql)
def db_cursor_connect_msyql_read(site,select_state1_sql):
for i in range(3):
try:
engine_mysql = mysql_connect(site=site)
df = engine_mysql.read_sql(select_state1_sql)
return df
except Exception as e:
import traceback
traceback.print_exc() # ★ 打印完整栈到终端
print(e, 'db_cursor_connect_msyql_read 报错:', select_state1_sql)
def junglescout_spider(db_base): def junglescout_spider(db_base):
month = time.strftime("%m") month = time.strftime("%m")
year = time.strftime("%Y") year = time.strftime("%Y")
year_month = f'{year}_{int(month)}' year_month = f'{year}_{int(month)}'
db_class = connect_db(db_base)
cursor_mysql_db, connect_mysql_db = db_class.mysql_db() # mysql
cursor_us, connect_us = db_class.us_mysql_db() # us站点
cursor_us = connect_us.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1" category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
print(category_name_sql_select) print(category_name_sql_select)
cursor_us.execute(category_name_sql_select) category_name_list_df = db_cursor_connect_msyql_read('us',category_name_sql_select)
category_name_list = cursor_us.fetchall() print(category_name_list_df)
print('category_name_list:',category_name_list)
for category_name in category_name_list: category_name_list = list(category_name_list_df['name']+'|-|==|'+category_name_list_df['c_id'])
db_class = connect_db(db_base) for category_name_id in category_name_list:
print(db_base) print(category_name_id, '2323232323')
cursor_mysql_db, db = db_class.mysql_db() # mysql c_name = category_name_id.split('|-|==|')[0]
db_class_us = connect_db('us') c_id = category_name_id.split('|-|==|')[1]
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql print(c_name, c_id)
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
print(category_name['name'], category_name['c_id'])
name_rnak_list = [] name_rnak_list = []
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{category_name['c_id']}'" up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{c_id}'"
print('更新状态:', up_sql) db_cursor_connect_update(up_sql,'us')
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1600, 1600,
1700, 1700,
...@@ -354,16 +111,16 @@ def junglescout_spider(db_base): ...@@ -354,16 +111,16 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8", "Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache", "Cache-Control": "no-cache",
'Cookie': 'Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1754561346; HMACCOUNT=F022519658636435; _ga=GA1.1.1814436837.1754561346; MEIQIA_TRACK_ID=30xFWMfHnmUko4gRxOqdrJNPOcY; MEIQIA_VISIT_ID=30xFWMuukIH8mg0Y3QtIVUHsOlv; ecookie=6fLTD5dFES0wy5bJ_CN; 5d6b3550f67a0d98a3f2=300e7c0221464bf96a29eee60c456f00; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=395de153390ead6f40f8e8ab7a472e28; _gcl_au=1.1.174716114.1754561346.1679700600.1754561355.1754561355; current_guest=qtmXt8RNChOi_250807-180618; rank-login-user=55981645716j2gNzbXWw3NxEgY4QumA2+nJmFK4cRNMQNZD9W4ScvveWtruw9iXoAChaMVh09V; rank-login-user-info="eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjU1OTgxNjQ1NzE2ajJnTnpiWFd3M054RWdZNFF1bUEyK25KbUZLNGNSTk1RTlpEOVc0U2N2dmVXdHJ1dzlpWG9BQ2hhTVZoMDlWIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJOQlhkbHU4eTlSZXdwS2doOWpJVzJBIiwiaWF0IjoxNzU0NTYxMzU1LCJleHAiOjE3NTQ2NDc3NTUsIm5iZiI6MTc1NDU2MTI5NSwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.csgALwYW8BkMpPMNB_LWfTmx9J4lYpLbqZW95ikqbz02AjJLMkoR8SmYfs_l5Y8_kr91FN-mMNG0_uG6LlMZg1_I_OWTX1GIVEqixiM4LnXO31VMK3yPCTEdFAUNosLKmXaLBsAkyovg82onGSOX3Sp8yy3QzCwREZc0TEVAONK7vBp0fFheyZNwejzyBfw_b7NPkFkTfvwzZo25QaHJyfkh0hxYgwtoDPSS_FmKrkpyh_zjbk7QIpJhY98k3ElI2OjdeAcE0ublxLemPI8GCwvj_V26Ob3mJ0WnvwyM5e2XBdCXF3Tn1OjOWvNP_fFr9TKDHguKLfZZzLOIO9gmkQ; ao_lo_to_n="55981645716j2gNzbXWw3NxEgY4QumA0yFbZjZZBIPjXVnHzCoK/gvYEiwOtDSpCKptN3oC6H7pg4af19gw3X2vJfRDGlTzgAJp8Uby054LbQLjTr8OLk="; rank-guest-user=6598164571W4w7830gIdYfyJ4dBpV8rZZnQ5nxne/EL2NJNXxISww1iqfwc5k9B1MBi3+pbbvB; _ga_38NCVF2XST=GS2.1.s1754561346$o1$g1$t1754561361$j45$l0$h2087898121; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1754561362; _ga_CN0F80S6GL=GS2.1.s1754561347$o1$g1$t1754561362$j45$l0$h0; JSESSIONID=012AF629221AF9FF44705008C9CE11D7', 'Cookie':'_ga=GA1.1.460823715.1761964155; _gcl_au=1.1.1179274784.1761964155; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1761964155; HMACCOUNT=B4AF3F9F7A9D6EC1; ecookie=Dze8cmDaWQgl62by_CN; cefb279b040e878c5f29=3df3541e6a5558a1721e067eca0b7599; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=34rGDtW8dkHrHYKAd1YqneyES16; MEIQIA_VISIT_ID=34rGDvmHZCXsyI9TcknmSs0VUgF; current_guest=Q5ta0ho0plze_251101-107638; _gaf_fp=71220b6380421e1c3114927822a0491d; rank-guest-user=4415412671XrN9Zk+EL9uIxING7/uXAkz1zoQytfQ4xehrp1wmpmp0tq0CKPMciyLt+xiapPpr; rank-login-user=4415412671XrN9Zk+EL9uIxING7/uXAtdtFXnuDWfcyj/blj6W2ZWpWUeF9+7WsIFXBV6TrXmy; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjQ0MTU0MTI2NzFYck45WmsrRUw5dUl4SU5HNy91WEF0ZHRGWG51RFdmY3lqL2JsajZXMlpXcFdVZUY5KzdXc0lGWEJWNlRyWG15In0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJfUWNIdGFKc1I3Xy04czRXcUF4UFpnIiwiaWF0IjoxNzYyMDg3NTQ0LCJleHAiOjE3NjIxNzM5NDQsIm5iZiI6MTc2MjA4NzQ4NCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.bA22TL7V1Ojva0xFsPl_1b---9IabSoJXdkWLxhspamEfSx8eLf-sv2VZz6fNqLbZI_ZXb9nBfdCbM0S2yzvElDeC9laJWi6Y_Cz5ywZvWPkkSl5Wmjal5Nso33UeoMffiBkjDkwIN6uIk-726zea76m7xrJmjQbN2wet_fzW04U4RbYPfCIam0eEvXQjhMAuYPoihIcF-LocsQ3Qr-m3xVaWD6CxxTC30rt4ZfD63kRGjrVa2RfgqVeBVS5nMwBF0PWEYgRUN2mB9jyDfnG472TNfxLhXIGPUTaoMtnaxQoRtbcENuapbpIZCpCruq1SuMNdqK3oxtdnUij6yiXEA; ao_lo_to_n="4415412671XrN9Zk+EL9uIxING7/uXAmjk9eVYRzsag6V6ttkMQIH2Lh3Ah2vwuQRDfzmyINXazLLen51hoAgbtysMQkarAmDtVJPvrGJg/tasB7+3bQc="; JSESSIONID=2FD41936F77140471FC8EC556826B071; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1762087559; _ga_CN0F80S6GL=GS2.1.s1762087538$o2$g1$t1762087559$j39$l0$h0; _ga_38NCVF2XST=GS2.1.s1762087538$o2$g1$t1762087565$j33$l0$h205427331',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
} }
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json" url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = { data = {
"station": "US", "station": "DE",
"cid": category_name['c_id'], # 分类id "cid": c_id, # 分类id
"bsr": f"{i}" # 排名 "bsr": f"{i}" # 排名
} }
print(category_name['name'], '请求参数 data::', data) print(c_name, '请求参数 data::', data)
for i1 in range(3): for i1 in range(3):
try: try:
response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300) response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300)
...@@ -383,60 +140,40 @@ def junglescout_spider(db_base): ...@@ -383,60 +140,40 @@ def junglescout_spider(db_base):
# 没拿到数据,跳出循环 # 没拿到数据,跳出循环
break break
if est == 0.0: if est == 0.0:
print(f"{category_name['name']} 排名{i}:销量 0,跳出循环。") print(f"{c_name} 排名{i}:销量 0,跳出循环。")
break break
# 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1 # 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1
print(type(est)) print(type(est))
print('获取数据:', category_name['name'], i, est, year_month) print('获取数据:', c_name, i, est, year_month)
sales = int(est) sales = int(est)
name_rnak_list.append((category_name['name'], i, sales, year_month)) name_rnak_list.append((c_name, i, sales, year_month))
time.sleep(random.uniform(20, 65.75)) time.sleep(random.uniform(20, 45.75))
# break # break
for i in range(4): for i in range(4):
try: try:
inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)" inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
print(inset_sql) print(inset_sql)
cursor_mysql_db.executemany(inset_sql, name_rnak_list) engine_db_msyql = mysql_connect(site=db_base)
db.commit() with engine_db_msyql.begin() as conn:
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{category_name['c_id']}'" conn.execute(
f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)",
name_rnak_list)
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{c_id}'"
print('更新状态:', up_sql) print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql) db_cursor_connect_update(up_sql,'us')
db_us.commit()
break break
except Exception as e: except Exception as e:
print('存储失败:', e) print('存储失败:', e)
db_class = connect_db(db_base)
print(db_base)
cursor_mysql_db, db = db_class.mysql_db() # mysql
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
time.sleep(20) time.sleep(20)
print('当前完成。获取下一个分类销量') print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(90, 200.5)) time.sleep(random.uniform(50, 120.5))
def save_site_category(site_bsr_dict=None):
db_class = connect_db('us')
cursor_us_mysql_db, db = db_class.us_mysql_db() # mysql
for i in site_bsr_dict.keys():
print(i)
delete_sql = f'DELETE from all_site_category where `name` ="{i}"' # 删除旧分类
print(delete_sql)
cursor_us_mysql_db.execute(delete_sql)
db.commit()
site_category_list = site_bsr_dict[i]
for site_category in site_category_list:
insert_sql = f'insert into all_site_category (site,`name`, c_id)values (%s,%s,%s)'
cursor_us_mysql_db.execute(insert_sql, (i, site_category['categoryLabel'], site_category['cid']))
db.commit()
db_class = connect_db(i)
cursor_site_mysql_db, db = db_class.mysql_db() # mysql
def run(): def run():
# get_cid() junglescout_spider('de')
junglescout_spider('us')
if __name__ == '__main__': if __name__ == '__main__':
run() run()
\ No newline at end of file
...@@ -670,6 +670,9 @@ class nsr_catgory(BaseUtils): ...@@ -670,6 +670,9 @@ class nsr_catgory(BaseUtils):
en_name_id_list.append(en_name_id[0]) en_name_id_list.append(en_name_id[0])
id_tuple = tuple(en_name_id_list) id_tuple = tuple(en_name_id_list)
print(len(id_tuple)) print(len(id_tuple))
if len(id_tuple) == 1:
update_sql = f"""UPDATE {self.site_name}_new_releases set one_category_id={id[0]} where id in ('{id_tuple[0]}')"""
else:
update_sql = f'update {self.site_name}_new_releases set one_category_id={id[0]} where id in {id_tuple}' update_sql = f'update {self.site_name}_new_releases set one_category_id={id[0]} where id in {id_tuple}'
self.db_cursor_connect_update(update_sql, self.site_name) self.db_cursor_connect_update(update_sql, self.site_name)
except: except:
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import curl_cffi
from lxml import etree
# from threading_spider.db_connectivity import connect_db
from DrissionPage import ChromiumPage, ChromiumOptions
from DrissionPage.common import Keys
import json
import time
from utils.secure_db_client import get_remote_engine
import random
num_list = []
# # 获取所有站点的bsr 大类名称 和 分类id,存储到us站点
def get_cid():
url = 'https://www.sellersprite.com/v2/tools/sales-estimator'
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
resp = curl_cffi.get(url, headers=headers, impersonate="chrome")
html = etree.HTML(resp.text)
data_category = html.xpath("//script[@id='data-category']/text()")[0]
print(data_category)
def mysql_connect(site='us'):
engine_mysql = get_remote_engine(
site_name=site, # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
return engine_mysql
def db_cursor_connect_update(sql, site):
for i in range(3):
try:
engine_us_mysql = mysql_connect(site=site)
print('更新sql:', sql)
with engine_us_mysql.begin() as conn:
conn.execute(sql)
break
except:
print(site, 'db_cursor_connect 报错:', sql)
def db_cursor_connect_msyql_read(site, select_state1_sql):
for i in range(3):
try:
engine_mysql = mysql_connect(site=site)
df = engine_mysql.read_sql(select_state1_sql)
return df
except Exception as e:
import traceback
traceback.print_exc() # ★ 打印完整栈到终端
print(e, 'db_cursor_connect_msyql_read 报错:', select_state1_sql)
def sellersprite_spider(db_base):
month = time.strftime("%m")
year = time.strftime("%Y")
year_month = f'{year}_{int(month)}'
category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
print(category_name_sql_select)
category_name_list_df = db_cursor_connect_msyql_read('us', category_name_sql_select)
print(category_name_list_df)
category_name_list = list(category_name_list_df['name'] + '|-|==|' + category_name_list_df['c_id'])
cookies_dict = sellersprite_login(num=1) # 调用自动化登录账号 num使用第几个账号
for category_name_id in category_name_list:
print(category_name_id, '2323232323')
c_name = category_name_id.split('|-|==|')[0]
c_id = category_name_id.split('|-|==|')[1]
print(c_name, c_id)
name_rnak_list = []
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{c_id}'"
db_cursor_connect_update(up_sql, 'us')
rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1600,
1700,
1800,
1900,
2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500,
3600,
3700,
3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300,
5400,
5500,
5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100,
7200,
7300,
7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900,
9000,
9100,
9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000,
17000,
18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000
]
for i in rank_list:
headers = {
"Referer": "https://www.sellersprite.com/v2/tools/sales-estimator",
"Origin": "https://www.sellersprite.com",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": db_base.upper(),
"cid": c_id, # 分类id
"bsr": f"{i}" # 排名
}
print(c_name, '请求参数 data::', data)
for i1 in range(3):
try:
response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome", timeout=300,
cookies=cookies_dict)
print(response.url)
# print(response.text)
response = json.loads(response.text)
break
except:
time.sleep(random.uniform(15, 30.75))
try:
response_data = response['data']
print('code::', response['code'])
print('message::', response['message'])
print('estMonSales::', response_data['estMonSales'])
est = response_data.get('estMonSales')
if est is None:
# 没拿到数据,跳出循环
break
if est == 0.0:
print(f"{c_name} 排名{i}:销量 0,跳出循环。")
break
print(type(est))
print('获取数据:', c_name, i, est, year_month)
sales = int(est)
name_rnak_list.append((c_name, i, sales, year_month))
time.sleep(random.uniform(20, 45.75))
# break
except Exception as e:
print(e,5555555)
time.sleep(10) # # 调用自动化登录账号 报错 账号被封禁了。切换下一个账号
cookies_dict = sellersprite_login(num=0)
for i in range(4):
try:
inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
print(inset_sql)
engine_db_msyql = mysql_connect(site=db_base)
with engine_db_msyql.begin() as conn:
conn.execute(
f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)",
name_rnak_list)
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{c_id}'"
print('更新状态:', up_sql)
db_cursor_connect_update(up_sql, 'us')
break
except Exception as e:
print('存储失败:', e)
time.sleep(20)
print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(50, 120.5))
def sellersprite_login(num=2):
global num_list
num_list.append(num)
print('num_list',num_list)
if len(num_list) > 2:
num = 2
if len(num_list)>4:
num = 1
if len(num_list)>5:
print('睡眠')
num_list = []
num = 0
time.sleep(14400)
user_list = [['18307967347', 'Aa123456.'], ['qq16531218653@163.com', 'qq16531218653'], ['15368051270', '123456']]
print('登录账号:', user_list[num])
# 配置 Chrome 浏览器 - 端口 9222
chrome_options = ChromiumOptions()
chrome_options.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe')
chrome_options.set_local_port(9333) # 设置 Chrome 的调试端口
page_chrome = ChromiumPage(addr_or_opts=chrome_options)
print(f"Chrome 浏览器运行在端口: {9333}")
page_chrome.get("https://www.sellersprite.com/cn/w/user/login")
page_chrome.set.window.max()
page_chrome.set.cookies.clear()
time.sleep(random.randint(1, 3))
page_chrome.refresh()
# 等待页面初始加载
time.sleep(random.randint(1, 3))
page_chrome.get("https://www.sellersprite.com/cn/w/user/login")
time.sleep(random.randint(6, 10))
export_orders = page_chrome.ele('xpath://a[text()="账号登录"]', timeout=10)
export_orders.click()
print('点击账号登录')
time.sleep(random.randint(5, 10))
email_input = page_chrome.ele('xpath://div[@id="form_signin_password"]//input[@name="email"]')
email_input.clear() # 清除任何预填充的内容
email_input.input(user_list[num][0]) # 输入文本password
print("已输入账号到邮箱输入框")
time.sleep(random.randint(5, 10))
password_input = page_chrome.ele('xpath://div[@id="form_signin_password"]//input[@type="password"]')
password_input.clear() # 清除任何预填充的内容
password_input.input(user_list[num][1])
time.sleep(random.randint(5, 10))
page_chrome.actions.type(Keys.ENTER)
time.sleep(random.randint(5, 10))
page_chrome.get('https://www.sellersprite.com/v2/tools/sales-estimator')
time.sleep(random.randint(5, 10))
original_cookies_list = page_chrome.cookies()
# 将 cookies 列表转换为字典
original_cookie_dict = {cookie['name']: cookie['value'] for cookie in original_cookies_list}
print('original_cookie_dict::', original_cookie_dict)
page_chrome.close()
return original_cookie_dict
def run():
for i in [ 'uk', 'de']:
sellersprite_spider(i)
if __name__ == '__main__':
run()
# import sys
# import os
#
# sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
# import curl_cffi
# from lxml import etree
# import json
# import time
# import random
# from utils.secure_db_client import get_remote_engine
#
#
# def db_engine(site_name, db_type):
#
# """
# "mysql": "mysql", # 阿里云mysql
# "postgresql_14": "postgresql_14", # pg14爬虫库-内网
# "postgresql_14_outer": "postgresql_14_outer", # pg14爬虫库-外网
# "postgresql_15": "postgresql_15", # pg15正式库-内网
# "postgresql_15_outer": "postgresql_15_outer", # pg15正式库-外网
# "postgresql_cluster": "postgresql_cluster", # pg集群-内网
# "postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
# "doris": "doris", # doris集群-内网
# """
# engine = get_remote_engine(
# site_name=site_name, # -> database "selection"
# db_type=db_type, # -> 服务端 alias "mysql"
# )
# return engine
#
# def db_cursor_connect_update(sql):
# for i in range(3):
# try:
# engine_us_mysql = get_remote_engine('us','mysql')
# print('执行sql:', sql)
# with engine_us_mysql.begin() as conn:
# conn.execute(sql)
# break
# except:
# print('db_cursor_connect 报错:', sql)
# def get_cid():
# # 获取所有站点的bsr 大类名称 和 分类id,存储到us站点
# url = 'https://www.sellersprite.com/v2/tools/sales-estimator'
# headers = {
#
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
# "Accept-Encoding": "gzip, deflate, br, zstd",
# "Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
# "Cache-Control": "no-cache",
# "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
# }
# resp = curl_cffi.get(url, headers=headers, impersonate="chrome")
# html = etree.HTML(resp.text)
# data_category = html.xpath("//script[@id='data-category']/text()")[0]
# print(data_category)
# save_site_category(json.loads(data_category))
#
#
# def junglescout_spider(db_base):
# month = time.strftime("%m")
# year = time.strftime("%Y")
# year_month = f'{year}_{int(month)}'
# category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
# print(category_name_sql_select)
# _engine = db_engine('us','mysql')
# df_ = _engine.read_sql(category_name_sql_select)
# category_name_list = list(df_.name + '|==|--|'+df_.c_id)
# print('category_name_list:',category_name_list)
# for category_name_cid in category_name_list:
# category_name = category_name_cid.split('|==|--|')[0]
# c_id = category_name_cid.split('|==|--|')[1]
# print(category_name, c_id)
# name_rnak_list = []
# up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{c_id}'"
# print('更新状态:', up_sql)
# db_cursor_connect_update(up_sql)
# rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
# 1600,
# 1700,
# 1800,
# 1900,
# 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500,
# 3600,
# 3700,
# 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300,
# 5400,
# 5500,
# 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100,
# 7200,
# 7300,
# 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900,
# 9000,
# 9100,
# 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000,
# 17000,
# 18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000
# ]
# for i in rank_list:
# headers = {
# "Referer": "https://www.sellersprite.com/v2/tools/sales-estimator",
# "Origin":"https://www.sellersprite.com",
# "Accept": "application/json, text/javascript, */*; q=0.01",
# "Accept-Encoding": "gzip, deflate, br, zstd",
# "Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
# "Cache-Control": "no-cache",
# 'Cookie': 'Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1754980411; HMACCOUNT=470336FBA166E984; _ga=GA1.1.369458666.1754980412; _gcl_au=1.1.264141837.1754980412; MEIQIA_TRACK_ID=31AwurWIdtN2Ob4FQ2QW7kcpB0x; MEIQIA_VISIT_ID=31AwuwdPS1Vv4z7uSPqpc0Gj1ce; ecookie=0oXdazBemDJwQj8M_CN; 5d6b3550f67a0d98a3f2=300e7c0221464bf96a29eee60c456f00; _fp=65dbbe41a37f8f9fbe702eba96328267; current_guest=WEMjtTq1tsth_250812-144558; _gaf_fp=59df4b0c2b58ce924ed353a3d4aff048; rank-guest-user=6308305571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9ligCErogWOBlL8kpVxO2TpkM; rank-login-user=6308305571KK6FnhfedvWg9tSSyk3xj2GRIc/8HSm4vuPYVHI5vKIJMH4fZg1mmDOwxfDtgQRc; rank-login-user-info=eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjYzMDgzMDU1NzFLSzZGbmhmZWR2V2c5dFNTeWszeGoyR1JJYy84SFNtNHZ1UFlWSEk1dktJSk1INGZaZzFtbURPd3hmRHRnUVJjIn0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJ4RTZsWDJPbDdhWEphOVRjZVdfbER3IiwiaWF0IjoxNzU0OTgwNDM2LCJleHAiOjE3NTUwNjY4MzYsIm5iZiI6MTc1NDk4MDM3Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.ijVJw3SbLMEwdgwRIK-WviTFwmhYpp5_Kme31vu9V1HdiBLuyNjCwO97G6c5Pjlpio4jJjVdmaWXjB0rgAfpZA0XG3eui7bENUjqeOnAlBseIVfPBUjDWp9vhgV8uGGtpkD5I8wBAu26KSyE6tLuE4ML3mGMaWuNCPc9NpiLszyIaGmp0FkCU761-PmV0K9mNDKPQmUSlCOqHsNn5mscdfWfIaUaYdlAKEh0ojJoIOrDnFK3hLy9qWQ4W2ChgI8HQUU4Y48QwZCXsTn8fvZ6cNstRNuTjeT2Iq16hzdpWyuVzBgFe2YkwRYBdvi4plyPZkauZtb6RRkoi6oNA1qCXA; ao_lo_to_n=6308305571KK6FnhfedvWg9tSSyk3xjw9sOVkTMvBIw9HsBCFNE4fph+5i+ZmhNt9gtZL7px6VG01jHqLt1SVP6xii5hleGt0VSRpt/FdjNQpaxROiyf0=; JSESSIONID=57A66ED8AB79C6F6E5B12EC4414C0E00; _ga_38NCVF2XST=GS2.1.s1754980411$o1$g1$t1754980455$j16$l0$h797116043; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1754980455; _ga_CN0F80S6GL=GS2.1.s1754980412$o1$g1$t1754980455$j17$l0$h0',
# "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# }
# url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
# data = {
# "station": "UK",
# "cid": c_id, # 分类id
# "bsr": f"{i}" # 排名
# }
# print(category_name, '请求参数 data::', data)
# for i1 in range(3):
# try:
# response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300)
# print(response.url)
# # print(response.text)
# response = json.loads(response.text)
# break
# except:
# time.sleep(random.uniform(15, 35.75))
#
# response_data = response['data']
# print('code::', response['code'])
# print('message::', response['message'])
# print('estMonSales::', response_data['estMonSales'])
# est = response_data.get('estMonSales')
# if est is None:
# # 没拿到数据,跳出循环
# break
# if est == 0.0:
# print(f"{category_name} 排名{i}:销量 0,跳出循环。")
# break
#
# # 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1
# print(type(est))
# print('获取数据:', category_name, i, est, year_month)
# sales = int(est)
# name_rnak_list.append((category_name, i, sales, year_month))
# time.sleep(random.uniform(30, 95.75))
# # break
# for i in range(4):
# try:
# inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
# print(inset_sql)
# engine_mysql = db_engine(db_base, 'mysql')
# with engine_mysql.begin() as conn:
# conn.execute(
# f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)",
# name_rnak_list)
# up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{c_id}'"
# print('更新状态:', up_sql)
# db_cursor_connect_update(up_sql)
# break
# except Exception as e:
# print('存储失败:',e)
# _engine = db_engine('us','mysql')
#
# print('当前完成。获取下一个分类销量')
# time.sleep(random.uniform(120, 240.5))
#
#
# def save_site_category(site_bsr_dict=None):
# engine_mysql = db_engine('us', 'mysql')
# for i in site_bsr_dict.keys():
# print(i)
# delete_sql = f'DELETE from all_site_category where `name` ="{i}"' # 删除旧分类
# db_cursor_connect_update(delete_sql)
# site_category_list = site_bsr_dict[i]
# with engine_mysql.begin() as conn:
# for site_category in site_category_list:
# insert_sql = f'insert into all_site_category (site,`name`, c_id)values (%s,%s,%s)'
# conn.execute(insert_sql, (i, site_category['categoryLabel'], site_category['cid']))
# def run():
# # get_cid()
# junglescout_spider('uk')
#
#
# if __name__ == '__main__':
# run()
import sys import sys
import os import os
...@@ -7,7 +196,7 @@ from lxml import etree ...@@ -7,7 +196,7 @@ from lxml import etree
from threading_spider.db_connectivity import connect_db from threading_spider.db_connectivity import connect_db
import json import json
import time import time
import pymysql from utils.secure_db_client import get_remote_engine
import random import random
...@@ -26,36 +215,55 @@ def get_cid(): ...@@ -26,36 +215,55 @@ def get_cid():
html = etree.HTML(resp.text) html = etree.HTML(resp.text)
data_category = html.xpath("//script[@id='data-category']/text()")[0] data_category = html.xpath("//script[@id='data-category']/text()")[0]
print(data_category) print(data_category)
save_site_category(json.loads(data_category))
def mysql_connect(site='us'):
engine_mysql = get_remote_engine(
site_name=site, # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
return engine_mysql
def db_cursor_connect_update(sql, site):
for i in range(3):
try:
engine_us_mysql = mysql_connect(site=site)
print('更新sql:', sql)
with engine_us_mysql.begin() as conn:
conn.execute(sql)
break
except:
print(site, 'db_cursor_connect 报错:', sql)
def db_cursor_connect_msyql_read(site,select_state1_sql):
for i in range(3):
try:
engine_mysql = mysql_connect(site=site)
df = engine_mysql.read_sql(select_state1_sql)
return df
except Exception as e:
import traceback
traceback.print_exc() # ★ 打印完整栈到终端
print(e, 'db_cursor_connect_msyql_read 报错:', select_state1_sql)
def junglescout_spider(db_base): def junglescout_spider(db_base):
month = time.strftime("%m") month = time.strftime("%m")
year = time.strftime("%Y") year = time.strftime("%Y")
year_month = f'{year}_{int(month)}' year_month = f'{year}_{int(month)}'
db_class = connect_db(db_base)
cursor_mysql_db, connect_mysql_db = db_class.mysql_db() # mysql
cursor_us, connect_us = db_class.us_mysql_db() # us站点
cursor_us = connect_us.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1" category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
print(category_name_sql_select) print(category_name_sql_select)
cursor_us.execute(category_name_sql_select) category_name_list_df = db_cursor_connect_msyql_read('us',category_name_sql_select)
category_name_list = cursor_us.fetchall() print(category_name_list_df)
print('category_name_list:',category_name_list)
for category_name in category_name_list: category_name_list = list(category_name_list_df['name']+'|-|==|'+category_name_list_df['c_id'])
db_class = connect_db(db_base) for category_name_id in category_name_list:
print(db_base) print(category_name_id, '2323232323')
cursor_mysql_db, db = db_class.mysql_db() # mysql c_name = category_name_id.split('|-|==|')[0]
db_class_us = connect_db('us') c_id = category_name_id.split('|-|==|')[1]
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql print(c_name, c_id)
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
print(category_name['name'], category_name['c_id'])
name_rnak_list = [] name_rnak_list = []
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{category_name['c_id']}'" up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{c_id}'"
print('更新状态:', up_sql) db_cursor_connect_update(up_sql,'us')
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1600, 1600,
1700, 1700,
...@@ -76,6 +284,11 @@ def junglescout_spider(db_base): ...@@ -76,6 +284,11 @@ def junglescout_spider(db_base):
9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000,
17000, 17000,
18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000 18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000
# 55000, 60000, 65000, 70000, 75000,
# 80000,
# 85000, 90000, 95000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000,
# 550000,
# 600000, 650000, 700000, 750000, 800000, 850000, 900000, 950000, 1000000
] ]
for i in rank_list: for i in rank_list:
headers = { headers = {
...@@ -85,16 +298,16 @@ def junglescout_spider(db_base): ...@@ -85,16 +298,16 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8", "Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache", "Cache-Control": "no-cache",
'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; x-hng=lang=zh-CN&domain=www.sellersprite.com; a40ac813159995d028ba=3d9b7c15f5787387e62acd734f598f23; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751973053,1752031904,1752460043,1752653436; HMACCOUNT=800EBCCFB4C6BBFB; rank-guest-user=8301172571YFpPM/DhYDVQzRAgRu7tcQTFTi48nSnOk/TNMkep2gdtR77QXyNfDPmFlYbdSsdL; rank-login-user=8301172571YFpPM/DhYDVQzRAgRu7tcWqD2KCbe1WiKcOarfxTCdls3AJ9lNFy+VA8a+RTm195; rank-login-user-info=eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjgzMDExNzI1NzFZRnBQTS9EaFlEVlF6UkFnUnU3dGNXcUQyS0NiZTFXaUtjT2FyZnhUQ2RsczNBSjlsTkZ5K1ZBOGErUlRtMTk1In0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJLcVRRV2RPbVNNcjlKTU1qYTdXRjFRIiwiaWF0IjoxNzUyNjUzNDM4LCJleHAiOjE3NTI3Mzk4MzgsIm5iZiI6MTc1MjY1MzM3OCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.caY2QxTbtUVg7CQXvNJcmVo1YU0TGy3AD01dIddF76PHjYbbFh5a8zZAdAXnAKM1wNcs39d1MM8Wa-uoXHiitqDlCZsWyF9aXzco9L4wn-yU4xlMYsf7VoquZI6bxaMT2TNeX6vgQBod-NeXHYFpZQWdrH5sfZHQypkpRINb_o1QwaWvZrjufj1UwYdiypryBxTDyCuLfD4djU0PLMRXvifY6Ef86VNjAlsY8gFqDdHiVLixR2GWGdKRtoG74Ak5DX2eMDT6ak-OMrWYOaikthxIXiqdADTq2tvUCmjO4pE0oYnWhSEx9-UABo7jxJ0v_Af8B6AVu7ccC0NUUvWBMA; ao_lo_to_n=8301172571YFpPM/DhYDVQzRAgRu7tca/7vKUOAtDW4w4LhsAzrvlsqk8xCK+opMY27DGtrDKlwUwhqg///+C6QOw12iRKNIq9mCOV5+ORmOA+PwqisF4=; _gaf_fp=0f3f9e0c791b5513d38aa715d0624aab; _gcl_au=1.1.420472597.1749119222.448034571.1752653439.1752653439; JSESSIONID=0F617D64E2FD6DD92F3BB10935E3C846; _ga_38NCVF2XST=GS2.1.s1752653436$o51$g1$t1752653450$j46$l0$h366949276; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1752653451; _ga_CN0F80S6GL=GS2.1.s1752653437$o50$g1$t1752653451$j46$l0$h0', 'Cookie': '_gcl_au=1.1.1447326361.1758264089; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1758264089; HMACCOUNT=7F9DD13A227E1D2B; _ga=GA1.1.991212207.1758264089; MEIQIA_TRACK_ID=32uIa4AhuxRLTcHXxm0PIk6Neqn; MEIQIA_VISIT_ID=32uIa1sK6QRdxkjk2DRJqfS7UaG; ecookie=qmC7o7xcw0S1xZD9_CN; d0c5b9780b50a33ad822=dc2936bc8106c9a8ee1714818e7e7a72; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=8dc236c6b0cc21a83974c129be61cfc2; current_guest=BWQNYACe6Zku_250919-145796; rank-login-user=6961238571i+0e7rddchJDh/F/NHVdW5HCX9QRmjEXsHOplEmuEXlp0Qwv/G3CXc4Z8WBR8qa8; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjY5NjEyMzg1NzFpKzBlN3JkZGNoSkRoL0YvTkhWZFc1SENYOVFSbWpFWHNIT3BsRW11RVhscDBRd3YvRzNDWGM0WjhXQlI4cWE4In0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJhYnA1OTJtVWFjeElMQm96TkRVTEFBIiwiaWF0IjoxNzU4MjY0MDk2LCJleHAiOjE3NTgzNTA0OTYsIm5iZiI6MTc1ODI2NDAzNiwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.Q4Sne6pW_Lhytp1g9sR99PbRNN7BcE5azPHptecj7JIKFyRhFNJ7ZuwDnXW755Cs64JwS16Ra2R67pb1WhuxFoFFP6MBNZIql3jMnpBUO33LKBWbHkN3x5r4OXHCjCM6BvhhAyDWftUHnG-QeF-zTuQDfgVPiWSxEPnJj0pBvpDSOAYWgSs7G3J7pWE1Mz5nL4m7VhsKyIpQ0oT3zv5zgPOfXq6CaB0Ud5LjBJ9ADpNlQOi2-7hT05lQoZRgzce8Irx8jrla4icnWsBSYEUyTZSlNEf1pZVly4aK_txRfgYj5PjOyph4axuvZGq1am3wcEiD8MRBNz-ksEspXCeW0Q; ao_lo_to_n="6961238571i+0e7rddchJDh/F/NHVdW+To037jtP43UpoX84C25sG3pqka4v1jufgWxEucssJpM3EjpAFpLcWIRm7MR3R6a8lu71Loi14HqnMls77u9v8="; rank-guest-user=6961238571i+0e7rddchJDh/F/NHVdW6XmQntBFYjoP/KdgG5QRJAelN+6DbB+BnVatmY+ztdl; JSESSIONID=0FEB643072E7552D57ED5DEC085D20D8; _ga_38NCVF2XST=GS2.1.s1758264089$o1$g1$t1758264103$j46$l0$h1039295416; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1758264103; _ga_CN0F80S6GL=GS2.1.s1758264090$o1$g1$t1758264103$j47$l0$h0',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
} }
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json" url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = { data = {
"station": "UK", "station": "DE",
"cid": category_name['c_id'], # 分类id "cid": c_id, # 分类id
"bsr": f"{i}" # 排名 "bsr": f"{i}" # 排名
} }
print(category_name['name'], '请求参数 data::', data) print(c_name, '请求参数 data::', data)
for i1 in range(3): for i1 in range(3):
try: try:
response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300) response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300)
...@@ -103,7 +316,7 @@ def junglescout_spider(db_base): ...@@ -103,7 +316,7 @@ def junglescout_spider(db_base):
response = json.loads(response.text) response = json.loads(response.text)
break break
except: except:
time.sleep(random.uniform(15, 35.75)) time.sleep(random.uniform(15, 30.75))
response_data = response['data'] response_data = response['data']
print('code::', response['code']) print('code::', response['code'])
...@@ -114,61 +327,39 @@ def junglescout_spider(db_base): ...@@ -114,61 +327,39 @@ def junglescout_spider(db_base):
# 没拿到数据,跳出循环 # 没拿到数据,跳出循环
break break
if est == 0.0: if est == 0.0:
print(f"{category_name['name']} 排名{i}:销量 0,跳出循环。") print(f"{c_name} 排名{i}:销量 0,跳出循环。")
break break
# 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1 # 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1
print(type(est)) print(type(est))
print('获取数据:', category_name['name'], i, est, year_month) print('获取数据:', c_name, i, est, year_month)
sales = int(est) sales = int(est)
name_rnak_list.append((category_name['name'], i, sales, year_month)) name_rnak_list.append((c_name, i, sales, year_month))
time.sleep(random.uniform(30, 95.75)) time.sleep(random.uniform(20, 65.75))
# break # break
for i in range(4): for i in range(4):
try: try:
inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)" inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
print(inset_sql) print(inset_sql)
cursor_mysql_db.executemany(inset_sql, name_rnak_list) engine_db_msyql = mysql_connect(site=db_base)
db.commit() with engine_db_msyql.begin() as conn:
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{category_name['c_id']}'" conn.execute(
f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)",
name_rnak_list)
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{c_id}'"
print('更新状态:', up_sql) print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql) db_cursor_connect_update(up_sql,'us')
db_us.commit()
break break
except Exception as e: except Exception as e:
print('存储失败:',e) print('存储失败:', e)
db_class = connect_db(db_base)
print(db_base)
cursor_mysql_db, db = db_class.mysql_db() # mysql
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
time.sleep(20)
time.sleep(20)
print('当前完成。获取下一个分类销量') print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(120, 240.5)) time.sleep(random.uniform(90, 200.5))
def save_site_category(site_bsr_dict=None):
db_class = connect_db('us')
cursor_us_mysql_db, db = db_class.us_mysql_db() # mysql
for i in site_bsr_dict.keys():
print(i)
delete_sql = f'DELETE from all_site_category where `name` ="{i}"' # 删除旧分类
print(delete_sql)
cursor_us_mysql_db.execute(delete_sql)
db.commit()
site_category_list = site_bsr_dict[i]
for site_category in site_category_list:
insert_sql = f'insert into all_site_category (site,`name`, c_id)values (%s,%s,%s)'
cursor_us_mysql_db.execute(insert_sql, (i, site_category['categoryLabel'], site_category['cid']))
db.commit()
db_class = connect_db(i)
cursor_site_mysql_db, db = db_class.mysql_db() # mysql
def run(): def run():
# get_cid() junglescout_spider('de')
junglescout_spider('uk')
if __name__ == '__main__': if __name__ == '__main__':
run() run()
\ No newline at end of file
...@@ -408,11 +408,12 @@ class async_account_name_products(BaseUtils): ...@@ -408,11 +408,12 @@ class async_account_name_products(BaseUtils):
try: try:
with self.engine_pg6.begin() as conn: with self.engine_pg6.begin() as conn:
# 查詢收藏asin # 查詢收藏asin
sql_read_asin = f'SELECT id, data_id, end_time FROM {self.db_user_collection_syn} WHERE now() >= crawling_time and now() <= end_time and state = 1 and data_type = 1 ORDER BY id FETCH FIRST {self.read_size} ROWS ONLY FOR UPDATE;' sql_read_asin = f'SELECT id, data_id, end_time FROM {self.db_user_collection_syn} WHERE now() >= crawling_time and now() <= end_time and state = 1 and data_type = 1 ORDER BY id FOR UPDATE'
print('查詢收藏asin:', sql_read_asin) print('查詢收藏asin:', sql_read_asin)
b = conn.execute(sql_read_asin) self.df_read_asin = self.engine_pg6.read_sql(sql_read_asin)
self.df_read_asin = pd.DataFrame(b, columns=['id', 'data_id', 'end_time']) if self.df_read_asin.shape[0] !=0:
self.index_tuple_asin = tuple(self.df_read_asin['id']) self.index_tuple_asin = tuple(self.df_read_asin['id'])
print('self.index_tuple_asin::',len(self.index_tuple_asin))
if self.index_tuple_asin: if self.index_tuple_asin:
if len(self.index_tuple_asin) == 1: if len(self.index_tuple_asin) == 1:
sql_update = f"""UPDATE {self.db_user_collection_syn} b set state=2 where b.id in ({self.index_tuple_asin[0]})""" sql_update = f"""UPDATE {self.db_user_collection_syn} b set state=2 where b.id in ({self.index_tuple_asin[0]})"""
...@@ -428,12 +429,14 @@ class async_account_name_products(BaseUtils): ...@@ -428,12 +429,14 @@ class async_account_name_products(BaseUtils):
user_asin_list = user_asin.split('|-|') user_asin_list = user_asin.split('|-|')
self.user_asin_list.append(user_asin_list) self.user_asin_list.append(user_asin_list)
print(self.user_asin_list) print(self.user_asin_list)
print('存储 收藏asin')
self.save_asin_syn() self.save_asin_syn()
# 查询收藏店铺 # 查询收藏店铺
sql_read = f'SELECT id, product_url,data_id,end_time FROM {self.db_user_collection_syn} WHERE now() >= crawling_time and now() <= end_time and state = 1 and data_type = 2 ORDER BY id FETCH FIRST {self.read_size} ROWS ONLY FOR UPDATE;' sql_read = f'SELECT id, product_url,data_id,end_time FROM {self.db_user_collection_syn} WHERE now() >= crawling_time and now() <= end_time and state = 1 and data_type = 2 ORDER BY id FETCH FIRST {self.read_size} ROWS ONLY FOR UPDATE;'
print('查询收藏店铺:', sql_read) print('查询收藏店铺:', sql_read)
a = conn.execute(sql_read) # a = conn.execute(sql_read)
self.df_read = pd.DataFrame(a, columns=['id', 'product_url', 'data_id', 'end_time']) self.df_read = self.engine_pg6.read_sql(sql_read)
# self.df_read = pd.DataFrame(a, columns=['id', 'product_url', 'data_id', 'end_time'])
if self.df_read.shape[0] == 0: if self.df_read.shape[0] == 0:
self.stop_item_queue = False self.stop_item_queue = False
return [] return []
...@@ -473,9 +476,7 @@ class async_account_name_products(BaseUtils): ...@@ -473,9 +476,7 @@ class async_account_name_products(BaseUtils):
else: else:
sql_DELETE = f"""DELETE FROM {self.site_name}_user_seller_collections where account_id in {tuple(self.account_name_list_update)}""" sql_DELETE = f"""DELETE FROM {self.site_name}_user_seller_collections where account_id in {tuple(self.account_name_list_update)}"""
conn.execute(sql_DELETE) conn.execute(sql_DELETE)
df_asin_variation.to_sql(f"{self.site_name}_user_seller_collections", con=self.engine_pg6, self.engine_pg6.to_sql(df_asin_variation,f"{self.site_name}_user_seller_collections",if_exists='append')
if_exists='append',
index=False)
self.asin_detail_list = [] self.asin_detail_list = []
break break
except Exception as e: except Exception as e:
......
...@@ -351,8 +351,8 @@ else: ...@@ -351,8 +351,8 @@ else:
# redis # redis
REDIS_CONN = { REDIS_CONN = {
"redis_host": "113.100.143.162", "redis_host": "113.100.143.162",
"redis_port": 6379, "redis_port": 54372,
"redis_pwd": "fG7#vT6kQ1pX", "redis_pwd": "N8#rTp2Xz!Lk6@Vw9qHs4&Yb1Fm0Cj3",
"redis_db": 14 "redis_db": 14
} }
......
import sys
import os import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from func_timeout import func_set_timeout from func_timeout import func_set_timeout
...@@ -29,11 +29,12 @@ class Save_asin_detail(BaseUtils): ...@@ -29,11 +29,12 @@ class Save_asin_detail(BaseUtils):
self.init_db_names() self.init_db_names()
self.cols = self.reuests_para_val.db_column(site_name) self.cols = self.reuests_para_val.db_column(site_name)
self.redis_client = self.redis_db() self.redis_client = self.redis_db()
def init_db_names(self): def init_db_names(self):
self.engine = self.mysql_connect() self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect() # 更改变体 时 存储 变体表 使用 self.engine self.engine_pg = self.pg_connect() # 更改变体 时 存储 变体表 使用 self.engine
self.kafuka_producer = self.kafuka_connect() # 卡夫卡连接 self.kafuka_producer = self.kafuka_connect() # 卡夫卡连接
self.kafuka_producer_str = self.kafuka_connect(acks=True,connections_max_idle_ms=300000) # 卡夫卡连接 self.kafuka_producer_str = self.kafuka_connect(acks=True, connections_max_idle_ms=300000) # 卡夫卡连接
self.redis_db14 = self.redis_db() # redis 链接 self.redis_db14 = self.redis_db() # redis 链接
self.db_syn = self.site_name + '_all_syn_st_month_2025' self.db_syn = self.site_name + '_all_syn_st_month_2025'
self.db_seller_account_syn = self.site_name + DB_REQUESTS_ASIN_PARAMS['db_seller_account_syn'][2:] + '_distinct' self.db_seller_account_syn = self.site_name + DB_REQUESTS_ASIN_PARAMS['db_seller_account_syn'][2:] + '_distinct'
...@@ -90,7 +91,7 @@ class Save_asin_detail(BaseUtils): ...@@ -90,7 +91,7 @@ class Save_asin_detail(BaseUtils):
self.df_read.drop_duplicates(['asin'], inplace=True) self.df_read.drop_duplicates(['asin'], inplace=True)
if self.df_read.shape[0] > 0: if self.df_read.shape[0] > 0:
self.index_tuple = tuple(self.df_read['id']) self.index_tuple = tuple(self.df_read['id'])
print(self.index_tuple,'self.index_tuplself.index_tuplself.index_tupl') print(self.index_tuple, 'self.index_tuplself.index_tuplself.index_tupl')
# 使用默认值填充空值 # 使用默认值填充空值
self.df_read['volume'].fillna('null', inplace=True) self.df_read['volume'].fillna('null', inplace=True)
self.df_read['weight_str'].fillna('null', inplace=True) self.df_read['weight_str'].fillna('null', inplace=True)
...@@ -220,7 +221,7 @@ class Save_asin_detail(BaseUtils): ...@@ -220,7 +221,7 @@ class Save_asin_detail(BaseUtils):
print(f'存储pg:{self.site_name}_asin_detail_month_{report_info}') print(f'存储pg:{self.site_name}_asin_detail_month_{report_info}')
# df.to_csv(r'2025-7-30_srs_search_term_asin.csv', index=False) # df.to_csv(r'2025-7-30_srs_search_term_asin.csv', index=False)
self.engine_pg.to_sql(df,f"{self.site_name}_asin_detail_month_{report_info}", self.engine_pg.to_sql(df, f"{self.site_name}_asin_detail_month_{report_info}",
if_exists='append') if_exists='append')
break break
except Exception as e: except Exception as e:
...@@ -258,6 +259,27 @@ class Save_asin_detail(BaseUtils): ...@@ -258,6 +259,27 @@ class Save_asin_detail(BaseUtils):
self.db_change_state(state=13, asin_list=asin_not_div_id_dp_list) self.db_change_state(state=13, asin_list=asin_not_div_id_dp_list)
@func_set_timeout(240) @func_set_timeout(240)
def save_asin_not_buysales(self, asin_buySales_list):
while True:
try:
if is_internet_available():
pass
else:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
print('错误月销的asin:', asin_buySales_list)
print('错误月销的asin:', len(asin_buySales_list))
df_asin_ = pd.DataFrame(data=asin_buySales_list, columns=['asin', 'buysales', 'date_info'])
self.engine_pg.to_sql(df_asin_, f'{self.site_name}_asin_detail_2025_not_buysales', if_exists='append')
break
except Exception as e:
print("存储 _asin_detail_2025_not_buysales 文本 数据错误", e)
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
time.sleep(random.uniform(10, 20.5))
continue
@func_set_timeout(240)
def save_bs_category_asin_detail(self, bs_category_asin_list_pg): def save_bs_category_asin_detail(self, bs_category_asin_list_pg):
# 存储 asin bsr 文本 # 存储 asin bsr 文本
while True: while True:
...@@ -278,7 +300,8 @@ class Save_asin_detail(BaseUtils): ...@@ -278,7 +300,8 @@ class Save_asin_detail(BaseUtils):
if df_asin_bsr_pg.shape[0] > 0: if df_asin_bsr_pg.shape[0] > 0:
date_info_ = list(df_asin_bsr_pg.date_info)[0].replace('-', '_') date_info_ = list(df_asin_bsr_pg.date_info)[0].replace('-', '_')
print(f'{self.site_name}_bs_category_asin_detail_month_{date_info_}') print(f'{self.site_name}_bs_category_asin_detail_month_{date_info_}')
self.engine_pg.to_sql(df_asin_bsr_pg,f'{self.site_name}_bs_category_asin_detail_month_{date_info_}', self.engine_pg.to_sql(df_asin_bsr_pg,
f'{self.site_name}_bs_category_asin_detail_month_{date_info_}',
if_exists='append') if_exists='append')
bs_category_asin_list_pg = [] bs_category_asin_list_pg = []
break break
...@@ -340,7 +363,7 @@ class Save_asin_detail(BaseUtils): ...@@ -340,7 +363,7 @@ class Save_asin_detail(BaseUtils):
else: else:
sql_delete = f"delete from {self.db_seller_asin_account} where asin in {tuple(set(df_seller_asin_account.asin))};" sql_delete = f"delete from {self.db_seller_asin_account} where asin in {tuple(set(df_seller_asin_account.asin))};"
conn.execute(sql_delete) conn.execute(sql_delete)
self.engine.to_sql(df_seller_asin_account,self.db_seller_asin_account, self.engine.to_sql(df_seller_asin_account, self.db_seller_asin_account,
if_exists='append') if_exists='append')
buyBoxname_asin_list = [] buyBoxname_asin_list = []
break break
...@@ -412,7 +435,7 @@ class Save_asin_detail(BaseUtils): ...@@ -412,7 +435,7 @@ class Save_asin_detail(BaseUtils):
sql_delete = f"delete from {self.site_name}_all_syn_st_asin where asin in {tuple(set(df_asin.asin))};" sql_delete = f"delete from {self.site_name}_all_syn_st_asin where asin in {tuple(set(df_asin.asin))};"
conn.execute(sql_delete) conn.execute(sql_delete)
df_asin['state'] = state df_asin['state'] = state
self.engine_pg.to_sql(df_asin,f'{self.site_name}_all_syn_st_asin',if_exists='append') self.engine_pg.to_sql(df_asin, f'{self.site_name}_all_syn_st_asin', if_exists='append')
break break
except Exception as e: except Exception as e:
self.engine = self.mysql_connect() self.engine = self.mysql_connect()
...@@ -422,6 +445,5 @@ class Save_asin_detail(BaseUtils): ...@@ -422,6 +445,5 @@ class Save_asin_detail(BaseUtils):
f"\n{traceback.format_exc()}") f"\n{traceback.format_exc()}")
continue continue
# if __name__ == '__main__':
if __name__ == '__main__': # Save_asin_detail()
Save_asin_detail()
import pandas as pd import datetime
from selenium import webdriver import json
from selenium.webdriver.chrome.options import Options import os
from selenium.webdriver.common.by import By import random
from selenium.webdriver.support import expected_conditions as EC import re
import socket
import time import time
import traceback import traceback
from sqlalchemy import create_engine from random import randint
from time import sleep
import numpy as np
import pandas as pd
import pydub
import redis
import requests import requests
from sqlalchemy import text
from lxml import etree from lxml import etree
import os from secure_db_client import get_remote_engine
import socket from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.keys import Keys
import re from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.common.exceptions import WebDriverException, TimeoutException from selenium.webdriver.support.wait import WebDriverWait
import random
import pydub
from speech_recognition import Recognizer, AudioFile from speech_recognition import Recognizer, AudioFile
from time import sleep
from random import randint
from selenium.common.exceptions import NoSuchElementException
import numpy as np
import datetime
import redis
class H10(): class H10():
def __init__(self): def __init__(self):
with open('config_json', 'r', encoding='utf-8')as f:
params_data_json = f.read()
self.params_data_dict = json.loads(params_data_json)
self.db_syn = 'all_h10_syn' self.db_syn = 'all_h10_syn'
self.site_name = 'us' self.site_name = 'us'
self.site_name_csv = 'us' self.site_name_csv = 'us'
self.sku_list = [] self.sku_list = []
self.err_asin_list = [] self.err_asin_list = []
self.err_asins_adv_list = [] self.err_asins_adv_list = []
self.asin_state_5_list = []
self.sku_state = False self.sku_state = False
self.ip = self.get_ip_address() self.ip = self.get_ip_address()
self.useremail_state = True self.useremail_state = True
self.redis_db() self.redis_db()
# self.mysql_inv()
def redis_db(self): def redis_db(self):
self.redis_db1 = redis.Redis(host='120.79.147.190', port=6379, password='Vm5vQH4ydFXh', db=0) self.redis_db1 = redis.Redis(host=self.params_data_dict['redis_host'], port=self.params_data_dict['redis_port'],
password=self.params_data_dict['redis_pwd'], db=self.params_data_dict['redis_db'])
def get_token(self): def get_token(self):
while True: while True:
try: try:
print('redis 获取token')
val = self.redis_db1.hget('thirdParty:token:inventory', 'disanfang') val = self.redis_db1.hget('thirdParty:token:inventory', 'disanfang')
self.val_str = val.decode('utf-8') self.val_str = val.decode('utf-8')
print(self.val_str) print(self.val_str)
break break
except: except Exception as e:
self.redis_db() self.redis_db()
print('redis 获取token 报错') print('redis 获取token 报错', e)
time.sleep(20) time.sleep(20)
def get_ip_address(self): def get_ip_address(self):
...@@ -62,7 +66,8 @@ class H10(): ...@@ -62,7 +66,8 @@ class H10():
s.connect(('baidu.com', 0)) s.connect(('baidu.com', 0))
ip = s.getsockname()[0] ip = s.getsockname()[0]
# You are viewing a demo of Cerebro # You are viewing a demo of Cerebro
user_pw_dict = {'192.168.10.244': [r'C:\Users\win10-244\Downloads', 'YSWGHF422023@outlook.com', 'soundasia422023@'], user_pw_dict = {
'192.168.10.244': [r'C:\Users\win10-244\Downloads', 'YSWGHF422023@outlook.com', 'soundasia422023@'],
'192.168.10.245': [r'C:\Users\win10-245\Downloads', 'CherryY2023@outlook.com', '20230322Yy@'], '192.168.10.245': [r'C:\Users\win10-245\Downloads', 'CherryY2023@outlook.com', '20230322Yy@'],
'192.168.10.246': [r'C:\Users\win10-246\Downloads', 'H10961961@outlook.com', 'soundasia961961@'], '192.168.10.246': [r'C:\Users\win10-246\Downloads', 'H10961961@outlook.com', 'soundasia961961@'],
'192.168.10.247': [r'C:\Users\win10-247\Downloads', 'X18756082657@outlook.com', 'Zyx13075039897@'], '192.168.10.247': [r'C:\Users\win10-247\Downloads', 'X18756082657@outlook.com', 'Zyx13075039897@'],
...@@ -79,67 +84,14 @@ class H10(): ...@@ -79,67 +84,14 @@ class H10():
else: else:
return [] return []
def mysql_inv(self):
nums = 0
while True:
nums += 1
try:
self.engine_adv = create_engine(
'mysql+pymysql://chenjianyun:Cjy8751_07@rm-wz956fk600d89g2g7uo.mysql.rds.aliyuncs.com:3306/inventory?charset=utf8mb4') # , pool_recycle=3600
break
except Exception as e:
print("error_mysql_connect:", e, f"\n{traceback.format_exc()}")
time.sleep(nums * 20)
continue
def mysql_connect(self, site='us'): def mysql_connect(self, site='us'):
DB_CONN_DICT = { self.engine_us = get_remote_engine('us', 'mysql')
"mysql_port": 3306,
"mysql_db": "selection",
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
nums = 0
while True:
nums += 1
try:
db = f'selection'
self.engine_us = create_engine(
f'mysql+pymysql://{DB_CONN_DICT["mysql_user"]}:' + f'{DB_CONN_DICT["mysql_pwd"]}@{DB_CONN_DICT["mysql_host"]}:{DB_CONN_DICT["mysql_port"]}/{db}?charset=utf8mb4') # , pool_recycle=3600
break
except Exception as e:
print("error_mysql_connect:", e, f"\n{traceback.format_exc()}")
time.sleep(nums * 20)
continue
nums = 0
while True:
nums += 1
try:
if self.site_name == 'us' or self.site_name == 'mx': if self.site_name == 'us' or self.site_name == 'mx':
db = 'selection' self.site_name = 'us'
else: self.engine = get_remote_engine(self.site_name, 'mysql')
db = f'selection_{site}'
self.engine = create_engine(
f'mysql+pymysql://{DB_CONN_DICT["mysql_user"]}:' + f'{DB_CONN_DICT["mysql_pwd"]}@{DB_CONN_DICT["mysql_host"]}:{DB_CONN_DICT["mysql_port"]}/{db}?charset=utf8mb4') # , pool_recycle=3600
break
except Exception as e:
print("error_mysql_connect:", e, f"\n{traceback.format_exc()}")
time.sleep(nums * 20)
continue
def web_drver(self): def web_drver(self):
# port = 9222
# params_ = ""
# params_ = "--blink-settings=imagesEnabled=false"
# os.system(f'start Chrome {params_} --remote-debugging-port={port}')
chrome_options = Options() chrome_options = Options()
# 禁止加载图片
# chrome_options.add_argument('--blink-settings=imagesEnabled=false')
# chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{port}") # 打开调用本地浏览器
# 设置driver以无头浏览的模式运行
# chrome_options.add_argument('-headless')
# 禁用GPU(可选)
chrome_options.add_argument('-disable-gpu') chrome_options.add_argument('-disable-gpu')
chrome_options.add_argument("--disable-notifications") chrome_options.add_argument("--disable-notifications")
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"]) chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
...@@ -313,10 +265,10 @@ class H10(): ...@@ -313,10 +265,10 @@ class H10():
# https://members.helium10.com/user/signin?re=L2NlcmVicm8= # https://members.helium10.com/user/signin?re=L2NlcmVicm8=
self.driver.get('https://members.helium10.com/user/signin') self.driver.get('https://members.helium10.com/user/signin')
sleep(randint(2, 4)) sleep(randint(2, 4))
search_box = self.driver.find_element_by_id('loginform-email') search_box = self.driver.find_element(By.ID, 'loginform-email')
search_box.send_keys(self.email_name) search_box.send_keys(self.email_name)
sleep(randint(1, 2)) sleep(randint(1, 2))
search_box = self.driver.find_element_by_id('loginform-password') search_box = self.driver.find_element(By.ID, 'loginform-password')
search_box.send_keys(self.pw) search_box.send_keys(self.pw)
sleep(randint(1, 2)) sleep(randint(1, 2))
try: try:
...@@ -453,7 +405,7 @@ class H10(): ...@@ -453,7 +405,7 @@ class H10():
if asin not in self.err_asin_list and self.useremail_state: if asin not in self.err_asin_list and self.useremail_state:
print('cerebro界面', self.site_name_url) print('cerebro界面', self.site_name_url)
self.driver.get(f'https://members.helium10.com/cerebro?accountId={self.account_id}') self.driver.get(f'https://members.helium10.com/cerebro?accountId={self.account_id}')
time.sleep(10) time.sleep(8)
if 'You are viewing a demo of Cerebro' in self.driver.page_source: if 'You are viewing a demo of Cerebro' in self.driver.page_source:
print(self.email_name, '账号过期') print(self.email_name, '账号过期')
self.driver.refresh() self.driver.refresh()
...@@ -496,15 +448,15 @@ class H10(): ...@@ -496,15 +448,15 @@ class H10():
try: try:
self.driver.execute_script( self.driver.execute_script(
f"""document.querySelector("img[loading='lazy']").click()""") f"""document.querySelector("img[loading='lazy']").click()""")
time.sleep(1) time.sleep(1.5)
except: except:
self.driver.execute_script( self.driver.execute_script(
f"""document.querySelector("img[alt='{alt}']").click()""") f"""document.querySelector("img[alt='{alt}']").click()""")
time.sleep(1) time.sleep(1.5)
self.verify() self.verify()
# 切换站点 # 切换站点
self.driver.execute_script(f"""document.querySelector("div[data-value='{host}']").click()""") self.driver.execute_script(f"""document.querySelector("div[data-value='{host}']").click()""")
time.sleep(2) time.sleep(1.5)
# 输入asin # 输入asin
print('输入asin', asin) print('输入asin', asin)
if ',' in asin: if ',' in asin:
...@@ -520,7 +472,7 @@ class H10(): ...@@ -520,7 +472,7 @@ class H10():
# 点击 get keyword # 点击 get keyword
time.sleep(1) time.sleep(1)
self.driver.execute_script('document.querySelector("#CerebroSearchButtons > button").click()') self.driver.execute_script('document.querySelector("#CerebroSearchButtons > button").click()')
time.sleep(3) time.sleep(2)
html = self.driver.page_source html = self.driver.page_source
if 'You have reached the limit of the uses' in html: if 'You have reached the limit of the uses' in html:
self.useremail_state = False self.useremail_state = False
...@@ -540,11 +492,11 @@ class H10(): ...@@ -540,11 +492,11 @@ class H10():
self.verify() self.verify()
time.sleep(2) time.sleep(2)
try: try:
if 'searched this product before' in html: if 'searched this product before' in html or '先前已搜索过此产品' in html:
print('33333333333') print('33333333333')
self.driver.execute_script( self.driver.execute_script(
"""document.querySelector("button[data-testid='runnewsearch']").click()""") """document.querySelector("button[data-testid='runnewsearch']").click()""")
sleep(randint(20, 35)) sleep(randint(10, 35))
except: except:
print('点击 run 报错') print('点击 run 报错')
...@@ -568,16 +520,11 @@ class H10(): ...@@ -568,16 +520,11 @@ class H10():
elif 'errorCodes.undefined' in html: elif 'errorCodes.undefined' in html:
continue continue
self.verify() self.verify()
resp = etree.HTML(html)
try:
div_class = resp.xpath(
'//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class')
except:
time.sleep(2.5) time.sleep(2.5)
resp = etree.HTML(html)
try: try:
div_class = resp.xpath( div_class = resp.xpath(
'//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class') '''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class''')
except: except:
print('报错22222222222222') print('报错22222222222222')
if asinstype: if asinstype:
...@@ -597,7 +544,7 @@ class H10(): ...@@ -597,7 +544,7 @@ class H10():
html1 = self.driver.page_source html1 = self.driver.page_source
resp1 = etree.HTML(html1) resp1 = etree.HTML(html1)
span_class = resp1.xpath( span_class = resp1.xpath(
'//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class')[0] '//span[contains(text(),"Analyzed product")]/parent::div/following-sibling::div/@class|//span[contains(text(),"已分析的产品")]/parent::div/following-sibling::div/@class')[0]
# 选择亚马逊精选参数1 # 选择亚马逊精选参数1
self.driver.execute_script( self.driver.execute_script(
f"""document.querySelector("div[class='{span_class}']").click()""") f"""document.querySelector("div[class='{span_class}']").click()""")
...@@ -641,12 +588,10 @@ class H10(): ...@@ -641,12 +588,10 @@ class H10():
def read_db_data(self, sku): def read_db_data(self, sku):
while True: while True:
try: try:
if self.read_product_sku(sku):
with self.engine_us.begin() as conn: with self.engine_us.begin() as conn:
sql_read = f"SELECT asin, id,site,sku FROM {self.db_syn} WHERE STATE = 1 and site='{self.site_url}' and sku='{sku}' limit 10 FOR UPDATE;" sql_read = f"SELECT asin, id,site,sku FROM {self.db_syn} WHERE STATE = 1 and site='{self.site_url}' and sku='{sku}' limit 10 FOR UPDATE;"
print(sql_read) print(sql_read)
a = conn.execute(sql_read) self.df_read = self.engine_us.read_sql(sql_read)
self.df_read = pd.DataFrame(a, columns=['asin', 'id', 'site', 'sku'])
self.df_read.drop_duplicates(['asin'], inplace=True) self.df_read.drop_duplicates(['asin'], inplace=True)
if self.df_read.shape[0] == 0: if self.df_read.shape[0] == 0:
print('*********** asin 数据抓取 完毕 *****************') print('*********** asin 数据抓取 完毕 *****************')
...@@ -663,9 +608,6 @@ class H10(): ...@@ -663,9 +608,6 @@ class H10():
self.site_name_url = list(self.df_read.site)[0] self.site_name_url = list(self.df_read.site)[0]
self.sku = list(self.df_read.sku)[0] self.sku = list(self.df_read.sku)[0]
return asin_list return asin_list
else:
self.asin_state_5_list.append(sku)
return []
except Exception as e: except Exception as e:
print("读取数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}") print("读取数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}")
time.sleep(10) time.sleep(10)
...@@ -675,40 +617,26 @@ class H10(): ...@@ -675,40 +617,26 @@ class H10():
def read_db_sku(self): def read_db_sku(self):
while True: while True:
try: try:
sql = f"""SELECT DISTINCT sku,token from all_h10_syn where site='{self.site_url}' and state = 1 """ sql = f"""SELECT DISTINCT sku,token from all_h10_syn where site='{self.site_url}' and state =1"""
df = pd.read_sql(sql, con=self.engine_us) print(sql, '2323324dd')
df = self.engine_us.read_sql(sql)
if not df.empty:
self.sku_data_list = list(df.sku + '|-|' + df.token) self.sku_data_list = list(df.sku + '|-|' + df.token)
print(self.sku_data_list) print(self.sku_data_list)
else:
self.sku_data_list = []
break break
except: except Exception as e:
print('读取sku 失败0') print('读取sku 失败0', e)
time.sleep(30) time.sleep(30)
self.mysql_connect() self.mysql_connect()
continue continue
def read_product_sku(self, sku):
# for i in range(5):
# try:
# sql = f"select id from product_audit where product_sku ='{sku}' and product_audit_status = '5-1' ;"
# print(sql)
# df = pd.read_sql(sql, con=self.engine_adv)
# id_list = list(df.id)
# print('检查是否手动下载:', id_list)
# if id_list:
# print('id_list::', id_list)
# return False
# else:
# print('可以')
# return True
# except:
# self.mysql_inv()
return True
def read_db_asin(self): def read_db_asin(self):
while True: while True:
try: try:
sql_read = f"SELECT sku, site, GROUP_CONCAT(asin SEPARATOR ',') AS asin_list FROM {self.db_syn} WHERE site = '{self.site_url}' and sku='{self.sku}';" sql_read = f"SELECT sku, site, GROUP_CONCAT(asin SEPARATOR ',') AS asin_list FROM {self.db_syn} WHERE site = '{self.site_url}' and sku='{self.sku}';"
df = pd.read_sql(sql_read, con=self.engine_us) df = self.engine_us.read_sql(sql_read)
sku_list = list(df.sku) sku_list = list(df.sku)
if sku_list: if sku_list:
data_list = list(df.sku + '|' + df.site + '|' + df.asin_list) data_list = list(df.sku + '|' + df.site + '|' + df.asin_list)
...@@ -742,23 +670,6 @@ class H10(): ...@@ -742,23 +670,6 @@ class H10():
self.mysql_connect() self.mysql_connect()
continue continue
def update_sku_syn(self):
if self.asin_state_5_list:
while True:
try:
with self.engine_us.begin() as conn:
if len(self.asin_state_5_list) == 1:
sql_update = f"update {self.db_syn} set state=5 where sku in ('{self.asin_state_5_list[0]}')"
else:
sql_update = f"update {self.db_syn} set state=5 where sku in {tuple(self.asin_state_5_list)}"
conn.execute(sql_update)
self.asin_state_5_list = []
break
except Exception as e:
print("update_sku_syn", e, f"\n{traceback.format_exc()}")
self.mysql_connect()
continue
def del_file(self, path_data, asin_type): def del_file(self, path_data, asin_type):
try: try:
for i in os.listdir(path_data): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径 for i in os.listdir(path_data): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
...@@ -778,18 +689,22 @@ class H10(): ...@@ -778,18 +689,22 @@ class H10():
except: except:
print(path_data, '删除111111111') print(path_data, '删除111111111')
def read_files(self, path, asin): def if_csv_path(self, file_path):
columns_to_include = ['Keyword Phrase', 'Cerebro IQ Score', 'Search Volume', 'Search Volume Trend',
'Sponsored ASINs',
'Competing Products', 'CPR', 'Title Density', 'Amazon Recommended', 'Organic',
'Amazon Rec. Rank', 'Sponsored Rank', 'Organic Rank']
try: try:
with open(file_path, 'r', encoding='utf-8') as f:
f.read()
f.close()
return True
except:
print('文件路径不存在')
return False
def read_files(self, path, asin):
time_strftime = time.strftime("%Y-%m-%d", time.localtime()) time_strftime = time.strftime("%Y-%m-%d", time.localtime())
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{time_strftime}.csv' file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{time_strftime}.csv'
df = pd.read_csv(file_path, usecols=columns_to_include) state = self.if_csv_path(file_path)
print('读取文件11111::', file_path) print('读取文件11111::', file_path)
except: if state == False:
try:
# 获取当前日期 # 获取当前日期
current_date = datetime.date.today() current_date = datetime.date.today()
# 计算前一天日期 # 计算前一天日期
...@@ -797,48 +712,53 @@ class H10(): ...@@ -797,48 +712,53 @@ class H10():
# 格式化前一天日期为字符串 # 格式化前一天日期为字符串
previous_date_str = previous_date.strftime("%Y-%m-%d") previous_date_str = previous_date.strftime("%Y-%m-%d")
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{previous_date_str}.csv' file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{previous_date_str}.csv'
df = pd.read_csv(file_path, usecols=columns_to_include)
print('读取文件2222222::', file_path) print('读取文件2222222::', file_path)
except: state = self.if_csv_path(file_path)
if state == False:
self.driver.refresh() self.driver.refresh()
time.sleep(5) time.sleep(5)
print('重新下载文件:', asin,path) print('重新下载文件:', asin, path)
self.webdrvier_html(asin, None) self.webdrvier_html(asin, None)
time.sleep(5) time.sleep(5)
time_strftime = time.strftime("%Y-%m-%d", time.localtime()) time_strftime = time.strftime("%Y-%m-%d", time.localtime())
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{time_strftime}.csv' file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{time_strftime}.csv'
df = pd.read_csv(file_path, usecols=columns_to_include)
print('读取文件333333::', file_path) print('读取文件333333::', file_path)
return df columns = pd.read_csv(file_path, nrows=0).columns.tolist()
def sava_data(self, path): def contains_chinese(text):
print('self.err_asin_list::', self.err_asin_list) return bool(re.search(r'[\u4e00-\u9fff]', text))
df_asin_data_list = []
for asin in self.asin_list: is_chinese_header = any(contains_chinese(col) for col in columns)
print(asin, '333333333333333', self.err_asin_list) if is_chinese_header:
if asin not in self.err_asin_list: print("表头是中文")
df = self.read_files(path, asin) columns_to_include_zh = ['关键词词组', 'Cerebro IQ 得分', '搜索量', '搜索量趋势',
# columns_to_include = ['Keyword Phrase', 'Cerebro IQ Score', 'Search Volume', 'Search Volume Trend', '广告推广ASIN 数',
# 'Sponsored ASINs', '竞品数', 'CPR', '标题密度', '亚马逊推荐', '自然',
# 'Competing Products', 'CPR', 'Title Density', 'Amazon Recommended', 'Organic', '亚马逊推荐排名', '广告排名', '自然排名']
# 'Amazon Rec. Rank', 'Sponsored Rank', 'Organic Rank'] df = pd.read_csv(file_path, usecols=columns_to_include_zh)
# try: # 中文 -> 英文映射
# time_strftime = time.strftime("%Y-%m-%d", time.localtime()) df.rename(columns={
# file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{time_strftime}.csv' '关键词词组': 'keyword',
# print('file_path1111', file_path) 'Cerebro IQ 得分': 'cerebro_iq_score',
# df = pd.read_csv(file_path, usecols=columns_to_include) '搜索量': 'search_volume',
# print('读取文件::', file_path) '搜索量趋势': 'search_volume_trend',
# except: '广告推广ASIN 数': 'sponsored_asins',
# # 获取当前日期 '竞品数': 'competing_product',
# current_date = datetime.date.today() 'CPR': 'cpr',
# # 计算前一天日期 '标题密度': 'title_desity',
# previous_date = current_date - datetime.timedelta(days=1) '亚马逊推荐': 'amazon_recommended',
# # 格式化前一天日期为字符串 '自然': 'organic',
# previous_date_str = previous_date.strftime("%Y-%m-%d") '亚马逊推荐排名': 'amazon_recommended_rank',
# file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin}_{previous_date_str}.csv' '广告排名': 'sponsored_rank',
# print('file_path22222222222', file_path) '自然排名': 'organic_rank'
# df = pd.read_csv(file_path, usecols=columns_to_include) }, inplace=True)
# print('读取文件::', file_path) else:
print("表头是英文")
columns_to_include_en = ['Keyword Phrase', 'Cerebro IQ Score', 'Search Volume', 'Search Volume Trend',
'Sponsored ASINs',
'Competing Products', 'CPR', 'Title Density', 'Amazon Recommended', 'Organic',
'Amazon Rec. Rank', 'Sponsored Rank', 'Organic Rank']
df = pd.read_csv(file_path, usecols=columns_to_include_en)
df.rename(columns={ df.rename(columns={
'Keyword Phrase': 'keyword', 'Keyword Phrase': 'keyword',
'Cerebro IQ Score': 'cerebro_iq_score', 'Cerebro IQ Score': 'cerebro_iq_score',
...@@ -854,6 +774,15 @@ class H10(): ...@@ -854,6 +774,15 @@ class H10():
'Sponsored Rank': 'sponsored_rank', 'Sponsored Rank': 'sponsored_rank',
'Organic Rank': 'organic_rank' 'Organic Rank': 'organic_rank'
}, inplace=True) }, inplace=True)
return df
def sava_data(self, path):
print('self.err_asin_list::', self.err_asin_list)
df_asin_data_list = []
for asin in self.asin_list:
print(asin, '333333333333333', self.err_asin_list)
if asin not in self.err_asin_list:
df = self.read_files(path, asin)
df['asin'] = asin df['asin'] = asin
df['sku'] = self.sku df['sku'] = self.sku
df_asin_data_list.append(df) df_asin_data_list.append(df)
...@@ -885,7 +814,6 @@ class H10(): ...@@ -885,7 +814,6 @@ class H10():
print('调接口更新数据:', data) print('调接口更新数据:', data)
sku_data_list = data.split('|') sku_data_list = data.split('|')
sku = sku_data_list[0] sku = sku_data_list[0]
if self.read_product_sku(sku):
self.sku_state = False self.sku_state = False
if sku not in self.sku_list: if sku not in self.sku_list:
self.sku_list.append(sku) self.sku_list.append(sku)
...@@ -899,7 +827,7 @@ class H10(): ...@@ -899,7 +827,7 @@ class H10():
} }
url = f'http://120.79.147.190:8080/soundasia_selection/updateKeyWords/selWords?site={site}&asins={asins}' url = f'http://120.79.147.190:8080/soundasia_selection/updateKeyWords/selWords?site={site}&asins={asins}'
print(url) print(url)
for i in range(5): for i in range(20):
try: try:
resp = requests.get(url, timeout=30, headers=headers).json() resp = requests.get(url, timeout=30, headers=headers).json()
self.data = {"sku": self.sku, self.data = {"sku": self.sku,
...@@ -910,9 +838,9 @@ class H10(): ...@@ -910,9 +838,9 @@ class H10():
break break
except Exception as e: except Exception as e:
print("请求java 接口报错:", e, f"\n{traceback.format_exc()}") print("请求java 接口报错:", e, f"\n{traceback.format_exc()}")
time.sleep(3) time.sleep(105)
continue continue
print('type_resp::',type(resp)) print('type_resp::', type(resp))
# core核心词 # core核心词
core_list = resp['result']['core'].split('\n') core_list = resp['result']['core'].split('\n')
# updown长尾词 # updown长尾词
...@@ -929,9 +857,10 @@ class H10(): ...@@ -929,9 +857,10 @@ class H10():
sql_delete_bsr = f"delete from product_audit where product_sku in ('{tuple(df_save.product_sku)[0]}') and site='{self.site_url}';" sql_delete_bsr = f"delete from product_audit where product_sku in ('{tuple(df_save.product_sku)[0]}') and site='{self.site_url}';"
else: else:
sql_delete_bsr = f"delete from product_audit where product_sku in {tuple(set(df_save.product_sku))} and site='{self.site_url}';" sql_delete_bsr = f"delete from product_audit where product_sku in {tuple(set(df_save.product_sku))} and site='{self.site_url}';"
print('sql_delete_bsr', sql_delete_bsr)
conn.execute(sql_delete_bsr) conn.execute(sql_delete_bsr)
df_save.to_sql("product_audit", con=self.engine_us, self.engine_us.to_sql(df_save, "product_audit",
if_exists='append', index=False) if_exists='append')
except Exception as e: except Exception as e:
print("save_competition:", e, f"\n{traceback.format_exc()}") print("save_competition:", e, f"\n{traceback.format_exc()}")
print('存储优质词报错。重连数据库') print('存储优质词报错。重连数据库')
...@@ -941,9 +870,6 @@ class H10(): ...@@ -941,9 +870,6 @@ class H10():
print(f'存储 core核心词 updown长尾词 报错, \n{e, traceback.format_exc()}') print(f'存储 core核心词 updown长尾词 报错, \n{e, traceback.format_exc()}')
self.mysql_connect() self.mysql_connect()
time.sleep(5) time.sleep(5)
else:
self.asin_state_5_list.append(sku)
self.sku_state = True
# 定义一个函数来获取三列的最小值(忽略为0的值) # 定义一个函数来获取三列的最小值(忽略为0的值)
def get_min(self, row): def get_min(self, row):
...@@ -953,13 +879,11 @@ class H10(): ...@@ -953,13 +879,11 @@ class H10():
def save_competition(self, path, asin_list, site_url, site): def save_competition(self, path, asin_list, site_url, site):
print('self.err_asins_adv_list;;', self.err_asins_adv_list) print('self.err_asins_adv_list;;', self.err_asins_adv_list)
if asin_list[0] not in ''.join(self.err_asins_adv_list): if asin_list[0] not in ''.join(self.err_asins_adv_list):
try:
time_strftime = time.strftime("%Y-%m-%d", time.localtime()) time_strftime = time.strftime("%Y-%m-%d", time.localtime())
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv' file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
print('file_pathsave_competition1111111', file_path) print('file_pathsave_competition1111111', file_path)
df = pd.read_csv(file_path) state = self.if_csv_path(file_path)
except: if state==False:
# 获取当前日期
current_date = datetime.date.today() current_date = datetime.date.today()
# 计算前一天日期 # 计算前一天日期
previous_date = current_date - datetime.timedelta(days=1) previous_date = current_date - datetime.timedelta(days=1)
...@@ -967,8 +891,20 @@ class H10(): ...@@ -967,8 +891,20 @@ class H10():
previous_date_str = previous_date.strftime("%Y-%m-%d") previous_date_str = previous_date.strftime("%Y-%m-%d")
file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{previous_date_str}.csv' file_path = fr'{path}\{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{previous_date_str}.csv'
print('file_pathsave_competition2222', file_path) print('file_pathsave_competition2222', file_path)
df = pd.read_csv(file_path)
# 创建一个字典来映射原始列名和新的列名 # 创建一个字典来映射原始列名和新的列名
columns = pd.read_csv(file_path, nrows=0).columns.tolist()
def contains_chinese(text):
return bool(re.search(r'[\u4e00-\u9fff]', text))
is_chinese_header = any(contains_chinese(col) for col in columns)
if is_chinese_header:
print("save_competition 表头是中文")
column_mapping = {
'关键词词组': 'keyword_phrase',
'自然排名': 'rank0',
}
else:
print("save_competition 表头是英文")
column_mapping = { column_mapping = {
'Keyword Phrase': 'keyword_phrase', 'Keyword Phrase': 'keyword_phrase',
'Position (Rank)': 'rank0', 'Position (Rank)': 'rank0',
...@@ -976,6 +912,7 @@ class H10(): ...@@ -976,6 +912,7 @@ class H10():
# 将asin_list中的元素作为列名,并给它们一个新的列名(rank1, rank2, rank3, ...) # 将asin_list中的元素作为列名,并给它们一个新的列名(rank1, rank2, rank3, ...)
for i, asin in enumerate(asin_list[1:], start=1): for i, asin in enumerate(asin_list[1:], start=1):
column_mapping[asin] = f'rank{i}' column_mapping[asin] = f'rank{i}'
df = pd.read_csv(file_path)
df.rename(columns=column_mapping, inplace=True) df.rename(columns=column_mapping, inplace=True)
# 只保留包含rank的列 # 只保留包含rank的列
rank_columns = [col for col in df.columns if col.startswith('rank')] rank_columns = [col for col in df.columns if col.startswith('rank')]
...@@ -992,6 +929,8 @@ class H10(): ...@@ -992,6 +929,8 @@ class H10():
print(new_df.columns) print(new_df.columns)
new_df.replace({np.nan: None}, inplace=True) # 将 NaN 替换为 None new_df.replace({np.nan: None}, inplace=True) # 将 NaN 替换为 None
# print(' 低竞争 优质词') # print(' 低竞争 优质词')
# "join() 里存在 None。 用 astype(str) 清洗。"
new_df['keyword_phrase'] = new_df['keyword_phrase'].astype(str)
competition_phrase_keywords = '&&&'.join(list(new_df['keyword_phrase'])) competition_phrase_keywords = '&&&'.join(list(new_df['keyword_phrase']))
rank_list = list(new_df['rank']) rank_list = list(new_df['rank'])
competition_phrase_rank = ','.join(str(x) for x in rank_list) competition_phrase_rank = ','.join(str(x) for x in rank_list)
...@@ -1004,18 +943,11 @@ class H10(): ...@@ -1004,18 +943,11 @@ class H10():
try: try:
if new_df.shape[0] > 0: if new_df.shape[0] > 0:
with self.engine_us.begin() as conn: with self.engine_us.begin() as conn:
if len(set(new_df.keyword_phrase)) == 1: sql_delete = f"DELETE FROM adv_low_competition_phrase WHERE site = '{site_url}' AND sku = '{self.sku}' ;"
sql_delete = f"""delete from adv_low_competition_phrase where site = '{site_url}' and sku='{self.sku}' and keyword_phrase in ("{tuple(new_df.keyword_phrase)[0]}");""" print("sql_delete:", sql_delete)
conn.execute(sql_delete) conn.execute(sql_delete)
else: self.engine_us.to_sql(new_df, "adv_low_competition_phrase", if_exists='append'
sql_delete = text(
"DELETE FROM adv_low_competition_phrase WHERE site = :site_url AND sku = :sku AND keyword_phrase IN :keyword_phrases;"
) )
print("sql_delete:", sql_delete)
conn.execute(sql_delete, site_url=site_url, sku=self.sku,
keyword_phrases=tuple(set(new_df['keyword_phrase'].tolist())))
new_df.to_sql("adv_low_competition_phrase", con=self.engine_us, if_exists='append',
index=False)
except Exception as e: except Exception as e:
print("save_competition:", e, f"\n{traceback.format_exc()}") print("save_competition:", e, f"\n{traceback.format_exc()}")
print('存储优质词报错。重连数据库') print('存储优质词报错。重连数据库')
...@@ -1026,16 +958,17 @@ class H10(): ...@@ -1026,16 +958,17 @@ class H10():
def requests_updateSkuByAsinH10Data_api(self, data): def requests_updateSkuByAsinH10Data_api(self, data):
print('调用接口:', self.data) print('调用接口:', self.data)
# url = 'https://xcu.yswg.com.cn/api/selections/updateSkuByAsinH10Data'
# url = 'http://120.24.90.10:80/api/ComprehensiveProject/updateSkuByAsinH10Data' for i in range(5):
url = 'https://xcu.yswg.com.cn/api/ComprehensiveProject/updateSkuByAsinH10Data'
try: try:
res = requests.post(url, json=data, timeout=30) res = requests.post(url, json=data, timeout=30)
print(res.status_code) print(res.status_code)
print(res.text) print(res.text)
print(res.json()) print(res.json(), 1111112222)
except: break
pass except Exception as e:
print(e, 'requests_updateSkuByAsinH10Data_api 报错 2323232323')
time.sleep(10)
def send_ms(self, ms): def send_ms(self, ms):
if self.useremail_state == False: if self.useremail_state == False:
...@@ -1064,8 +997,8 @@ class H10(): ...@@ -1064,8 +997,8 @@ class H10():
else: else:
path = r'C:\Users\ASUS\Downloads' path = r'C:\Users\ASUS\Downloads'
print('当前路径:', path) print('当前路径:', path)
self.email_name = 'H10961961@outlook.com' self.email_name = 'yashengweige678@outlook.com'
self.pw = 'soundasia961961@' self.pw = '987654321yswg@' # 'yashengweige678@outlook.com', '987654321yswg@'
self.web_drver() self.web_drver()
while True: while True:
self.data = {} self.data = {}
...@@ -1137,15 +1070,13 @@ class H10(): ...@@ -1137,15 +1070,13 @@ class H10():
self.del_file(path, self.asin_list[0]) self.del_file(path, self.asin_list[0])
# 修改状态4 # 修改状态4
self.db_change_state_common(self.err_asin_list, 4) self.db_change_state_common(self.err_asin_list, 4)
# 修改状态5
self.update_sku_syn()
else: else:
self.db_change_state_common(self.asin_list, 1) self.db_change_state_common(self.asin_list, 1)
time.sleep(3600) time.sleep(3600)
self.useremail_state = True self.useremail_state = True
break break
else:
self.update_sku_syn()
self.mysql_connect(site) self.mysql_connect(site)
time.sleep(randint(20, 50)) time.sleep(randint(20, 50))
new_date = datetime.datetime.now().strftime("%H") new_date = datetime.datetime.now().strftime("%H")
...@@ -1158,5 +1089,6 @@ class H10(): ...@@ -1158,5 +1089,6 @@ class H10():
if new_date == '08': if new_date == '08':
self.driver.refresh() self.driver.refresh()
if __name__ == '__main__': if __name__ == '__main__':
H10().run() H10().run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.secure_db_client import get_remote_engine
from amazon_spider.VPS_IP import pppoe_ip
from amazon_params import py_ja3
from utils.asin_parse import ParseAsinUs
from utils.requests_param import Requests_param_val
from queue import Queue
from lxml import etree
import requests
import urllib3
from datetime import datetime
import json
import pandas as pd
import threading
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
urllib3.disable_warnings()
import ast
class ai_async_asin_pg():
def __init__(self):
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.item_queue = Queue() # 存储 item 详情数据队列
self.pg_connect()
self.sp_asin_queue = Queue()
self.spider_state = None
self.update_ai_asin_analyze_log_list = []
month = time.strftime("%m")
day = time.strftime("%d")
if int(day) > 10:
_month = int(month)
else:
if int(month) > 1:
_month = int(month) - 1 # 上个月
else:
_month = int(month)
if _month < 10:
_month = str(f'0{_month}')
self.topic_asin_html = f'asin_html_2025_{str(_month)}'
def get_params(self, site_name='us'):
self.site_name = site_name # 站点
self.reuests_para_val = Requests_param_val(site_name=self.site_name)
self.cookies_queue = Queue() # cookie队列
self.cookie_dict_delete_id = {}
# 返回 对应站点的host,首页链接
self.site_url, self.host = self.reuests_para_val.get_site_url(self.site_name)
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie(num=168)
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
def pg_connect(self):
self.engine_pg = get_remote_engine(
site_name='us', # -> database "selection"
db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
)
def get_asin(self):
while True:
if self.queries_asin_queue.empty() == False and self.spider_state is None:
asin_queu = self.queries_asin_queue.get()
elif self.sp_asin_queue.empty() == False:
self.spider_state = '竞品asin'
print('执行竞品asin 抓取')
asin_queu = self.sp_asin_queue.get()
else:
break
print('::asin_queu::: ', asin_queu)
queu_list = asin_queu.split('|-|')
print('queu_list:::', queu_list)
asin = queu_list[0]
task_id = queu_list[1]
site_name = queu_list[2]
module = queu_list[3]
if module == 'Amazon:asin':
sub_step = 'Amazon:asin:竞品'
elif module == 'Amazon:asinList':
sub_step = 'Amazon:asinList:详情'
else:
sub_step = None
self.get_params(site_name=site_name)
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie()
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
# 获取组装cookie
cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
asin=asin, scraper_url=None)
headers["cookie"] = cookie_str
scraper_url = self.site_url + 'dp/' + asin + "?th=1&psc=1"
print('scraper_url::', scraper_url)
try:
sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False)
# with open(rf'{self.site_name}_22_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp):
print('出现验证码,。asin---> ', asin)
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
continue
except Exception as e:
print("请求错误错误: 。asin---> ", asin, '错误:', e)
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
continue
response_url = resp.url
response = resp.text
response_s = etree.HTML(response)
if self.reuests_para_val.check_amazon_not_page(response): # asin 已下架 状态 4 Listen Now
continue
if self.reuests_para_val.check_amazon_page(response, response_url): # 检查是不是正常商品页面
continue
if self.reuests_para_val.check_amazon_allow_redirects(response_url, asin): # 检查是否被重定向
continue
# 获取邮编
try:
ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()")
print(ingress, ' 打印 邮编 ', resp.url)
except Exception as e:
print('asin 不是正常页面', asin)
continue
try:
ingress = ingress[0].strip()
except:
ingress = None
if ingress:
if self.reuests_para_val.check_amazon_ingress(ingress):
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + module)
continue
div_dp = response_s.xpath('//div[@id="dp"]')
if div_dp:
items = ParseAsinUs(resp=response, asin=asin, site_name=self.site_name).xpath_html()
new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item = {'asin': items["asin"], 'task_id': task_id, 'url': scraper_url, 'sub_step': sub_step,
'title': items["title"], 'img_url': items["img_url"],
'rating': items["rating"], 'total_comments': items["total_comments"],
'price': items["price"], "rank": items["rank"], 'category': items["category"],
'launch_time': items["launch_time"], 'volume': items["volume"],
'weight': items["weight"], "page_inventory": items["page_inventory"],
"buy_box_seller_type": items["buy_box_seller_type"],
"asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
'img_num': items["img_num"], 'img_type': items["img_type"],
'activity_type': items["activity_type"],
'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
'eight_val': items["eight_val"],
'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
'three_star': items["three_star"],
'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
'together_asin': items["together_asin"],
'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
'node_id': items["node_id"],
'sp_num': items["sp_num"], 'describe': items["describe"],
'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
'variat_num': items['variat_num'],
'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
'all_best_sellers_herf': items['all_best_sellers_herf'],
'product_description': items['product_description'], 'buy_sales': items['buySales'],
'image_view': items['image_view'], 'product_json': items['product_json'],
'product_detail_json': items['productdetail_json'],
'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
'lob_asin_json': items['lob_asin_json'],
'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
'compare_similar_asin_json': items['compare_similar_asin_json'],
'customer_reviews_json': items['customer_reviews_json'],
'together_asin_json': items['together_asin_json'],
'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': items['bundle_asin_component_json'],
'bsr_category_asin_list': items['bs_category_asin_list_pg'],'review_json_list':items['review_json_list']
}
print(item)
# a = None
# if result_list_json and module == 'Amazon:asin' and self.spider_state is None:
# is_sp_asin_state = None
# result_list_dict = json.loads(result_list_json)
# print(asin, '判断是否有竞品asin')
# for result_dict in result_list_dict:
# # Based on your recent shopping trends # Frequently purchased items with fast delivery
# # Customers who viewed this item also viewed # Brand in this category on Amazon
# sp_type = 'Based on your recent shopping trends'
# if result_dict.get(sp_type):
# print(asin, '找到有竞品asin。 数量:', len(result_dict[sp_type]))
# for i in result_dict[sp_type]:
# sp_asin = i + '|-|' + task_id + '|-|' + site_name + '|-|' + module
# self.sp_asin_queue.put(sp_asin)
# is_sp_asin_state = 111
# a = 1
# if is_sp_asin_state is None:
# print('没有找到竞品asin')
# self.item_queue.put(item)
# # self.save_data()
# # self.update_ai_asin_analyze_log([int(task_id)], '成功')
# a = 1
self.item_queue.put(item)
Requests_param_val().send_kafka(html_data=response, topic=self.topic_asin_html)
Requests_param_val().kafuka_producer_str.flush(timeout=30)
# if self.spider_state == '竞品asin':
# self.item_queue.put(item)
# a = 1
#
# if module == 'Amazon:asinList':
# self.item_queue.put(item)
# a = 1
# if a is None:
# self.item_queue.put(item)
else:
print('asin 商品 异常')
def update_ai_asin_analyze_log(self, task_id_list, status):
if task_id_list:
task_id_list = list(set(task_id_list))
while True:
try:
with self.engine_pg.begin() as conn:
for task_id in task_id_list:
sql_update = f"""UPDATE ai_asin_analyze_log a set spider_status='{status}' where a.task_id = {task_id}"""
print('UPDATE_sql:', sql_update)
conn.execute(sql_update)
break
except Exception as e:
print('更新 ai_asin_analyze_log 失败', e)
time.sleep(5)
def save_data(self):
self.pg_connect()
items_data_list = []
update_time = int(time.time())
task_id_list = []
while True:
if self.item_queue.empty() == False:
items = self.item_queue.get()
unique_key = self.site_name + ':' + items['asin']
items_data_list.append(
[int(items['task_id']), items['url'], items['sub_step'], '爬取成功', items, update_time, unique_key])
task_id_list.append(int(items['task_id']))
else:
break
if task_id_list:
self.update_ai_asin_analyze_log(task_id_list, '成功')
while True:
try:
print('存储数据, 数量', len(items_data_list))
if items_data_list:
print(len(items_data_list))
df_asin_detail = pd.DataFrame(data=items_data_list,
columns=['task_id', 'url', 'sub_step', 'status', 'html_json',
'create_time',
'unique_key'])
df_asin_detail['html_json'] = df_asin_detail['html_json'].apply(
lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, (dict, list)) else x
)
self.engine_pg.to_sql(df_asin_detail, 'ai_asin_analyze_spider', if_exists='append')
break
except Exception as e:
print('存储报错::', e)
time.sleep(10)
else:
print('save_data 存储数据, 数量', len(items_data_list))
def task(self):
result = 1 + 1
print("执行结果:", result)
def read_ai_asin(self):
time_ip_num = 0
while True:
try:
time_ip_num += 1
self.pg_connect()
for module in ['Amazon:asin','Amazon:asinList']:
if module == 'Amazon:asin':
# pass
sql = f"SELECT elem->>'boyris' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module='{module}' and spider_status='未开始' for update;"
else:
sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and spider_status='未开始' for update;"""
# sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and task_id=39 for update;"""
print(sql)
df_read = self.engine_pg.read_then_update(
select_sql=sql,
update_table='ai_asin_analyze_log',
set_values={"spider_status": '爬取中'}, # 把库存清零
where_keys=["task_id"], # WHERE sku = :sku
)
print(f'开始 {module} 任务:', sql)
if not df_read.empty:
if module == 'Amazon:asin':
_asin_list = ast.literal_eval(df_read['asin'][0])
asin_id_list = []
for _aisn in _asin_list:
asin_data_list = list(
_aisn + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + module)
asin_id_list.extend(asin_data_list)
else:
asin_id_list = list(
df_read['asin'] + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + module)
print(asin_id_list)
for asin_id in asin_id_list:
print(asin_id)
self.queries_asin_queue.put(asin_id)
html_thread = []
for i in range(5):
thread2 = threading.Thread(target=self.get_asin)
thread2.start()
html_thread.append(thread2)
for t2 in html_thread:
t2.join()
self.save_data()
time.sleep(5)
if 10 <= datetime.now().hour < 22:
if time_ip_num > 60:
pppoe_ip()
time_ip_num = 0
time.sleep(5)
# break
except Exception as e:
print('查询报错:', e)
# break
if __name__ == '__main__':
ai_async_asin_pg().read_ai_asin()
...@@ -3,17 +3,16 @@ import sys ...@@ -3,17 +3,16 @@ import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from secure_db_client import get_remote_engine from secure_db_client import get_remote_engine
import traceback
from curl_cffi import requests from curl_cffi import requests
from utils.db_connect import BaseUtils from utils.db_connect import BaseUtils
import re import re
from lxml import etree from lxml import etree
os.environ['NO_PROXY'] = 'amazon.com' os.environ['NO_PROXY'] = 'amazon.com'
import json import json
from urllib.parse import urlparse from urllib.parse import urlparse
import datetime
import time
class Amazon_reviewer(): class Amazon_reviewer():
...@@ -111,6 +110,35 @@ class Amazon_reviewer(): ...@@ -111,6 +110,35 @@ class Amazon_reviewer():
"review_data_img": review_img} "review_data_img": review_img}
print(items) print(items)
def pg_get_asin(self):
while True:
try:
print('轮询 mysql 查询:', datetime.now().strftime("%m-%d %H:%M:%S"))
engine_pg = self.pg_connect()
spider_state_sql = """select asin,task_id from ai_asin_analyze_spider where status = '未开始' limit 20 """
print('spider_state_sql:', spider_state_sql)
df_asin = engine_pg.read_sql(spider_state_sql)
if not df_asin.empty:
update_time = int(time.time())
with engine_pg.begin() as conn:
index_tuple = tuple(df_asin['task_id'])
if len(index_tuple) == 1:
sql_update = f"""UPDATE ai_asin_analyze_spider a set status='爬取中',update_time='{update_time}' where a.task_id in ({index_tuple[0]})"""
else:
sql_update = f"""UPDATE ai_asin_analyze_spider a set status='爬取中',update_time='{update_time}' where a.task_id in {index_tuple}"""
print('UPDATE_sql:', sql_update)
conn.execute(sql_update)
_asin_lis = list(df_asin.asin + '|-|' + df_asin.task_id.astype("U"))
print("_asin_lis:::", _asin_lis, )
print("_asin_lis::: len ", len(_asin_lis))
run_spider(_asin_lis) # 传递asin 列表
time.sleep(3)
# break
except Exception as e:
print('查询 mysql_get_asin 报错::', e, f"\n{traceback.format_exc()}")
def run(self): def run(self):
self.redis_db() self.redis_db()
self.get_asin_reviewer() self.get_asin_reviewer()
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.secure_db_client import get_remote_engine
from amazon_params import py_ja3
from utils.asin_parse import ParseAsinUs
from amazon_spider.VPS_IP import pppoe_ip
from utils.requests_param import Requests_param_val
from queue import Queue
from lxml import etree
import requests
import urllib3
import pandas as pd
from datetime import datetime
import json
import threading
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
urllib3.disable_warnings()
class ai_async_asin_pg():
def __init__(self, site_name='us'):
self.site_name = site_name
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.item_queue = Queue() # 存储 item 详情数据队列
self.pg_connect()
self.sp_asin_queue = Queue()
self.spider_state = None
self.update_ai_asin_analyze_log_list = []
month = time.strftime("%m")
day = time.strftime("%d")
if int(day)>10:
_month = int(month)
else:
if int(month) > 1:
_month = int(month) - 1 # 上个月
else:
_month = int(month)
if _month < 10:
_month = str(f'0{_month}')
self.topic_asin_html = f'asin_html_2025_{str(_month)}'
def get_params(self):
# 站点
self.reuests_para_val = Requests_param_val(site_name=self.site_name)
self.cookies_queue = Queue() # cookie队列
self.cookie_dict_delete_id = {}
# 返回 对应站点的host,首页链接
self.site_url, self.host = self.reuests_para_val.get_site_url(self.site_name)
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie(num=168)
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
def pg_connect(self):
self.engine_pg = get_remote_engine(
site_name=self.site_name, # -> database "selection"
db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
)
return self.engine_pg
def get_asin(self):
while True:
if self.queries_asin_queue.empty() == False and self.spider_state is None:
asin_queu = self.queries_asin_queue.get()
elif self.sp_asin_queue.empty() == False:
self.spider_state = '竞品asin'
print('执行竞品asin 抓取')
asin_queu = self.sp_asin_queue.get()
else:
break
# ['B09658Q5RP|-|82|-|us|-|6248', 'B0CSPVS7JL|-|82|-|us|-|6249']
print('::asin_queu::: ', asin_queu)
queu_list = asin_queu.split('|-|')
print('queu_list:::', queu_list)
asin = queu_list[0]
task_id = queu_list[1]
site_name = queu_list[2]
id_str = queu_list[3]
sub_step = queu_list[4]
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie()
self.cookie_dict_delete_id = cookies_dict
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
# 获取组装cookie
cookie_str = self.reuests_para_val.get_cookie_str(self.cookies_queue)
headers = self.reuests_para_val.requests_amazon_headers(host=self.host, site_url=self.site_url,
asin=asin, scraper_url=None)
headers["cookie"] = cookie_str
scraper_url = self.site_url + 'dp/' + asin + "?th=1&psc=1"
print('scraper_url::', scraper_url)
try:
sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False)
# with open(rf'{self.site_name}_22_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp):
print('出现验证码,。asin---> ', asin)
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
continue
except Exception as e:
print("请求错误错误: 。asin---> ", asin, '错误:', e)
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
continue
response_url = resp.url
response = resp.text
response_s = etree.HTML(response)
if self.reuests_para_val.check_amazon_not_page(response): # asin 已下架 状态 4 Listen Now
continue
if self.reuests_para_val.check_amazon_page(response, response_url): # 检查是不是正常商品页面
continue
if self.reuests_para_val.check_amazon_allow_redirects(response_url, asin): # 检查是否被重定向
continue
# 获取邮编
try:
ingress = response_s.xpath("//span[@id='glow-ingress-line2']/text()")
print(ingress, ' 打印 邮编 ', resp.url)
except Exception as e:
print('asin 不是正常页面', asin)
continue
try:
ingress = ingress[0].strip()
except:
ingress = None
if ingress:
if self.reuests_para_val.check_amazon_ingress(ingress):
if self.spider_state == '竞品asin':
self.sp_asin_queue.put(
asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
else:
self.queries_asin_queue.put(asin + '|-|' + task_id + '|-|' + site_name + '|-|' + id_str)
continue
div_dp = response_s.xpath('//div[@id="dp"]')
if div_dp:
items = ParseAsinUs(resp=response, asin=asin, site_name=self.site_name).xpath_html()
new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item = {'asin': items["asin"], 'task_id': task_id, 'id': id_str,'sub_step':sub_step, 'url': scraper_url,
'title': items["title"], 'img_url': items["img_url"],
'rating': items["rating"], 'total_comments': items["total_comments"],
'price': items["price"], "rank": items["rank"], 'category': items["category"],
'launch_time': items["launch_time"], 'volume': items["volume"],
'weight': items["weight"], "page_inventory": items["page_inventory"],
"buy_box_seller_type": items["buy_box_seller_type"],
"asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
'img_num': items["img_num"], 'img_type': items["img_type"],
'activity_type': items["activity_type"],
'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
'eight_val': items["eight_val"],
'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
'three_star': items["three_star"],
'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
'together_asin': items["together_asin"],
'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
'node_id': items["node_id"],
'sp_num': items["sp_num"], 'describe': items["describe"],
'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
'variat_num': items['variat_num'],
'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
'all_best_sellers_herf': items['all_best_sellers_herf'],
'product_description': items['product_description'], 'buy_sales': items['buySales'],
'image_view': items['image_view'], 'product_json': items['product_json'],
'product_detail_json': items['productdetail_json'],
'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
'lob_asin_json': items['lob_asin_json'],
'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
'compare_similar_asin_json': items['compare_similar_asin_json'],
'customer_reviews_json': items['customer_reviews_json'],
'together_asin_json': items['together_asin_json'],
'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': items['bundle_asin_component_json'],
'bsr_category_asin_list': items['bs_category_asin_list_pg'],
'review_json_list': items['review_json_list']
}
print(item)
self.item_queue.put(item)
Requests_param_val().send_kafka(html_data=response, topic=self.topic_asin_html)
Requests_param_val().kafuka_producer_str.flush(timeout=30)
else:
print('asin 商品 异常')
def save_data(self):
self.pg_connect()
items_data_list = []
id_list = []
while True:
if self.item_queue.empty() == False:
items = self.item_queue.get()
items_data_list.append([int(items['task_id']), items['asin'], items['site_name'], '成功', items, int(items['id']),items['sub_step']])
id_list.append(int(items['id']))
else:
break
if items_data_list:
while True:
try:
self.pg_connect()
print('存储数据, 数量', len(items_data_list))
with self.engine_pg.begin() as conn:
if len(set(id_list)) == 1:
sql_delete = f"delete from ai_asin_analyze_spider where id in ({tuple(id_list)[0]});"
else:
sql_delete = f"delete from ai_asin_analyze_spider where id in {tuple(set(id_list))};"
print('删除:',sql_delete)
conn.execute(sql_delete)
print(len(items_data_list))
df_asin_detail = pd.DataFrame(data=items_data_list,
columns=['task_id', 'unique_key', 'site_name', 'status', 'html_json',
'id','sub_step'])
df_asin_detail['html_json'] = df_asin_detail['html_json'].apply(
lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, (dict, list)) else x
)
self.engine_pg.to_sql(df_asin_detail, 'ai_asin_analyze_spider', if_exists='append')
break
except Exception as e:
print('存储报错::', e)
self.pg_connect()
time.sleep(10)
def init_list(self):
print("=======清空变量==========")
self.asin_not_found_list = [] # 4
self.asin_not_sure_list = [] # 6
self.asin_not_foot_list = [] # 7
self.asin_not_foot2_list = [] # 8
self.asin_not_buyBox_list = [] # 9
self.asin_not_response_list = [] # 10
self.asin_not_redirect_list = [] # 12
self.asin_not_div_id_dp_list = [] # 13 返回html没有包含div @id=dp,状态13
self.requests_error_asin_list = [] # 1
self.asin_list_update = [] # 3
self.item_queue = Queue() # 存储 item 详情数据队列
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.buyBox_list = [] # 卖家名称 url 列表
self.asin_detail_list = [] # 存储asin 详情的列表
self.buyBoxname_asin_list = [] # asin 卖家的列表
self.delete_cookies_list = [] # 存储出现中国邮编的cookie
self.star_list = []
self.add_cart_asin_list = [] # 存储绑定购买的asin
self.asin_brand_list = []
self.bs_category_asin_list = []
self.bs_category_asin_list_pg = []
self.reuests_para_val.kafuka_producer_str.close(timeout=10)
self.asin_video_list = []
self.cookies_queue = Queue() # cookie队列
self.item_queue = Queue() # 存储 item 详情数据队列
self.queries_asin_queue = Queue() # 需要爬取的asin队列
self.buyBox_list = [] # 卖家名称 url 列表
self.asin_detail_list = [] # 存储asin 详情的列表
self.buyBoxname_asin_list = [] # asin 卖家的列表item
self.delete_cookies_list = [] # 存储出现中国邮编的cookie
self.cookie_dict_delete_id = {}
self.star_list = [] # 存储星级百分比
self.add_cart_asin_list = [] # 存储 绑定购买的asin
self.asin_brand_list = [] # 存储asin 对应 的品牌
self.bs_category_asin_list = [] # 存储 asin 详情 bsr 文本类目
self.bs_category_asin_list_pg = [] # 存储 asin 详情 bsr 文本类目
# 验证码 1
self.yzm_err_total_list = []
# 异常 2
self.asin_request_errp_total_list = []
# 成功 3
self.success_asin_total_list = []
# 每小时
self.hour_total_count_list = []
# 总请求 4
self.request_total_count_list = []
def run_ai_asin(self, asin_id_list):
self.get_params()
print(asin_id_list)
for asin_id in asin_id_list:
print(asin_id)
self.queries_asin_queue.put(asin_id)
html_thread = []
for i in range(5):
thread2 = threading.Thread(target=self.get_asin)
thread2.start()
html_thread.append(thread2)
for t2 in html_thread:
t2.join()
self.save_data()
self.init_list()
def select_asin():
time_ip_num = 0
while True:
try:
time_ip_num += 1
for site in ['us', 'de', 'uk']:
select_sql = f"""select id, site_name, task_id, unique_key as asin,sub_step from ai_asin_analyze_spider where sub_step = 'AsinInfoRepository:详情' and status = '未开始' and site_name='{site}' order by task_id"""
print('select_sql::', select_sql)
engine_pg15 = ai_async_asin_pg(site_name='us').pg_connect()
df_read = engine_pg15.read_then_update(
select_sql=select_sql,
update_table='ai_asin_analyze_spider',
set_values={"status": '爬取中'}, # 把库存清零
where_keys=["id", "site_name"], # WHERE sku = :sku
)
if not df_read.empty:
asin_id_list = list(
df_read['asin'] + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + df_read.id.astype("U") + '|-|' + df_read.sub_step)
print(asin_id_list)
ai_async_asin_pg(site_name=site).run_ai_asin(asin_id_list)
time.sleep(5)
if 10 <= datetime.now().hour < 22:
if time_ip_num > 180:
pppoe_ip()
time_ip_num = 0
time.sleep(5)
except Exception as e:
print(e,2333333)
time.sleep(5)
if __name__ == '__main__':
select_asin()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment