Commit 3cd71f28 by Peng

no message

parents
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/../../../../../../:\Users\ASUS\Desktop\py_spider\.idea/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="4">
<item index="0" class="java.lang.String" itemvalue="httpx" />
<item index="1" class="java.lang.String" itemvalue="mysqlclient" />
<item index="2" class="java.lang.String" itemvalue="jsonpath" />
<item index="3" class="java.lang.String" itemvalue="mysql-connector-python" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (eve_Python)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/py_spider.iml" filepath="$PROJECT_DIR$/.idea/py_spider.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.7 (eve_Python)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>
\ No newline at end of file
import uuid
import base64
import json
from datetime import timedelta
from flask import Flask, request, jsonify, g
from flask_bcrypt import Bcrypt
import redis
import pymysql
from pymysql.err import OperationalError
app = Flask(__name__)
# 输出 JSON 时保留非 ASCII 字符(如中文)
app.config['JSON_AS_ASCII'] = False
DB_CONFIG = {
'host': '120.77.232.73',
'port': 3306,
'user': 'yswg_it_cangchu',
'password': 'Yswg@inv-cangchu241011420',
'db': 'inventory'
}
# REDIS_CONN = {
# "redis_host": "120.77.232.73",
# "redis_port": 6379,
# "redis_pwd": "yswgInventoryTest@202307#",
# "redis_db": 1
#
# }
REDIS_CONN = {
"redis_host": "113.100.143.162",
"redis_port": 6379,
"redis_pwd": "fG7#vT6kQ1pX",
"redis_db": 14
}
def mysql_db_conn():
connect_mysql_db = pymysql.connect(host=DB_CONFIG['host'], port=DB_CONFIG['port'],
user=DB_CONFIG['user'],
password=DB_CONFIG['password'], database=DB_CONFIG['db'],
charset="utf8mb4")
return connect_mysql_db
def redis_db_conn():
redis_client = redis.Redis(host=REDIS_CONN['redis_host'], port=REDIS_CONN['redis_port'],
password=REDIS_CONN['redis_pwd'], db=REDIS_CONN['redis_db'])
return redis_client
# 初始化Flask-Bcrypt
dbcrypt = Bcrypt(app)
# 生成 UUID
def generate_token():
raw = uuid.uuid4().bytes
token = base64.urlsafe_b64encode(raw).rstrip(b'=').decode('utf-8')
return token
# 公共方法:获取当前请求的用户信息
def get_current_user():
"""
从 flask.g 获取当前用户信息(dict),如果不存在返回 None
"""
return getattr(g, 'current_user', None)
# 中间件:在每次请求前校验 token(除登录外)
@app.before_request
def verify_token_middleware():
# 登录接口不需要校验 token
if request.endpoint == 'login':
return None
token = request.headers.get('inventory-token')
print(' 从请求的 headers 获取 token: ', token)
if not token:
return jsonify({'code': 401, 'error': '缺少 inventory-token'})
redis_client = redis_db_conn()
user_json = redis_client.get(token)
redis_client.close()
print('根据 token 去redis查询 是否过期')
if not user_json:
return jsonify({'code': 401, 'error': '无效或已过期的 token'})
try:
g.current_user = json.loads(user_json)
except json.JSONDecodeError:
return jsonify({'code': 500, 'error': '解析信息失败'})
@app.route('/index', methods=['GET'])
def index_():
user = get_current_user()
print('打印请求用户的信息:', user)
return jsonify({
'message': f'欢迎,{user["name"]}!',
'user_id': user['id']
})
@app.route('/user/members/index', methods=['GET'])
def user_index():
user = get_current_user()
print('打印请求用户的信息:', user)
return jsonify({
'message': f'欢迎,{user["name"]}!',
'user_id': user['id']
})
@app.route('/login', methods=['POST'])
def login():
data = request.get_json()
username = data.get('username')
password = data.get('password')
print(username)
print(password)
if not username or not password:
return jsonify({"code": 400, 'error': '用户名和密码不能为空'})
# SQL 查询用户
try:
conn = mysql_db_conn()
# 设置 pymysql.cursors.DictCursor 字典输出 格式
with conn.cursor(pymysql.cursors.DictCursor) as cursor:
sql = f"SELECT `password`,id,name,email FROM users WHERE wechat_id='{username}' LIMIT 1"
print(sql)
cursor.execute(sql)
user = cursor.fetchone()
print('查询用户是否存在:', user)
except OperationalError:
# 数据库连接失败,请检查配置
return jsonify({"code": 500, 'error': '服务端 错误'})
finally:
try:
conn.close()
except:
pass
if user is None:
return jsonify({"code": 404, 'error': '用户不存在'})
# 检查密码
if not dbcrypt.check_password_hash(user['password'], password):
return jsonify({"code": 401, 'error': '密码 或 用户名 错误'})
# 生成token并存储到Redis中
token = generate_token()
user_obj = {
'id': user['id'],
'name': user['name'],
'email': user['email'],
}
print(token)
# 序列化时保留中文
redis_client = redis_db_conn()
redis_client.setex(token, timedelta(hours=23), json.dumps(user_obj, ensure_ascii=False))
redis_client.close()
return jsonify({"code": 200, 'token': token}), 200
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)
'''
部署再h5物理机。给超哥开接口。识别图片中的文本
'''
from paddleocr import PaddleOCR
import logging
from flask import Flask, request, jsonify
app = Flask(__name__)
logging.getLogger('ppocr').setLevel(logging.ERROR)
def OCR(language, url):
try:
# language = 'en'
# url = 'http://soundasia.oss-cn-shenzhen.aliyuncs.com/product_img/2024/12/05/EQ6368_1733363300_%E7%94%BB%E6%9D%BF%203.jpg'
# path = r'D:\新建文件夹\requests_files'
ocr = PaddleOCR(use_angle_cls=True, lang=language)
result = ocr.ocr(url, cls=True)
data_list = []
for idx in range(len(result)):
res = result[idx]
for line in res:
xy_dict = {}
xy_dict['a'] = {'x': line[0][0][0], 'y': line[0][0][1]}
xy_dict['b'] = {'x': line[0][1][0], 'y': line[0][1][1]}
xy_dict['c'] = {'x': line[0][2][0], 'y': line[0][2][1]}
xy_dict['d'] = {'x': line[0][3][0], 'y': line[0][3][1]}
xy_dict['value'] = line[1][0]
xy_dict['accuracy'] = round(line[1][1], 3)
data_list.append(xy_dict)
# data_json = json.dumps(data_list)
if data_list:
return data_list
else:
return None
# with open(rf'{path}', 'w', encoding='utf-8')as f:
# f.write(data_json)
except Exception as e:
print('识别错误。。', e)
return 'error'
@app.route('/cron/staff/toOCR', methods=['GET'])
def get_args():
language = request.args.get('language')
url = request.args.get('url')
if language and url and 'http' in url:
data_json = OCR(language, url)
if data_json:
items = {"code": 200, "message": "success", 'items': data_json}
elif data_json == 'error':
items = {"code": 500, "message": "Something went wrong while downloading models", 'items': None}
else:
items = {"code": 200, "message": "check Image Url", 'items': data_json}
else:
items = {"code": 400, "message": "Parameter error", 'items': None}
return jsonify(items)
if __name__ == '__main__':
app.run('0.0.0.0', 10245)
import os
import sys
from sqlalchemy import text
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_connect import BaseUtils
from flask import Flask, request, jsonify
import pandas as pd
app = Flask(__name__)
def db_mysql_connect():
mysql_db = BaseUtils().mysql_connect()
return mysql_db
def select_workflow_progress(sql_select):
print('查询语句:',sql_select)
mysql_connect = db_mysql_connect()
df = pd.read_sql(sql_select, con=mysql_connect)
if df.values:
return list(df['status_val'])[0]
else:
return None
def update_workflow_progress(sql_update):
print('更新语句:',sql_update)
mysql_connect = db_mysql_connect()
stmt = text(sql_update)
with mysql_connect.begin() as conn:
conn.execute(stmt)
@app.route('/workflow_progress', methods=['POST'])
def get_args():
print(request.form) # 打印所有表单数据
site_name = request.form.get('site_name')
sql = request.form.get('sql')
sql_type = request.form.get('sql_type')
if sql_type == 'select':
status_val = select_workflow_progress(sql)
items = {"code": 200, "message": "select success", 'status_val': status_val}
elif sql_type == 'update':
update_workflow_progress(sql)
items = {"code": 200, "message": "update success", 'status_val': 1}
else:
items = {"code": 400, "message": "Parameter error"}
return jsonify(items)
if __name__ == '__main__':
app.run('0.0.0.0', 10249)
import curl_cffi
#
# url = 'https://d2ogs1k0ty8acr.cloudfront.net/sales?rank=24&category=Electronics&store=us'
# raw_headers = """
# authority:d2ogs1k0ty8acr.cloudfront.net
# method:GET
# path:/sales?rank=24&category=Electronics&store=us
# scheme:https
# Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7
# Accept-Encoding:gzip, deflate, br, zstd
# Accept-Language:zh-CN,zh-TW;q=0.9,zh;q=0.8
# Cache-Control:no-cache
# Cookie:datadome=I2Xd7hJbOOYOWoCj9PbMVSNufWb42ywpgwdj7RI9kREb6k0v7aCy~kDSo__8k5lh7T3jth9fNBL7k5n3r9Yy8hV38dJOsAxfhobwa9UDqvLCJqXGIwf9Cqu6LCJHtXhd
# Pragma:no-cache
# Sec-Ch-Device-Memory:8
# Sec-Ch-Ua:"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"
# Sec-Ch-Ua-Arch:"x86"
# Sec-Ch-Ua-Full-Version-List:"Not_A Brand";v="8.0.0.0", "Chromium";v="120.0.6099.216", "Google Chrome";v="120.0.6099.216"
# Sec-Ch-Ua-Mobile:?0
# Sec-Ch-Ua-Model:""
# Sec-Ch-Ua-Platform:"Windows"
# Sec-Fetch-Dest:document
# Sec-Fetch-Mode:navigate
# Sec-Fetch-Site:none
# Sec-Fetch-User:?1
# Upgrade-Insecure-Requests:1
# User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36
# """
#
# # 转换为 dict
# headers = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'Accept-Encoding': 'gzip, deflate, br, zstd', 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
# 'Cache-Control': 'no-cache',
# 'Pragma': 'no-cache',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
#
# # 打印结果
# print(headers)
# print(url)
# resp = curl_cffi.get(url, headers=headers, impersonate="chrome")
# print(resp.text)
# # url = 'http://192.168.0.113/login'
# # data = {'username':"pengyanbing","password":'15112376559'}
# # resp = requests.post(url,json=data)
# # print(resp.json())
#
# # url = 'http://192.168.2.28:5000/user/members/index'
# # headers={
# # 'inventory-token':"uPaMtIvnQmu5ZT3PvsKJQg"
# # }
# # resp = requests.get(url,headers=headers)
# # print(resp.json())
#
#
# # url = 'http://192.168.0.113'
# # headers = {
# # 'inventory-token': "-6XaaX9iQre-wF1F056OmQ"
# # }
# # resp = requests.get(url, headers=headers)
# # print(resp.text)
# # print(resp.status_code)
# # data = {'username':"pengyanbing","password":'15112376559'}
# # resp = requests.post(url,json=data)
# # print(resp.json())
import requests
headers = {
"Referer":"https://www.sellersprite.com/v2/tools/sales-estimator",
"Origin":"https://www.sellersprite.com",
"Accept":"application/json, text/javascript, */*; q=0.01",
"Accept-Encoding":"gzip, deflate, br, zstd",
"Accept-Language":"zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control":"no-cache",
'Cookie':'ecookie=Q5geIEdVjaBAFtuN_CN; _ga=GA1.1.24257081.1704176282; MEIQIA_TRACK_ID=2BvHiTIbRbKCGdu4mZU9P4u1xzI; MEIQIA_VISIT_ID=2vOZXWYZ1ELMqt7Myhc0FyMlff2; 242a48f291ecd6f60332=d29147f2a3b4af73e978691a1045ab80; current_guest=bXPLbIar9Oez_250519-188489; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1747649492,1747708080; HMACCOUNT=38079C97791291E4; _fp=65dbbe41a37f8f9fbe702eba96328267; _gcl_au=1.1.487208183.1744018086.1200924220.1747708256.1747708255; rank-guest-user=4585677471mscMIvqmsIKuDCd6Kk4qriCTUW/dsNaMOv1Wq60RTVlOh4JrbdHK8tpXYGZtnJcT; rank-login-user=4585677471mscMIvqmsIKuDCd6Kk4qrsdASXjkEtoV+LcVpQ7P3gFN62n1Lptd2QCYukHNHXPR; rank-login-user-info="eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjQ1ODU2Nzc0NzFtc2NNSXZxbXNJS3VEQ2Q2S2s0cXJzZEFTWGprRXRvVitMY1ZwUTdQM2dGTjYybjFMcHRkMlFDWXVrSE5IWFBSIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJzTlBhZl84YTl3VzNmVmg3YU5zOWpBIiwiaWF0IjoxNzQ3NzA4MjU0LCJleHAiOjE3NDc3OTQ2NTQsIm5iZiI6MTc0NzcwODE5NCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.g-eTs0HWdNlmPVb8xDOl1cYOYg80ChXnEc1o5v2d3FY4W8jL5O_LuEf9anR98X23pNUycv88TlaEByxQdYQs2XxEnPcwRBbgDf41pyi0iQvaR9vrpgW-hQsC71XAyfkwOwc5IwTLIthyOxOnlkFtfSCpxqBiOYoStYFAiWeY6qaurYPyANtIgAFaNR7uY6XqakEFJgMvUSoMRlkX6ALNa2gz-OcHZx-Qm4HKpvrgL2OweHsK7A4NaVa6JB69AlVDEND2ZVF6uGt0atU-WQZdMkMqHmzvPNg3GoWrzY3bh4QkWfTwZwNOSmCAoZHNxIdMRwSm29P-Vh3-7vV8VCHwxg; ao_lo_to_n="4585677471mscMIvqmsIKuDCd6Kk4qrmyq1MPFtV4nWaYtgg8lvaAxNW0YMtfNTVxZ5ufIPleIBLuqMQPNii6zzLnEerwJ0yhaH8xWXO7wiSC3X5OM3Kc="; _gaf_fp=c4ea36bde61bd39fcc4758118ac26b47; _ga_38NCVF2XST=GS2.1.s1747708079$o13$g1$t1747708318$j50$l0$h1030740079; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1747708319; _ga_CN0F80S6GL=GS2.1.s1747708080$o21$g1$t1747708319$j0$l0$h0; JSESSIONID=9108499B42608BD47FEF2BEFFAA63B6C',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": "US",
"cid": "2617941011",
"bsr": "200"
}
response = curl_cffi.post(url, headers=headers,data=data,impersonate="chrome")
print(response.text)
print(response)
\ No newline at end of file
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import requests
import pymysql
from utils.requests_param import Requests_param_val
from amazon_params.params import DB_CONN_DICT, PG_CONN_DICT
from queue import Queue
import time
import threading
import random
"""谷歌翻译"""
class Google_translates():
def __init__(self,site_name):
self.site_name = site_name
self.init_db(site_name)
def init_db(self,site_name):
self.site_name = site_name
if site_name == 'us':
self.connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection", charset="utf8mb4")
else:
self.connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection_" + site_name,
charset="utf8mb4")
self.cursor = self.connect.cursor()
self.reuests_para_val = Requests_param_val(site_name=self.site_name, spider="seller_account_product")
self.name_id_queue = Queue()
self.insert_list = []
def translate(self,fromLang="auto", toLang="zh-CN"):
# https://translate.googleapis.com/translate_a/single?client=gtx&sl=auto&tl=zh-CN&dt=t&q=Carretillas
while True:
if self.name_id_queue.empty() == False:
querys = self.name_id_queue.get()
print(querys)
print(f" id:", querys[0], " 分类:", querys[1])
while True:
try:
headers = {}
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(50, 104)}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
headers['user-agent'] = ua
url = 'https://translate.googleapis.com/translate_a/single?client=gtx&sl=' + fromLang + '&tl=' + toLang + '&dt=t&q=' + querys[1].replace(' ', '%20').replace('&', '%26')
print(url)
data = requests.get(url,timeout=15,headers=headers).json()
print(data)
if data[0][0][0]:
self.insert_list.append((querys[0],data[0][0][0].strip()))
break
except Exception as e:
if "'NoneType' object is not subscriptable" in str(e):
break
else:
print("google API翻译接口异常:",e,url)
time.sleep(random.uniform(2, 10))
continue
else:
break
def db_read_data(self, i):
while True:
try:
self.connect.ping(reconnect=True)
print(f"当前请求 类目 {i}")
sql_1 = f'select id,en_name from {self.site_name}_bs_category where nodes_num={i} and ch_name is NULL'
print("获取请求 类目sql_1:", sql_1)
time.sleep(3)
self.cursor.execute(sql_1)
name_1_list = self.cursor.fetchall()
return name_1_list
except Exception as e:
print("db_read_data 报错:", e)
time.sleep(10)
self.init_db(self.site_name)
continue
def get_all_read_data(self):
while True:
try:
self.connect.ping(reconnect=True)
sql_1 = f'select id,en_name from {self.site_name}_bs_category where ch_name is NULL'
print("获取请求 类目sql_1:", sql_1)
time.sleep(3)
self.cursor.execute(sql_1)
name_1_list = self.cursor.fetchall()
return name_1_list
except Exception as e:
print("db_read_data 报错:", e)
time.sleep(10)
self.init_db(self.site_name)
continue
def init_list(self):
self.insert_list = []
self.name_id_queue = Queue()
def save_date(self):
for name_id in self.insert_list:
while True:
try:
self.connect.ping(reconnect=True)
update_sql_ch_name = f'UPDATE {self.site_name}_bs_category set ch_name="{name_id[1]}" WHERE id={name_id[0]}'
print("更新:", update_sql_ch_name)
self.cursor.execute(update_sql_ch_name)
self.connect.commit()
break
except Exception as e:
print(e)
time.sleep(10)
self.init_db(self.site_name)
continue
def run(self):
num = 1
while True:
print(num)
time.sleep(3)
name_1_list = self.db_read_data(num)
if name_1_list and num < 15:
for name_1 in name_1_list:
self.name_id_queue.put(name_1)
html_thread = []
for i in range(20):
thread2 = threading.Thread(target=self.translate)
html_thread.append(thread2)
for ti in html_thread:
ti.start()
time.sleep(random.uniform(2, 5))
for t2 in html_thread:
t2.join()
print("self.insert_list: ",self.insert_list)
print(f"当前 {num} 级分类 全部翻译完毕", len(self.insert_list))
num += 1
else:
break
self.save_date()
self.init_list()
def run_start(self):
name_1_list = self.get_all_read_data()
if name_1_list:
for name_1 in name_1_list:
self.name_id_queue.put(name_1)
html_thread = []
for i in range(20):
thread2 = threading.Thread(target=self.translate)
html_thread.append(thread2)
for ti in html_thread:
ti.start()
time.sleep(random.uniform(2, 5))
for t2 in html_thread:
t2.join()
print("self.insert_list: ", self.insert_list)
print(f"当前 级分类 全部翻译完毕", len(self.insert_list))
self.save_date()
self.init_list()
if __name__ == '__main__':
spider_us = Google_translates(site_name='us')
spider_us.run_start() # 查询所有没有翻译的分类名称
spider_uk = Google_translates(site_name='uk')
spider_uk.run_start()
spider_fr = Google_translates(site_name='fr')
spider_fr.run_start()
spider_es = Google_translates(site_name='es')
spider_es.run_start()
spider_it = Google_translates(site_name='it')
spider_it.run_start()
spider_de = Google_translates(site_name='de')
spider_de.run_start()
\ No newline at end of file
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_connect import BaseUtils
import time
import pandas as pd
"""bsr分类 拆分,。"""
class BsCateDetail(BaseUtils):
def __init__(self, site_name='us'):
super().__init__()
self.site_name = site_name
self.engine = self.mysql_connect()
self.df_cate = pd.DataFrame()
self.df_std = pd.DataFrame()
self.df_std_save = pd.DataFrame()
def db_read_data(self):
sql = f"select * from {self.site_name}_bs_category;"
self.df_cate = pd.read_sql(sql, con=self.engine)
self.df_cate['cate_current_node'] = self.df_cate.nodes_num - 1
print(self.df_cate.shape)
def handle_data(self):
node_list = list(set(self.df_cate.cate_current_node))
node_list.remove(0)
print("node_list:", node_list)
cols_id_list = []
for node in node_list:
cols_id_list.extend((f"cate_{node}_id", f"cate_{node}_en_name"))
df_node = self.df_cate.loc[self.df_cate.cate_current_node == node, ['id', 'p_id', 'en_name']]
df_node.rename(
columns={"id": f"cate_{node}_id", "p_id": f"cate_{node}_p_id", "en_name": f"cate_{node}_en_name"}, inplace=True)
if node == 1:
self.df_std = df_node
else:
self.df_std = self.df_std.merge(df_node, left_on=f"cate_{node - 1}_id", right_on=f"cate_{node}_p_id", how='left')
print(node, df_node.shape)
self.df_std_save = self.df_std.loc[:, cols_id_list]
def db_save_data(self):
with self.engine.begin() as conn:
sql_truncate = f"truncate {self.site_name}_bs_category_detail;"
conn.execute(sql_truncate)
self.df_std_save.to_sql(f"{self.site_name}_bs_category_detail", con=self.engine, if_exists="append", index=False)
def run(self):
while True:
try:
self.db_read_data()
self.handle_data()
self.db_save_data()
break
except Exception as e:
self.mysql_reconnect(table_name=f"{self.site_name}_bs_category", e=e)
continue
if __name__ == '__main__':
for site_name in ['us', 'uk', 'de', 'es', 'fr', 'it']:
s_time = time.time()
print("site_name:", site_name)
handle_obj = BsCateDetail(site_name=site_name)
handle_obj.run()
print("程序总耗时:", time.time() - s_time)
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_connect import BaseUtils
from amazon_params.params import DB_CONN_DICT
import math
import pandas as pd
import time
import pymysql
import requests
"""计算销量,均值差"""
class CalculateMean(BaseUtils):
def __init__(self, site_name='us', nums_start=0, nums_step=10000000, year=2025, week=1, flag_uniformity=False):
super().__init__()
self.site_name = site_name
self.engine = self.mysql_connect()
self.nums_start = nums_start # limit 开始位置
self.nums_step = nums_step # limit 步长
self.year = year
self.week = week
self.year_month = ''
self.df_sum = pd.DataFrame()
self.df_repeat = pd.DataFrame()
self.df_repeat_list = []
self.cate_list = []
def send_mes(self, site_name):
month = time.strftime("%m")
year = time.strftime("%Y")
_year_month = f'{year}_{int(month)}'
with self.engine.begin() as conn:
update_sql_state = f"""
UPDATE {site_name}_one_category
SET STATE=4
WHERE `name` IN (
SELECT `name` FROM (
SELECT `name` FROM {site_name}_one_category WHERE rank=1 AND orders=0 AND `year_month`="{_year_month}"
) AS temp_table
);
"""
print('update_sql_state:',update_sql_state)
conn.execute(update_sql_state)
def db_read_data(self):
month = time.strftime("%m")
year = time.strftime("%Y")
_year_month = f'{year}_{int(month)}'
print(_year_month)
print(f"读取 {self.site_name}_one_category")
sql = f"select * from {self.site_name}_one_category where state!=4 and name = 'Health & Household' and `year_month`='{_year_month}';"
print('查询原始表:', sql)
self.df_sum = pd.read_sql(sql, con=self.engine)
# # 排序
self.df_sum.sort_values(by=['name', 'rank'], inplace=True)
print("self.df_sum.shape1:", self.df_sum.shape)
# # 删除重复行,保留最后一行
self.df_sum.drop_duplicates(['name', 'rank'], keep='last', inplace=True)
print("self.df_sum.shape2:", self.df_sum.shape)
print(self.df_sum.values.tolist())
self.cate_list = list(set(self.df_sum.name))
sql_select = f"SELECT `year_month` from selection.week_20_to_30 WHERE `week`={int(self.week)} and `year`={self.year}"
print(sql_select, 'sql_select:')
df = pd.read_sql(sql_select, con=self.engine)
self.year_month = list(df['year_month'])[0] if list(df['year_month']) else ''
print("self.year_month:", self.year_month)
def handle_data(self):
for cate_name in self.cate_list:
index_ = self.cate_list.index(cate_name)
df = self.df_sum.loc[self.df_sum.name == cate_name]
df['rate'] = 1
data_list = []
rank_list = list(df['rank'])
orders_list = list(df['orders'])
rate_list = list(df['rate'])
for rank, orders, rate in zip(rank_list, orders_list, rate_list):
print("index:", index_, cate_name, rank, orders, rate)
if orders==0:
break
index = rank_list.index(rank)
if index + 1 < len(rank_list):
rank_next = rank_list[index + 1]
orders_next = orders_list[index + 1]
rate_next = rate_list[index + 1]
rank_diff = rank_next - rank
orders_diff = orders - orders_next
rate_diff = rate - rate_next
orders_avg = orders_diff / rank_diff
rate_avg = rate_diff / rank_diff
rank_range_list = list(range(rank, rank_next, 1))
for rank_range in rank_range_list:
index_range = rank_range_list.index(rank_range)
s = orders - orders_avg * index_range
r = rate - rate_avg * index_range
data_list.append([rank_range, s, r])
else:
data_list.append([rank, orders, rate])
print("data_list:", len(data_list))
df_repeat = pd.DataFrame(data_list, columns=['rank', 'orders', 'rate'])
df_repeat['name'] = cate_name
print(df_repeat.head(10))
self.df_repeat_list.append(df_repeat)
time.sleep(4444)
self.df_repeat = pd.concat(self.df_repeat_list)
self.df_repeat.drop(columns=['rate'], inplace=True)
self.df_repeat['orders_day'] = self.df_repeat.orders.apply(lambda x: math.ceil(x / 30))
self.df_repeat['year_month'] = self.year_month
self.df_repeat['week'] = self.week
def db_save_data(self):
with self.engine.begin() as conn:
print(f"delete from {self.site_name}_one_category_report_pyb where `year_month`='{self.year_month}';")
conn.execute(f"delete from {self.site_name}_one_category_report_pyb where `year_month`='{self.year_month}';")
# sql = f'select en_name as name,category_id from {self.site_name}_bs_category WHERE nodes_num =2 and delete_time is NULL'
sql = f"select en_name as name, category_id from {self.site_name}_bs_category where 1 = 1 and nodes_num = 2 group by en_name, category_id"
df_en_name = pd.read_sql(sql, con=self.engine)
# 使用 merge 判断两个列的 name 是否一样
self.df_repeat = pd.merge(self.df_repeat, df_en_name, on='name', how='left')
self.df_repeat = self.df_repeat.loc[self.df_repeat.orders >= 1] # 保留大于0的 排名月销
self.df_repeat.to_sql(f"{self.site_name}_one_category_report_pyb", con=self.engine, if_exists="append", index=False)
def run(self):
self.send_mes(self.site_name)
self.db_read_data()
self.handle_data()
self.db_save_data()
def sendMessage(self, week, site_name):
db = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'],
user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'],
database='selection', charset="utf8mb4")
cursor = db.cursor(cursor=pymysql.cursors.DictCursor)
time_strftime = time.strftime("%Y-%m-%d %X", time.localtime())
update_workflow_progress = f"update workflow_progress set status_val=3,status='抓取结束' where page='ASIN销量' and date_info='2025-{week}' and site_name='{site_name}' and date_type='week'"
print(update_workflow_progress)
cursor.execute(update_workflow_progress)
db.commit()
cursor.close()
db.close()
url = 'http://47.112.96.71:8082/selection/sendMessage'
data = {
'account': 'pengyanbing,fangxingjun,wangrui4',
'title': f"{site_name} 站点类目销量统计",
'content': str(self.week) + f' 周 {site_name}站点类目销量计算 已结束,请确认下一步流程!时间:' + time_strftime
}
try:
requests.post(url=url, data=data, timeout=15)
except:
pass
if __name__ == '__main__':
week = time.strftime("%W")
# week = '04'
print("week 周:", week)
time.sleep(2)
handle_obj_us = CalculateMean(site_name='us', year=2025, week=week)
handle_obj_us.run()
# handle_obj_us.sendMessage(week, site_name='us')
# handle_obj_uk = CalculateMean(site_name='uk', year=2025, week=week)
# handle_obj_uk.run()
# handle_obj_uk.sendMessage(week, site_name='uk')
# handle_obj_de = CalculateMean(site_name='de', year=2025, week=week)
# handle_obj_de.run()
# handle_obj_de.sendMessage(week, site_name='de')
# handle_obj_fr = CalculateMean(site_name='fr', year=2025, week=week)
# handle_obj_fr.run()
# handle_obj_fr.sendMessage(week, site_name='fr')
# handle_obj_es = CalculateMean(site_name='es', year=2025, week=week)
# handle_obj_es.run()
# handle_obj_es.sendMessage(week, site_name='es')
# handle_obj_it = CalculateMean(site_name='it', year=2025, week=week)
# handle_obj_it.run()
# handle_obj_it.sendMessage(week, site_name='it')
from curl_cffi import requests
import json
class dow_shutterstock_img():
def __init__(self):
self.Cookie = 'n_v=35c0dc14612; locale=zh; NEXT_LOCALE=zh; stck_anonymous_id=472cdbf8-94ab-4400-bc2e-1282144cd384; sstk_anonymous_id=472cdbf8-94ab-4400-bc2e-1282144cd384; ssnext=true; hl=zh; sstk_session_start=2024-07-25T01%3A18%3A40.151Z; stck_session_id=0448368f-52a7-4e28-a347-ab0f05ac3aaa; sstk_session_id=0448368f-52a7-4e28-a347-ab0f05ac3aaa; visit_id=81138438424; visitor_id=73374677017; htjs_anonymous_id=472cdbf8-94ab-4400-bc2e-1282144cd384; OptanonCachedGroups=,C0001,C0003,C0002,C0005,C0004,C0007,; gtm_monit_roll=35; _gcl_au=1.1.2007568107.1721870326; _ga=GA1.1.167863445.1721870327; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%225zW82mi1vrTw1mhmBI1x%22%7D; FPID=FPID2.2.jbbGJsFa%2F4QZTUcoZS3oBakRd4Dy31w%2BcrLWua3ga58%3D.1721870327; FPLC=rZnqDf%2Bn%2BS52gDBS2ipJg%2BAWydfKAh%2B9delXhynY3fwgGwb1bpQqEorUA7kHpdBKH5bONHU5lKe2ZUWSO2vga3C5IRiisVuQEKJf2G7LQX5PDSEpmwe3aduB%2FPp4tA%3D%3D; __ssid=9124e091e2c0f8a255d533447d3edf8; slireg=https://scout.us1.salesloft.com; sliguid=3b796208-9a66-40cc-91c6-9144ef0f91d0; slirequested=true; _4c_=%7B%22_4c_mc_%22%3A%22af3661e3-cfe6-459a-bc7a-efefde831641%22%7D; extole_access_token=CQ6KFSOGH1TV2OF5Q09VSV4DNO; did=50c367d8-bf47-462d-b543-ff94bda0d3f7; accts_customer=daiting55; accts_customer_sso1=331232635-undefined; next.sid=s%3A4Hs9k_fNrFzMq_FEPYNsShMtQWMbkjMT.EMdAfR%2F4H6hJNs662VC98Nn69ekJZV9yYwam00S5frk; Authorization=1%2FeyJjbGllbnRfaWQiOiJhMGI3Ni1hN2Y1ZS1mZWRlYy1iYmViZS1mYTY1Yi04NTcxOSIsInJlYWxtIjoiY3VzdG9tZXIiLCJzY29wZSI6InVzZXIudmlldyB1c2VyLmVtYWlsIHVzZXIuYWRkcmVzcyB1c2VyLmVkaXQgb3JnYW5pemF0aW9uLnZpZXcgb3JnYW5pemF0aW9uLmFkZHJlc3MgY29sbGVjdGlvbnMudmlldyBjb2xsZWN0aW9ucy5lZGl0IGxpY2Vuc2VzLnZpZXcgbGljZW5zZXMuY3JlYXRlIG1lZGlhLnVwbG9hZCBtZWRpYS5zdWJtaXQgbWVkaWEuZWRpdCBwdXJjaGFzZXMudmlldyBwdXJjaGFzZXMuY3JlYXRlIiwidXR2IjoiOGJSVSIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJ1c2VybmFtZSI6ImRhaXRpbmc1NSIsInVzZXJfaWQiOjMzMTIzMjYzNSwib3JnYW5pemF0aW9uX2lkIjpudWxsLCJwYXJlbnRfb3JnYW5pemF0aW9uX2lkcyI6W10sImN1c3RvbWVyX2lkIjozMzEyMzI2MzUsImV4cCI6MTcyMTg3NDYxM30.GTyqg0nXBm4LFrxY9OsIOQnVKAAi-XjO4XzTP8f3hGTI-IYqI6csxzbCzrwjhUpVaYfjqBe2xRn6f35NRzoZDg; htjs_user_id=331232635; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%22472cdbf8-94ab-4400-bc2e-1282144cd384%22%7D; search-component-cookie=pagination; downlink=mid; OptanonConsent=isGpcEnabled=0&datestamp=Thu+Jul+25+2024+09%3A44%3A43+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202403.2.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=f9ee0de2-d442-4486-a0b8-fc7418338d88&interactionCount=1&isAnonUser=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0005%3A1%2CC0004%3A1%2CC0007%3A1&AwaitingReconsent=false; _ga_5JRYE4Y8J9=GS1.1.1721870327.1.1.1721871885.60.0.0; _ga_SSGTMSSTK=GS1.1.1721870327.1.1.1721871885.0.0.593396081; _uetsid=d96bba304a2311ef91b8b97ed1dee9b6|1atmgbl|2|fnr|0|1667; _uetvid=d96bbea04a2311ef898be93aba9723f6|tngh3d|1721871886172|7|1|bat.bing.com/p/insights/c/l; datadome=7tnXqyo18T3wrY9BkHxkVF_5Bk39NamIoV9SzIO~iHwDJU2fnbgpuH65b2FEeMW~kI3is7vvxi~zSIG_fIL~9JG_d9Hge9StDOHiRJtzre~8TZpTB0j7tMUg~ZM2c7LH',
def get_imgid_data_index(self):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Cookie': self.Cookie,
'Pragma': 'no-cache',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
url = 'https://www.shutterstock.com/api/s/dam/holdings/search?include=media-item%2Cmedia-item.track-assets&sort=-licensedAt&useMms=True&channel=shutterstock&page[size]=50&page[number]=1&filter[assetStatus]=comped%2Clicensed&language=zh'
resp = requests.get(url, headers=headers)
# resp_dict = resp.json()
# totalPages = resp_dict['meta']['pagination']['totalPages'] # 总页数
# totalRecords = resp_dict['meta']['pagination']['totalRecords'] # 总图片数量
# size = resp_dict['meta']['pagination']['size'] # 一页返回的图片数量
print(resp.text)
# get_imgid_data(int(totalPages))
def get_imgid_data(self, totalPages):
for i in range(2, totalPages + 1):
next_page_url = f'https://www.shutterstock.com/api/s/dam/holdings/search?include=media-item%2Cmedia-item.track-assets&sort=-licensedAt&useMms=True&channel=shutterstock&page[size]=50&page[number]={i}&filter[assetStatus]=comped%2Clicensed&language=zh'
print(next_page_url)
def dow_image_eps(self):
headers = {
"Host": "www.shutterstock.com",
'Cookie': self.Cookie,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Content-Type": "application/json",
"Accept": "application/json",
"Origin": "https://www.shutterstock.com",
"Referer": "https://www.shutterstock.com/zh/catalog/licenses",
"Accept-Language": "zh-CN,zh;q=0.9",
}
url = "https://www.shutterstock.com/napi/licensees/current/redownload"
params = {
"v": "35c0dc14612"
}
data = {"required_cookies": "", "content": [
{"content_id": content_id, "content_type": "photo", "content_size": "vector", "content_format": "eps",
"license_name": "standard", "show_modal": True}]}
data = json.dumps(data, separators=(',', ':'))
response = requests.post(url, headers=headers, params=params, data=data, verify=False)
print(response.json())
meta_items = response.json()
downloadUrl = meta_items['meta']['licensedContent'][0]['downloadUrl']
resp = requests.get(downloadUrl)
with open(rf'C:\Users\ASUS\Downloads\{content_id}.eps', 'wb')as f:
f.write(resp.content)
# 返回下载链接
# {"meta":{"licensedContent":[{"licenseName":"standard","contentFormat":"eps","contentId":"2254280157","contentSize":"vector","contentType":"photo","downloadUrl":"https://download.shutterstock.com/gatekeeper/W3siZCI6ICJzaHV0dGVyc3RvY2stbWVkaWEiLCAiayI6ICJwaG90by8yMjU0MjgwMTU3L3ZlY3Rvci5lcHMiLCAiZGMiOiAiaWRsXzEyMyIsICJlIjogMTcyMjM1NDcwMywgIm0iOiAxfSwgImQzTzh6QlFzNDNiaHlEcmFRcUNJek5lRUNjRSJd/shutterstock_2254280157.eps","isRedownload":true,"contentLicenseId":"e1f8df6ea1080f070e19283dcb6f1cc97b","downloadId":"deprecated"}]}}
def select_totalRecords(self):
"""
查询 上一次下载总数。当前总数减去上一次图片总数 除 50 得到需要下载的总页数。
"""
if __name__ == '__main__':
# get_imgid_data(302)
dow_shutterstock_img().get_imgid_data_index()
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from amazon_params import py_ja3
# import requests
from curl_cffi import requests,Curl
from sqlalchemy import create_engine
import pandas as pd
from queue import Queue
import threading
import time
import random
import os
from amazon_params.params import DB_CONN_DICT, PG_CONN_DICT
import urllib3
import re
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from utils.requests_param import Requests_param_val
# sess = requests.Session()
import traceback
class Amazon_Img():
def __init__(self, site_name):
self.site_name = site_name
self.mysql_reconnect()
self.asin_img_queue = Queue()
self.asin_state_list = []
self.asin_not_find = []
self.asin_imgurl_null = []
self.reuests_para_val = Requests_param_val(site_name=self.site_name, spider="seller_account_product")
def mysql_reconnect(self):
"""
Connection of mysql.
"""
nums = 0
while True:
nums += 1
try:
if self.site_name == 'us':
db = 'selection'
else:
db = f'selection_{self.site_name}'
self.engine_pg = create_engine(
f'mysql+pymysql://{DB_CONN_DICT["mysql_user"]}:' + f'{DB_CONN_DICT["mysql_pwd"]}@{DB_CONN_DICT["mysql_host"]}:{DB_CONN_DICT["mysql_port"]}/{db}?charset=utf8mb4') # , pool_recycle=3600
break
except Exception as e:
print("error_mysql_connect:", e, f"\n{traceback.format_exc()}")
time.sleep(nums * 20)
continue
def downlad_img(self):
while True:
if self.asin_img_queue.empty() == False:
querys = self.asin_img_queue.get()
query = querys.split('|-|')
print('请求:::', query)
asin = query[0]
asin_img_path = 'amazon'
img_url = query[1]
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
}
if img_url == 'null' or img_url == 'None' or img_url == '':
self.asin_imgurl_null.append(asin)
continue
try:
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 114)}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
headers['user-agent'] = ua
# sess.mount("'https://m.media-amazon.com", py_ja3.DESAdapter())
item_proxy8 = {'http': 'http://t16208953323855:ib80eped@i537.kdltps.com:15818/',
'https': 'http://t16208953323855:ib80eped@i537.kdltps.com:15818/'}
item_proxy9 = {'http': 'http://t16244760032579:0gj5rbnp@n378.kdltps.com:15818/',
'https': 'http://t16244760032579:0gj5rbnp@n378.kdltps.com:15818/'}
item_proxy10 = {'http': 'http://t17780866032960:57b65ww2@i633.kdltps.com:15818/',
'https': 'http://t17780866032960:57b65ww2@i633.kdltps.com:15818/'}
item_proxy11 = {'http': 'http://t16450128695918:2vqy8epc@x783.kdltps.com:15818/',
'https': 'http://t16450128695918:2vqy8epc@x783.kdltps.com:15818/'}
item_proxy12 = {'http': 'http://t16450137765728:8cw2orjz@t357.kdltps.com:15818/',
'https': 'http://t16450137765728:8cw2orjz@t357.kdltps.com:15818/'}
proxy_list = [item_proxy8, item_proxy9, item_proxy10, item_proxy11, item_proxy12]
proxy_ = random.choice(proxy_list)
curl = Curl(cacert="/path/to/your/cert")
session = requests.Session(curl=curl)
r = session.get(img_url, headers=headers,proxies=proxy_, timeout=60,verify=False,impersonate="chrome110") # 获取网页
except Exception as e:
print('========================请求报错:', e)
self.asin_not_find.append(asin)
continue
try:
asin_upper = asin.upper()
print(asin, "存储路径", rf"{asin_img_path}/{asin_upper}.jpg")
path_1 = fr"/mnt/data/img_data/{asin_img_path}"
if os.path.exists(path_1) == False: # 判断路径是否存在
os.makedirs(path_1)
with open(rf"/mnt/data/img_data/{asin_img_path}/{asin_upper}.jpg",
'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
self.asin_state_list.append(asin)
except Exception as e:
print('++++++++++++++++++++++++++存储报错=====', e)
self.asin_not_find.append(asin)
else:
break
def update_asin_state(self, state=2, asin_list=None):
df = self.df_read.loc[(self.df_read.asin_compet.isin(asin_list))]
asin_tuple = tuple(df.asin_compet)
print(state, '修改状态::', len(asin_tuple))
while True:
try:
print('修改状态')
with self.engine_pg.begin() as conn:
# 1,3:1--回滚;3--成功
if asin_list:
if len(asin_tuple) == 1:
sql_update = f"update us_self_asin_compet_amazon set state={state} where asin_compet in ('{asin_tuple[0]}') and state=2;"
else:
sql_update = f"update us_self_asin_compet_amazon set state={state} where asin_compet in {asin_tuple} and state=2;"
conn.execute(sql_update)
break
except Exception as e:
print(e, '444444444444')
self.mysql_reconnect()
time.sleep(5)
continue
def read_img_url(self):
while True:
try:
with self.engine_pg.begin() as conn:
sql_read = f'SELECT asin_compet,img_url,id FROM us_self_asin_compet_amazon where state=1 LIMIT 1000 FOR UPDATE'
print(sql_read)
a = conn.execute(sql_read)
self.df_read = pd.DataFrame(a, columns=['asin_compet', 'img_url', 'id'])
self.df_read.drop_duplicates(['asin_compet'], inplace=True)
if self.df_read.shape[0] == 0:
return []
index_tuple = tuple(self.df_read['id'])
print('更改状态 2 ', len(index_tuple))
if len(index_tuple) == 1:
sql_update = f"""UPDATE us_self_asin_compet_amazon a set state=2 where a.id in ('{index_tuple[0]}')"""
else:
sql_update = f"""UPDATE us_self_asin_compet_amazon a set state=2 where a.id in {index_tuple}"""
conn.execute(sql_update)
asin_img_list = list(
self.df_read.asin_compet + '|-|' + self.df_read.img_url)
# print(asin_img_list)
return asin_img_list
except Exception as e:
print("读取数据出bug并等待5s继续", e)
self.mysql_reconnect()
time.sleep(3)
continue
def run(self):
while True:
asin_img_list = self.read_img_url()
if asin_img_list:
for asin_img in asin_img_list:
self.asin_img_queue.put(asin_img)
html_thread = []
for i in range(40):
thread2 = threading.Thread(target=self.downlad_img)
html_thread.append(thread2)
for ti in html_thread:
ti.start()
time.sleep(0.125)
for t2 in html_thread:
t2.join()
if self.asin_state_list:
self.update_asin_state(state=3, asin_list=self.asin_state_list)
self.asin_state_list = []
if self.asin_not_find:
self.update_asin_state(state=1, asin_list=self.asin_not_find)
self.asin_not_find = []
if self.asin_imgurl_null:
self.update_asin_state(state=5, asin_list=self.asin_imgurl_null)
self.asin_imgurl_null = []
# break
else:
break
if __name__ == '__main__':
Amazon_Img('us').run()
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from lxml import etree
import time
from selenium.webdriver.support import expected_conditions as EC
from sqlalchemy import create_engine
import pandas as pd
import random
from sqlalchemy.engine import URL
class dow_seller_help():
def __init__(self):
DB_CONN_DICT = {
"mysql_port": 3306,
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
url_us = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database='selection',
query={"charset": "utf8mb4"}
)
self.engine_us_mysql = create_engine(url_us)
def db_(self):
DB_CONN_DICT = {
"mysql_port": 3306,
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
url_us = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database='selection',
query={"charset": "utf8mb4"}
)
self.engine_us_mysql = create_engine(
url_us)
def download_day_st(self, site='us'):
print(f'下载 {site} 站点数据')
for i in range(1):
try:
pr_name = "chrome.exe"
os.system('%s%s' % ("taskkill /F /IM ", pr_name))
except Exception as e:
print("强制关闭chrome.exe失败:", e)
port = 9222
params_ = ""
# params_ = "--blink-settings=imagesEnabled=false"
os.system(f'start Chrome {params_} --remote-debugging-port={port}')
chrome_options = Options()
# 禁止加载图片
chrome_options.add_argument('--blink-settings=imagesEnabled=false') # 这样可以;;;;激动
chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{port}")
driver = webdriver.Chrome(r'chromedriver.exe', options=chrome_options)
# driver = webdriver.Chrome(f"{self.path_driver}/chromedriver.exe", options=chrome_options)
return driver
def get_help_hub(self, driver):
driver.get('https://sellercentral.amazon.com/help/hub/reference')
time.sleep(5)
return driver
def save_data(self, data_list):
df_seller_help = pd.DataFrame(data=data_list, columns=['category', 'outer_html', 'number'])
df_seller_help.to_sql('seller_help_html', con=self.engine_us_mysql, if_exists='append', index=False)
# def get_category_url(self):
# # 调用 get_category_url 存储每个大分类下的所有url
# with open(r'C:\Users\ASUS\Downloads\Merch Collab-html.html', 'r', encoding='utf-8')as f:
# # with open(r'新建文本文档.html', 'r', encoding='utf-8')as f:
# resp = f.read()
# resp_html = etree.HTML(resp)
# all_a_list = resp_html.xpath("//kat-tab[@id='hh-nav-tree-widget-tab']//a")
# print('all_a_list::', len(all_a_list))
# data = []
# for _a in all_a_list[21:]:
# title_href = _a.xpath('./@href')
# title = _a.xpath('.//span/text()')
# num = re.findall(r'reference/(.*)', title_href[0])
# data.append([title[0], title_href[0], num[0]])
# print(data)
# print(len(data))
# df_seller_help = pd.DataFrame(data=data, columns=['zh_name', 'url', 'number'])
# df_seller_help.drop_duplicates(['number'], inplace=True)
# df_seller_help.to_sql('seller_help_syn', con=self.engine_us_mysql, if_exists='append', index=False)
def select_url(self):
sql = 'select zh_name, number, url from seller_help_syn where state=1'
syn_data_df = pd.read_sql(sql, con=self.engine_us_mysql)
syn_data_list = list(syn_data_df.zh_name + '|' + syn_data_df.number + '|' + syn_data_df.url)
return syn_data_list
def update_url_state(self, number):
with self.engine_us_mysql.begin() as conn:
up_sql = f"UPDATE seller_help_syn set state=3 WHERE number='{number}'"
print('up_sql:', up_sql)
conn.execute(up_sql)
def run(self):
driver = self.download_day_st()
driver = self.get_help_hub(driver)
time.sleep(5)
syn_data_list = self.select_url()
for syn_data in syn_data_list:
categroy_data_list = syn_data.split('|')
print(categroy_data_list)
for i in range(3):
try:
data_list = []
title_name = categroy_data_list[0]
title_href = categroy_data_list[2]
number = categroy_data_list[1]
print('小类链接', title_href)
print('具体小类名称', title_name)
print('编码', number)
url = 'https://sellercentral.amazon.com' + title_href
print('请求url:', url)
driver.get(url)
time.sleep(5)
wait = WebDriverWait(driver, 15) # 最多等待 10 秒
element = wait.until(EC.presence_of_element_located((By.ID, "full-help-page")))
html = etree.HTML(driver.page_source)
label_list = html.xpath("//kat-breadcrumb[@role='list']//kat-breadcrumb-item/@label")
jion_label = ' _ '.join(label_list).replace(' _ ...', '')
jion_label = jion_label.replace('...', '').replace('\n', '')
div_element = driver.find_element(By.ID, "full-help-page")
# 获取 div 内部的所有 HTML 标签
inner_html = div_element.get_attribute("innerHTML")
print(inner_html) # 打印 div 内部的所有 HTML 代码
data_list.append([jion_label, inner_html, number, ])
self.save_data(data_list)
print('jion_label:::', jion_label)
self.update_url_state(number)
time.sleep(random.uniform(3, 10))
break
except Exception as e:
print('报错:', e)
time.sleep(15)
driver.refresh()
self.db_()
continue
if __name__ == '__main__':
dow_seller_help().run()
"""
根据提供的品牌名称 去 商标网查对应的专利,部署再h7物理机
"""
from flask import Flask, request, jsonify
import requests
import traceback
import json
app = Flask(__name__)
# 在应用程序中设置JSON_AS_ASCII选项为False
app.config['JSON_AS_ASCII'] = False
class brand_tmview():
def __init__(self):
self.data_dict = {}
self.items = {}
self.data_dict_list = []
def get_tmview_data(self):
# 获取商标编码。根据国家开头 如 US500000074325647,UK500000074325647
url = "https://www.tmdn.org/tmview/api/search/results"
headers = {
"Accept": "application/json",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json; charset=utf-8",
"Origin": "https://www.tmdn.org",
"Pragma": "no-cache",
"Referer": "https://www.tmdn.org/tmview/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Not_A Brand\";v=\"8\", \"Chromium\";v=\"120\", \"Google Chrome\";v=\"120\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
data = {"page": "1", "pageSize": "50", "criteria": "I", "basicSearch": self.keyword,
"fOffices": ["ES", "FR", "GB", "IT", "US", "WO", "DE", "CA"], "fTMStatus": ["Registered", "Filed"],
"fields": ["ST13", "markImageURI", "tmName", "tmOffice", "applicationNumber", "applicationDate",
"tradeMarkStatus", "niceClass"]}
ST13_list = []
try:
response = requests.post(url, headers=headers, json=data, timeout=30).json()
trademarks_list = response['tradeMarks']
print('trademarks_list::', trademarks_list)
if trademarks_list:
for trademarks in trademarks_list:
ST13_list.append({"ST13": trademarks['ST13'], 'niceClass': trademarks['niceClass'],
'tmOffice': trademarks['tmOffice'],
'asin': self.asin, 'site': self.site, 'brand': self.keyword})
if ST13_list:
self.get_trademark_detail(ST13_list)
else:
self.data_dict_list.append(response)
except Exception as e:
print("error_info:", e, f"\n{traceback.format_exc()}")
def get_trademark_detail(self, ST13_list):
# 根据商标编码获取注册人名称
for ST13_data in ST13_list:
url = f'https://www.tmdn.org/tmview/api/trademark/detail/{ST13_data["ST13"]}?translate=true'
print('获取权利人名称url:', url)
headers = {
"Accept": "application/json",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json; charset=utf-8",
"Pragma": "no-cache",
"Referer": "https://www.tmdn.org/tmview/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Not_A Brand\";v=\"8\", \"Chromium\";v=\"120\", \"Google Chrome\";v=\"120\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
for i in range(3):
try:
response = requests.get(url, headers=headers,timeout=30).json()
applicants_list = response['applicants']
print('查看申请人详情::', applicants_list)
break
except:
headers['Host'] = 'www.tmdn.org'
continue
for applicants in applicants_list:
ST13 = ST13_data["ST13"]
niceClass = ST13_data['niceClass']
tmOffice = ST13_data['tmOffice']
asin = ST13_data['asin']
brand = ST13_data['brand']
site = ST13_data['site']
full_name = applicants.get('fullName')
if full_name is None:
full_name = applicants.get('organizationName')
data_dict = {"ST13":ST13,'fullName': full_name, 'niceClass': niceClass,
'tmOffice': tmOffice,
'asin': asin, 'site': site, 'brand': brand}
print('data_dictdata_dict::', data_dict)
self.data_dict_list.append(data_dict)
self.unique_data()
def unique_data(self):
# 创建一个集合来存储已经存在的元组(fullName、niceClass、tmOffice)
seen = set()
# 创建一个列表来存储去重后的数据
unique_data_list = []
# 遍历数据列表
for item in self.data_dict_list:
# 构建元组
key = (item['fullName'], tuple(sorted(item['niceClass'])), item['tmOffice'])
# 如果元组不在集合中,说明是新的条目,将其添加到集合和新的列表中
if key not in seen:
seen.add(key)
unique_data_list.append(item)
self.data_dict_list = unique_data_list
def run(self, keyword, asin, site):
print('搜索品牌名称:', keyword, asin, site)
self.keyword = keyword
self.asin = asin
self.site = site
self.get_tmview_data()
return self.data_dict_list
# if __name__ == '__main__':
# data_dict_list = brand_tmview().run('coospider-repta','B0C5D7TZ33','us')
# print(data_dict_list)
#
@app.route('/tmview', methods=['GET'])
def get_args():
asin = request.args.get('asin')
site = request.args.get('site')
keyword = request.args.get('brand')
if site and asin and keyword:
data_dict_list = brand_tmview().run(keyword, asin, site)
if data_dict_list:
items = {"code": 200, "message": "success", "data_dict": data_dict_list}
else:
items = {"code": 500, "message": "Retry", "data_dict": []}
return jsonify(items)
else:
response = {'message': 'Request parameter error', 'code': 401, 'data_dict': []}
return jsonify(response)
if __name__ == '__main__':
app.run('0.0.0.0', 10247)
import time
import random
import uuid
import json
from urllib.parse import quote_plus
from collections import deque
from concurrent.futures import ThreadPoolExecutor
import requests
from lxml import etree
import pandas as pd
from sqlalchemy import create_engine, text, bindparam
from sqlalchemy.engine import URL
from sqlalchemy.pool import NullPool
from utils.requests_param import Requests_param_val
from utils.db_connect import BaseUtils
from threading_spider.db_connectivity import connect_db
from amazon_params.params import DB_CONN_DICT
class CategoryParser:
@staticmethod
def safe_index(seq, idx, default=None):
return seq[idx] if 0 <= idx < len(seq) else default
@classmethod
def parse(cls, nodes_num, url):
parts = url.rstrip('/').split('/')
has_ref = 'ref=' in url
if not has_ref:
ref_suffix = None
cid = cls.safe_index(parts, -1)
first_id = cls.safe_index(parts, -2)
else:
ref_suffix = cls.safe_index(parts, -1)
first_id = cls.safe_index(parts, -3)
cid = cls.safe_index(parts, -2)
# Determine level
if nodes_num == 1:
level = 1
elif url.endswith('_0'):
level = 2
elif url.endswith(f"{first_id}_1"):
level = 3
else:
level = 4
# Compute parent
if level == 1:
pid, cid, first_id = None, '0', None
elif level == 2:
pid = '0'
first_id = cid
elif level == 3:
pid = first_id
else:
pid = ref_suffix.split('_')[-1] if ref_suffix else None
return {
'category_id': cid,
'category_first_id': first_id,
'category_parent_id': pid
}
class RequestSession:
def __init__(self, site):
self.site = site
self.site_url = f'https://www.amazon.{site}'
self.host = f'www.amazon.{site}'
self.session = requests.Session()
self.cookies = deque()
self.param_val = Requests_param_val(site_name=site)
def ensure_cookies(self):
if not self.cookies:
ck = self.param_val.get_cookie()
for v in ck.values():
self.cookies.append(v)
def next_cookie(self):
self.ensure_cookies()
raw = self.cookies.popleft()
try:
lst = json.loads(raw)
except Exception:
lst = eval(raw)
# Ensure list
if isinstance(lst, dict):
lst = [lst]
if not isinstance(lst, list):
lst = []
# Build cookie string
cookie_str = ''
for c in lst:
try:
cookie_str += f"{c['name']}={c['value']};"
except Exception:
continue
return cookie_str
def build_headers(self, referer):
n = random.randint(70, 114)
ua = (
f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
f"AppleWebKit/537.36 (KHTML, like Gecko) "
f"Chrome/{n}.0.{random.randint(1000,5000)}.{random.randint(1,181)} Safari/537.36"
)
headers = {
'Connection': 'close',
'Authority': self.host,
'Accept': 'text/html,*/*',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Origin': referer,
'Referer': referer,
'User-Agent': ua,
'Cookie': self.next_cookie()
}
headers[random.choice('abcdef')] = str(uuid.uuid4())
return headers
def fetch(self, url, retries=5, timeout=20):
for attempt in range(retries):
try:
hdr = self.build_headers(url)
resp = self.session.get(url, headers=hdr, timeout=timeout, verify=False)
txt = resp.text
if resp.status_code in (200, 201) and 'Enter the characters' not in txt:
return etree.HTML(txt)
except Exception as e:
print(f"[fetch] attempt {attempt} error: {e}")
time.sleep(1 + attempt)
raise RuntimeError(f"无法获取 URL: {url}")
class BSRCategorySpider(BaseUtils):
def __init__(self, site):
super().__init__()
self.site = site
self.req = RequestSession(site)
self.parser = CategoryParser()
self._init_db()
self.asin_items = []
self.week = int(time.strftime("%W"))
self.year_month = time.strftime("%Y_%m")
self.time_strftime_ = time.strftime("%Y-%m-%d", time.localtime())
self.columns = [
'bsr_id', 'asin', 'bsr_rank', 'price',
'rating', 'reviews', 'week', 'year_month', 'cate_current_id'
]
def _init_db(self):
cfg = DB_CONN_DICT
url = URL.create(
drivername='mysql+pymysql',
username=cfg['mysql_user'],
password=cfg['mysql_pwd'],
host=cfg['mysql_host'],
port=cfg['mysql_port'],
database=cfg['mysql_db'],
query={'charset': 'utf8mb4'}
)
self.engine = create_engine(
url,
poolclass=NullPool,
connect_args={'connect_timeout': 10}
)
def process_node(self, rec, level):
_id, name, path, cid = rec
tree = self.req.fetch(path)
xpath_expr = (
f"//div[@role='treeitem' and span/text()='{name}']"
"/following-sibling::div[@role='group']/div[@role='treeitem']/a"
)
a_nodes = tree.xpath(xpath_expr)
parent_info = self.parser.parse(level, path)
next_recs = []
for a in a_nodes:
href = a.xpath('./@href')[0]
full = self.req.site_url + href
info = self.parser.parse(level+1, full)
next_recs.append((info['category_id'], name, full, parent_info['category_id']))
self._collect_asin(tree, _id, parent_info['category_id'])
return next_recs
def _collect_asin(self, tree, bsr_id, current_cid):
recs = tree.xpath("//div[@class='p13n-desktop-grid']/@data-client-recs-list")
if not recs:
return
data_list = eval(recs[0])
for data in data_list:
asin = data.get('id')
rank = data.get('metadataMap', {}).get('render.zg.rank')
self.asin_items.append(
(bsr_id, asin, int(rank), None, None, None,
self.week, self.year_month, current_cid)
)
def run_level(self, level, max_workers=10):
df = pd.read_sql(
f"SELECT id,en_name,path,category_id FROM {self.site}_bs_category "
f"WHERE nodes_num={level} AND category_state=1 AND delete_time IS NULL",
self.engine
)
if df.empty:
return []
ids = tuple(df['category_id'])
# mark processed using expanding
stmt = text(
f"UPDATE {self.site}_bs_category SET category_state=2 "
f"WHERE category_id IN :ids"
).bindparams(bindparam('ids', expanding=True))
with self.engine.begin() as conn:
conn.execute(stmt, {'ids': ids})
recs = list(df.to_records(index=False))
next_recs = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for out in executor.map(lambda r: self.process_node(r, level), recs):
next_recs.extend(out)
return next_recs
def save_asin_data(self):
if not self.asin_items:
return
df = pd.DataFrame(self.asin_items, columns=self.columns)
df['date_info'] = self.time_strftime_
df.drop_duplicates(['asin', 'bsr_rank', 'cate_current_id'], inplace=True)
df.to_sql(
f'{self.site}_bs_category_top100_asin',
con=self.engine,
if_exists='append',
index=False
)
def send_ms(self):
for _ in range(3):
try:
url = 'http://selection.yswg.com.cn:8080/soundasia_selection/workflow/emit'
data = {
'dateType': 'day',
'reportDate': self.time_strftime_,
'statusVal': 3,
'siteName': self.site,
'remark': 'bsr榜单爬取完毕',
'isEnd': '是',
'tableName': f'{self.site}_bs_category_top100_asin',
'status': 'bsr榜单爬取完毕'
}
resp = requests.post(url, headers={'Content-Type':'application/json'}, json=data, timeout=10)
print('通知返回:', resp.text)
break
except Exception as e:
print('send_ms error:', e)
time.sleep(5)
def run(self, max_level=4):
level = 1
while level <= max_level:
next_nodes = self.run_level(level)
if not next_nodes:
break
level += 1
self.save_asin_data()
if __name__ == '__main__':
for site in ['us']:
spider = BSRCategorySpider(site)
spider.run()
spider.send_ms()
import requests
import time
import json
import random
requests.packages.urllib3.disable_warnings()
def test(authorization):
# 自动点击
url = 'https://api.hamsterkombat.io/clicker/tap'
h = {
"Host": "api.hamsterkombat.io",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"Accept": "*/*",
"Access-Control-Request-Method": "POST",
"Access-Control-Request-Headers": "authorization,content-type",
"Origin": "https://hamsterkombat.io",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Dest": "empty",
"Referer": "https://hamsterkombat.io/",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh,zh-CN;q=0.9,en;q=0.8"
}
s = requests.session()
res = s.options(url, headers=h, verify=False)
# print(res.text)
h2 = {
"Host": "api.hamsterkombat.io",
"Connection": "keep-alive",
"Content-Length": "55",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"sec-ch-ua": "\"Google Chrome\";v=\"125\", \"Chromium\";v=\"125\", \"Not.A/Brand\";v=\"24\"",
"accept": "application/json",
"content-type": "application/json",
"sec-ch-ua-mobile": "?0",
"authorization": authorization,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"sec-ch-ua-platform": "\"Windows\"",
"Origin": "https://hamsterkombat.io",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Dest": "empty",
"Referer": "https://hamsterkombat.io/",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh,zh-CN;q=0.9,en;q=0.8"
}
r_int = random.randint(99, 150)
data = {
"count": r_int,
"availableTaps": 1963 - r_int,
"timestamp": int(str(time.time()).split('.')[0])
}
data = json.dumps(data, separators=(',', ':'))
res2 = s.post(url, headers=h2, data=data, verify=False)
print(res2.text)
# print(res2.text)
# print(res2.text)
if res2.json()['clickerUser']['availableTaps'] >= 1:
time.sleep(random.randint(4, 10))
test(authorization)
def upgrades_buy(authorization):
# 签到
url = 'https://api.hamsterkombatgame.io/clicker/upgrades-for-buy'
h = {
"Host": "api.hamsterkombatgame.io",
"Connection": "keep-alive",
"Authorization": authorization,
"User-Agent": "Mozilla/5.0 (Linux; Android 12; K) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/95.0.4638.74 Safari/537.36",
"Accept": "*/*",
"Origin": "https://hamsterkombatgame.io",
"X-Requested-With": "org.telegram.messenger",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Dest": "empty",
"Referer": "https://hamsterkombatgame.io/",
"Accept-Language": "en,zh-CN;q=0.9,zh;q=0.8,en-US;q=0.7",
"Accept-Encoding": "gzip, deflate",
"Content-Length": "0",
}
res = requests.post(url, headers=h, verify=False)
print(res.json())
print(res.json()[
'dailyCombo']) # {'upgradeIds': [], 'bonusCoins': 5000000, 'isClaimed': False, 'remainSeconds': 31267}
authorization = "Bearer 17248120209751wF8gOWJiDNbXsFD2PdSW5AlpoIfM8REW62lLwPXgqaVLmACNtAL0UJblr21RRRU7478944675"
# upgrades_buy(authorization)
while True:
try:
test(authorization)
now = time.localtime()
print(time.strftime("%Y-%m-%d %H:%M:%S", now))
print('等待12分钟')
time.sleep(12 * 60)
print('等待结束')
except Exception as e:
print('重试')
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import curl_cffi
from lxml import etree
from threading_spider.db_connectivity import connect_db
import json
import time
import pymysql
import random
def get_cid():
# 获取所有站点的bsr 大类名称 和 分类id,存储到us站点
url = 'https://www.sellersprite.com/v2/tools/sales-estimator'
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
resp = curl_cffi.get(url, headers=headers, impersonate="chrome")
html = etree.HTML(resp.text)
data_category = html.xpath("//script[@id='data-category']/text()")[0]
print(data_category)
save_site_category(json.loads(data_category))
def junglescout_spider(db_base):
month = time.strftime("%m")
year = time.strftime("%Y")
year_month = f'{year}_{int(month)}'
db_class = connect_db(db_base)
cursor_mysql_db, connect_mysql_db = db_class.mysql_db() # mysql
cursor_us, connect_us = db_class.us_mysql_db() # us站点
cursor_us = connect_us.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
category_name_sql_select = f"select `name`,c_id from all_site_category where site='{db_base}' and state =1"
print(category_name_sql_select)
cursor_us.execute(category_name_sql_select)
category_name_list = cursor_us.fetchall()
print('category_name_list:',category_name_list)
for category_name in category_name_list:
db_class = connect_db(db_base)
print(db_base)
cursor_mysql_db, db = db_class.mysql_db() # mysql
db_class_us = connect_db('us')
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
print(category_name['name'], category_name['c_id'])
name_rnak_list = []
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{category_name['c_id']}'"
print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1600,
1700,
1800,
1900,
2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500,
3600,
3700,
3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300,
5400,
5500,
5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100,
7200,
7300,
7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900,
9000,
9100,
9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 11000, 12000, 13000, 14000, 15000, 16000,
17000,
18000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000
]
for i in rank_list:
headers = {
"Referer": "https://www.sellersprite.com/v2/tools/sales-estimator",
"Origin":"https://www.sellersprite.com",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
'Cookie': '_ga=GA1.1.19240078.1751854600; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751854601; HMACCOUNT=28ABEEABEFA97E4A; _gcl_au=1.1.536675967.1751854601; MEIQIA_TRACK_ID=2zWlEnsYAqnZRdhJqJ5txX7tpXm; MEIQIA_VISIT_ID=2zWlEmUkBQV745rliAtXEdAk0CJ; ecookie=ZyZ05gxOxlDTPkM1_CN; 8f00639f9c446a2d0213=54fb71d3f2c9e8acb7878e0f73abbf33; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=e03eac62da4f8988dc796341e1bd822c; current_guest=jsxcNvsgBJO1_250707-100340; rank-login-user=502219157192wVgAJpdturGN5Im+nPDQqTtoVYwVNo1oWP9MD0mtMHFwS3LrhtAUhuCnvMHsCl; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjUwMjIxOTE1NzE5MndWZ0FKcGR0dXJHTjVJbStuUERRcVR0b1ZZd1ZObzFvV1A5TUQwbXRNSEZ3UzNMcmh0QVVodUNudk1Ic0NsIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiIwZ01FdlJuNWJ1dlZhVW5IZ1lKSDFRIiwiaWF0IjoxNzUxODU0NjA1LCJleHAiOjE3NTE5NDEwMDUsIm5iZiI6MTc1MTg1NDU0NSwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.Ujr6_K3vHIQRw3x52QAQdTftMy6GbZ_TunmFMgW76onCy3EkBzx7uxEv-42zRRXgKLMUfJz2t0ierqXV6Evh9i-o5F0ZUBREzm48LHpGSw6Iupjx4Udc3VQwVqgiUOmYBvnTAQqmaj6iA5l06zAZcVNHQASZ5xe5QFUCllIOL0m8tf3Xad6T8u5oLHRHTTuyy5nDAqLu6ZxVOqUYYXsIzq9H2qAsPhqIgRy_5Av1zyoAcQErddadCe25H_ILmKO0Az9ANIFg4o1r_is_VFVZpGvbz8nCN0JLuY3uajAjf2JXoEzhHT9YbMP0o2TrZDRPdORV3HVK1N5uvghRaRyJvw; ao_lo_to_n="502219157192wVgAJpdturGN5Im+nPDfbd9htCMUGF/tdMS8/gmBNzv9/utYT5ucwmHHPC71S6i4RnT3fLUZW/nDI61eZx1uqLqr+hBy0X/aeJ6c/sSSc="; rank-guest-user=502219157192wVgAJpdturGN5Im+nPDYsyQgRxjbXtKYdDjju8ax0OkcsNUNGWP3xY6uiwKVVO; JSESSIONID=96FF611DCBDF20B9C6C921EAD2A55205; _ga_38NCVF2XST=GS2.1.s1751854600$o1$g1$t1751854612$j48$l0$h1855838417; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1751854612; _ga_CN0F80S6GL=GS2.1.s1751854600$o1$g1$t1751854613$j47$l0$h0',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": "US",
"cid": category_name['c_id'], # 分类id
"bsr": f"{i}" # 排名
}
print(category_name['name'], '请求参数 data::', data)
for i1 in range(3):
try:
response = curl_cffi.post(url, headers=headers, data=data, impersonate="chrome",timeout=300)
print(response.url)
# print(response.text)
response = json.loads(response.text)
break
except:
time.sleep(random.uniform(15, 35.75))
response_data = response['data']
print('code::', response['code'])
print('message::', response['message'])
print('estMonSales::', response_data['estMonSales'])
est = response_data.get('estMonSales')
if est is None:
# 没拿到数据,跳出循环
break
if est == 0.0:
print(f"{category_name['name']} 排名{i}:销量 0,跳出循环。")
break
# 2) 既然不会有 0.3、0.99 这种情况,剩下的 est 都是 ≥ 1
print(type(est))
print('获取数据:', category_name['name'], i, est, year_month)
sales = int(est)
name_rnak_list.append((category_name['name'], i, sales, year_month))
time.sleep(random.uniform(30, 95.75))
# break
for i in range(4):
try:
inset_sql = f"INSERT INTO {db_base}_one_category (name, rank,orders,`year_month`) values (%s, %s, %s, %s)"
print(inset_sql)
cursor_mysql_db.executemany(inset_sql, name_rnak_list)
db.commit()
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{category_name['c_id']}'"
print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
break
except Exception as e:
print('存储失败:',e)
db_class = connect_db(db_base)
print(db_base)
cursor_mysql_db, db = db_class.mysql_db() # mysql
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
time.sleep(20)
print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(120, 240.5))
def save_site_category(site_bsr_dict=None):
db_class = connect_db('us')
cursor_us_mysql_db, db = db_class.us_mysql_db() # mysql
for i in site_bsr_dict.keys():
print(i)
delete_sql = f'DELETE from all_site_category where `name` ="{i}"' # 删除旧分类
print(delete_sql)
cursor_us_mysql_db.execute(delete_sql)
db.commit()
site_category_list = site_bsr_dict[i]
for site_category in site_category_list:
insert_sql = f'insert into all_site_category (site,`name`, c_id)values (%s,%s,%s)'
cursor_us_mysql_db.execute(insert_sql, (i, site_category['categoryLabel'], site_category['cid']))
db.commit()
db_class = connect_db(i)
cursor_site_mysql_db, db = db_class.mysql_db() # mysql
def run():
# get_cid()
junglescout_spider('us')
if __name__ == '__main__':
run()
import random
import time
import js2py
import ctypes
def int_overflow(val):
maxint = 2147483647
if not -maxint-1 <= val <= maxint:
val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
return val
def unsigned_right_shitf(n, i, f):
# python 中位运算>> or <<
if f == ">":
# 数字小于0,则转为32位无符号uint
if n < 0:
n = ctypes.c_uint32(n).value
# 正常位移位数是为正数,但是为了兼容js之类的,负数就右移变成左移好了
if i < 0:
return -int_overflow(n << abs(i))
return int_overflow(n >> i)
else:
# 数字小于0,则转为32位无符号uint
if n < 0:
n = ctypes.c_uint32(n).value
# 正常位移位数是为正数,但是为了兼容js之类的,负数就右移变成左移好了
if i < 0:
return -int_overflow(n >> abs(i))
return int_overflow(n << i)
def xor32bit(a, b):
m = (a ^ b) % (2**32)
if m > (2**16):
m -= 2**32
return m
# def get_A_array():
# with open("js.js", "r", encoding="utf-8")as f:
# m = js2py.EvalJs()
# m.execute(f.read())
# return 1854735704
# return m.A()
def p0(V):
# t = str(hex(int(time.time()) * 1000)).replace("0x", "")
# V = f":false:1854735704:0:0/{t}/11/true/-480/1854735704/ed8ccea"
# ":redefine:1854735704:18:576"
d = V[::-1]
print(d)
t = 0
n = None
while t < len(d):
if True:
if not n:
n = str(hex(ord(d[t]))).replace("0x", "")
else:
n += str(hex(ord(d[t]))).replace("0x", "")
else:
pass
t += 1
return n
# def get_j_array(V, T):
# with open("js.js", "r", encoding="utf-8")as f:
# m = js2py.EvalJs()
# m.execute(f.read())
# return m.j(V, T)
def T_N(d=43):
return int(random.random() * d)
def get_j_array(y, z):
A = 1315423911 ^ (unsigned_right_shitf(z, 16, "<"))
B = len(y) - 1
C = None
while B >= 0:
C = ord(y[B])
A ^= ((unsigned_right_shitf(A, 5, "<") + C) + unsigned_right_shitf(A, 2, ">"))
B -= 1
return abs(A & 2147483647)
def LastEventID(n):
T = T_N()
h1 = str(hex(T)).replace("0x", '')
h2 = len(h1)
# h5
j = get_j_array(n, len(n))
print(j)
# j = 192801281
p5 = "0"+str(hex(j)).replace("0x", '')
# a = get_A_array()
# 1213187751
a = 1854735704
# ":redefine:1854735704:18:576"
V = f"G-898E1ED0395CFD90:redefine:{a}:{T}:{T * 32}"
# print(V)
t = str(hex(int(time.time()) * 1000)).replace("0x", "")
# print(t)
V += f"/{t}/11/true/-480/{a}/0a494e6"
# print(V)
j = get_j_array(V, T)
# 0
# :
# "366534393461302f343037353337343538312f3038342d2f657572742f31312f66363263613765623838312f36393a333a343037353337343538313a656e6966656465723a303944464335393330444531453839382d47"
# 1
# :
# "3"
# 2
# :
# 1
# 3
# :
# 4
# 4
# :
# "5c41b2a7"
# 5
# :
# "0b7dea01"
return "".join([p0(V), h1, str(h2), '4', str(hex(j)).replace("0x", ""), f'{p5}'])
n = """{"data":[{"num":"5004339145","fc":0,"sc":0}],"guid":"","timeZoneOffset":-480}"""
print(LastEventID(n))
# 366534393461302f343037353337343538312f3038342d2f657572742f31312f66363263613765623838312f36393a333a343037353337343538313a656e6966656465723a303944464335393330444531453839382d47
\ No newline at end of file
from bs4 import BeautifulSoup
import re
from sqlalchemy import create_engine
import pandas as pd
import json
from bs4 import BeautifulSoup, NavigableString, Tag
engine_us_mysql = create_engine(
f'mysql+pymysql://XP_Yswg2025_PY:Gd1pGJog1ysLMLBdML8w81@rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com:3306/selection?charset=utf8mb4')
is_not_table = False
def get_h2_all_text(table):
# # 查找所有的 h2 标签 版本1 不包括 图片链接
# h2_tags = soup.find_all(f"{table}")
# result = []
# for h2 in h2_tags:
# h2_text = h2.get_text(strip=True)
# texts = []
# # 遍历当前 h2 之后的所有兄弟节点,直到遇到下一个 h2 标签
# for sibling in h2.find_next_siblings():
# if sibling.name == f"{table}":
# break
# # 获取该标签及其子标签的全部文本(去除首尾空白)
# text = sibling.get_text()
# cleaned_text = re.sub(r'\s+', ' ', text).strip()
# # 移除不需要的特定文本
# if cleaned_text:
# texts.append(cleaned_text)
# result.append({"name": h2_text, "text": texts})
# return result
# 查找所有的 h2 标签 版本二 同时获取图片链接。放到对应文本上
h2_tags = soup.find_all(f"{table}")
result = []
for h2 in h2_tags:
h2_text = h2.get_text(strip=True)
texts = []
# 遍历当前 h2 后面的兄弟节点,直到遇到下一个 h2 标签
for sibling in h2.next_siblings:
# for sibling in h2.find_next_siblings():
# if sibling.name == "h2":
# break
# # 对每个兄弟节点,先查找所有 img 标签,将其替换成包含 src 链接的文本
# for img in sibling.find_all("img"):
# img_src = img.get("src")
# # 用 Image: 链接 来替换图片标签
# replacement = f" {img_src} "
# img.replace_with(replacement)
# # 提取修改后的文本,并清洗多余空白字符
# raw_text = sibling.get_text()
# cleaned_text = re.sub(r'\s+', ' ', raw_text).strip()
# if cleaned_text:
# texts.append(cleaned_text)
# 处理文本节点(直接添加非空字符串)
# 如果遇到下一个 h2 标签,则结束本次循环
if isinstance(sibling, Tag) and sibling.name == "h2":
break
if isinstance(sibling, NavigableString):
t = sibling.strip()
if t:
texts.append(t)
# 处理标签节点
elif isinstance(sibling, Tag):
# 如果该节点内有 img 标签,则先替换为 [Image: src]
for img in sibling.find_all("img"):
img_src = img.get("src")
replacement = f" [Image: {img_src}] "
img.replace_with(replacement)
raw_text = sibling.get_text()
# 清洗连续空白字符
cleaned_text = re.sub(r"\s+", " ", raw_text).strip()
if cleaned_text:
texts.append(cleaned_text)
if len(h2_text) == 0 and texts:
global is_not_table
is_not_table = True
else:
result.append({"name": h2_text, "text": texts})
return result
sql = 'select category,outer_html from seller_help_html'
df = pd.read_sql(sql, con=engine_us_mysql)
category_data_list = []
for index, row in df.iterrows():
category = row['category']
category_name = re.sub(r'\s+', ' ', category).strip()
html_content = row['outer_html']
soup = BeautifulSoup(html_content, "html.parser")
# 移除不需要的标签:Top 按钮 和 "Was this article helpful?" 模块
top_button = soup.find("kat-button", class_="hh-scroll-to-top-box")
if top_button:
top_button.decompose()
helpful_div = soup.find("div", class_="help-hmd")
if helpful_div:
helpful_div.decompose()
result = get_h2_all_text('h2')
if len(result) == 0:
result = get_h2_all_text('h1')
elif len(result) == 1:
results = get_h2_all_text('h1')
result.extend(results)
if is_not_table:
results = get_h2_all_text('h1')
result.extend(results)
is_not_table = False
# category_data_list.append(result)
print(result)
result_json = json.dumps(result)
category_data_list.append([category_name, result_json, html_content])
# break
#
# data_list = []
# for category_data in category_data_list:
# for category in category_data:
# if category['text']and category['name']:
# data_list.append([category['name'],category['text'][0]])
#
# print(data_list)
# df_seller_asin_account = pd.DataFrame(data=data_list,columns=['category_name','text'])
# df_seller_asin_account.to_csv(r'C:\Users\ASUS\Downloads\seller_help_data.csv', index=False, encoding='utf-8-sig')
# df_seller_asin_account = pd.DataFrame(data=category_data_list, columns=['category_name', 'result_json', 'outer_html'])
# df_seller_asin_account.to_sql('seller_help_data', con=engine_us_mysql, if_exists='append', index=False)
import pandas as pd
import numpy as np
import math
from sklearn.linear_model import LinearRegression
def estimate_by_interpolation(df_known, min_rank, max_rank):
df_k = df_known.set_index('rank').sort_index()
idx = range(min_rank, max_rank+1)
df_full = df_k.reindex(idx)
df_full['orders'] = df_full['orders'].interpolate()
df_full = df_full.reset_index().rename(columns={'index':'rank'})
df_full['orders_day'] = df_full['orders'].apply(lambda x: math.ceil(x/30))
return df_full
def estimate_by_powerlaw(df_known, min_rank, max_rank):
ranks = np.log(df_known['rank'].values).reshape(-1,1)
orders = np.log(df_known['orders'].values)
model = LinearRegression().fit(ranks, orders)
a = np.exp(model.intercept_)
b = -model.coef_[0]
ranks_full = np.arange(min_rank, max_rank+1)
orders_pred = a * (ranks_full ** -b)
df_pred = pd.DataFrame({
'rank': ranks_full,
'orders_day': np.ceil(orders_pred/30)
})
return df_pred
# 假设 df_known 是 5月已知的样本:(rank, orders)
df_known = pd.DataFrame({
'rank':[10000,11000,12000,13000,14000,15000,16000],
'orders':[3240,3000,2790,2640,2460,2340,2220]
})
df_interp = estimate_by_interpolation(df_known, 10000, 20000)
df_power = estimate_by_powerlaw(df_known, 10000, 20000)
print("=== 插值预测(前10行) ===")
print(df_interp.head(30))
print("\n=== 幂律预测(前10行) ===")
print(df_power.head(30))
category_name,text
"Policies, agreements, and guidelines",Amazon Services Business Solutions AgreementChanges to the Amazon Services Business Solutions AgreementProgram PoliciesChanges to program policiesIntellectual Property for Rights OwnersInternational selling agreementsAdditional GuidelinesAbout seller facial dataAbout Brand Registry facial data verificationUse of business credit reportsDirective on Administrative Cooperation–7th Amendment (DAC7)About the INFORM Consumers ActReport Infringement form: Enter ASINs in bulkUsing the Report Infringement form
import requests
data = {
"username":"pengyanbing",
'password':"15112376559"
}
# url = 'http://192.168.2.28:5000/login'
# resp = requests.post(url,json=data)
url = 'http://192.168.2.28:5000/user/members/index'
resp = requests.get(url,headers={'inventory-token':'fFZ7P4XpSA6nxaH7Xw7aHQ'})
print(resp.content.decode('utf-8'))
DB_CONFIG = {
'host': '120.77.232.73',
'port': 3306,
'user': 'yswg_it_cangchu',
'password': 'Yswg@inv-cangchu241011420',
'db': 'inventory'
}
# REDIS_CONN = {
# "redis_host": "120.77.232.73",
# "redis_port": 6379,
# "redis_pwd": "yswgInventoryTest@202307#",
# "redis_db": 1
#
# }
REDIS_CONN = {
"redis_host": "113.100.143.162",
"redis_port": 6379,
"redis_pwd": "fG7#vT6kQ1pX",
"redis_db": 14
}
\ No newline at end of file
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import paramiko
from datetime import datetime, timedelta
import subprocess
from func_timeout.exceptions import FunctionTimedOut
from func_timeout import func_set_timeout
import time
# vps拨号服务器 下载图片 上传到服务器
class temu_image():
def __init__(self, ):
# SFTP服务器的连接信息
self.host = "61.145.136.61"
self.port = 218
self.username = "vps_ftp"
self.password = "ysws123!@#"
self.ssh_connect()
def ssh_connect(self):
while True:
try:
# 创建SSH客户端
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# 连接到SFTP服务器
ssh.connect(self.host, self.port, self.username, self.password, timeout=15)
# 创建SFTP客户端
self.sftp = ssh.open_sftp()
break
except:
print('重新连接')
time.sleep(5)
def delete_img_path(self, path):
try:
command = f"rm {path}"
# 使用subprocess运行命令
subprocess.run(command, shell=True)
except Exception as e:
print(f"Error deleting file: {e}")
def to_fies(self):
# 替换为你的实际目录
# top_directory = '/run'
# # 遍历以2023开头的所有目录
# for root, dirs, files in os.walk(top_directory):
# for dir_name in dirs:
# if dir_name.startswith("2023"):
top_directory = '/run'
# 遍历以2023开头的所有目录
top_directory = '/run'
# Traverse only the top-level directories in /run
for dir_name in os.listdir(top_directory):
if dir_name.startswith("2023") and os.path.isdir(os.path.join(top_directory, dir_name)):
# Your code logic for processing the directories goes here
current_directory = os.path.join(top_directory, dir_name)
# Read files in the current directory
for file_name in os.listdir(os.path.join(current_directory, 'mnt/data/img_data/temu')):
if file_name.lower().endswith(('.jpg', '.jpeg', '.png')):
image_path = os.path.join(current_directory, 'mnt/data/img_data/temu', file_name)
while True:
try:
self.to_server_img(image_path, file_name)
break
except FunctionTimedOut as e:
print('断网')
time.sleep(10)
# 重新连接并重试上传
self.ssh_connect()
@func_set_timeout(15)
def to_server_img(self, local_file_path, file_name):
print('上传图片')
current_time = datetime.now()
# 格式化时间为 'YYYYMMDDHH' 格式
formatted_time = current_time.strftime('%Y%m%d%H')
# 目标位置
remote_file_path = f"/vps_ftp/temu/{formatted_time}/"
try:
# 自动创建目标目录和其上级目录
try:
self.sftp.chdir(remote_file_path) # 尝试切换到目标目录,如果目录不存在,会引发异常
except IOError:
self.sftp.mkdir(remote_file_path) # 创建目标目录
self.sftp.chdir(remote_file_path) # 切换到目标目录
# 上传本地文件到远程服务器
remote_file_path = f"/vps_ftp/temu/{formatted_time}/{file_name}"
print('服务器目录:', remote_file_path)
print('本地目录:', local_file_path)
# 上传本地文件到远程服务器
self.sftp.put(local_file_path, remote_file_path)
self.delete_img_path(local_file_path)
except (paramiko.SSHException, OSError) as e:
print(f"Error uploading file: {e}")
if __name__ == '__main__':
temu_image().to_fies()
\ No newline at end of file
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
import random
ORIGIN_CIPHERS = ('ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:'
'DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES')
class DESAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
"""
A TransportAdapter that re-enables 3DES support in Requests.
"""
CIPHERS = ORIGIN_CIPHERS.split(':')
random.shuffle(CIPHERS)
CIPHERS = ':'.join(CIPHERS)
md5_list = [':!aNULL:!eNULL:!MD5', ':!aNULL:!MD5:!DSS']
self.CIPHERS = CIPHERS + random.choice(md5_list)
# self.CIPHERS = CIPHERS + ':!aNULL:!MD5:!DSS'
super().__init__(*args, **kwargs)
def init_poolmanager(self, *args, **kwargs):
context = create_urllib3_context(ciphers=self.CIPHERS)
kwargs['ssl_context'] = context
return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
def proxy_manager_for(self, *args, **kwargs):
context = create_urllib3_context(ciphers=self.CIPHERS)
kwargs['ssl_context'] = context
return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
\ No newline at end of file
import pandas as pd
from urllib.parse import quote
# 从数据库获取数据的函数
def get_data_from_database(connection, query):
return pd.read_sql(query, connection)
def db_read_data(engine_pg):
# 初始化一个空的 DataFrame
result_list = []
query = f"SELECT search_term FROM us_search_term_month_merchantwords WHERE state=1"
print(query)
result_df = get_data_from_database(engine_pg, query)
result_df.drop_duplicates(['search_term'], inplace=True)
print('us_search_term_month_merchantwords::', result_df.shape)
# 对每个搜索关键词生成 URL 并添加到结果列表
for search_term in result_df['search_term']:
urls = build_urls(search_term)
result_list.extend(urls)
# 创建初始 DataFrame
df_search_term = pd.DataFrame(data=result_list, columns=['search_term', 'url'])
print(df_search_term.shape)
# 找出超过 450 字符长度的 URL 行的索引
long_url_rows = df_search_term['url'].str.len() <= 450
# 筛选保留不超过 450 字符长度的 URL 行
data_df = df_search_term[long_url_rows]
print('pg6 写入数据 merchantwords')
with engine_pg.begin() as conn:
data_df.to_sql(f'us_search_term_month_syn_merchantwords', con=engine_pg, if_exists="append", index=False)
update_sql = f"update us_search_term_month_merchantwords set state =3 where state=1"
print(update_sql)
conn.execute(update_sql)
deletesql = f"DELETE from us_search_term_month_syn_merchantwords where state =3"
print(deletesql)
conn.execute(deletesql)
# 构建 URL 的函数
def build_urls(search_term):
url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return [[search_term, url] for url in urls]
# if __name__ == '__main__':
# # 传一个 数据库链接
# db_read_data(engine_pg)
\ No newline at end of file
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import pymysql
from amazon_params.params import DB_CONN_DICT, PG_CONN_DICT
from sqlalchemy import create_engine
import pandas as pd
import time
from threading_spider.db_connectivity import connect_db
def init_db(site_name):
if site_name == 'us':
connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'],
user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection", charset="utf8mb4")
else:
connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'],
user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection_" + site_name,
charset="utf8mb4")
cursor = connect.cursor()
if site_name == 'us':
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT['pg_user']}:{PG_CONN_DICT['pg_pwd']}@{PG_CONN_DICT['pg_host']}:{PG_CONN_DICT['pg_port']}/selection",
encoding='utf-8')
else:
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT['pg_user']}:{PG_CONN_DICT['pg_pwd']}@{PG_CONN_DICT['pg_host']}:{PG_CONN_DICT['pg_port']}/selection_{site_name}",
encoding='utf-8')
print('同步erp asin 到 pg')
_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
sql_read = f"SELECT asin, img_url, title, title_len, price, rating, total_comments, buy_box_seller_type, page_inventory, category, volume, weight, rank, launch_time, video_url, add_url, material, created_at, updated_at, img_num, img_type, qa_num, brand, ac_name, node_id, sp_num, mpn, online_time, `describe`, one_star, two_star, three_star, four_star, five_star, low_star, asin_type, is_coupon, search_category, weight_str, date_info, site,account_name, other_seller_name, bsr_date_info, account_id FROM us_self_asin_detail where date_info = '{_time}'"
print(sql_read)
cursor.execute(sql_read)
data_search_term = cursor.fetchall()
df = pd.DataFrame(data=data_search_term,
columns=['asin', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments',
'buy_box_seller_type', 'page_inventory', 'category', 'volume', 'weight', 'rank',
'launch_time', 'video_url', 'add_url', 'material', 'created_at', 'updated_at', 'img_num',
'img_type', 'qa_num', 'brand', 'ac_name', 'node_id', 'sp_num', 'mpn', 'online_time',
'describe', 'one_star', 'two_star', 'three_star', 'four_star', 'five_star', 'low_star',
'asin_type', 'is_coupon', 'search_category', 'weight_str', 'date_info', 'site',
'account_name', 'other_seller_name', 'bsr_date_info', 'account_id'])
df.loc[df['launch_time'] == '0000-00-00', 'launch_time'] = None
df.to_sql('us_self_asin_detail_2024', con=engine_pg, if_exists="append", index=False)
title = site_name + '站点 每日同步 us_self_asin_detail 数据 ',
content = f'{_time} 同步数据成功 to pg us_self_asin_detail_2024'
account = 'pengyanbing'
connect_db(None).send_mg(account, title, content)
print(' 同步公司asin 到pg。并且 us_bsr_asin_detail、us_nsr_asin_detail 更新 asin_type 字段标记')
sql_self_asin = f"SELECT asin,created_at,updated_at FROM us_self_asin"
print(sql_self_asin)
cursor.execute(sql_self_asin)
self_aisn_data = cursor.fetchall()
df_asin = pd.DataFrame(data=self_aisn_data, columns=['asin', 'created_at', 'updated_at'])
with engine_pg.begin() as conn:
tr_sql = 'TRUNCATE us_self_asin'
conn.execute(tr_sql)
print('开始存储')
df_asin.to_sql('us_self_asin', con=engine_pg, if_exists="append", index=False)
print('存储成功')
with engine_pg.begin() as conn:
u_bsr_sql = 'UPDATE us_bsr_asin_detail b SET asin_type = 1 FROM us_self_asin a WHERE a.asin = b.asin and b.asin_type !=1;'
print(u_bsr_sql)
conn.execute(u_bsr_sql)
u_nsr_sql = 'UPDATE us_nsr_asin_detail b SET asin_type = 1 FROM us_self_asin a WHERE a.asin = b.asin and b.asin_type !=1;'
print(u_nsr_sql)
conn.execute(u_nsr_sql)
title = site_name + '站点 每日同步 us_self_asin 数据',
content = F'{_time} 同步数据成功 to pg 成功。更改 us_nsr_asin_detail,us_bsr_asin_detail 表 asin_type 成功 '
account = 'pengyanbing'
connect_db(None).send_mg(account, title, content)
# # #备份 _one_category_report 数据
# year_month = time.strftime('%Y_%m', time.localtime(time.time()))
# year_month =year_month.replace('_0','_')
# category_report_sql = f'select cate_1_id,`name`,rank,orders,orders_day,`year_month`,created_at,updated_at,`week` from {site_name}_one_category_report where `year_month` = "{year_month}"'
# print(category_report_sql)
# cursor.execute(category_report_sql)
# category_report_data = cursor.fetchall()
# df = pd.DataFrame(data=category_report_data,
# columns=['cate_1_id', 'name', 'rank', 'orders', 'orders_day', 'year_month', 'created_at',
# 'updated_at', 'week'])
# df.to_sql(f'{site_name}_one_category_report', con=engine_pg, if_exists="append", index=False)
# 监控搜索词数据入库数量
if site_name == 'us':
sql_read = f"SELECT DISTINCT(date_info) FROM {site_name}_search_term_pg where state in (1, 2) GROUP BY date_info"
print(sql_read)
df = pd.read_sql(sql_read, con=engine_pg)
date_info = list(df.date_info)
print(date_info)
if date_info:
report_date = date_info[0]
print("report_date: ", report_date)
ymd = report_date.replace('-', '_')
for i in ['zr', 'sp', 'sb', 'ac', 'er', 'tr', 'bs', 'hr']:
sql_count = f"select count(id) from {site_name}_search_term_rank_{i}_{ymd}"
print(sql_count)
df_count = pd.read_sql(sql_count, con=engine_pg)
count = list(df_count.values)[0][0]
print(count)
if i == 'er':
if count < 10000:
print('数据解析错误')
title = site_name + '站点 周搜索词抓取',
content = report_date + ' 周搜索词 ' + i + f' 入库总数 {count}'
account = 'pengyanbing'
connect_db(None).send_mg(account, title, content)
elif i == 'ac' or i == 'tr':
if count < 90000:
print('数据解析错误')
title = site_name + '站点 周搜索词抓取',
content = report_date + ' 周搜索词 ' + i + f' 入库总数 {count}'
account = 'pengyanbing'
connect_db(None).send_mg(account, title, content)
else:
if count < 200000:
print('数据解析错误')
title = site_name + '站点 周搜索词抓取',
content = report_date + ' 周搜索词 ' + i + f' 入库总数 {count}'
account = 'pengyanbing'
connect_db(None).send_mg(account, title, content)
if __name__ == '__main__':
init_db('us')
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import pandas as pd
from utils.db_connect import BaseUtils
import math
from urllib.parse import quote
class count_all_syn_st_id(BaseUtils):
def __init__(self, site_name=None,month=None,data_info=None,engine_db_num=14):
super().__init__()
if int(month) < 10:
month = '0' + str(month)
self.site_name = site_name # 站点
self.month = month
if engine_db_num == 14:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
else:
self.engine_pg = self.pg_connect_6()
self.engine_db_num = engine_db_num
if site_name == "us":
self.site_url = 'https://www.amazon.com/'
self.host = 'www.amazon.com'
elif site_name == 'uk':
self.site_url = 'https://www.amazon.co.uk/' # 站点url
self.host = 'www.amazon.co.uk'
elif site_name == 'de':
self.site_url = 'https://www.amazon.de/'
self.host = 'www.amazon.de'
elif site_name == 'fr':
self.site_url = 'https://www.amazon.fr/'
self.host = 'www.amazon.fr'
elif site_name == 'es':
self.site_url = 'https://www.amazon.es/'
self.host = 'www.amazon.es'
elif site_name == 'it':
self.site_url = 'https://www.amazon.it/'
self.host = 'www.amazon.it'
data_info = f'2025-{month}'
self.data_info = data_info
self.table_data_info = data_info.replace('-','_')
def get_minid_maxid(self):
# 查询最小和最大 id
print(self.site_name, ' 查询最小和最大 id')
query = f"SELECT MIN(id) AS min_id, MAX(id) AS max_id FROM {self.site_name}_all_syn_st_month_{self.table_data_info} where state in (1,2)"
print(query)
result = pd.read_sql(query, self.engine_pg)
if result.shape[0] > 0:
min_id = result['min_id'].values[0]
max_id = result['max_id'].values[0]
# 分组数
num_groups = 150
group_size = math.ceil((max_id - min_id + 1) / num_groups)
# 创建分组范围
group_ranges = [(i, i + group_size - 1) for i in range(min_id, max_id + 1, group_size)]
id_list = []
# 打印分组范围
for group_start, group_end in group_ranges:
print(f"Group: {group_start} - {group_end}")
if site == 'us':
id_list.append([f'{group_start}-{group_end}', self.table_data_info])
else:
id_list.append([f'{group_start}-{group_end}', self.table_data_info])
print(id_list)
df_asin_img_video = pd.DataFrame(data=id_list, columns=['minid_maxid', 'yaer_month'])
with self.engine.begin() as conn:
delete_sql = f'DELETE from {self.site_name}_syn_asin_all_minid_maxid where state <4'
print('delete_sql::',delete_sql)
conn.execute(delete_sql)
df_asin_img_video.to_sql(f'{self.site_name}_syn_asin_all_minid_maxid', con=self.engine,
if_exists='append',
index=False)
def search_term_syn(self):
# 初始化一个空的 DataFrame
result_list = []
if self.engine_db_num == 6:
query = f"SELECT search_term FROM {self.site_name}_search_term_month_merchantwords WHERE state=1 and id <5000001"
elif self.engine_db_num == 14:
query = f"SELECT search_term FROM {self.site_name}_search_term_month WHERE month={self.month} and state in (1,2)"
print(query)
result_df = self.get_data_from_database(self.engine_pg, query)
result_df.drop_duplicates(['search_term'], inplace=True)
print('_search_term_month::',result_df.shape)
# 对每个搜索关键词生成 URL 并添加到结果列表
for search_term in result_df['search_term']:
urls = self.build_urls(search_term)
result_list.extend(urls)
# 创建初始 DataFrame
df_search_term = pd.DataFrame(data=result_list, columns=['search_term', 'url'])
print(df_search_term.shape)
# 找出超过 450 字符长度的 URL 行的索引
long_url_rows = df_search_term['url'].str.len() <= 450
# 筛选保留不超过 450 字符长度的 URL 行
data_df = df_search_term[long_url_rows]
data_df['month'] = f'{self.month}'
data_df['date_info'] = self.data_info
print(data_df)
print(data_df.shape)
with self.engine_pg.begin() as conn:
if self.engine_db_num == 14:
data_df.to_sql(f'{self.site_name}_search_term_month_syn',con=self.engine_pg, if_exists="append",
index=False)
update_sql = f"update {self.site_name}_search_term_month set state =3 where date_info='2025-{self.month}' and state=1"
print(update_sql)
conn.execute(update_sql)
deletesql = f"DELETE from {self.site_name}_search_term_month_syn where date_info < '{self.data_info}'"
print(deletesql)
conn.execute(deletesql)
elif self.engine_db_num == 6:
print('pg6 写入数据 merchantwords')
data_df.to_sql(f'{self.site_name}_search_term_month_syn_merchantwords', con=self.engine_pg, if_exists="append",
index=False)
update_sql = f"update us_search_term_month_merchantwords set state =3 where state=1"
print(update_sql)
conn.execute(update_sql)
deletesql = f"DELETE from {self.site_name}_search_term_month_syn_merchantwords where state =3"
print(deletesql)
conn.execute(deletesql)
# 从数据库获取数据的函数
def get_data_from_database(self, connection, query):
return pd.read_sql(query, connection)
# 构建 URL 的函数
def build_urls(self, search_term):
url_template = f"{self.site_url}s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return [[search_term, url] for url in urls]
if __name__ == '__main__':
import time
# 根据 engine 选择那个库。爬虫库 14, 抓取me搜索词是6,爬虫一般使用14,根据情况调整
month = 6
engine_db_num = 14
# for site in ['de','uk']:
for site in ['de']:
time.sleep(0)
count_all_syn_st_id(site_name=site,month=month).get_minid_maxid()
# count_all_syn_st_id(site_name=site,month=month,engine_db_num=engine_db_num).search_term_syn()
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import pandas as pd
from utils.db_connect import BaseUtils
from amazon_params.params import DB_REQUESTS_ASIN_PARAMS
from utils.requests_param import Requests_param_val
import traceback
print('存储竞品asin 数据')
class Save_asin_competitive(BaseUtils):
def __init__(self, site_name=None):
super().__init__()
self.site_name = site_name # 站点
self.asin_detail_list = []
self.stop_item_queue = True
self.init_db_names()
self.read_size = 100
# 解析
if site_name == 'us':
self.asin_cols = ['asin', 'title', 'img_url', 'rating', 'total_comments', 'price', "rank", 'category',
'launch_time',
'volume', 'weight', 'title_len', "video_url",
"add_url", 'material', 'sp_num', 'activity_type', 'one_two_val', 'three_four_val'
]
else:
self.asin_cols = ['asin', 'title', 'img_url', 'rating', 'total_comments', 'price', "rank", 'category',
'launch_time',
'volume', 'weight', 'title_len', "video_url",
"add_url", 'material', 'sp_num', 'activity_type', 'one_two_val', 'three_four_val',
'five_six_val']
self.reuests_para_val = Requests_param_val(site_name=self.site_name)
def init_db_names(self):
"""
1. 初始化数据库连接
2. 初始化数据库表名
"""
self.engine = self.mysql_connect()
self.db_syn = self.site_name + DB_REQUESTS_ASIN_PARAMS['db_competitive_aisn_syn'][2:]
self.db_competitive_aisn = self.site_name + DB_REQUESTS_ASIN_PARAMS['db_competitive_aisn'][2:]
def process_item(self, item_queue, requests_error_asin_list, asin_not_found_list, asin_not_foot2_list,
asin_not_response_list, asin_not_redirect_list, asin_not_div_id_dp_list):
print("=================开始存储数据======================")
self.requests_error_asin_list = requests_error_asin_list
self.asin_not_found_list = asin_not_found_list
self.asin_not_foot2_list = asin_not_foot2_list
self.asin_not_response_list = asin_not_response_list
self.asin_not_redirect_list = asin_not_redirect_list
self.asin_not_div_id_dp_list = asin_not_div_id_dp_list
while True:
if item_queue.empty() == False:
item = item_queue.get()
item_list = []
# 需要存到数据库的字段
item_list.append(item['asin'])
item_list.append(item['title'])
item_list.append(item['img_url'])
item_list.append(item['rating'])
item_list.append(item['total_comments'])
item_list.append(item['price'])
item_list.append(item['rank'])
item_list.append(item['category'])
item_list.append(item['launch_time'])
item_list.append(item['volume'])
item_list.append(item['weight'])
item_list.append(item['title_len'])
item_list.append(item['video_url'])
item_list.append(item['add_url'])
item_list.append(item['material'])
item_list.append(item['sp_num'])
item_list.append(item['activity_type'])
item_list.append(item['one_two_val'])
item_list.append(item['three_four_val'])
if self.site_name == 'us':
pass
else:
item_list.append(item['five_six_val'])
self.asin_detail_list.append(item_list)
else:
if item_queue.empty():
self.save_data()
self.asin_detail_list = []
print("结束--跳出--存储")
break
def read_db_data(self):
while True:
try:
with self.engine.begin() as conn:
sql_read = f'SELECT asin,id FROM {self.db_syn} WHERE STATE=1 and length(asin) =10 LIMIT {self.read_size} for update;'
print(sql_read)
a = conn.execute(sql_read)
self.df_read = pd.DataFrame(a, columns=['asin', 'id'])
self.df_read.drop_duplicates(['asin'], inplace=True)
if self.df_read.shape[0] == 0:
print("*************** us_competitive_aisn 详情抓取完毕 ****************")
self.stop_item_queue = False
self.spider_de_feedback = True
return []
self.index_tuple = tuple(self.df_read['id'])
if len(self.index_tuple) == 1:
sql_update = f"""UPDATE {self.db_syn} a set state=2 where a.id in ({self.index_tuple[0]})"""
else:
sql_update = f"""UPDATE {self.db_syn} a set state=2 where a.id in {self.index_tuple}"""
conn.execute(sql_update)
asin_list = list(self.df_read.asin)
return asin_list
except Exception as e:
print("读取数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}")
self.mysql_reconnect(table_name=self.db_syn, e=e)
continue
def save_data(self):
df_self_asin_detail = pd.DataFrame(data=self.asin_detail_list, columns=self.asin_cols)
self.asin_list_update = list(df_self_asin_detail.asin)
while True:
try:
if self.asin_detail_list:
df_self_asin_detail.to_sql(f"{self.db_competitive_aisn}", con=self.engine, if_exists='append',
index=False)
break
except Exception as e:
try:
print(f"存储{self.asin_detail_list}失败,等待5s继续", e, f"\n{traceback.format_exc()}")
except:
pass
self.mysql_reconnect(table_name=f"{self.db_competitive_aisn}", e=e)
continue
if self.requests_error_asin_list:
self.db_change_state(state=1)
if self.asin_list_update:
self.db_change_state(state=3)
if self.asin_not_found_list:
self.db_change_state(state=4)
if self.asin_not_foot2_list:
self.db_change_state(state=8)
if self.asin_not_response_list:
self.db_change_state(state=10)
if self.asin_not_redirect_list:
self.db_change_state(state=12)
if self.asin_not_div_id_dp_list:
self.db_change_state(state=13)
def db_change_state(self, state=2):
if state == 1:
self.db_change_state_common(state=state, asin_list=self.requests_error_asin_list)
if state == 3:
self.db_change_state_common(state=state, asin_list=self.asin_list_update)
elif state == 4:
self.db_change_state_common(state=state, asin_list=self.asin_not_found_list)
elif state == 8:
self.db_change_state_common(state=state, asin_list=self.asin_not_foot2_list)
elif state == 10:
self.db_change_state_common(state=state, asin_list=self.asin_not_response_list)
elif state == 12:
self.db_change_state_common(state=state, asin_list=self.asin_not_redirect_list)
elif state == 13:
self.db_change_state_common(state=state, asin_list=self.asin_not_div_id_dp_list)
def db_change_state_common(self, state, asin_list):
print(f"==================== 存储状态 {state} 数据 ========== {len(asin_list)} ========")
df = self.df_read.loc[self.df_read.asin.isin(asin_list)]
# if state == 3:
# # 剔除状态 7,9 的id
# df = self.df_read.loc[
# (self.df_read.asin.isin(asin_list)) & ~(self.df_read.asin.isin(self.asin_not_foot_list)) & ~(
# self.df_read.asin.isin(self.asin_not_buyBox_list))]
id_tuple = tuple(df.id)
while True:
try:
with self.engine.begin() as conn:
# 1,3:1--回滚;3--成功
if id_tuple:
if len(id_tuple) == 1:
sql_update = f"update {self.db_syn} set state={state} where id in ({id_tuple[0]}) and state=2;"
else:
sql_update = f"update {self.db_syn} set state={state} where id in {id_tuple} and state=2;"
conn.execute(sql_update)
break
except Exception as e:
print(f"更改{self.db_syn}表的state={state}出错", e, f"\n{traceback.format_exc()}")
self.mysql_reconnect(table_name=self.db_syn, e=e)
continue
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import pandas as pd
from amazon_params.db_connect import BaseUtils
from amazon_params.params import DB_REQUESTS_ASIN_PARAMS
from amazon_params.requests_param import Requests_param_val
import traceback
print('存储 测试关键词的asin ')
class Save_asin_detail(BaseUtils):
def __init__(self, site_name=None, proxy_name=None):
super().__init__()
self.site_name = site_name # 站点
self.asin_detail_list = []
self.stop_item_queue = True
self.cols = ['asin', 'sku', 'adv_num']
self.init_db_names()
self.read_size = 100
print("存储 调用 配置 proxy_name:", proxy_name)
self.reuests_para_val = Requests_param_val(site_name=self.site_name, proxy_name=proxy_name)
def init_db_names(self):
"""
1. 初始化数据库连接
2. 初始化数据库表名
"""
self.engine = self.mysql_connect()
self.db_syn = DB_REQUESTS_ASIN_PARAMS['db_potential_product_asin_syn'][3:]
self.potential_product_asin = DB_REQUESTS_ASIN_PARAMS['db_potential_product_asin'][3:]
def process_item(self, item_queue, requests_error_asin_list, asin_list_update, asin_not_found_list,
asin_not_sure_list, asin_not_foot_list, asin_not_foot2_list, asin_not_buyBox_list,
asin_not_response_list, asin_not_redirect_list, asin_not_div_id_dp_list,
asin_variation_list, star_list, add_cart_asin_list, bs_category_asin_list):
print("=================开始存储数据======================")
while True:
if item_queue.empty() == False:
item = item_queue.get()
item_list = []
# 需要存到数据库的字段
item_list.append(item['asin'])
item_list.append(item['sku'])
item_list.append(item['sp_num'])
self.asin_detail_list.append(item_list)
else:
if item_queue.empty():
self.save_data(requests_error_asin_list, asin_list_update, asin_not_found_list, asin_not_sure_list,
asin_not_foot_list, asin_not_foot2_list, asin_not_buyBox_list,
asin_not_response_list, asin_not_redirect_list, asin_not_div_id_dp_list,
asin_variation_list, star_list, add_cart_asin_list, bs_category_asin_list)
self.asin_detail_list = []
print("结束--跳出--存储")
break
def read_db_data(self):
while True:
try:
with self.engine.begin() as conn:
sql_read = f'SELECT id,asin,sku FROM {self.db_syn} WHERE asin_state=1 LIMIT {self.read_size} for update;'
print(sql_read)
a = conn.execute(sql_read)
self.df_read = pd.DataFrame(a, columns=['id', 'asin', 'sku'])
self.df_read.drop_duplicates(['asin'], inplace=True)
if self.df_read.shape[0] == 0:
print('*********** asin 数据抓取 完毕 *****************')
self.stop_item_queue = False
return []
self.index_tuple = tuple(self.df_read['id'])
if len(self.index_tuple) == 1:
sql_update = f"""UPDATE {self.db_syn} a set asin_state=2 where a.id in ({self.index_tuple[0]})"""
else:
sql_update = f"""UPDATE {self.db_syn} a set asin_state=2 where a.id in {self.index_tuple}"""
conn.execute(sql_update)
asin_list = list(self.df_read.asin + '|' + self.df_read.sku)
return asin_list
except Exception as e:
print("读取数据出bug并等待5s继续", e, f"\n{traceback.format_exc()}")
self.mysql_reconnect(table_name=self.db_syn, e=e)
continue
def save_data(self, requests_error_asin_list, asin_list_update, asin_not_found_list, asin_not_sure_list,
asin_not_foot_list, asin_not_foot2_list, asin_not_buyBox_list, asin_not_response_list,
asin_not_redirect_list, asin_not_div_id_dp_list, asin_variation_list, star_list,
add_cart_asin_list, bs_category_asin_list):
while True:
try:
df_asin_detail = pd.DataFrame(data=self.asin_detail_list, columns=self.cols)
self.asin_list_update = list(df_asin_detail.asin)
df_asin_detail.drop_duplicates(['asin', 'sku'], inplace=True) # 去重
# if df_asin_star.shape[0] > 0:
# with self.engine.begin() as conn:
# if len(set(df_asin_star.asin)) == 1:
# sql_delete_star = f"delete from {self.db_asin_star} where week={week_} and asin in ('{tuple(df_asin_star.asin)[0]}');"
# else:
# sql_delete_star = f"delete from {self.db_asin_star} where week={week_} and asin in {tuple(set(df_asin_star.asin))};"
# conn.execute(sql_delete_star)
df_asin_detail.to_sql(self.potential_product_asin, con=self.engine, if_exists='append', index=False)
break
except Exception as e:
print(f"存储'{self.db_asin_star}'失败,等待5s继续", e, f"\n{traceback.format_exc()}")
self.mysql_reconnect(table_name=self.db_asin_star, e=e)
continue
if requests_error_asin_list:
self.db_change_state(state=1, asin_list=requests_error_asin_list)
if self.asin_list_update:
self.db_change_state(state=3, asin_list=self.asin_list_update)
if asin_not_found_list:
self.db_change_state(state=4, asin_list=asin_not_found_list)
if asin_not_sure_list:
self.db_change_state(state=6, asin_list=asin_not_sure_list)
if asin_not_foot_list:
self.db_change_state(state=7, asin_list=asin_not_foot_list) # 没有脚
if asin_not_foot2_list:
self.db_change_state(state=8, asin_list=asin_not_foot2_list)
if asin_not_buyBox_list:
self.db_change_state(state=9,
asin_list=asin_not_buyBox_list) # 只有图片和标题,其他数据没有 https://www.amazon.com/dp/B016IBFUWC/qid=1637397113
if asin_not_response_list:
self.db_change_state(state=10,
asin_list=asin_not_response_list) # https://www.amazon.com/dp/B08G3HL4PR 返回空白 html
if asin_not_redirect_list:
self.db_change_state(state=12, asin_list=asin_not_redirect_list)
if asin_not_div_id_dp_list:
self.db_change_state(state=13, asin_list=asin_not_div_id_dp_list)
def db_change_state(self, state=2, asin_list=None):
self.db_change_state_common(state=state, asin_list=asin_list)
def db_change_state_common(self, state, asin_list):
print(f"==================== 存储状态 {state} 数据 ========== {len(asin_list)} ========")
df = self.df_read.loc[self.df_read.asin.isin(asin_list)]
if state == 3:
# 剔除状态 7,9 的id
df = self.df_read.loc[self.df_read.asin.isin(asin_list)]
id_tuple = tuple(df.id)
while True:
try:
with self.engine.begin() as conn:
# 1,3:1--回滚;3--成功
if id_tuple:
if len(id_tuple) == 1:
sql_update = f"update {self.db_syn} set asin_state={state} where id in ({id_tuple[0]}) and asin_state=2;"
else:
sql_update = f"update {self.db_syn} set asin_state={state} where id in {id_tuple} and asin_state=2;"
conn.execute(sql_update)
break
except Exception as e:
print(f"更改{self.db_syn}表的state={state}出错", e, f"\n{traceback.format_exc()}")
self.mysql_reconnect(table_name=self.db_syn, e=e)
continue
import pandas as pd
from urllib.parse import quote
from utils.common_util import CommonUtil
from utils.db_util import DBUtil
# 大数据级脚本。上传搜索词之后插入到 抓取表
class Integrate_search_term():
def __init__(self, site_name=None, week=None, month=None, date_info=None, date_type=''):
if int(week) < 10:
week = '0' + str(week)
self.site_name = site_name # 站点
self.week = week
self.date_info = date_info
if site_name == "us":
self.site_url = 'https://www.amazon.com/'
elif site_name == 'uk':
self.site_url = 'https://www.amazon.co.uk/' # 站点url
elif site_name == 'de':
self.site_url = 'https://www.amazon.de/'
elif site_name == 'fr':
self.site_url = 'https://www.amazon.fr/'
elif site_name == 'es':
self.site_url = 'https://www.amazon.es/'
elif site_name == 'it':
self.site_url = 'https://www.amazon.it/'
self.engine_pg = DBUtil.get_db_engine("postgresql_14", site_name)
self.engine_mysql = DBUtil.get_db_engine("mysql", "us")
self.date_info_pre = self.get_pre_week(date_info)
self.date_type = date_type
def fetch_year_month_by_week(self):
if self.date_type == 'week':
sql = f"select `year_month` from date_20_to_30 WHERE year_week='{self.date_info}' and week_day=1;"
df = pd.read_sql(sql, con=self.engine_mysql)
self.date_info = list(df.year_month)[0]
self.month = list(df.year_month)[0]
month = list(df.year_month)[0].split('-')
self.month = int(month[-1])
elif self.date_type == 'month':
self.month = int(self.date_info.split('-')[-1])
def search_term_syn(self):
# 初始化一个空的 DataFrame
result_list = []
# 根据不同的 site_name 获取数据并合并
if self.site_name == 'us':
self.fetch_year_month_by_week()
query = f"SELECT search_term FROM {self.site_name}_search_term_month WHERE date_info='{self.date_info}' and state in (1,2)"
print(query)
result_df = self.get_data_from_database(self.engine_pg, query)
else:
query = f"SELECT search_term FROM {self.site_name}_search_term WHERE week={self.week} and state in (1,2)"
print(query)
result_df = self.get_data_from_database(self.engine_pg, query)
result_df.drop_duplicates(['search_term'], inplace=True)
print(result_df.shape)
# 对每个搜索关键词生成 URL 并添加到结果列表
for search_term in result_df['search_term']:
urls = self.build_urls(search_term)
result_list.extend(urls)
# 创建初始 DataFrame
df_search_term = pd.DataFrame(data=result_list, columns=['search_term', 'url'])
print(df_search_term.shape)
# 找出超过 450 字符长度的 URL 行的索引
long_url_rows = df_search_term['url'].str.len() <= 450
# 筛选保留不超过 450 字符长度的 URL 行
data_df = df_search_term[long_url_rows]
if self.site_name == 'us':
data_df['month'] = f'{self.month}'
data_df['date_info'] = self.date_info
else:
data_df['week'] = f'{self.week}'
data_df['date_info'] = self.date_info
print(data_df)
print(data_df.shape)
data_df.to_sql(f'{self.site_name}_search_term_month_syn', con=self.engine_pg, if_exists="append",
index=False)
if self.site_name == 'us':
with self.engine_pg.begin() as conn:
update_sql = f"update us_search_term_month set state =3 where date_info='{self.date_info}' and state=1"
print(update_sql)
conn.execute(update_sql)
deletesql = f"DELETE from {self.site_name}_search_term_month_syn where date_info < '{self.date_info}'"
print(deletesql)
conn.execute(deletesql)
deletesql = f"DELETE from {self.site_name}_search_term_syn where state = 1"
conn.execute(deletesql)
else:
with self.engine_pg.begin() as conn:
delete_sql = f"DELETE from {self.site_name}_search_term_syn where date_info<'{self.date_info_pre}' and state not in (1,2)"
print(delete_sql)
conn.execute(delete_sql)
deletesql = f"DELETE from {self.site_name}_search_term_syn where state = 1"
conn.execute(deletesql)
# data_df.to_sql(f'{self.site_name}_search_term_syn', con=self.engine_pg, if_exists="append",
# index=False)
# 从数据库获取数据的函数
def get_data_from_database(self, connection, query):
return pd.read_sql(query, connection)
# 构建 URL 的函数
def build_urls(self, search_term):
url_template = f"{self.site_url}s?k={{search_term}}&page={{page_number}}"
search_term_chinese = quote(search_term, 'utf-8')
search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
urls = [
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=1),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=2),
url_template.format(
search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
'%28').replace(
')', '%29'), page_number=3)
]
return [[search_term, url] for url in urls]
def get_pre_week(self, date_info):
engine = DBUtil.get_db_engine("mysql", "us")
with engine.connect() as connection:
sql = f"""
select year_week
from date_20_to_30
where year_week < '{date_info}'
order by year_week desc
limit 1 """
result = connection.execute(sql)
pre_week = result.cursor.fetchone()[0]
return pre_week
if __name__ == '__main__':
# week = 49
# month = None
# for site in ['de','uk']:
date_info = CommonUtil.get_sys_arg(1, None)
site_name = CommonUtil.get_sys_arg(2, None)
date_type = CommonUtil.get_sys_arg(3, None)
assert date_info is not None, "date_info 不能为空!"
assert site_name is not None, "site_name 不能为空!"
assert date_type is not None, "date_type 不能为空!"
year, week = CommonUtil.split_month_week_date("week", date_info)
Integrate_search_term(site_name=site_name, week=week, month=99, date_info=date_info,
date_type=date_type).search_term_syn()
import sys
import os
import traceback
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import pandas as pd
import json
from threading_spider.db_connectivity import connect_db
# 更新 keepa us_keepa_trend_b 表的 bsr_orders_list bsr_orders_sale_list 字段
def str_to_list(df_resp):
df_asin_list = list(
df_resp.asin.astype(str) + '|-|' + df_resp.last_price.astype(str) + '|-|' + df_resp.first_bsr_list.astype(
str) + '|-|' + df_resp.first_bsr_label.astype(str))
print('df_asin_list::', df_asin_list)
return df_asin_list
def update_asin_orders():
db_class = connect_db('us')
engine_pg = db_class.pg_db() # pg
while True:
try:
for table_name in ['us_keepa_trend_b08', 'us_keepa_trend_b09', 'us_keepa_trend_other', 'us_keepa_trend_b07',
'us_keepa_trend_b00_b06']:
sleect_sql = f'select asin, last_price, first_bsr_list, first_bsr_label from {table_name} where bsr_orders_list is null ORDER BY id FETCH FIRST 1000 ROWS ONLY FOR UPDATE;'
print('sleect_sql::', sleect_sql)
df_resp = pd.read_sql(sleect_sql, con=engine_pg)
df_asin_list = str_to_list(df_resp)
list_asin = []
for df_asin in df_asin_list:
asin_bsr_price_list = df_asin.split('|-|')
list_asin.append(asin_bsr_price_list)
data_list = []
for asins in list_asin:
print(asins)
asin = asins[0]
if asins[1] and asins[1] != 'nan':
price = asins[1]
else:
price = 0.0
bsr_list = json.loads(asins[2].replace('None', 'null'))
rank_lists = [] if bsr_list is None else [x if x is not None else None for x in bsr_list]
rank_data = eval(str(rank_lists))
category = asins[3]
rank_list = [x for x in rank_data if x is not None]
if rank_list:
if len(rank_list) == 1:
sql_read = f"""SELECT orders,orders*{price} as "销售额" from us_one_category_report_2023_05 where name = '{category}' and rank in ('{rank_list[0]}')"""
else:
sql_read = f"""SELECT orders,orders*{price} as "销售额" from us_one_category_report_2023_05 where name = '{category}' and rank in {tuple(rank_list)}"""
df = pd.read_sql(sql_read, con=engine_pg)
if not df['销售额'].empty and not df['orders'].empty:
# 定义要更新的数据
orders = str(list(df.orders))
orders_sale = str(list(df['销售额']))
data_list.append([orders, orders_sale, asin])
else:
data_list.append(['[-1]', '[-1]', asin])
else:
data_list.append(['[-1]', '[-1]', asin])
# 构建更新SQL语句
print('构建更新SQL语句::', table_name)
sql_update = f"UPDATE {table_name} SET bsr_orders_list = %s, bsr_orders_sale_list = %s WHERE asin = %s"
with engine_pg.begin() as conn:
# 执行批量更新操作
conn.execute(sql_update, data_list)
conn.execute(sql_update, data_list)
print('执行完成。', table_name)
except Exception as e:
db_class = connect_db('us')
engine_pg = db_class.pg_db() # pg
print("存储 数据错误数据错误", e, f"\n{traceback.format_exc()}")
print(sleect_sql)
if __name__ == '__main__':
update_asin_orders()
import subprocess
import time
import requests
import random
def is_internet_available():
try:
print('判断是否有网')
n = random.randint(70, 114)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'User-Agent': ua,
}
r = requests.get("https://www.baidu.com", timeout=5,headers=headers)
print('请求Google:',r.status_code)
return True
except Exception as e:
print(e)
return False
def pppoe_ip():
nums = 0
while True:
try:
print('开始切换ip')
result = subprocess.run(['sh', '/root/pppoe.sh'], capture_output=True, text=True, check=True)
print(result.stdout)
time.sleep(2)
print(result.stderr) # 捕获stderr输出
print('执行 sh 脚本 完成')
time.sleep(1)
# 要执行的系统命令
# 调用函数检测网络连接是否可用
if is_internet_available():
print("有网络连接")
# 设置共享值为True,表示操作已经完成
break
else:
print("没有网络连接",nums)
nums += 10
time.sleep(nums * 2)
except subprocess.CalledProcessError as e:
print('切换ip 失败',e)
continue
if __name__ == '__main__':
is_internet_available()
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment