以图搜竞品

parent d7f6ddab
# uk de 的站点 没测过
from loguru import logger
import json
from py_spider.utils.secure_db_client import get_remote_engine
# uk de 的站点 没测过 拍照接口的 application username uk站点的为amzn-mbl-cscan-ais url不变 为https://match-visualsearch.amazon.com/vsearch/2.0
site_name_secret_dict = {
"us": {'application':'amzn-mbl-cscan-us','username':'amzn-mbl-cscan-us','secret':'5b6874d3a20417591bd5464a25a37bc6','snap_url':'https://match-visualsearch.amazon.com'},
"uk": {'application':'amzn-mbl-cscan-uk','username':'amzn-mbl-cscan-uk','secret':'c1a79f745bbe6a8824ce3178c8b549ad','snap_url':'https://match-visualsearch-uk.amazon.com'},
......@@ -461,7 +465,43 @@ de_devices_list = [
]
# us_cookie_dict ={"i18n-prefs": "USD", "lc-main": "en_US", "session-id": "137-6850115-5627367", "session-id-time": "2082787201l", "session-token": "NDR0kD9QBkk81plbn2yEFeEJ205I0LEzv9G4Q3nG90PVEmpZEtEiL6Ixj3alIu5KBvaiePi9IOHhzSh48qBNQH2yroklDf1+rDTtJ8HNzcfae0ZEL3/szlwFWXRsxH7UwfHip9R66v9ZoO5ni9mrz+4J6QgjTmrzCW7uVjTMthdesVgOk2SVmxyfoLQnmYuScp17v+ayrztH3VFIg/VyhzNBlVbkZDIHg2WB3F+U0JHkiQfiZB1q2uCMWOEIOwRkCGhaRrWHbBgpQtvHnf+Iw2KxVkl2gv1jZga2iNQUhO6cIr2Py5V7evAyuXXhjRaBzIl/STRh58Jr9ctbhdBXnTQsbThexXLn", "skin": "noskin", "ubid-main": "133-5836273-3821350"}
# uk_cookie_dict ={"i18n-prefs": "HKD", "lc-acbuk": "en_GB", "session-id": "521-9387816-0582061", "session-id-time": "2082787201l", "session-token": "\"ANvQ1SZ9VE6Y9cGcEdwhYcEpI1Jql7bOXZ7Q+qdBbb41IuuIFXvUzDSoaXOzPlcUTa3CIMoLIIzD2QOAWnN2h1E/c/iXEzpM8qd4LqR9otFV6PQecFkZKnC/i5v3QIguYgLt3KB9halOcaKi+KXcpG1b5jjYSicsMknbJMQVgfkiqMDXOLcxIns2cHs1xpXgf/DrjOHNpIB6Q24VKaqLTuKbWd8biHv86NGyoBNIs1kkjJxdwZhvWtQC/rHqoiz0e853W2lkrUrdr6Ko3dSIYojEBEgVuglCENJWLmkQzqEMxcIzNgcrpf4ZSlRHhMpmi2D14A6rwrwdo/JMozJSol+Wx3dL7j/AEGiXa1IWwrA=\"", "ubid-acbuk": "259-9451176-7460703"}
# de_cookie_dict ={"i18n-prefs": "HKD", "lc-acbde": "de_DE", "session-id": "258-7262769-1488647", "session-id-time": "2082787201l", "session-token": "RxPtESM/cUQ4JAxsgMz8crm9EMbFhHLK4vPgZzTj2w23v5yWkzjXZsVVsErU5Oi8qzT3Le8nxU1zxeDEWuAg2IL+5fb2VwhE9UOQpLMmnOBWaukpW0CKFMOTvJU6Fu7+IZhWpJWt+ypu5b0KEYVi7CyTsHKh38q8N2m7tGBwb94NyT4tLHyE69XovznI5HV7y19J0aJrDyEYBfk00m6/SwxxOhRbkOXWVIagnGK4N1uQ27n9827rd7o1TpHQc8sWnn1bDAvYbCtGOaUYUHJ8HxnZPsTjKZ2b8OgzIKE9nlvpoAAQOArarZlZxneJTfUY++ucym9Te+RUvPp6SrhpPJyP7W425mA9", "ubid-acbde": "257-6077242-9988057"}
us_cookie_dict ={"i18n-prefs": "USD", "lc-main": "en_US", "session-id": "137-6850115-5627367", "session-id-time": "2082787201l", "session-token": "NDR0kD9QBkk81plbn2yEFeEJ205I0LEzv9G4Q3nG90PVEmpZEtEiL6Ixj3alIu5KBvaiePi9IOHhzSh48qBNQH2yroklDf1+rDTtJ8HNzcfae0ZEL3/szlwFWXRsxH7UwfHip9R66v9ZoO5ni9mrz+4J6QgjTmrzCW7uVjTMthdesVgOk2SVmxyfoLQnmYuScp17v+ayrztH3VFIg/VyhzNBlVbkZDIHg2WB3F+U0JHkiQfiZB1q2uCMWOEIOwRkCGhaRrWHbBgpQtvHnf+Iw2KxVkl2gv1jZga2iNQUhO6cIr2Py5V7evAyuXXhjRaBzIl/STRh58Jr9ctbhdBXnTQsbThexXLn", "skin": "noskin", "ubid-main": "133-5836273-3821350"}
uk_cookie_dict ={"i18n-prefs": "HKD", "lc-acbuk": "en_GB", "session-id": "521-9387816-0582061", "session-id-time": "2082787201l", "session-token": "\"ANvQ1SZ9VE6Y9cGcEdwhYcEpI1Jql7bOXZ7Q+qdBbb41IuuIFXvUzDSoaXOzPlcUTa3CIMoLIIzD2QOAWnN2h1E/c/iXEzpM8qd4LqR9otFV6PQecFkZKnC/i5v3QIguYgLt3KB9halOcaKi+KXcpG1b5jjYSicsMknbJMQVgfkiqMDXOLcxIns2cHs1xpXgf/DrjOHNpIB6Q24VKaqLTuKbWd8biHv86NGyoBNIs1kkjJxdwZhvWtQC/rHqoiz0e853W2lkrUrdr6Ko3dSIYojEBEgVuglCENJWLmkQzqEMxcIzNgcrpf4ZSlRHhMpmi2D14A6rwrwdo/JMozJSol+Wx3dL7j/AEGiXa1IWwrA=\"", "ubid-acbuk": "259-9451176-7460703"}
de_cookie_dict ={"i18n-prefs": "HKD", "lc-acbde": "de_DE", "session-id": "258-7262769-1488647", "session-id-time": "2082787201l", "session-token": "RxPtESM/cUQ4JAxsgMz8crm9EMbFhHLK4vPgZzTj2w23v5yWkzjXZsVVsErU5Oi8qzT3Le8nxU1zxeDEWuAg2IL+5fb2VwhE9UOQpLMmnOBWaukpW0CKFMOTvJU6Fu7+IZhWpJWt+ypu5b0KEYVi7CyTsHKh38q8N2m7tGBwb94NyT4tLHyE69XovznI5HV7y19J0aJrDyEYBfk00m6/SwxxOhRbkOXWVIagnGK4N1uQ27n9827rd7o1TpHQc8sWnn1bDAvYbCtGOaUYUHJ8HxnZPsTjKZ2b8OgzIKE9nlvpoAAQOArarZlZxneJTfUY++ucym9Te+RUvPp6SrhpPJyP7W425mA9", "ubid-acbde": "257-6077242-9988057"}
def get_cookies(site_name: str) -> dict[str, str]:
'''
从站点对应的数据库中读取Cookie
:param site_name: 站点名称(us/uk/de)
:return: Cookie字典,失败则返回None
'''
try:
# 2. 验证站点是否有对应的数据库配置
if site_name not in site_name_secret_dict:
logger.error(f"输入站点 {site_name} 的不存在 ")
return None
# 连接对应数据库
engine = get_remote_engine(
site_name=site_name, # -> database "selection"
db_type="mysql", # -> 服务端 alias "mysql"
)
if not engine:
logger.error('连接数据库出错')
return None
sql = f"select * from {site_name}_cookies order by updated_time desc limit 1;"
df_cookies = engine.read_sql(sql)
if df_cookies.empty:
logger.error("错误:未查询到 cookie 数据")
return None
cookies_dict = json.loads(df_cookies.iloc[0]['cookies'])
if not isinstance(cookies_dict, dict):
logger.error("转换结果不是字典")
return None
return cookies_dict
except Exception as e:
logger.error(f"错误:{str(e)}")
return None
# logger.success(get_cookies('de'))
\ No newline at end of file
import threading
import time
import random
import json
import datetime
from typing import List, Dict
from loguru import logger
from amazon_configs import site_name_secret_dict
from py_spider.utils.secure_db_client import get_remote_engine
def get_cookies_from_db(site_name: str) -> List[Dict]:
"""
从数据库查询最近 days 天的 Cookie
"""
if site_name not in site_name_secret_dict:
logger.error(f"站点 {site_name} 不在配置列表中")
return []
try:
# 1. 获取数据库连接
engine = get_remote_engine(site_name=site_name, db_type="mysql")
if not engine:
logger.error(f"站点 {site_name} 数据库连接失败")
return []
"""
# 2. 计算时间节点 (当前时间 - 7天)
# 格式化为 '2025-11-11 16:18:39'
seven_days_ago_dt = datetime.datetime.now() - datetime.timedelta(days=days)
seven_days_ago_str = seven_days_ago_dt.strftime("%Y-%m-%d %H:%M:%S")
logger.info(f'7天前 时间为 {seven_days_ago_dt},{seven_days_ago_str}')
# 3. 构建 SQL (注意:字段 updated_time 是 timestamp 类型)
table_name = f"{site_name}_cookies"
sql = f"SELECT * FROM {table_name} WHERE updated_time >= '{seven_days_ago_str}' ORDER BY updated_time DESC;"
"""
sql = f"select * from {site_name}_cookies order by updated_time desc limit 100;"
logger.info(f"[{site_name}] 执行SQL查询最近 100条 Cookie: {sql.strip()}")
df_cookies = engine.read_sql(sql)
if df_cookies.empty:
logger.warning(f"站点 {site_name} 无最近100条 Cookie数据")
return []
# 5. 解析数据
valid_cookies = []
for _, row in df_cookies.iterrows():
try:
cookie_str = row.get('cookies') # cookie数据
updated_date = row.get('updated_time') # 更新时间
# logger.info(f'更新时间为 {updated_date},{type(updated_date)}')
if cookie_str:
if isinstance(cookie_str, str):
cookie_dict = json.loads(cookie_str)
else:
cookie_dict = cookie_str # 已经是字典的情况
# 简单校验有效性
if "session-id" in cookie_dict:
if isinstance(updated_date, str):
clean_str = updated_date.replace('T', ' ')
updated_ts = datetime.datetime.strptime(clean_str, "%Y-%m-%d %H:%M:%S")
updated_ts = updated_ts.timestamp()
else:
updated_ts = time.time() # 默认给当前时间(兜底)
# logger.info(f'更新时间 时间戳为{updated_ts}')
valid_cookies.append({
'cookie': cookie_dict, # 给 requests 用的纯净字典
'updated_ts': updated_ts # 判断过期用的时间戳
})
except Exception as e:
logger.warning(f"解析单条Cookie失败: {e}")
continue
return valid_cookies
except Exception as e:
logger.error(f"[{site_name}] 数据库读取异常: {e}")
return []
# print(get_cookies_from_db('us'))
class CookiePoolManager:
_instance = None
_lock = threading.Lock()
def __new__(cls, *args, **kwargs):
if not cls._instance:
with cls._lock:
if not cls._instance:
cls._instance = super(CookiePoolManager, cls).__new__(cls)
cls._instance._init_pools()
return cls._instance
def _init_pools(self):
"""初始化池子状态"""
# 结构: { "us": { "data": [{’cookie‘:{},updated_ts:''},{’cookie‘:{},updated_ts:''}], "initial_count": 100, "last_refresh": timestamp } }
self.pools = {
site: {"data": [], "initial_count": 0, "last_db_check": 0}
for site in ["us", "uk", "de"]
}
self.cookie_expire_seconds = 1 * 24 * 60 * 60 # 数据更新时间:1天
self.db_check_interval = 1* 60 # 查库冷却期:1分钟
def _refresh_from_db(self, site_name: str):
"""内部方法:从数据库刷新"""
new_data = get_cookies_from_db(site_name) #返回的是列表 但是数据是字典格式 # [{’cookie‘:{},updated_ts:''},{’cookie‘:{},updated_ts:''}]
if new_data:
with self._lock:
self.pools[site_name]["data"] = new_data
self.pools[site_name]["initial_count"] = len(new_data)
# 更新查库时间
self.pools[site_name]["last_db_check"] = time.time()
logger.success(f"[{site_name}] Cookie池已刷新,数量: {len(new_data)}")
else:
# 即使没查到数据,也更新查库时间,防止10分钟内反复无效查询
self.pools[site_name]["last_db_check"] = time.time()
logger.warning(f"[{site_name}] 数据库无有效数据")
def get_cookie(self, site_name: str) -> Dict[str, str]:
"""
对外核心接口:获取一个可用 Cookie
包含:3天周期检查、半数阈值检查、空池初始化
"""
site_pool = self.pools.get(site_name)
if not site_pool:
# 如果是不支持的站点,尝试初始化一下结构
self.pools[site_name] = {"data": [], "initial_count": 0, "last_refresh": 0}
site_pool = self.pools[site_name]
# 1. 空池初始化
if not site_pool["data"]:
self._refresh_from_db(site_name)
# === 策略2: 数量阈值刷新 (少于一半) ===
# 只有在初始化成功过(initial_count > 0)的情况下才检查,防止数据库本来就没数据导致的死循环
current_count = len(site_pool["data"])
if site_pool["initial_count"] > 0 and current_count < (site_pool["initial_count"] / 2):
# 也要受冷却时间限制,防止数据库一直没新数据导致死循环刷新
if (time.time() - site_pool["last_db_check"]) > self.db_check_interval:
logger.warning( f"[{site_name}] 可用Cookie({current_count}) 低于 初始({site_pool['initial_count']}) 一半,触发强制刷新...")
self._refresh_from_db(site_name)
# logger.info(f'cookie池内容 {site_pool["initial_count"]} {site_pool["last_refresh"]}')
wrapper = {}
# === 策略3: 随机获取 ===
with self._lock:
if site_pool["data"]:
wrapper = random.choice(site_pool["data"])
if not wrapper: return {}
# 4. 【核心】基于单条数据的 updated_ts 判断过期
current_ts = time.time()
cookie_ts = wrapper.get('updated_ts', current_ts)
if (current_ts - cookie_ts) > self.cookie_expire_seconds:
last_check = site_pool["last_db_check"] # 拿到查库时间
# 数据过期了,看看能不能查库
if (current_ts - last_check) > self.db_check_interval:
logger.info(f"[{site_name}] 抽到过期数据(>1天) {wrapper},触发数据库更新...")
self._refresh_from_db(site_name)
# 刷新完再拿一次
with self._lock:
if site_pool["data"]:
wrapper = random.choice(site_pool["data"])
else:
# 冷却期内,哪怕过期了也先用着 ,或者可以选择返回空 {} 让上层用兜底 冷却期就是 限制数据库只能10分钟查询一次
logger.debug(f"[{site_name}] 数据过期但处于冷却期,降级使用")
return wrapper.get('cookie', {})
def mark_invalid(self, site_name: str, bad_cookie: Dict):
"""
对外接口:标记 Cookie 失效(从内存池中移除)
"""
if not bad_cookie:
return
# logger.info(f'移除cookie {bad_cookie}')
with self._lock:
pool_data = self.pools.get(site_name, {}).get("data", []) # 获取cookie列表
bad_sid = bad_cookie.get("session-id") # 获取过期cookie的session-id
if bad_sid:
original_len = len(pool_data)
# 过滤掉这个 session-id
self.pools[site_name]["data"] = [
c for c in pool_data if c.get('cookie').get("session-id") != bad_sid
]
new_len = len(self.pools[site_name]["data"])
# 判断删除cookie后的列表是否小于 原cookie 列表 确定移除是否成功
if new_len < original_len:
logger.warning(f"[{site_name}] 移除失效Cookie (SID: {bad_sid}...), 剩余: {new_len}")
# 导出单例
cookie_manager = CookiePoolManager()
\ No newline at end of file
# server.py
from flask import Flask, request, jsonify
from loguru import logger
# 引入逻辑类
from inv_img_double_search import AmazonImageSearch, SITE_CONFIG_MAPPER
app = Flask(__name__)
# 让 Flask 支持中文返回不乱码
app.config['JSON_AS_ASCII'] = False
# ==========================================
# 新增:首页接口 (浏览器直接访问)
# ==========================================
@app.route('/', methods=['GET'])
def index():
"""
首页:显示服务状态和简易文档
"""
html_content = """
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>Amazon 以图搜图服务</title>
<style>
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 40px; line-height: 1.6; color: #333; }
.container { background: #f9f9fa; padding: 30px; border-radius: 10px; border: 1px solid #e1e4e8; }
h1 { color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 10px; }
.status { color: #27ae60; font-weight: bold; font-size: 1.2em; }
.endpoint { background: #2c3e50; color: #fff; padding: 5px 10px; border-radius: 4px; font-family: monospace; }
.method { background: #e67e22; color: white; padding: 2px 6px; border-radius: 3px; font-size: 0.8em; font-weight: bold; }
pre { background: #2d2d2d; color: #f8f8f2; padding: 15px; border-radius: 5px; overflow-x: auto; }
</style>
</head>
<body>
<div class="container">
<h1>📸 Amazon Image Search API</h1>
<p>状态:<span class="status">✅ 服务正在运行 (Service is Running)</span></p>
<h3>接口信息</h3>
<p>URL: <span class="endpoint">/api/search_image</span> <span class="method">POST</span></p>
<h3>请求示例 (JSON)</h3>
<pre>
{
"image_url": "https://m.media-amazon.com/images/I/51i3aMcjmOL._SL600_.jpg",
"site_name": "us", // 可选: us
"search_mode": "default" // 可选: default, full_image
}
</pre>
<h3>健康检查</h3>
<p>URL: <a href="/health">/health</a> <span class="method">GET</span></p>
</div>
</body>
</html>
"""
return html_content
# ==========================================
# 核心业务接口
# ==========================================
@app.route('/api/search_image', methods=['POST'])
def search_image_api():
"""
接口描述:Amazon 以图搜图
Method: POST
"""
# 1. 获取并校验 JSON
data = request.get_json(silent=True)
if not data:
return jsonify({"code": 400, "msg": "Body必须是JSON格式"}), 400
# 2. 提取参数
image_url = data.get("image_url")
site_name = data.get("site_name", "us")
search_mode = data.get("search_mode", "default")
# 3. 校验必填项
if not image_url:
return jsonify({"code": 400, "msg": "缺少参数: image_url"}), 400
if site_name not in SITE_CONFIG_MAPPER:
return jsonify({"code": 400, "msg": f"不支持的站点: {site_name},支持: {list(SITE_CONFIG_MAPPER.keys())}"}), 400
try:
logger.info(f"收到API请求: Site={site_name}, Mode={search_mode}, URL={image_url}")
# 4. 初始化并执行搜索
client = AmazonImageSearch(site_name=site_name)
result = client.search(image_url, search_mode=search_mode)
# 5. 判断业务是否成功
if result.get("error") or result.get("success") == 0:
return jsonify({"code": 500, "msg": "识别失败", "data": result}), 500
return jsonify({"code": 200, "msg": "success", "data": result})
except Exception as e:
logger.error(f"服务内部错误: {e}")
return jsonify({"code": 500, "msg": f"Server Error: {str(e)}"}), 500
@app.route('/health', methods=['GET'])
def health():
"""健康检查接口"""
return jsonify({"status": "ok", "service": "Amazon Image Search"})
if __name__ == "__main__":
# 启动服务
logger.info("Flask 服务正在启动...")
# logger.info("请访问 http://127.0.0.1:5000 查看首页")
app.run(host='0.0.0.0', port=5000, debug=False)
\ No newline at end of file
import requests
import json
url = "http://192.168.200.210:5000/api/search_image"
# url = "http://127.0.0.1:5000/api/search_image"
# 请求参数
payload = {
# 图片链接
# "image_url": "https://m.media-amazon.com/images/I/31Yq1IRqKGL._SR240,220_.jpg", # ok
# "image_url": "https://m.media-amazon.com/images/I/21cOsqYwDjL._SR480,440_.jpg", # ok
# "image_url": "https://m.media-amazon.com/images/I/21cOsqYwDjL._SR480,440_.jpg", # ok
# "image_url": "https://m.media-amazon.com/images/I/41vTSMa6mNL._SR480,440_.jpg", # ok
# "image_url": "https://m.media-amazon.com/images/I/71G1BAeYlNL._AC_SX300_SY300_QL70_FMwebp_.jpg", # no
# "image_url": "https://m.media-amazon.com/images/I/41ryNvEnNCL._AC_SL1500_.jpg", # ok
# "image_url": "https://m.media-amazon.com/images/I/71IFE6W6THL._AC_SY550_.jpg", # ok
"image_url": "https://soundasia.oss-cn-shenzhen.aliyuncs.com/yswg-img-test/SoundasiaAmazon/competitor_image/2025/1124/84759f5d6514435e83d0fb728a57451c.jpg", # ok
# "image_url": "https://imgtool.net/cdn/image/2023/20230401_1.jpg", # ok
# 站点
"site_name": "us",
# 模式 (default: 普通模式, full_image: 全图模式) 主要针对app端识别
# "search_mode": "full_image" # 不传走默认 default
}
try:
print("开始请求!")
# 发送 POST 请求
response = requests.post(url, json=payload, timeout=120)
# 解析结果
if response.status_code == 200:
result = response.json()
print("\n 请求成功!")
print(f"状态码: {result['code']}")
print(f"消息: {result['msg']}")
data = result.get('data', {})
print(f"识别成功: {data.get('success')}")
print(f"找到商品数: {data.get('total_items')}")
print(f"耗时: {data.get('duration')}")
print(f"搜索url: {data.get('search_url')}")
# 打印前3个商品看看
items = data.get('items', [])
if items:
print("\n--- 商品示例 ---")
for item in items:
print(f"ASIN: {item['asin']} | 价格: {item['price']} | 评分:{item['rating']} | img_url:{item['img_url']} | 品牌名:{item['brand']} | 销量:{item['bought']} | 评论数:{item['total_comments']} | asin来源:{item['source']} | 标题: {item['title'][:50]}...")
else:
print(f" 请求失败: {response.status_code}")
print(response.text)
except Exception as e:
print(f" 发生错误: {e}")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment