no message

parent 1f19360c
......@@ -465,7 +465,7 @@ de_devices_list = [
]
us_cookie_dict ={"i18n-prefs": "USD", "lc-main": "en_US", "session-id": "137-6850115-5627367", "session-id-time": "2082787201l", "session-token": "NDR0kD9QBkk81plbn2yEFeEJ205I0LEzv9G4Q3nG90PVEmpZEtEiL6Ixj3alIu5KBvaiePi9IOHhzSh48qBNQH2yroklDf1+rDTtJ8HNzcfae0ZEL3/szlwFWXRsxH7UwfHip9R66v9ZoO5ni9mrz+4J6QgjTmrzCW7uVjTMthdesVgOk2SVmxyfoLQnmYuScp17v+ayrztH3VFIg/VyhzNBlVbkZDIHg2WB3F+U0JHkiQfiZB1q2uCMWOEIOwRkCGhaRrWHbBgpQtvHnf+Iw2KxVkl2gv1jZga2iNQUhO6cIr2Py5V7evAyuXXhjRaBzIl/STRh58Jr9ctbhdBXnTQsbThexXLn", "skin": "noskin", "ubid-main": "133-5836273-3821350"}
us_cookie_dict ={"i18n-prefs": "USD", "lc- main": "en_US", "session-id": "137-6850115-5627367", "session-id-time": "2082787201l", "session-token": "NDR0kD9QBkk81plbn2yEFeEJ205I0LEzv9G4Q3nG90PVEmpZEtEiL6Ixj3alIu5KBvaiePi9IOHhzSh48qBNQH2yroklDf1+rDTtJ8HNzcfae0ZEL3/szlwFWXRsxH7UwfHip9R66v9ZoO5ni9mrz+4J6QgjTmrzCW7uVjTMthdesVgOk2SVmxyfoLQnmYuScp17v+ayrztH3VFIg/VyhzNBlVbkZDIHg2WB3F+U0JHkiQfiZB1q2uCMWOEIOwRkCGhaRrWHbBgpQtvHnf+Iw2KxVkl2gv1jZga2iNQUhO6cIr2Py5V7evAyuXXhjRaBzIl/STRh58Jr9ctbhdBXnTQsbThexXLn", "skin": "noskin", "ubid-main": "133-5836273-3821350"}
uk_cookie_dict ={"i18n-prefs": "HKD", "lc-acbuk": "en_GB", "session-id": "521-9387816-0582061", "session-id-time": "2082787201l", "session-token": "\"ANvQ1SZ9VE6Y9cGcEdwhYcEpI1Jql7bOXZ7Q+qdBbb41IuuIFXvUzDSoaXOzPlcUTa3CIMoLIIzD2QOAWnN2h1E/c/iXEzpM8qd4LqR9otFV6PQecFkZKnC/i5v3QIguYgLt3KB9halOcaKi+KXcpG1b5jjYSicsMknbJMQVgfkiqMDXOLcxIns2cHs1xpXgf/DrjOHNpIB6Q24VKaqLTuKbWd8biHv86NGyoBNIs1kkjJxdwZhvWtQC/rHqoiz0e853W2lkrUrdr6Ko3dSIYojEBEgVuglCENJWLmkQzqEMxcIzNgcrpf4ZSlRHhMpmi2D14A6rwrwdo/JMozJSol+Wx3dL7j/AEGiXa1IWwrA=\"", "ubid-acbuk": "259-9451176-7460703"}
de_cookie_dict ={"i18n-prefs": "HKD", "lc-acbde": "de_DE", "session-id": "258-7262769-1488647", "session-id-time": "2082787201l", "session-token": "RxPtESM/cUQ4JAxsgMz8crm9EMbFhHLK4vPgZzTj2w23v5yWkzjXZsVVsErU5Oi8qzT3Le8nxU1zxeDEWuAg2IL+5fb2VwhE9UOQpLMmnOBWaukpW0CKFMOTvJU6Fu7+IZhWpJWt+ypu5b0KEYVi7CyTsHKh38q8N2m7tGBwb94NyT4tLHyE69XovznI5HV7y19J0aJrDyEYBfk00m6/SwxxOhRbkOXWVIagnGK4N1uQ27n9827rd7o1TpHQc8sWnn1bDAvYbCtGOaUYUHJ8HxnZPsTjKZ2b8OgzIKE9nlvpoAAQOArarZlZxneJTfUY++ucym9Te+RUvPp6SrhpPJyP7W425mA9", "ubid-acbde": "257-6077242-9988057"}
......
......@@ -130,23 +130,21 @@ def search_image_api(item: ImageSearchRequest):
# VPS 标记为失败 (success=0)
logger.warning(f"任务 {task_id} 状态3 VPS返回失败(success=0). 总耗时: {elapsed:.1f}s")
# 判断时间是否超过 1 分钟
if elapsed > 60:
logger.error("VPS失败状态3且已超时超过60s,不再兜底,直接返回失败")
if elapsed > 30:
logger.error("VPS失败状态3且已超时超过30s,不再兜底,直接返回失败")
return JSONResponse({"code": 400, "msg": "识别处理失败", "data": res},status_code=400)
else:
logger.info("耗时未超60s,转本地兜底重试...")
logger.info("耗时未超30s,转本地兜底重试...")
break # 跳出循环,去执行本地逻辑
except:
except Exception as e:
# 极少情况:JSON解析失败,视为失败走兜底
logger.error(f"VPS结果处理异常: {e}")
break
break
if elapsed > 60:
if elapsed > 30:
logger.warning(f"任务 {task_id} 状态为{state} 超时({elapsed:.1f})执行兜底")
break
time.sleep(0.3)
# ============ 3. 本地兜底执行 ============
try:
logger.info(f"本地兜底执行: {image_url}")
......
......@@ -18,8 +18,9 @@ from scrapy import Selector
from requests.exceptions import RequestException, JSONDecodeError, Timeout, ConnectionError, HTTPError
from loguru import logger
from cookie_manager import cookie_manager
from chrome_us_list import get_random_ua
# 导入配置
from curl_cffi import requests as curl_cffi_requests
from amazon_configs import (
site_name_secret_dict,
us_devices_list,
......@@ -69,7 +70,7 @@ def get_image_size(image_data: bytes) -> Optional[Dict[str, int]]:
return None
def get_page_num(total: int | None, count_per_page: int | None) -> int:
def get_page_num(total: int , count_per_page: int ) -> int:
if not isinstance(total, int) or not isinstance(count_per_page, int):
return 0
if total <= 0 or count_per_page <= 0:
......@@ -201,7 +202,7 @@ class AmazonImageSearch:
# 1. 检查 JSON 结构是否完整
if "style-snap" not in response_json or "searchResult" not in response_json["style-snap"]:
# 结构不对,直接报错
raise ValueError("App端响应结构异常: 缺少 style-snap 或 searchResult")
raise ValueError("App端响应结构 异常: 缺少 style-snap 或 searchResult")
results = response_json["style-snap"]["searchResult"]
......@@ -284,7 +285,7 @@ class AmazonImageSearch:
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"accept-language": "zh-CN,zh;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
resp = self._retry_request("GET", url, headers=headers, cookies=strict_cookies)
......@@ -303,7 +304,7 @@ class AmazonImageSearch:
"referer": WEB_REFERER_URL.format(base_url=self.base_url),
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 直接使用 bytes 数据
......@@ -417,25 +418,59 @@ class AmazonImageSearch:
# logger.info(f'解析成功 当前页{len(items)}条数据')
return items
def _fetch_single_page(self, url: str, page: int, headers: Dict[str, str]) -> List[Dict[str, Any]]:
@staticmethod
def extract_chrome_version_from_ua(ua: str) -> Optional[int]:
"""从UA字符串中提取Chrome主版本号(如从Chrome/136.0.7015.93提取136)"""
match = re.search(r"Chrome/(\d+)\.", ua)
return int(match.group(1)) if match else None
@staticmethod
def get_tls_fingerprint_by_chrome_version(version: Optional[int]) -> str:
"""根据Chrome主版本号映射 curl_cffi 的 impersonate 指纹"""
if version is None:
return "chrome120"
elif version >= 130:
return "chrome131"
elif version >= 120:
return "chrome120"
elif version >= 110:
return "chrome110"
elif version >= 100:
return "chrome101"
elif version >= 95:
return "chrome99"
else:
return "chrome104" # 默认兜底
def _build_browser_headers(self):
"""随机生成 UA + 匹配的 TLS 指纹,每次请求调用一次"""
ua = get_random_ua()
tls = self.get_tls_fingerprint_by_chrome_version(self.extract_chrome_version_from_ua(ua))
headers = {
"accept": "text/html,application/xhtml+xml,*/*",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate",
"user-agent": ua,
}
return headers, tls
def _fetch_single_page(self, url: str, page: int) -> List[Dict[str, Any]]:
try:
resp_p = self._retry_request("GET", f"{url}&page={page}", headers=headers, cookies=self.cookies)
headers, tls = self._build_browser_headers()
resp_p = curl_cffi_requests.get(f"{url}&page={page}", impersonate=tls, headers=headers, cookies=self.cookies)
return self._parse_items(resp_p.text)
except Exception as e:
logger.warning(f"第{page}页失败: {e}")
return []
def _fetch_results(self, url: str) -> Dict[str, Any]:
headers = {
"accept": "text/html,application/xhtml+xml,*/*",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
"viewport-width": "1343"
}
all_items = []
# logger.info("正在获取第1页...")
resp = self._retry_request("GET", url, headers=headers, cookies=self.cookies)
# resp = self._retry_request("GET", url, headers=headers, cookies=self.cookies)
headers, tls = self._build_browser_headers()
resp = curl_cffi_requests.get(url, headers=headers, impersonate=tls, cookies=self.cookies)
logger.info(f'tls版本为:{tls},ua为 {headers.get("user-agent")}')
page1_items = self._parse_items(resp.text) # 第一页数据解析
all_items.extend(page1_items) # 第一页数据添加到列表
total_res = re.search(r'"totalResultCount":(\d+)', resp.text)
......@@ -450,7 +485,7 @@ class AmazonImageSearch:
logger.info(f"开始并发爬取第 2 到 {pages} 页...")
with ThreadPoolExecutor(max_workers=4) as executor:
futures = [
executor.submit(self._fetch_single_page, url, p, headers)
executor.submit(self._fetch_single_page, url, p)
for p in range(2, pages + 1)
]
for future in as_completed(futures):
......@@ -460,7 +495,13 @@ class AmazonImageSearch:
df = pd.DataFrame(all_items)
# 按 asin 列去重 再转回字典
# pandas 会把 None 转成 NaN, 在 to_dict 之后清洗, 否则 JSONResponse 序列化会报错
final_items = df.drop_duplicates(subset=['asin']).to_dict('records') if not df.empty else []
for item in final_items:
for key in item:
v = item[key]
if isinstance(v, float) and (v != v): # NaN != NaN 是 True
item[key] = None
return {"total_items": len(final_items), "items": final_items}
# === 图片下载方法 ===
......@@ -660,14 +701,14 @@ class AmazonImageSearch:
if __name__ == "__main__":
# 测试用图片链接
# test_img_url = "https://soundasia.oss-cn-shenzhen.aliyuncs.com/yswg-img/SoundasiaAmazon/file/2025/1125/bdb9b06102184048b6eb9db3b39bb97e.png"
test_img_url = "https://yswg-private-test.oss-cn-shenzhen.aliyuncs.com/SoundasiaAmazon/competitor_image/2025/1127/88e90bbd317a42ea80cc9128ea333e6c.svg" # 图片权限问题
test_img_url = "https://soundasia.oss-cn-shenzhen.aliyuncs.com/yswg-img/SoundasiaAmazon/competitor_image/2026/0615/8cdddea95ac645ca8a822300ae717c4d.png"
# test_img_url = "https://yswg-private-test.oss-cn-shenzhen.aliyuncs.com/SoundasiaAmazon/competitor_image/2025/1127/88e90bbd317a42ea80cc9128ea333e6c.svg" # 图片权限问题
# test_img_url = "https://m.media-amazon.com/images/I/71IFE6W6THL._AC_UL320_.jpg"
# test_img_url = "https://m.media-amazon.com/images/I/71IFE6W6THL._AC_SY550_.jpg"
# test_img_url = "https://m.media-amazon.com/images/I/71G1BAeYlNL._AC_SX300_SY300_QL70_FMwebp_.jpg"
try:
client = AmazonImageSearch(site_name="uk")
client = AmazonImageSearch(site_name="us")
logger.info("\n=== 测试默认模式 ===")
result = client.search(test_img_url, search_mode="default")
# logger.success(f"Result: Success={default_result}")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment