Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
4b08a2f8
Commit
4b08a2f8
authored
Jun 15, 2026
by
PC-20230618BYKI\Administrator
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
1f19360c
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
66 additions
and
27 deletions
+66
-27
amazon_configs.py
...pider_code/app_amazon_image_recognition/amazon_configs.py
+1
-1
fastapi_server.py
...pider_code/app_amazon_image_recognition/fastapi_server.py
+6
-8
inv_img_double_search.py
...ode/app_amazon_image_recognition/inv_img_double_search.py
+59
-18
No files found.
hjm_spider_code/app_amazon_image_recognition/amazon_configs.py
View file @
4b08a2f8
...
...
@@ -465,7 +465,7 @@ de_devices_list = [
]
us_cookie_dict
=
{
"i18n-prefs"
:
"USD"
,
"lc-main"
:
"en_US"
,
"session-id"
:
"137-6850115-5627367"
,
"session-id-time"
:
"2082787201l"
,
"session-token"
:
"NDR0kD9QBkk81plbn2yEFeEJ205I0LEzv9G4Q3nG90PVEmpZEtEiL6Ixj3alIu5KBvaiePi9IOHhzSh48qBNQH2yroklDf1+rDTtJ8HNzcfae0ZEL3/szlwFWXRsxH7UwfHip9R66v9ZoO5ni9mrz+4J6QgjTmrzCW7uVjTMthdesVgOk2SVmxyfoLQnmYuScp17v+ayrztH3VFIg/VyhzNBlVbkZDIHg2WB3F+U0JHkiQfiZB1q2uCMWOEIOwRkCGhaRrWHbBgpQtvHnf+Iw2KxVkl2gv1jZga2iNQUhO6cIr2Py5V7evAyuXXhjRaBzIl/STRh58Jr9ctbhdBXnTQsbThexXLn"
,
"skin"
:
"noskin"
,
"ubid-main"
:
"133-5836273-3821350"
}
us_cookie_dict
=
{
"i18n-prefs"
:
"USD"
,
"lc-
main"
:
"en_US"
,
"session-id"
:
"137-6850115-5627367"
,
"session-id-time"
:
"2082787201l"
,
"session-token"
:
"NDR0kD9QBkk81plbn2yEFeEJ205I0LEzv9G4Q3nG90PVEmpZEtEiL6Ixj3alIu5KBvaiePi9IOHhzSh48qBNQH2yroklDf1+rDTtJ8HNzcfae0ZEL3/szlwFWXRsxH7UwfHip9R66v9ZoO5ni9mrz+4J6QgjTmrzCW7uVjTMthdesVgOk2SVmxyfoLQnmYuScp17v+ayrztH3VFIg/VyhzNBlVbkZDIHg2WB3F+U0JHkiQfiZB1q2uCMWOEIOwRkCGhaRrWHbBgpQtvHnf+Iw2KxVkl2gv1jZga2iNQUhO6cIr2Py5V7evAyuXXhjRaBzIl/STRh58Jr9ctbhdBXnTQsbThexXLn"
,
"skin"
:
"noskin"
,
"ubid-main"
:
"133-5836273-3821350"
}
uk_cookie_dict
=
{
"i18n-prefs"
:
"HKD"
,
"lc-acbuk"
:
"en_GB"
,
"session-id"
:
"521-9387816-0582061"
,
"session-id-time"
:
"2082787201l"
,
"session-token"
:
"
\"
ANvQ1SZ9VE6Y9cGcEdwhYcEpI1Jql7bOXZ7Q+qdBbb41IuuIFXvUzDSoaXOzPlcUTa3CIMoLIIzD2QOAWnN2h1E/c/iXEzpM8qd4LqR9otFV6PQecFkZKnC/i5v3QIguYgLt3KB9halOcaKi+KXcpG1b5jjYSicsMknbJMQVgfkiqMDXOLcxIns2cHs1xpXgf/DrjOHNpIB6Q24VKaqLTuKbWd8biHv86NGyoBNIs1kkjJxdwZhvWtQC/rHqoiz0e853W2lkrUrdr6Ko3dSIYojEBEgVuglCENJWLmkQzqEMxcIzNgcrpf4ZSlRHhMpmi2D14A6rwrwdo/JMozJSol+Wx3dL7j/AEGiXa1IWwrA=
\"
"
,
"ubid-acbuk"
:
"259-9451176-7460703"
}
de_cookie_dict
=
{
"i18n-prefs"
:
"HKD"
,
"lc-acbde"
:
"de_DE"
,
"session-id"
:
"258-7262769-1488647"
,
"session-id-time"
:
"2082787201l"
,
"session-token"
:
"RxPtESM/cUQ4JAxsgMz8crm9EMbFhHLK4vPgZzTj2w23v5yWkzjXZsVVsErU5Oi8qzT3Le8nxU1zxeDEWuAg2IL+5fb2VwhE9UOQpLMmnOBWaukpW0CKFMOTvJU6Fu7+IZhWpJWt+ypu5b0KEYVi7CyTsHKh38q8N2m7tGBwb94NyT4tLHyE69XovznI5HV7y19J0aJrDyEYBfk00m6/SwxxOhRbkOXWVIagnGK4N1uQ27n9827rd7o1TpHQc8sWnn1bDAvYbCtGOaUYUHJ8HxnZPsTjKZ2b8OgzIKE9nlvpoAAQOArarZlZxneJTfUY++ucym9Te+RUvPp6SrhpPJyP7W425mA9"
,
"ubid-acbde"
:
"257-6077242-9988057"
}
...
...
hjm_spider_code/app_amazon_image_recognition/fastapi_server.py
View file @
4b08a2f8
...
...
@@ -130,23 +130,21 @@ def search_image_api(item: ImageSearchRequest):
# VPS 标记为失败 (success=0)
logger
.
warning
(
f
"任务 {task_id} 状态3 VPS返回失败(success=0). 总耗时: {elapsed:.1f}s"
)
# 判断时间是否超过 1 分钟
if
elapsed
>
6
0
:
logger
.
error
(
"VPS失败状态3且已超时超过
6
0s,不再兜底,直接返回失败"
)
if
elapsed
>
3
0
:
logger
.
error
(
"VPS失败状态3且已超时超过
3
0s,不再兜底,直接返回失败"
)
return
JSONResponse
({
"code"
:
400
,
"msg"
:
"识别处理失败"
,
"data"
:
res
},
status_code
=
400
)
else
:
logger
.
info
(
"耗时未超
6
0s,转本地兜底重试..."
)
logger
.
info
(
"耗时未超
3
0s,转本地兜底重试..."
)
break
# 跳出循环,去执行本地逻辑
except
:
except
Exception
as
e
:
# 极少情况:JSON解析失败,视为失败走兜底
logger
.
error
(
f
"VPS结果处理异常: {e}"
)
break
break
if
elapsed
>
60
:
if
elapsed
>
30
:
logger
.
warning
(
f
"任务 {task_id} 状态为{state} 超时({elapsed:.1f})执行兜底"
)
break
time
.
sleep
(
0.3
)
# ============ 3. 本地兜底执行 ============
try
:
logger
.
info
(
f
"本地兜底执行: {image_url}"
)
...
...
hjm_spider_code/app_amazon_image_recognition/inv_img_double_search.py
View file @
4b08a2f8
...
...
@@ -18,8 +18,9 @@ from scrapy import Selector
from
requests.exceptions
import
RequestException
,
JSONDecodeError
,
Timeout
,
ConnectionError
,
HTTPError
from
loguru
import
logger
from
cookie_manager
import
cookie_manager
from
chrome_us_list
import
get_random_ua
# 导入配置
from
curl_cffi
import
requests
as
curl_cffi_requests
from
amazon_configs
import
(
site_name_secret_dict
,
us_devices_list
,
...
...
@@ -69,7 +70,7 @@ def get_image_size(image_data: bytes) -> Optional[Dict[str, int]]:
return
None
def
get_page_num
(
total
:
int
|
None
,
count_per_page
:
int
|
None
)
->
int
:
def
get_page_num
(
total
:
int
,
count_per_page
:
int
)
->
int
:
if
not
isinstance
(
total
,
int
)
or
not
isinstance
(
count_per_page
,
int
):
return
0
if
total
<=
0
or
count_per_page
<=
0
:
...
...
@@ -201,7 +202,7 @@ class AmazonImageSearch:
# 1. 检查 JSON 结构是否完整
if
"style-snap"
not
in
response_json
or
"searchResult"
not
in
response_json
[
"style-snap"
]:
# 结构不对,直接报错
raise
ValueError
(
"App端响应结构异常: 缺少 style-snap 或 searchResult"
)
raise
ValueError
(
"App端响应结构
异常: 缺少 style-snap 或 searchResult"
)
results
=
response_json
[
"style-snap"
][
"searchResult"
]
...
...
@@ -284,7 +285,7 @@ class AmazonImageSearch:
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/1
42
.0.0.0 Safari/537.36"
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/1
20
.0.0.0 Safari/537.36"
}
resp
=
self
.
_retry_request
(
"GET"
,
url
,
headers
=
headers
,
cookies
=
strict_cookies
)
...
...
@@ -303,7 +304,7 @@ class AmazonImageSearch:
"referer"
:
WEB_REFERER_URL
.
format
(
base_url
=
self
.
base_url
),
"accept"
:
"*/*"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/1
42
.0.0.0 Safari/537.36"
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/1
20
.0.0.0 Safari/537.36"
}
# 直接使用 bytes 数据
...
...
@@ -417,25 +418,59 @@ class AmazonImageSearch:
# logger.info(f'解析成功 当前页{len(items)}条数据')
return
items
def
_fetch_single_page
(
self
,
url
:
str
,
page
:
int
,
headers
:
Dict
[
str
,
str
])
->
List
[
Dict
[
str
,
Any
]]:
@staticmethod
def
extract_chrome_version_from_ua
(
ua
:
str
)
->
Optional
[
int
]:
"""从UA字符串中提取Chrome主版本号(如从Chrome/136.0.7015.93提取136)"""
match
=
re
.
search
(
r"Chrome/(\d+)\."
,
ua
)
return
int
(
match
.
group
(
1
))
if
match
else
None
@staticmethod
def
get_tls_fingerprint_by_chrome_version
(
version
:
Optional
[
int
])
->
str
:
"""根据Chrome主版本号映射 curl_cffi 的 impersonate 指纹"""
if
version
is
None
:
return
"chrome120"
elif
version
>=
130
:
return
"chrome131"
elif
version
>=
120
:
return
"chrome120"
elif
version
>=
110
:
return
"chrome110"
elif
version
>=
100
:
return
"chrome101"
elif
version
>=
95
:
return
"chrome99"
else
:
return
"chrome104"
# 默认兜底
def
_build_browser_headers
(
self
):
"""随机生成 UA + 匹配的 TLS 指纹,每次请求调用一次"""
ua
=
get_random_ua
()
tls
=
self
.
get_tls_fingerprint_by_chrome_version
(
self
.
extract_chrome_version_from_ua
(
ua
))
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,*/*"
,
"Accept-Language"
:
"en-US,en;q=0.9"
,
"Accept-Encoding"
:
"gzip, deflate"
,
"user-agent"
:
ua
,
}
return
headers
,
tls
def
_fetch_single_page
(
self
,
url
:
str
,
page
:
int
)
->
List
[
Dict
[
str
,
Any
]]:
try
:
resp_p
=
self
.
_retry_request
(
"GET"
,
f
"{url}&page={page}"
,
headers
=
headers
,
cookies
=
self
.
cookies
)
headers
,
tls
=
self
.
_build_browser_headers
()
resp_p
=
curl_cffi_requests
.
get
(
f
"{url}&page={page}"
,
impersonate
=
tls
,
headers
=
headers
,
cookies
=
self
.
cookies
)
return
self
.
_parse_items
(
resp_p
.
text
)
except
Exception
as
e
:
logger
.
warning
(
f
"第{page}页失败: {e}"
)
return
[]
def
_fetch_results
(
self
,
url
:
str
)
->
Dict
[
str
,
Any
]:
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,*/*"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
,
"viewport-width"
:
"1343"
}
all_items
=
[]
# logger.info("正在获取第1页...")
resp
=
self
.
_retry_request
(
"GET"
,
url
,
headers
=
headers
,
cookies
=
self
.
cookies
)
# resp = self._retry_request("GET", url, headers=headers, cookies=self.cookies)
headers
,
tls
=
self
.
_build_browser_headers
()
resp
=
curl_cffi_requests
.
get
(
url
,
headers
=
headers
,
impersonate
=
tls
,
cookies
=
self
.
cookies
)
logger
.
info
(
f
'tls版本为:{tls},ua为 {headers.get("user-agent")}'
)
page1_items
=
self
.
_parse_items
(
resp
.
text
)
# 第一页数据解析
all_items
.
extend
(
page1_items
)
# 第一页数据添加到列表
total_res
=
re
.
search
(
r'"totalResultCount":(\d+)'
,
resp
.
text
)
...
...
@@ -450,7 +485,7 @@ class AmazonImageSearch:
logger
.
info
(
f
"开始并发爬取第 2 到 {pages} 页..."
)
with
ThreadPoolExecutor
(
max_workers
=
4
)
as
executor
:
futures
=
[
executor
.
submit
(
self
.
_fetch_single_page
,
url
,
p
,
headers
)
executor
.
submit
(
self
.
_fetch_single_page
,
url
,
p
)
for
p
in
range
(
2
,
pages
+
1
)
]
for
future
in
as_completed
(
futures
):
...
...
@@ -460,7 +495,13 @@ class AmazonImageSearch:
df
=
pd
.
DataFrame
(
all_items
)
# 按 asin 列去重 再转回字典
# pandas 会把 None 转成 NaN, 在 to_dict 之后清洗, 否则 JSONResponse 序列化会报错
final_items
=
df
.
drop_duplicates
(
subset
=
[
'asin'
])
.
to_dict
(
'records'
)
if
not
df
.
empty
else
[]
for
item
in
final_items
:
for
key
in
item
:
v
=
item
[
key
]
if
isinstance
(
v
,
float
)
and
(
v
!=
v
):
# NaN != NaN 是 True
item
[
key
]
=
None
return
{
"total_items"
:
len
(
final_items
),
"items"
:
final_items
}
# === 图片下载方法 ===
...
...
@@ -660,14 +701,14 @@ class AmazonImageSearch:
if
__name__
==
"__main__"
:
# 测试用图片链接
# test_img_url = "https://soundasia.oss-cn-shenzhen.aliyuncs.com/yswg-img/SoundasiaAmazon/file/2025/1125/bdb9b06102184048b6eb9db3b39bb97e
.png"
test_img_url
=
"https://yswg-private-test.oss-cn-shenzhen.aliyuncs.com/SoundasiaAmazon/competitor_image/2025/1127/88e90bbd317a42ea80cc9128ea333e6c.svg"
# 图片权限问题
test_img_url
=
"https://soundasia.oss-cn-shenzhen.aliyuncs.com/yswg-img/SoundasiaAmazon/competitor_image/2026/0615/8cdddea95ac645ca8a822300ae717c4d
.png"
#
test_img_url = "https://yswg-private-test.oss-cn-shenzhen.aliyuncs.com/SoundasiaAmazon/competitor_image/2025/1127/88e90bbd317a42ea80cc9128ea333e6c.svg" # 图片权限问题
# test_img_url = "https://m.media-amazon.com/images/I/71IFE6W6THL._AC_UL320_.jpg"
# test_img_url = "https://m.media-amazon.com/images/I/71IFE6W6THL._AC_SY550_.jpg"
# test_img_url = "https://m.media-amazon.com/images/I/71G1BAeYlNL._AC_SX300_SY300_QL70_FMwebp_.jpg"
try
:
client
=
AmazonImageSearch
(
site_name
=
"u
k
"
)
client
=
AmazonImageSearch
(
site_name
=
"u
s
"
)
logger
.
info
(
"
\n
=== 测试默认模式 ==="
)
result
=
client
.
search
(
test_img_url
,
search_mode
=
"default"
)
# logger.success(f"Result: Success={default_result}")
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment