Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
dfdb4f08
Commit
dfdb4f08
authored
Nov 17, 2025
by
PC-20230618BYKI\Administrator
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
亚马逊新版app端 以图识物 支持默认识别和全图识别 增加了随机设备获取 和适配 uk us de站点 uk de未经过抓包测试
parent
9dcae35d
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
320 additions
and
0 deletions
+320
-0
app_img_search_api.py
...r_code/app_amazon_image_recognition/app_img_search_api.py
+320
-0
No files found.
hjm_spider_code/app_amazon_image_recognition/app_img_search_api.py
0 → 100644
View file @
dfdb4f08
import
hashlib
import
time
import
uuid
import
random
from
typing
import
Optional
,
Dict
,
Any
from
loguru
import
logger
import
requests
import
json
from
PIL
import
Image
from
requests.exceptions
import
RequestException
,
JSONDecodeError
# 导入所有站点相关配置
from
amazon_configs
import
(
site_name_secret_dict
,
us_devices_list
,
uk_devices_list
,
de_devices_list
,
us_cookie_dict
,
uk_cookie_dict
,
de_cookie_dict
)
# 配置日志
# logger.add("amazon_search_optimized.log", rotation="10 MB", level="INFO")
# 站点配置映射(关联设备列表和Cookie)
SITE_CONFIG_MAPPER
=
{
"us"
:
{
"devices"
:
us_devices_list
,
"cookies"
:
us_cookie_dict
},
"uk"
:
{
"devices"
:
uk_devices_list
,
"cookies"
:
uk_cookie_dict
},
"de"
:
{
"devices"
:
de_devices_list
,
"cookies"
:
de_cookie_dict
}
}
# asin 网页端打开 url
AMAZON_SEARCH_BASE_URL
=
"https://www.amazon.com/s?rh=p_78:{bbx_asin_list}&rank=asin-scores-asc-rank&searchMethod=CameraSearch"
# 重试策略配置
RETRY_TIMES
=
5
RETRY_DELAY
=
1
def
get_image_size
(
image_path
:
str
)
->
Optional
[
Dict
[
str
,
int
]]:
"""获取图片宽高尺寸"""
try
:
with
Image
.
open
(
image_path
)
as
img
:
width
,
height
=
img
.
size
return
{
"width"
:
width
,
"height"
:
height
}
except
FileNotFoundError
:
logger
.
error
(
f
"图片文件未找到: {image_path}"
)
return
None
except
Exception
as
e
:
logger
.
error
(
f
"获取图片尺寸失败 ({image_path}): {e}"
)
return
None
class
AmazonImageSearch
:
def
__init__
(
self
,
site_name
:
str
):
# 验证站点合法性
if
site_name
not
in
site_name_secret_dict
:
raise
ValueError
(
f
"不支持的站点: {site_name},支持站点:{list(site_name_secret_dict.keys())}"
)
if
site_name
not
in
SITE_CONFIG_MAPPER
:
raise
ValueError
(
f
"站点 {site_name} 缺少设备或Cookie配置"
)
self
.
site_name
=
site_name
# 加载站点基础配置(secret等)
self
.
site_config
=
site_name_secret_dict
[
site_name
]
# 加载站点设备列表和Cookie
self
.
site_specific
=
SITE_CONFIG_MAPPER
[
site_name
]
self
.
cookies
=
self
.
site_specific
[
"cookies"
]
self
.
session_id
=
self
.
cookies
.
get
(
"session-id"
,
''
)
# 从站点设备列表随机选择一个设备(用于query_metadata)
self
.
device_info
=
self
.
_get_random_device
()
# 生成客户端设备ID
self
.
client_device_id
=
str
(
uuid
.
uuid4
())
logger
.
info
(
f
"客户端初始化完成 - 站点: {self.site_name}, "
f
"随机设备: {self.device_info.get('clientDevice')}, "
f
"clientDeviceId: {self.client_device_id}"
)
# 构建请求头
self
.
headers
=
{
"x-amz-access-token"
:
""
,
"x-amz-lens-session-auth-token"
:
self
.
cookies
.
get
(
"session-token"
,
""
),
"x-amz-lens-session-id"
:
self
.
session_id
,
"x-amz-lens-ubid"
:
self
.
cookies
.
get
(
"ubid-main"
,
""
),
"accept-encoding"
:
"gzip"
,
"user-agent"
:
"okhttp/4.9.1"
,
}
self
.
snap_url
=
f
"{self.site_config['snap_url']}/style-snap/2.0"
def
_get_random_device
(
self
)
->
Dict
[
str
,
str
]:
"""从站点设备列表中随机选择一个设备"""
devices
=
self
.
site_specific
[
"devices"
]
if
not
devices
:
raise
ValueError
(
f
"站点 {self.site_name} 的设备列表为空"
)
return
random
.
choice
(
devices
)
def
_generate_auth_params
(
self
)
->
Dict
[
str
,
str
]:
"""生成认证所需的 authtoken 和 ts"""
ts
=
str
(
int
(
time
.
time
()))
combined
=
(
f
"{self.site_config['secret']}{self.site_config['username']}"
f
"{self.site_config['application']}{ts}"
)
authtoken
=
hashlib
.
sha512
(
combined
.
encode
(
"utf-8"
))
.
hexdigest
()
return
{
"ts"
:
ts
,
"authtoken"
:
authtoken
}
def
_build_query_metadata
(
self
,
extra_params
:
Optional
[
Dict
[
str
,
str
]]
=
None
)
->
str
:
"""构建通用的 query_metadata,包含随机设备信息"""
base_params
=
{
"amznSessionId"
:
self
.
session_id
,
"clientVersion"
:
"30.20.2.100"
,
"cardsVersion"
:
"1.0"
,
"clientMessageVersion"
:
"1.0"
,
"amznDirectedCustomerId"
:
""
,
"clientDeviceId"
:
self
.
client_device_id
,
"clientId"
:
str
(
uuid
.
uuid4
()),
"sourceType"
:
"Photo"
,
"ingressSource"
:
"ctp"
,
"uiMode"
:
"stylesnap"
,
# 注入随机设备信息
**
self
.
device_info
}
if
extra_params
:
base_params
.
update
(
extra_params
)
return
json
.
dumps
(
base_params
)
def
_parse_response
(
self
,
response_json
:
Dict
[
str
,
Any
])
->
Optional
[
Dict
[
str
,
Any
]]:
"""从API响应中解析ASIN列表并构建结果"""
try
:
search_result
=
response_json
[
"style-snap"
][
"searchResult"
][
0
]
bbx_asin_list
=
search_result
.
get
(
"bbxAsinList"
,
[])
if
not
bbx_asin_list
:
logger
.
warning
(
"响应中 bbxAsinList 为空"
)
return
None
unique_asin_list
=
list
(
set
(
bbx_asin_list
))
# 去重asin
joined_asins
=
"|"
.
join
(
unique_asin_list
)
return
{
"is_app"
:
1
,
"asin_list_app"
:
joined_asins
,
"search_url"
:
AMAZON_SEARCH_BASE_URL
.
format
(
bbx_asin_list
=
joined_asins
),
}
except
(
KeyError
,
IndexError
,
TypeError
)
as
e
:
logger
.
error
(
f
"解析响应失败: {e}. 响应内容: {response_json}"
)
return
None
def
_perform_request
(
self
,
**
kwargs
)
->
Dict
[
str
,
Any
]:
"""统一的请求发送方法,包含重试逻辑"""
for
attempt
in
range
(
RETRY_TIMES
):
try
:
response
=
requests
.
post
(
**
kwargs
,
timeout
=
10
)
response
.
raise_for_status
()
# 对非2xx响应抛出HTTPError
return
response
.
json
()
except
JSONDecodeError
:
logger
.
error
(
f
"请求失败 (第 {attempt + 1}/{RETRY_TIMES} 次): 响应不是有效的JSON格式。"
)
except
RequestException
as
e
:
logger
.
error
(
f
"请求失败 (第 {attempt + 1}/{RETRY_TIMES} 次): {e}"
)
if
attempt
<
RETRY_TIMES
-
1
:
logger
.
warning
(
f
"将在 {RETRY_DELAY} 秒后重试..."
)
time
.
sleep
(
RETRY_DELAY
)
raise
RequestException
(
f
"API请求在 {RETRY_TIMES} 次尝试后最终失败。"
)
def
_default_search
(
self
,
image_path
:
str
)
->
Dict
[
str
,
Any
]:
"""执行默认的图片识别请求(第一步)"""
logger
.
info
(
f
"开始默认识别 (站点: {self.site_name}, 图片: {image_path}),站点链接:{self.snap_url}"
)
try
:
with
open
(
image_path
,
"rb"
)
as
f
:
image_data
=
f
.
read
()
except
FileNotFoundError
:
logger
.
error
(
f
"无法读取图片文件: {image_path}"
)
raise
auth_params
=
self
.
_generate_auth_params
()
query_metadata
=
self
.
_build_query_metadata
({
"orientation"
:
"-1"
})
# 默认图片搜索 额外传 orientation
files
=
{
"application"
:
(
None
,
self
.
site_config
[
'application'
]),
"query_metadata"
:
(
None
,
query_metadata
),
"authtoken"
:
(
None
,
auth_params
[
'authtoken'
]),
"lang"
:
(
None
,
"en_US"
),
"username"
:
(
None
,
self
.
site_config
[
'username'
]),
"ts"
:
(
None
,
auth_params
[
'ts'
]),
"file"
:
(
"image.jpg"
,
image_data
,
"image/jpeg"
),
}
return
self
.
_perform_request
(
url
=
self
.
snap_url
,
files
=
files
,
headers
=
self
.
headers
)
def
_full_image_search
(
self
,
query_id
:
str
,
image_path
:
str
)
->
Dict
[
str
,
Any
]:
"""执行全图识别请求(第二步)"""
logger
.
info
(
f
"开始全图识别 (Query ID: {query_id[:10]}...)"
)
image_size
=
get_image_size
(
image_path
)
if
not
image_size
:
raise
ValueError
(
"无法获取图片尺寸,无法进行全图搜索。"
)
# 生成随机裁剪框
offset
=
random
.
randint
(
0
,
2
)
bounding_box
=
{
"tlx"
:
max
(
0
,
offset
),
"tly"
:
max
(
0
,
offset
),
"brx"
:
max
(
image_size
[
"width"
]
-
offset
,
max
(
0
,
offset
)
+
1
),
"bry"
:
max
(
image_size
[
"height"
]
-
offset
,
max
(
0
,
offset
)
+
1
),
"imh"
:
image_size
[
"height"
],
"imw"
:
image_size
[
"width"
]
}
auth_params
=
self
.
_generate_auth_params
()
query_metadata
=
self
.
_build_query_metadata
()
form_data
=
{
"mainQueryId"
:
(
None
,
query_id
),
"uiMode"
:
(
None
,
"stl_bbx_reformulation"
),
"application"
:
(
None
,
self
.
site_config
[
'application'
]),
"query_metadata"
:
(
None
,
query_metadata
),
"authtoken"
:
(
None
,
auth_params
[
'authtoken'
]),
"inputBoundingBox"
:
(
None
,
json
.
dumps
(
bounding_box
)),
"imageHash"
:
(
None
,
""
),
"lang"
:
(
None
,
"en_US"
),
"username"
:
(
None
,
self
.
site_config
[
'username'
]),
"ts"
:
(
None
,
auth_params
[
'ts'
]),
}
return
self
.
_perform_request
(
url
=
self
.
snap_url
,
files
=
form_data
,
headers
=
self
.
headers
)
def
search
(
self
,
image_path
:
str
,
search_mode
:
str
=
"default"
)
->
Dict
[
str
,
Any
]:
"""
执行图片搜索
:param image_path: 本地图片文件路径
:param search_mode: 搜索模式 ('default' 或 'full_image')
:return: 包含搜索结果的字典
"""
# 默认的返回值字典
base_result
=
{
"is_web"
:
0
,
"is_app"
:
0
,
"asin_list_web"
:
""
,
"asin_list_app"
:
""
,
"asin_list_join"
:
""
,
"site_name"
:
self
.
site_name
,
"search_url"
:
None
,
"mode"
:
search_mode
}
if
not
self
.
session_id
:
logger
.
error
(
"Cookies中缺少'session-id',无法继续。"
)
return
base_result
try
:
# 步骤 1: 执行默认搜索(全图模式需依赖此步骤的query_id)
default_response
=
self
.
_default_search
(
image_path
)
# 得到首次以图搜物响应
# 处理默认模式结果
if
search_mode
==
"default"
:
parsed_result
=
self
.
_parse_response
(
default_response
)
if
parsed_result
:
base_result
.
update
(
parsed_result
)
# 更新返回值
base_result
[
"asin_list_join"
]
=
parsed_result
[
"asin_list_app"
]
return
base_result
# 处理全图模式结果
elif
search_mode
==
"full_image"
:
query_id
=
default_response
.
get
(
"queryId"
)
if
not
query_id
:
logger
.
error
(
"默认识别未返回 queryId,无法进行全图搜索。"
)
return
base_result
full_image_response
=
self
.
_full_image_search
(
query_id
,
image_path
)
# 带上queryId 第二次请求 获取全图识别结果
parsed_result
=
self
.
_parse_response
(
full_image_response
)
if
parsed_result
:
base_result
.
update
(
parsed_result
)
base_result
[
"asin_list_join"
]
=
parsed_result
[
"asin_list_app"
]
return
base_result
else
:
logger
.
error
(
f
"不支持的搜索模式: {search_mode}"
)
return
base_result
except
Exception
as
e
:
logger
.
error
(
f
"处理 '{search_mode}' 模式搜索时发生异常: {e}"
)
return
base_result
if
__name__
==
"__main__"
:
# 模拟图片路径
image_file_path
=
"temp_img/B0BYNB2J6W.jpg"
try
:
# 测试不同站点(如"us"、"uk"、"de")
for
site
in
[
"us"
,
"uk"
,
"de"
]:
logger
.
info
(
f
"
\n
{'=' * 20} 测试站点: {site} {'=' * 20}"
)
client
=
AmazonImageSearch
(
site_name
=
site
)
# 测试默认识别模式
logger
.
info
(
"
\n
--- 默认识别模式 ---"
)
default_result
=
client
.
search
(
image_file_path
,
search_mode
=
"default"
)
logger
.
info
(
f
"结果: {default_result}"
)
# 测试全图识别模式
logger
.
info
(
"
\n
--- 全图识别模式 ---"
)
full_image_result
=
client
.
search
(
image_file_path
,
search_mode
=
"full_image"
)
logger
.
info
(
f
"结果: {full_image_result}"
)
except
ValueError
as
e
:
logger
.
error
(
f
"初始化失败: {e}"
)
except
Exception
as
e
:
logger
.
error
(
f
"执行过程中发生错误: {e}"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment