Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
000d315d
Commit
000d315d
authored
Jul 28, 2025
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
8dd9963f
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
176 additions
and
77 deletions
+176
-77
get_junglescout_rank.py
py_spider/amazon_every_day_spider/get_junglescout_rank.py
+4
-4
junglescout_spider.py
py_spider/amazon_every_day_spider/junglescout_spider.py
+3
-3
save_all_syn_st_minid_maxid.py
py_spider/amazon_save_db/save_all_syn_st_minid_maxid.py
+1
-1
asin_detail_pg.py
py_spider/amazon_spider/asin_detail_pg.py
+3
-4
recall_cases_spider.py
py_spider/amazon_spider/recall_cases_spider.py
+118
-59
pares_html.py
py_spider/amzon_parse_db_html/pares_html.py
+3
-3
asin_parse.py
py_spider/utils/asin_parse.py
+44
-3
No files found.
py_spider/amazon_every_day_spider/get_junglescout_rank.py
View file @
000d315d
...
@@ -350,7 +350,7 @@ def junglescout_spider(db_base):
...
@@ -350,7 +350,7 @@ def junglescout_spider(db_base):
"Accept-Encoding"
:
"gzip, deflate, br, zstd"
,
"Accept-Encoding"
:
"gzip, deflate, br, zstd"
,
"Accept-Language"
:
"zh-CN,zh-TW;q=0.9,zh;q=0.8"
,
"Accept-Language"
:
"zh-CN,zh-TW;q=0.9,zh;q=0.8"
,
"Cache-Control"
:
"no-cache"
,
"Cache-Control"
:
"no-cache"
,
'Cookie'
:
'_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN;
x-hng=lang=zh-CN&domain=www.sellersprite.com; 8f00639f9c446a2d0213=54fb71d3f2c9e8acb7878e0f73abbf33; _gcl_au=1.1.420472597.1749119222.719336435.1751886424.1751886424; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751516385,1751886410,1751973053,1752031904; HMACCOUNT=800EBCCFB4C6BBFB; 65722c3d8208b58d42f9=7dc2ebaa5e4a51182da4ade1aacd8dc4; rank-guest-user=6159802571t3e3obe8rwmCywrH0Xq28vOMfd8Q+siSpAi1WiGPGuuMcYrYhXyf/QpgeBCBdgCT; rank-login-user=6159802571t3e3obe8rwmCywrH0Xq28mIqu6gO0eXYPrSqY9RlSIznMsavLuIJkOkjELzcr/d1; rank-login-user-info="eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjYxNTk4MDI1NzF0M2Uzb2JlOHJ3bUN5d3JIMFhxMjhtSXF1NmdPMGVYWVByU3FZOVJsU0l6bk1zYXZMdUlKa09rakVMemNyL2QxIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJJUGFLc3VqMkZsUmpPR1NRQnIxYkJRIiwiaWF0IjoxNzUyMDMxOTE2LCJleHAiOjE3NTIxMTgzMTYsIm5iZiI6MTc1MjAzMTg1Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.mLIjN_qO4K8w18IDVa0GCRY3MODTmJhZlQaPbgBjeYJRPDwteHfkfqFS_GFyLu4svoahzyFRxkdnKhxs1x90QxQ-7QCwjwypbk8On6gMarKl8jopo9sJbZITvk8mrqtoT6N34LZ1ash35iAkIuPZONPMH8_cp5NxiSC70J12fvIT9ZXp-9zvEk6WV8qQ3pRr0yRuGnSsuWjVvDE9WRNpE3ZmYS_EUBroA51yBEPdS8aBThRuuVGt4HuqrPXp9ZwHoiOcRYu1VcQu-wpIAhLfXcnY1vJA3FXm7w_H00DOGZuM9HRcxdg6Fj-2WP5FvCxbE8z5n1-zbQMs_J8JVaVXgQ; ao_lo_to_n="6159802571t3e3obe8rwmCywrH0Xq28osFyhyxlRsfXXDx9AUjMD2qAFgWUPkLF84KewBkZoL5OL21x5jznuxdPNdiJfglPNE7YH03Vk5CofaP+MGH3y8="; _gaf_fp=01fef3c14bfcaf5a01438f74a677e95a; _ga_38NCVF2XST=GS2.1.s1752031904$o47$g1$t1752031923$j41$l0$h1543227925; _ga_CN0F80S6GL=GS2.1.s1752031906$o46$g1$t1752031924$j42$l0$h0; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1752035308; JSESSIONID=165F9BAA752FE5B22CCD7C5BB7B62F2F
'
,
'Cookie'
:
'_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN;
Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1752031904,1752460043,1752653436,1753353401; HMACCOUNT=800EBCCFB4C6BBFB; 894cdd1d9741ce0c9757=827b7d3d13ed7bd6b4b1b24d0246b3dc; 3d854e1bcd61963fdf05=38fcb3b742a48aa345ddfd7136bc60ee; _gaf_fp=f297033bfe53aa9891ffe2842271566b; _gcl_au=1.1.420472597.1749119222.1054917286.1753685435.1753685437; rank-guest-user=6303473571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9lihUCQIaVmrHXjbpSRP/Ca0F; rank-login-user=6303473571KK6FnhfedvWg9tSSyk3xj2GRIc/8HSm4vuPYVHI5vKLXnssgei5ccK1dG8fkQSFI; rank-login-user-info=eyJuaWNrbmFtZSI6IuW4heWTpSIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTgzKioqKjczNDciLCJ0b2tlbiI6IjYzMDM0NzM1NzFLSzZGbmhmZWR2V2c5dFNTeWszeGoyR1JJYy84SFNtNHZ1UFlWSEk1dktMWG5zc2dlaTVjY0sxZEc4ZmtRU0ZJIn0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJKc2pZSlZWeFZzTVptVWFvMzgtZ3RRIiwiaWF0IjoxNzUzNjg1NDM2LCJleHAiOjE3NTM3NzE4MzYsIm5iZiI6MTc1MzY4NTM3Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIxNSwicGkiOm51bGwsIm5uIjoi5biF5ZOlIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxODMwNzk2NzM0NyIsImVtIjoiMzE1OTk4MDg5MkBxcS5jb20iLCJtbCI6IkcifQ.EaQ7Md7iVOpjZDogkiS2DlndhFPt3GzL2t33LXnh9Z5Itr3A8scFM_tzrYuzXqF6a-BDIMFe90SdDtU18zs9WTTl6_Phv3AEqcDe6WDfPAhB_KMa15VYAE5-b9d3lgIukKR8ZZyAMpiJzcmIWShmqxrhCNQD0ER3b7idaJpSrJiKnwV-tj6La52WJ6BmVRAk8gst0p5h-SYVnNz9iNaSXLc2Dx-hHZvMVNU27yfbJgKPpzRxgh7TOD7O-cT0WrEoKvTSw9e81gG9bgvKuA_bD-z3ePhgM6prUfceWszD88KH8PcXua9s_8ZM4bgrMyKMHswLtwyLhWePcvtHUp6yyQ; ao_lo_to_n=6303473571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9liibP9br/hwQ1Dlb4xDZyVPrTQIst5JCVz4PpnUIlDMGE07YVPYBWOm3Hrx4PaVkgaQ=; _ga_38NCVF2XST=GS2.1.s1753685428$o61$g1$t1753685444$j44$l0$h984121357; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1753685445; _ga_CN0F80S6GL=GS2.1.s1753685429$o59$g1$t1753685445$j44$l0$h0; JSESSIONID=F09543D3A3D6F890BAD0F422FCA49942
'
,
"User-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
,
"User-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
,
}
}
url
=
"https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
url
=
"https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
...
@@ -368,7 +368,7 @@ def junglescout_spider(db_base):
...
@@ -368,7 +368,7 @@ def junglescout_spider(db_base):
response
=
json
.
loads
(
response
.
text
)
response
=
json
.
loads
(
response
.
text
)
break
break
except
:
except
:
time
.
sleep
(
random
.
uniform
(
15
,
3
5
.75
))
time
.
sleep
(
random
.
uniform
(
15
,
3
0
.75
))
response_data
=
response
[
'data'
]
response_data
=
response
[
'data'
]
print
(
'code::'
,
response
[
'code'
])
print
(
'code::'
,
response
[
'code'
])
...
@@ -387,7 +387,7 @@ def junglescout_spider(db_base):
...
@@ -387,7 +387,7 @@ def junglescout_spider(db_base):
print
(
'获取数据:'
,
category_name
[
'name'
],
i
,
est
,
year_month
)
print
(
'获取数据:'
,
category_name
[
'name'
],
i
,
est
,
year_month
)
sales
=
int
(
est
)
sales
=
int
(
est
)
name_rnak_list
.
append
((
category_name
[
'name'
],
i
,
sales
,
year_month
))
name_rnak_list
.
append
((
category_name
[
'name'
],
i
,
sales
,
year_month
))
time
.
sleep
(
random
.
uniform
(
20
,
7
5.75
))
time
.
sleep
(
random
.
uniform
(
20
,
6
5.75
))
# break
# break
for
i
in
range
(
4
):
for
i
in
range
(
4
):
try
:
try
:
...
@@ -408,7 +408,7 @@ def junglescout_spider(db_base):
...
@@ -408,7 +408,7 @@ def junglescout_spider(db_base):
cursor_us_mysql_db
,
db_us
=
db_class_us
.
us_mysql_db
()
# us 站点 mysql
cursor_us_mysql_db
,
db_us
=
db_class_us
.
us_mysql_db
()
# us 站点 mysql
time
.
sleep
(
20
)
time
.
sleep
(
20
)
print
(
'当前完成。获取下一个分类销量'
)
print
(
'当前完成。获取下一个分类销量'
)
time
.
sleep
(
random
.
uniform
(
120
,
24
0.5
))
time
.
sleep
(
random
.
uniform
(
90
,
20
0.5
))
def
save_site_category
(
site_bsr_dict
=
None
):
def
save_site_category
(
site_bsr_dict
=
None
):
...
...
py_spider/amazon_every_day_spider/junglescout_spider.py
View file @
000d315d
...
@@ -85,12 +85,12 @@ def junglescout_spider(db_base):
...
@@ -85,12 +85,12 @@ def junglescout_spider(db_base):
"Accept-Encoding"
:
"gzip, deflate, br, zstd"
,
"Accept-Encoding"
:
"gzip, deflate, br, zstd"
,
"Accept-Language"
:
"zh-CN,zh-TW;q=0.9,zh;q=0.8"
,
"Accept-Language"
:
"zh-CN,zh-TW;q=0.9,zh;q=0.8"
,
"Cache-Control"
:
"no-cache"
,
"Cache-Control"
:
"no-cache"
,
'Cookie'
:
'_ga=GA1.1.
19240078.1751854600; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751854601; HMACCOUNT=28ABEEABEFA97E4A; _gcl_au=1.1.536675967.1751854601; MEIQIA_TRACK_ID=2zWlEnsYAqnZRdhJqJ5txX7tpXm; MEIQIA_VISIT_ID=2zWlEmUkBQV745rliAtXEdAk0CJ; ecookie=ZyZ05gxOxlDTPkM1_CN; 8f00639f9c446a2d0213=54fb71d3f2c9e8acb7878e0f73abbf33; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=e03eac62da4f8988dc796341e1bd822c; current_guest=jsxcNvsgBJO1_250707-100340; rank-login-user=502219157192wVgAJpdturGN5Im+nPDQqTtoVYwVNo1oWP9MD0mtMHFwS3LrhtAUhuCnvMHsCl; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjUwMjIxOTE1NzE5MndWZ0FKcGR0dXJHTjVJbStuUERRcVR0b1ZZd1ZObzFvV1A5TUQwbXRNSEZ3UzNMcmh0QVVodUNudk1Ic0NsIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiIwZ01FdlJuNWJ1dlZhVW5IZ1lKSDFRIiwiaWF0IjoxNzUxODU0NjA1LCJleHAiOjE3NTE5NDEwMDUsIm5iZiI6MTc1MTg1NDU0NSwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.Ujr6_K3vHIQRw3x52QAQdTftMy6GbZ_TunmFMgW76onCy3EkBzx7uxEv-42zRRXgKLMUfJz2t0ierqXV6Evh9i-o5F0ZUBREzm48LHpGSw6Iupjx4Udc3VQwVqgiUOmYBvnTAQqmaj6iA5l06zAZcVNHQASZ5xe5QFUCllIOL0m8tf3Xad6T8u5oLHRHTTuyy5nDAqLu6ZxVOqUYYXsIzq9H2qAsPhqIgRy_5Av1zyoAcQErddadCe25H_ILmKO0Az9ANIFg4o1r_is_VFVZpGvbz8nCN0JLuY3uajAjf2JXoEzhHT9YbMP0o2TrZDRPdORV3HVK1N5uvghRaRyJvw; ao_lo_to_n="502219157192wVgAJpdturGN5Im+nPDfbd9htCMUGF/tdMS8/gmBNzv9/utYT5ucwmHHPC71S6i4RnT3fLUZW/nDI61eZx1uqLqr+hBy0X/aeJ6c/sSSc="; rank-guest-user=502219157192wVgAJpdturGN5Im+nPDYsyQgRxjbXtKYdDjju8ax0OkcsNUNGWP3xY6uiwKVVO; JSESSIONID=96FF611DCBDF20B9C6C921EAD2A55205; _ga_38NCVF2XST=GS2.1.s1751854600$o1$g1$t1751854612$j48$l0$h1855838417; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1751854612; _ga_CN0F80S6GL=GS2.1.s1751854600$o1$g1$t1751854613$j47
$l0$h0'
,
'Cookie'
:
'_ga=GA1.1.
522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; x-hng=lang=zh-CN&domain=www.sellersprite.com; a40ac813159995d028ba=3d9b7c15f5787387e62acd734f598f23; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751973053,1752031904,1752460043,1752653436; HMACCOUNT=800EBCCFB4C6BBFB; rank-guest-user=8301172571YFpPM/DhYDVQzRAgRu7tcQTFTi48nSnOk/TNMkep2gdtR77QXyNfDPmFlYbdSsdL; rank-login-user=8301172571YFpPM/DhYDVQzRAgRu7tcWqD2KCbe1WiKcOarfxTCdls3AJ9lNFy+VA8a+RTm195; rank-login-user-info=eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjgzMDExNzI1NzFZRnBQTS9EaFlEVlF6UkFnUnU3dGNXcUQyS0NiZTFXaUtjT2FyZnhUQ2RsczNBSjlsTkZ5K1ZBOGErUlRtMTk1In0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJLcVRRV2RPbVNNcjlKTU1qYTdXRjFRIiwiaWF0IjoxNzUyNjUzNDM4LCJleHAiOjE3NTI3Mzk4MzgsIm5iZiI6MTc1MjY1MzM3OCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.caY2QxTbtUVg7CQXvNJcmVo1YU0TGy3AD01dIddF76PHjYbbFh5a8zZAdAXnAKM1wNcs39d1MM8Wa-uoXHiitqDlCZsWyF9aXzco9L4wn-yU4xlMYsf7VoquZI6bxaMT2TNeX6vgQBod-NeXHYFpZQWdrH5sfZHQypkpRINb_o1QwaWvZrjufj1UwYdiypryBxTDyCuLfD4djU0PLMRXvifY6Ef86VNjAlsY8gFqDdHiVLixR2GWGdKRtoG74Ak5DX2eMDT6ak-OMrWYOaikthxIXiqdADTq2tvUCmjO4pE0oYnWhSEx9-UABo7jxJ0v_Af8B6AVu7ccC0NUUvWBMA; ao_lo_to_n=8301172571YFpPM/DhYDVQzRAgRu7tca/7vKUOAtDW4w4LhsAzrvlsqk8xCK+opMY27DGtrDKlwUwhqg///+C6QOw12iRKNIq9mCOV5+ORmOA+PwqisF4=; _gaf_fp=0f3f9e0c791b5513d38aa715d0624aab; _gcl_au=1.1.420472597.1749119222.448034571.1752653439.1752653439; JSESSIONID=0F617D64E2FD6DD92F3BB10935E3C846; _ga_38NCVF2XST=GS2.1.s1752653436$o51$g1$t1752653450$j46$l0$h366949276; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1752653451; _ga_CN0F80S6GL=GS2.1.s1752653437$o50$g1$t1752653451$j46
$l0$h0'
,
"User-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
,
"User-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
,
}
}
url
=
"https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
url
=
"https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data
=
{
data
=
{
"station"
:
"U
S
"
,
"station"
:
"U
K
"
,
"cid"
:
category_name
[
'c_id'
],
# 分类id
"cid"
:
category_name
[
'c_id'
],
# 分类id
"bsr"
:
f
"{i}"
# 排名
"bsr"
:
f
"{i}"
# 排名
}
}
...
@@ -167,7 +167,7 @@ def save_site_category(site_bsr_dict=None):
...
@@ -167,7 +167,7 @@ def save_site_category(site_bsr_dict=None):
def
run
():
def
run
():
# get_cid()
# get_cid()
junglescout_spider
(
'u
s
'
)
junglescout_spider
(
'u
k
'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
py_spider/amazon_save_db/save_all_syn_st_minid_maxid.py
View file @
000d315d
...
@@ -158,7 +158,7 @@ if __name__ == '__main__':
...
@@ -158,7 +158,7 @@ if __name__ == '__main__':
month
=
7
month
=
7
engine_db_num
=
14
engine_db_num
=
14
# for site in ['de','uk']:
# for site in ['de','uk']:
for
site
in
[
'u
k
'
]:
for
site
in
[
'u
s
'
]:
time
.
sleep
(
0
)
time
.
sleep
(
0
)
count_all_syn_st_id
(
site_name
=
site
,
month
=
month
)
.
get_minid_maxid
()
count_all_syn_st_id
(
site_name
=
site
,
month
=
month
)
.
get_minid_maxid
()
# count_all_syn_st_id(site_name=site,month=month,engine_db_num=engine_db_num).search_term_syn()
# count_all_syn_st_id(site_name=site,month=month,engine_db_num=engine_db_num).search_term_syn()
...
...
py_spider/amazon_spider/asin_detail_pg.py
View file @
000d315d
...
@@ -214,7 +214,8 @@ class async_asin_pg():
...
@@ -214,7 +214,8 @@ class async_asin_pg():
'created_time'
:
new_date
,
'current_asin'
:
items
[
'current_asin'
],
'created_time'
:
new_date
,
'current_asin'
:
items
[
'current_asin'
],
'parent_asin'
:
items
[
"parentAsin"
],
'div_id_list'
:
items
[
'div_id_list'
],
'parent_asin'
:
items
[
"parentAsin"
],
'div_id_list'
:
items
[
'div_id_list'
],
'bundles_this_asins_json'
:
items
[
'bundles_this_asins_data_json'
],
'bundles_this_asins_json'
:
items
[
'bundles_this_asins_data_json'
],
'video_m3u8_url'
:
items
[
"video_m3u8"
],
'result_list_json'
:
items
[
'result_list_json'
]
'video_m3u8_url'
:
items
[
"video_m3u8"
],
'result_list_json'
:
items
[
'result_list_json'
],
'bundle_asin_component_json'
:
items
[
'bundle_asin_component_json'
]
}
}
if
self
.
site_name
in
[
'uk'
,
'de'
,
'fr'
,
'es'
,
'it'
]:
if
self
.
site_name
in
[
'uk'
,
'de'
,
'fr'
,
'es'
,
'it'
]:
item
[
'five_six_val'
]
=
items
[
'five_six_val'
]
item
[
'five_six_val'
]
=
items
[
'five_six_val'
]
...
@@ -222,8 +223,6 @@ class async_asin_pg():
...
@@ -222,8 +223,6 @@ class async_asin_pg():
item
[
'five_six_val'
]
=
None
item
[
'five_six_val'
]
=
None
# 第二次请求
# 第二次请求
_response_text
=
None
_response_text
=
None
# if (item['weight'] is None and item['volume'] is None and item['rank'] is None and item[
# 'launch_time'] is None) or (item['variat_num'] > 0 and is_variat == '0'):
if
item
[
'variat_num'
]
>
0
and
is_variat
==
'0'
:
if
item
[
'variat_num'
]
>
0
and
is_variat
==
'0'
:
self
.
request_total_count_list
.
append
(
4
)
self
.
request_total_count_list
.
append
(
4
)
if
item
[
'variat_num'
]
>
0
:
if
item
[
'variat_num'
]
>
0
:
...
@@ -478,7 +477,7 @@ class async_asin_pg():
...
@@ -478,7 +477,7 @@ class async_asin_pg():
def
run
(
self
):
def
run
(
self
):
asin_list
=
self
.
save_asin_detail
.
read_db_data
()
asin_list
=
self
.
save_asin_detail
.
read_db_data
()
# asin_list = ['B0
7BXM8RZ3|2025-01|1|1|null|null','B07FM8P1Z1|2025-01|1|1|null|null','B07TWHCK69
|2025-01|1|1|null|null']
# asin_list = ['B0
BPKK2BMN
|2025-01|1|1|null|null']
if
asin_list
:
if
asin_list
:
for
asin
in
asin_list
:
for
asin
in
asin_list
:
self
.
queries_asin_queue
.
put
(
asin
)
self
.
queries_asin_queue
.
put
(
asin
)
...
...
py_spider/amazon_spider/recall_cases_spider.py
View file @
000d315d
...
@@ -14,7 +14,7 @@ import html
...
@@ -14,7 +14,7 @@ import html
import
re
import
re
from
html
import
unescape
from
html
import
unescape
import
urllib.parse
import
urllib.parse
from
sqlalchemy
import
text
class
recall_cases
():
class
recall_cases
():
...
@@ -90,6 +90,15 @@ class recall_cases():
...
@@ -90,6 +90,15 @@ class recall_cases():
if
response_detail
:
if
response_detail
:
recall_date_list
=
response_detail
.
xpath
(
"//div[contains(text(),'Recall Date:')]/parent::div/text()"
)
recall_date_list
=
response_detail
.
xpath
(
"//div[contains(text(),'Recall Date:')]/parent::div/text()"
)
product_title_list
=
response_detail
.
xpath
(
"//div[contains(text(),'Name of Product:')]/parent::div/text()"
)
product_title_list
=
response_detail
.
xpath
(
"//div[contains(text(),'Name of Product:')]/parent::div/text()"
)
if
product_title_list
:
matches
=
re
.
findall
(
r'[A-Za-z\-®]+(?: [A-Za-z\-®]+)*'
,
product_title_list
[
-
1
]
.
strip
())
if
matches
:
brand
=
','
.
join
(
matches
)
else
:
brand
=
None
else
:
brand
=
None
hazard_list
=
response_detail
.
xpath
(
"//div[contains(text(),'危险:')]/parent::div//p//text()"
)
hazard_list
=
response_detail
.
xpath
(
"//div[contains(text(),'危险:')]/parent::div//p//text()"
)
image_url_list
=
response_detail
.
xpath
(
"//div[@id='recall-gallery-img']//li/img/@src"
)
image_url_list
=
response_detail
.
xpath
(
"//div[@id='recall-gallery-img']//li/img/@src"
)
recall_date
=
recall_date_list
[
-
1
]
.
strip
()
if
recall_date_list
else
None
# 召回日期
recall_date
=
recall_date_list
[
-
1
]
.
strip
()
if
recall_date_list
else
None
# 召回日期
...
@@ -98,7 +107,7 @@ class recall_cases():
...
@@ -98,7 +107,7 @@ class recall_cases():
image_url
=
'https://www.cpsc.gov'
+
image_url_list
[
0
]
.
strip
()
if
image_url_list
else
None
# 图片
image_url
=
'https://www.cpsc.gov'
+
image_url_list
[
0
]
.
strip
()
if
image_url_list
else
None
# 图片
if
recall_date
:
if
recall_date
:
recall_date
=
self
.
_parse_date_str
(
recall_date
)
recall_date
=
self
.
_parse_date_str
(
recall_date
)
data_list
=
[
'us_recalls_product'
,
recall_date
,
product_title
,
hazard
,
image_url
,
a_href
]
data_list
=
[
'us_recalls_product'
,
recall_date
,
product_title
,
hazard
,
image_url
,
a_href
,
brand
]
return
data_list
return
data_list
else
:
else
:
return
None
return
None
...
@@ -140,7 +149,7 @@ class recall_cases():
...
@@ -140,7 +149,7 @@ class recall_cases():
df
=
pd
.
DataFrame
(
data
=
save_data_list
,
df
=
pd
.
DataFrame
(
data
=
save_data_list
,
columns
=
[
'data_type'
,
'recall_date'
,
'product_title'
,
'hazard'
,
columns
=
[
'data_type'
,
'recall_date'
,
'product_title'
,
'hazard'
,
'image_url'
,
'image_url'
,
'ext_url'
,
'recall_title'
,
'country'
])
'ext_url'
,
'brand'
,
'recall_title'
,
'country'
])
df
.
to_sql
(
'recall_cases_data'
,
con
=
self
.
mysql_db
,
if_exists
=
"append"
,
index
=
False
)
df
.
to_sql
(
'recall_cases_data'
,
con
=
self
.
mysql_db
,
if_exists
=
"append"
,
index
=
False
)
except
:
except
:
is_None
=
False
is_None
=
False
...
@@ -207,6 +216,7 @@ class recall_cases():
...
@@ -207,6 +216,7 @@ class recall_cases():
dict_item
=
response
.
json
()
dict_item
=
response
.
json
()
data_lists
=
dict_item
[
'data'
]
data_lists
=
dict_item
[
'data'
]
for
data
in
data_lists
:
for
data
in
data_lists
:
print
(
data
,
'344444444'
)
data_list
=
[]
data_list
=
[]
try
:
try
:
# 逐项解码
# 逐项解码
...
@@ -234,18 +244,24 @@ class recall_cases():
...
@@ -234,18 +244,24 @@ class recall_cases():
response2
=
self
.
_request
(
headers
=
headers
,
url
=
url
)
response2
=
self
.
_request
(
headers
=
headers
,
url
=
url
)
response_detail
=
etree
.
HTML
(
response2
.
text
)
response_detail
=
etree
.
HTML
(
response2
.
text
)
src_list
=
response_detail
.
xpath
(
"//div[@id='recall-photos']//img/@src"
)
src_list
=
response_detail
.
xpath
(
"//div[@id='recall-photos']//img/@src"
)
Brand_list
=
response_detail
.
xpath
(
"//div[contains(text(),'Brand Name')]/following-sibling::div//text()"
)
if
Brand_list
:
brand
=
''
.
join
(
Brand_list
)
.
strip
()
else
:
brand
=
None
print
(
brand
,
'Brand_list::'
,
Brand_list
)
if
src_list
:
if
src_list
:
image_url
=
'https://www.fda.gov'
+
src_list
[
0
]
image_url
=
'https://www.fda.gov'
+
src_list
[
0
]
else
:
else
:
image_url
=
None
image_url
=
None
print
(
'image_url:'
,
image_url
)
print
(
'image_url:'
,
image_url
)
data_list
.
append
([
'us_fba_recalls'
,
date
,
link_text
,
hazard
,
image_url
,
url
,
recall_title
,
'us'
,
data_list
.
append
([
'us_fba_recalls'
,
date
,
link_text
,
hazard
,
image_url
,
url
,
recall_title
,
'us'
,
product_category
])
product_category
,
brand
])
try
:
try
:
df
=
pd
.
DataFrame
(
data
=
data_list
,
df
=
pd
.
DataFrame
(
data
=
data_list
,
columns
=
[
'data_type'
,
'recall_date'
,
'product_title'
,
'hazard'
,
columns
=
[
'data_type'
,
'recall_date'
,
'product_title'
,
'hazard'
,
'image_url'
,
'image_url'
,
'ext_url'
,
'recall_title'
,
'country'
,
'product_category'
])
'ext_url'
,
'recall_title'
,
'country'
,
'product_category'
,
'brand'
])
df
.
drop_duplicates
([
'recall_date'
,
'product_title'
,
'ext_url'
],
inplace
=
True
)
df
.
drop_duplicates
([
'recall_date'
,
'product_title'
,
'ext_url'
],
inplace
=
True
)
df
.
to_sql
(
'recall_cases_data'
,
con
=
self
.
mysql_db
,
if_exists
=
"append"
,
index
=
False
)
df
.
to_sql
(
'recall_cases_data'
,
con
=
self
.
mysql_db
,
if_exists
=
"append"
,
index
=
False
)
except
:
except
:
...
@@ -260,7 +276,7 @@ class recall_cases():
...
@@ -260,7 +276,7 @@ class recall_cases():
def
ec_europa_eu
(
self
):
def
ec_europa_eu
(
self
):
'欧盟召回'
'欧盟召回'
for
i
in
range
(
0
,
32
):
for
i
in
range
(
1
,
33
):
url
=
'https://ec.europa.eu/safety-gate-alerts/public/api/notification/carousel/?'
url
=
'https://ec.europa.eu/safety-gate-alerts/public/api/notification/carousel/?'
data
=
{
"language"
:
"en"
,
"page"
:
f
"{i}"
}
data
=
{
"language"
:
"en"
,
"page"
:
f
"{i}"
}
headers
=
{
headers
=
{
...
@@ -284,6 +300,7 @@ class recall_cases():
...
@@ -284,6 +300,7 @@ class recall_cases():
print
(
data
,
'请求列表页url:'
,
url
)
print
(
data
,
'请求列表页url:'
,
url
)
is_None
=
True
is_None
=
True
response
=
requests
.
post
(
url
,
headers
=
headers
,
impersonate
=
"chrome120"
,
timeout
=
120
,
json
=
data
)
response
=
requests
.
post
(
url
,
headers
=
headers
,
impersonate
=
"chrome120"
,
timeout
=
120
,
json
=
data
)
print
(
response
.
url
)
if
response
:
if
response
:
content
=
response
.
json
()[
'content'
]
content
=
response
.
json
()[
'content'
]
for
ids
in
content
:
for
ids
in
content
:
...
@@ -314,11 +331,17 @@ class recall_cases():
...
@@ -314,11 +331,17 @@ class recall_cases():
recall_title
=
items
[
'product'
][
'versions'
][
0
][
'description'
]
recall_title
=
items
[
'product'
][
'versions'
][
0
][
'description'
]
print
(
product_title
)
print
(
product_title
)
print
(
recall_title
)
print
(
recall_title
)
brands
=
items
[
'product'
][
'brands'
]
if
brands
:
brand
=
brands
[
0
]
.
get
(
'brand'
)
else
:
brand
=
None
print
(
'brand::1'
,
brand
)
hazard
=
items
[
'risk'
][
'versions'
][
0
][
'riskDescription'
]
hazard
=
items
[
'risk'
][
'versions'
][
0
][
'riskDescription'
]
print
(
hazard
)
print
(
hazard
)
ext_url
=
'https://ec.europa.eu/safety-gate-alerts/screen/webReport/alertDetail/'
+
str
(
items
[
'id'
])
+
'?lang=en'
ext_url
=
'https://ec.europa.eu/safety-gate-alerts/screen/webReport/alertDetail/'
+
str
(
print
(
'ext_url::'
,
ext_url
)
items
[
'id'
])
+
'?lang=en'
print
(
'ext_url::'
,
ext_url
)
if
items
[
'product'
][
'photos'
]:
if
items
[
'product'
][
'photos'
]:
image_id
=
items
[
'product'
][
'photos'
][
0
][
'id'
]
image_id
=
items
[
'product'
][
'photos'
][
0
][
'id'
]
image_url
=
f
'https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{image_id}'
image_url
=
f
'https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{image_id}'
...
@@ -326,30 +349,36 @@ class recall_cases():
...
@@ -326,30 +349,36 @@ class recall_cases():
image_url
=
None
image_url
=
None
print
(
image_url
)
print
(
image_url
)
data_list
.
append
(
data_list
.
append
(
[
date
,
product_category
,
product_title
,
recall_title
,
hazard
,
'eu_recall'
,
image_url
,
'eu'
,
ext_url
,
[
date
,
product_category
,
product_title
,
recall_title
,
hazard
,
'eu_recall'
,
image_url
,
'eu'
,
data_json
])
ext_url
,
data_json
,
brand
])
# try:
# df = pd.DataFrame(data=data_list,
keys
=
[
# columns=['recall_date', 'product_category', 'product_title', 'recall_title',
"recall_date"
,
"product_category"
,
"product_title"
,
"recall_title"
,
# 'hazard', 'data_type', 'image_url',
"hazard"
,
"data_type"
,
"image_url"
,
"country"
,
"ext_url"
,
"data_json"
,
"brand"
# 'country', 'ext_url', 'data_json'])
]
# df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
# df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
# 把 list of list 转成 list of dict
dict_list
=
[
dict
(
zip
(
keys
,
row
))
for
row
in
data_list
]
with
self
.
mysql_db
.
begin
()
as
conn
:
with
self
.
mysql_db
.
begin
()
as
conn
:
conn
.
execute
(
conn
.
execute
(
f
"insert into recall_cases_data (recall_date, product_category, product_title,recall_title,hazard,"
text
(
"""
f
"data_type,image_url,country,ext_url,data_json) values (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s) ON DUPLICATE KEY UPDATE recall_date = values(recall_date),product_title=values (product_title),ext_url=values (ext_url)"
,
INSERT INTO recall_cases_data
data_list
)
(recall_date, product_category, product_title, recall_title, hazard, data_type, image_url, country, ext_url, data_json, brand)
# except:
VALUES (:recall_date, :product_category, :product_title, :recall_title, :hazard, :data_type, :image_url, :country, :ext_url, :data_json, :brand)
# is_None = False
ON DUPLICATE KEY UPDATE
# break
recall_date = VALUES(recall_date),
product_title = VALUES(product_title),
ext_url = VALUES(ext_url)
"""
),
dict_list
)
if
is_None
==
False
:
if
is_None
==
False
:
break
break
else
:
else
:
break
break
time
.
sleep
(
random
.
uniform
(
2
,
8
))
time
.
sleep
(
random
.
uniform
(
2
,
8
))
def
ec_europa_uk
(
self
):
def
ec_europa_uk
(
self
):
'https://www.gov.uk/product-safety-alerts-reports-recalls?page=2'
'https://www.gov.uk/product-safety-alerts-reports-recalls?page=2'
url
=
'https://www.gov.uk/product-safety-alerts-reports-recalls'
url
=
'https://www.gov.uk/product-safety-alerts-reports-recalls'
...
@@ -392,6 +421,8 @@ class recall_cases():
...
@@ -392,6 +421,8 @@ class recall_cases():
print
(
'产品标题:'
,
product_title
)
print
(
'产品标题:'
,
product_title
)
hazard_list
=
resp_html
.
xpath
(
"//p[contains(text(),'Hazard:')]/text()"
)
hazard_list
=
resp_html
.
xpath
(
"//p[contains(text(),'Hazard:')]/text()"
)
print
(
'风险:'
,
hazard_list
)
print
(
'风险:'
,
hazard_list
)
Brand_list
=
resp_html
.
xpath
(
"//td[contains(text(),'Brand')]/following-sibling::td/text()"
)
brand
=
Brand_list
[
0
]
.
strip
()
if
Brand_list
else
None
image_url_list
=
resp_html
.
xpath
(
"//span[@class='attachment-inline']/a/@href"
)
image_url_list
=
resp_html
.
xpath
(
"//span[@class='attachment-inline']/a/@href"
)
product_category
=
product_category
[
0
]
.
strip
()
if
product_category
else
None
product_category
=
product_category
[
0
]
.
strip
()
if
product_category
else
None
product_title
=
product_title
[
0
]
.
strip
()
.
replace
(
'Product:'
,
''
)
if
product_title
else
None
product_title
=
product_title
[
0
]
.
strip
()
.
replace
(
'Product:'
,
''
)
if
product_title
else
None
...
@@ -399,13 +430,13 @@ class recall_cases():
...
@@ -399,13 +430,13 @@ class recall_cases():
image_url_list
=
image_url_list
[
0
]
.
strip
()
if
image_url_list
else
None
image_url_list
=
image_url_list
[
0
]
.
strip
()
if
image_url_list
else
None
data_list
.
append
(
data_list
.
append
(
[
recall_title
,
detail_url
,
recall_date
,
product_category
,
product_title
,
[
recall_title
,
detail_url
,
recall_date
,
product_category
,
product_title
,
hazard_list
,
image_url_list
,
'uk_recall'
,
'uk'
])
hazard_list
,
image_url_list
,
'uk_recall'
,
'uk'
,
brand
])
if
data_list
:
if
data_list
:
try
:
try
:
df
=
pd
.
DataFrame
(
data
=
data_list
,
df
=
pd
.
DataFrame
(
data
=
data_list
,
columns
=
[
'recall_title'
,
'ext_url'
,
'recall_date'
,
'product_category'
,
columns
=
[
'recall_title'
,
'ext_url'
,
'recall_date'
,
'product_category'
,
'product_title'
,
'product_title'
,
'hazard'
,
'image_url'
,
'data_type'
,
'country'
])
'hazard'
,
'image_url'
,
'data_type'
,
'country'
,
'brand'
])
df
.
drop_duplicates
([
'recall_date'
,
'product_title'
,
'ext_url'
],
inplace
=
True
)
df
.
drop_duplicates
([
'recall_date'
,
'product_title'
,
'ext_url'
],
inplace
=
True
)
df
.
to_sql
(
'recall_cases_data'
,
con
=
self
.
mysql_db
,
if_exists
=
"append"
,
index
=
False
)
df
.
to_sql
(
'recall_cases_data'
,
con
=
self
.
mysql_db
,
if_exists
=
"append"
,
index
=
False
)
...
@@ -554,9 +585,12 @@ class recall_cases():
...
@@ -554,9 +585,12 @@ class recall_cases():
def
get_globalrecalls
(
self
):
def
get_globalrecalls
(
self
):
# sql = 'SELECT data_json FROM global_recalls_data'
# sql = 'SELECT data_json FROM global_recalls_data'
# df_data = pd.read_sql(sql, con=self.mysql_db)
# df_data = pd.read_sql(sql, con=self.mysql_db)
list_url
=
'https://globalrecalls.oecd.org/ws/search.xqy?end=20&lang=en&order=desc&q=&sort=date&start=0&uiLang=en'
list_url
=
'https://globalrecalls.oecd.org/ws/search.xqy?end=20&lang=en&order=desc&q=&sort=date&start=0&uiLang=en'
# list_url = f'https://globalrecalls.oecd.org/ws/search.xqy?end={i}&lang=en&order=desc&q=&sort=date&start={i - 20}&uiLang=en'
print
(
'请求url'
,
list_url
)
# 'https://globalrecalls.oecd.org/ws/search.xqy?end=200&lang=en&order=desc&q=&sort=date&start=180&uiLang=en'
headers
=
{
headers
=
{
'Accept'
:
'*/*'
,
'Accept'
:
'*/*'
,
'Accept-Encoding'
:
'gzip, deflate, br, zstd'
,
'Accept-Encoding'
:
'gzip, deflate, br, zstd'
,
'Accept-Language'
:
'zh-CN,zh-TW;q=0.9,zh;q=0.8'
,
'Accept-Language'
:
'zh-CN,zh-TW;q=0.9,zh;q=0.8'
,
...
@@ -574,7 +608,8 @@ class recall_cases():
...
@@ -574,7 +608,8 @@ class recall_cases():
for
result
in
result_list
:
for
result
in
result_list
:
countryId
=
result
[
'countryId'
]
countryId
=
result
[
'countryId'
]
imageUri
=
result
[
'imageUri'
]
imageUri
=
result
[
'imageUri'
]
if
countryId
.
lower
()
in
[
'us'
,
'ca'
,
'mx'
,
'nl'
,
'sa'
,
'se'
,
'pl'
,
'tr'
,
'be'
,
'uk'
,
'de'
,
'es'
,
'fr'
,
'it'
,
if
countryId
.
lower
()
in
[
'us'
,
'ca'
,
'mx'
,
'nl'
,
'sa'
,
'se'
,
'pl'
,
'tr'
,
'be'
,
'uk'
,
'de'
,
'es'
,
'fr'
,
'it'
,
'jp'
]:
'jp'
]:
date_time
=
result
[
'date'
]
date_time
=
result
[
'date'
]
extUrl
=
result
[
'extUrl'
]
extUrl
=
result
[
'extUrl'
]
...
@@ -600,7 +635,11 @@ class recall_cases():
...
@@ -600,7 +635,11 @@ class recall_cases():
if
'ENTITY_NOT_FOUN'
in
resp
.
text
:
if
'ENTITY_NOT_FOUN'
in
resp
.
text
:
continue
continue
items_data
=
resp
.
json
()
items_data
=
resp
.
json
()
brands
=
items_data
[
'product'
][
'brands'
]
if
brands
:
brand
=
brands
[
0
]
.
get
(
'brand'
)
else
:
brand
=
None
time
.
sleep
(
random
.
uniform
(
1
,
3
))
time
.
sleep
(
random
.
uniform
(
1
,
3
))
items
[
'country'
]
=
countryId
items
[
'country'
]
=
countryId
items
[
'reacll_time'
]
=
date_time
items
[
'reacll_time'
]
=
date_time
...
@@ -616,17 +655,19 @@ class recall_cases():
...
@@ -616,17 +655,19 @@ class recall_cases():
items
[
'data_type'
]
=
'global_recalls'
items
[
'data_type'
]
=
'global_recalls'
items
[
'product_title'
]
=
re
.
findall
(
r'^(.*?)\s*;'
,
title_name
+
';'
)[
0
]
items
[
'product_title'
]
=
re
.
findall
(
r'^(.*?)\s*;'
,
title_name
+
';'
)[
0
]
items
[
'ext_url'
]
=
extUrl
items
[
'ext_url'
]
=
extUrl
items
[
'brand'
]
=
brand
data_json
=
json
.
dumps
(
items_data
)
data_json
=
json
.
dumps
(
items_data
)
data_list
.
append
([
items
[
'data_type'
],
items
[
'product_title'
],
items
[
'productCategory'
],
data_list
.
append
([
items
[
'data_type'
],
items
[
'product_title'
],
items
[
'productCategory'
],
items
[
'reacll_time'
],
items
[
'riskDescription'
],
items
[
'country'
],
items
[
'reacll_time'
],
items
[
'riskDescription'
],
items
[
'country'
],
items
[
'image_url'
],
items
[
'image_url'
],
items
[
'recall_title'
],
items
[
'ext_url'
],
data_json
])
items
[
'recall_title'
],
items
[
'ext_url'
],
data_json
,
items
[
'brand'
]
])
print
(
items
)
print
(
'itemsitems::'
,
items
)
try
:
try
:
df
=
pd
.
DataFrame
(
data
=
data_list
,
df
=
pd
.
DataFrame
(
data
=
data_list
,
columns
=
[
'data_type'
,
'product_title'
,
'product_category'
,
'recall_date'
,
columns
=
[
'data_type'
,
'product_title'
,
'product_category'
,
'recall_date'
,
'hazard'
,
'hazard'
,
'country'
,
'image_url'
,
'recall_title'
,
'ext_url'
,
'data_json'
])
'country'
,
'image_url'
,
'recall_title'
,
'ext_url'
,
'data_json'
,
'brand'
])
df
.
to_sql
(
'recall_cases_data'
,
con
=
self
.
mysql_db
,
if_exists
=
"append"
,
index
=
False
)
df
.
to_sql
(
'recall_cases_data'
,
con
=
self
.
mysql_db
,
if_exists
=
"append"
,
index
=
False
)
except
:
except
:
print
(
'数据重复====='
)
print
(
'数据重复====='
)
...
@@ -635,17 +676,18 @@ class recall_cases():
...
@@ -635,17 +676,18 @@ class recall_cases():
print
(
'没有解析到id'
)
print
(
'没有解析到id'
)
items
=
{}
items
=
{}
url
=
result
[
'uri'
]
url
=
result
[
'uri'
]
items
[
'country'
]
=
countryId
# 站点
items
[
'country'
]
=
countryId
# 站点
encoded_url
=
urllib
.
parse
.
quote
(
url
)
encoded_url
=
urllib
.
parse
.
quote
(
url
)
_url
=
'https://globalrecalls.oecd.org/ws/getrecall.xqy?uiLang=en&uri='
+
encoded_url
_url
=
'https://globalrecalls.oecd.org/ws/getrecall.xqy?uiLang=en&uri='
+
encoded_url
print
(
'_url::'
,
_url
)
resp
=
requests
.
get
(
_url
,
headers
=
headers
,
timeout
=
60
)
resp
=
requests
.
get
(
_url
,
headers
=
headers
,
timeout
=
60
)
result
=
resp
.
json
()
result
=
resp
.
json
()
print
(
"result::"
,
result
)
print
(
"result::"
,
result
)
time
.
sleep
(
random
.
uniform
(
1
,
3
))
time
.
sleep
(
random
.
uniform
(
1
,
3
))
extUrl
=
result
[
'recall'
][
'extUrl'
]
# 详情url
extUrl
=
result
[
'recall'
][
'extUrl'
]
# 详情url
imageUri
=
result
[
'recall'
][
'images'
][
0
][
'imageUri'
]
imageUri
=
result
[
'recall'
][
'images'
][
0
][
'imageUri'
]
encode_imageUri
=
urllib
.
parse
.
quote
(
imageUri
)
encode_imageUri
=
urllib
.
parse
.
quote
(
imageUri
)
imaurl
=
f
"https://globalrecalls.oecd.org/ws/getdocument.xqy?uri={encode_imageUri}"
# 图片
imaurl
=
f
"https://globalrecalls.oecd.org/ws/getdocument.xqy?uri={encode_imageUri}"
# 图片
date_time
=
result
[
'recall'
][
'date'
]
date_time
=
result
[
'recall'
][
'date'
]
items
[
'reacll_time'
]
=
date_time
items
[
'reacll_time'
]
=
date_time
title_name
=
result
[
'recall'
][
'product.name'
]
title_name
=
result
[
'recall'
][
'product.name'
]
...
@@ -653,7 +695,7 @@ class recall_cases():
...
@@ -653,7 +695,7 @@ class recall_cases():
if
recall_title
is
None
:
if
recall_title
is
None
:
recall_title
=
result
[
'recall'
][
'images'
][
0
][
'alt.text'
]
recall_title
=
result
[
'recall'
][
'images'
][
0
][
'alt.text'
]
if
recall_title
:
if
recall_title
:
recall_title
.
replace
(
'Image of '
,
''
)
recall_title
.
replace
(
'Image of '
,
''
)
hazard
=
result
[
'recall'
][
'hazard'
]
hazard
=
result
[
'recall'
][
'hazard'
]
items
[
'recall_title'
]
=
recall_title
items
[
'recall_title'
]
=
recall_title
items
[
'productCategory'
]
=
result
[
'recall'
][
'product.type'
]
items
[
'productCategory'
]
=
result
[
'recall'
][
'product.type'
]
...
@@ -667,7 +709,7 @@ class recall_cases():
...
@@ -667,7 +709,7 @@ class recall_cases():
items
[
'reacll_time'
],
items
[
'riskDescription'
],
items
[
'country'
],
items
[
'reacll_time'
],
items
[
'riskDescription'
],
items
[
'country'
],
items
[
'image_url'
],
items
[
'image_url'
],
items
[
'recall_title'
],
items
[
'ext_url'
],
data_json
])
items
[
'recall_title'
],
items
[
'ext_url'
],
data_json
])
print
(
'没有解析到id的数据:'
,
items
)
print
(
'没有解析到id的数据:'
,
items
)
try
:
try
:
df
=
pd
.
DataFrame
(
data
=
data_list
,
df
=
pd
.
DataFrame
(
data
=
data_list
,
columns
=
[
'data_type'
,
'product_title'
,
'product_category'
,
'recall_date'
,
columns
=
[
'data_type'
,
'product_title'
,
'product_category'
,
'recall_date'
,
...
@@ -677,6 +719,7 @@ class recall_cases():
...
@@ -677,6 +719,7 @@ class recall_cases():
except
:
except
:
print
(
'没有解析到id 存储 数据重复====='
)
print
(
'没有解析到id 存储 数据重复====='
)
continue
continue
def
run
(
self
):
def
run
(
self
):
# self.global_recalls()
# self.global_recalls()
self
.
get_globalrecalls
()
self
.
get_globalrecalls
()
...
@@ -685,25 +728,41 @@ class recall_cases():
...
@@ -685,25 +728,41 @@ class recall_cases():
self
.
ec_europa_eu
()
self
.
ec_europa_eu
()
self
.
ec_europa_uk
()
self
.
ec_europa_uk
()
self
.
gov_uk
()
self
.
gov_uk
()
# """
# 数据类型,属于那个国的
# eu_recall
# global_recalls
# uk_drug_device 1
# uk_recall 2
# us_fba_recalls 3
# with self.mysql_db.begin() as conn:
# us_recalls_product
# sql = "SELECT data_json FROM recall_cases_data WHERE data_type='eu_recall'"
# """
# df_data = pd.read_sql(sql, con=self.mysql_db)
# with self.mysql_db.begin() as conn:
# data_json_list = list(df_data.data_json)
# sql = "SELECT id,product_title FROM recall_cases_data WHERE data_type='us_recalls_product'"
# for data_json in data_json_list:
# df_data = pd.read_sql(sql, con=self.mysql_db)
# data_dict = json.loads(data_json)
# df_data['id'] = df_data['id'].fillna('').astype(str)
# print(data_dict['product']['photos'][0]['id'])
# df_data['product_title'] = df_data['product_title'].fillna('').astype(str)
# imgurl = f"https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{data_dict['product']['photos'][0]['id']}"
# data_json_list = list(df_data.id+ "|=||+||" + df_data.product_title)
# up_sql = f"update recall_cases_data set image_url ='{imgurl}' WHERE data_type='eu_recall' and image_url like '%%/image/{data_dict['product']['photos'][0]['id']}'"
# for data_json_id in data_json_list:
# print(up_sql)
# if data_json_id:
# conn.execute(up_sql)
# data_json_id_list = data_json_id.split('|=||+||')
# id = data_json_id_list[0]
# product_title = data_json_id_list[1]
# print(product_title)
# if bool(re.search(r'[\u4e00-\u9fff]', product_title)):
# # print(product_title,'23333333')
# matches = re.findall(r'[A-Za-z\-®]+(?: [A-Za-z\-®]+)*', product_title.strip())
# if matches:
# brand = ','.join(matches)
# else:
# brand = None
# print(id, brand,'23444444444')
# if brand:
# brand = brand.replace('"','').replace("'","")
# up_sql = f"""update recall_cases_data set brand ="{brand}" WHERE id={id}"""
# print(up_sql)
# conn.execute(up_sql)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
py_spider/amzon_parse_db_html/pares_html.py
View file @
000d315d
...
@@ -35,7 +35,7 @@ class Parse_asin_html():
...
@@ -35,7 +35,7 @@ class Parse_asin_html():
print
(
'没有该 asin html'
)
print
(
'没有该 asin html'
)
def
search_term_html
(
self
,
site_name
=
'us'
,
month
=
'04'
):
def
search_term_html
(
self
,
site_name
=
'us'
,
month
=
'04'
):
sql
=
f
"SELECT search_term,page,html FROM search_term_html_2025_{month} WHERE search_term='
lace white tops for women
' and site_name = '{site_name}'"
sql
=
f
"SELECT search_term,page,html FROM search_term_html_2025_{month} WHERE search_term='
resin kit
' and site_name = '{site_name}'"
print
(
sql
)
print
(
sql
)
df
=
pd
.
read_sql
(
sql
,
con
=
engine_strrocks
)
df
=
pd
.
read_sql
(
sql
,
con
=
engine_strrocks
)
print
(
df
.
values
)
print
(
df
.
values
)
...
@@ -52,8 +52,8 @@ class Parse_asin_html():
...
@@ -52,8 +52,8 @@ class Parse_asin_html():
print
(
'没有该 搜索词 html'
)
print
(
'没有该 搜索词 html'
)
def
run
(
self
):
def
run
(
self
):
self
.
asin_html
()
#
self.asin_html()
# self.search_term_html(site_name='us',month='04
')
self
.
search_term_html
(
site_name
=
'uk'
,
month
=
'07
'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
py_spider/utils/asin_parse.py
View file @
000d315d
...
@@ -402,7 +402,7 @@ class ParseAsinUs(object):
...
@@ -402,7 +402,7 @@ class ParseAsinUs(object):
break
break
if
min_match_asin_data_list
:
if
min_match_asin_data_list
:
min_match_asin_json
=
json
.
dumps
(
min_match_asin_data_list
,
ensure_ascii
=
False
)
min_match_asin_json
=
json
.
dumps
(
min_match_asin_data_list
,
ensure_ascii
=
False
)
# bundles_this_asins ,Bundles with this item B0BPV8R4K8
# bundles_this_asins ,Bundles with this item B0BPV8R4K8
变体下方位置。和五点描述挨着
bundles_this_asins_data_list
=
[]
bundles_this_asins_data_list
=
[]
bundles_this_asins_data_json
=
None
bundles_this_asins_data_json
=
None
for
i
in
ASIN_XPATH
[
'bundles_this_asins'
]:
for
i
in
ASIN_XPATH
[
'bundles_this_asins'
]:
...
@@ -436,7 +436,48 @@ class ParseAsinUs(object):
...
@@ -436,7 +436,48 @@ class ParseAsinUs(object):
break
break
if
bundles_this_asins_data_list
:
if
bundles_this_asins_data_list
:
bundles_this_asins_data_json
=
json
.
dumps
(
bundles_this_asins_data_list
,
ensure_ascii
=
False
)
bundles_this_asins_data_json
=
json
.
dumps
(
bundles_this_asins_data_list
,
ensure_ascii
=
False
)
# 捆绑销售 B0DD8W2DZD This bundle contains 2 items
href_asin_list
=
self
.
response_s
.
xpath
(
"//div[@class='bundle-title']/following-sibling::div//div[@class='bundle-components']//div[contains(@id,'bundle-component-details-component-title')]/a/@href"
)
bundle_asin_component_list
=
[]
if
href_asin_list
:
bundle_component_asin_list
=
[]
for
href_asin
in
href_asin_list
:
i_asin_list
=
re
.
findall
(
r'(?:[A-Z0-9]{10}|[0-9]{10})'
,
href_asin
)
bundle_component_asin_list
.
append
(
i_asin_list
[
0
])
if
bundle_component_asin_list
:
bundle_component_asin_list
=
list
(
set
(
bundle_component_asin_list
))
for
bundle_component_asin
in
bundle_component_asin_list
:
print
(
'bundle_component_asin:'
,
bundle_component_asin
)
bundle_title_list
=
self
.
response_s
.
xpath
(
f
"//a[contains(@href,'{bundle_component_asin}')]/parent::div[contains(@id,'component-details-component-title')]/a/text()"
)
bundle_asin_title
=
bundle_title_list
[
0
]
if
bundle_title_list
else
None
bundle_img_list
=
self
.
response_s
.
xpath
(
f
"//a[contains(@href,'{bundle_component_asin}')]/img/@src"
)
bundle_asin_img
=
bundle_img_list
[
0
]
if
bundle_img_list
else
None
bundle_review_list
=
self
.
response_s
.
xpath
(
rf
"//a[contains(@href,'{bundle_component_asin}')]/i[contains(@class,'component-details-component-review')]//following-sibling::span/text()"
)
bundle_asin_review
=
bundle_review_list
[
0
]
if
bundle_review_list
else
None
bundle_starslist
=
self
.
response_s
.
xpath
(
rf
"//a[contains(@href,'{bundle_component_asin}')]/i[contains(@class,'component-details-component-review-stars')]/@class"
)
bundle_stars
=
bundle_starslist
[
0
]
if
bundle_starslist
else
None
bundle_stars_list
=
re
.
findall
(
r'a-star-(.*?) '
,
bundle_stars
)
bundle_asin_star
=
bundle_stars_list
[
0
]
.
replace
(
'-'
,
'.'
)
if
bundle_stars_list
else
None
bundle_asin_price_list
=
self
.
response_s
.
xpath
(
f
"//a[contains(@href,'{bundle_component_asin}')]/parent::div/following-sibling::div[contains(@class,'component-details-component-prices')]/span/text()"
)
bundle_asin_price
=
bundle_asin_price_list
[
0
]
if
bundle_asin_price_list
else
None
bundle_asin_point_list
=
self
.
response_s
.
xpath
(
f
"//a[contains(@href,'{bundle_component_asin}')]/parent::div/following-sibling::ul/li[contains(@id,'component-details-component-bullet-point')]/span/text()"
)
bundle_asin_point
=
'|-|'
.
join
(
bundle_asin_point_list
)
if
bundle_asin_point_list
else
None
bundle_component_asin_item
=
{
"bundle_component_asin"
:
bundle_component_asin
,
"bundle_asin_title"
:
bundle_asin_title
,
'bundle_asin_img'
:
bundle_asin_img
,
"bundle_asin_review"
:
bundle_asin_review
,
"bundle_asin_star"
:
bundle_asin_star
,
"bundle_asin_price"
:
bundle_asin_price
,
"bundle_asin_point"
:
bundle_asin_point
}
bundle_asin_component_list
.
append
(
bundle_component_asin_item
)
if
bundle_asin_component_list
:
bundle_asin_component_json
=
json
.
dumps
(
bundle_asin_component_list
)
else
:
bundle_asin_component_json
=
None
# 五点描述
# 五点描述
for
i
in
ASIN_XPATH
[
'five_data'
]:
for
i
in
ASIN_XPATH
[
'five_data'
]:
five_text_list
=
self
.
response_s
.
xpath
(
i
)
five_text_list
=
self
.
response_s
.
xpath
(
i
)
...
@@ -2815,7 +2856,7 @@ class ParseAsinUs(object):
...
@@ -2815,7 +2856,7 @@ class ParseAsinUs(object):
'customer_reviews_json'
:
customer_reviews_json
,
'together_asin_json'
:
together_asin_json
,
'customer_reviews_json'
:
customer_reviews_json
,
'together_asin_json'
:
together_asin_json
,
'min_match_asin_json'
:
min_match_asin_json
,
'seller_json'
:
seller_json
,
'current_asin'
:
current_asin
,
'min_match_asin_json'
:
min_match_asin_json
,
'seller_json'
:
seller_json
,
'current_asin'
:
current_asin
,
'div_id_list'
:
div_id_list
,
'bundles_this_asins_data_json'
:
bundles_this_asins_data_json
,
'div_id_list'
:
div_id_list
,
'bundles_this_asins_data_json'
:
bundles_this_asins_data_json
,
'video_m3u8'
:
video_m3u8
,
'result_list_json'
:
result_list_json
}
'video_m3u8'
:
video_m3u8
,
'result_list_json'
:
result_list_json
,
'bundle_asin_component_json'
:
bundle_asin_component_json
}
if
self
.
site_name
==
'us'
:
if
self
.
site_name
==
'us'
:
item
[
'three_four_val'
]
=
Join_Prime_int
item
[
'three_four_val'
]
=
Join_Prime_int
elif
self
.
site_name
in
[
'uk'
,
'fr'
,
'it'
,
'es'
]:
elif
self
.
site_name
in
[
'uk'
,
'fr'
,
'it'
,
'es'
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment