Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
44057a7b
Commit
44057a7b
authored
Apr 17, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
6360879f
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
67 additions
and
39 deletions
+67
-39
dow_category_edge.py
py_spider/amazon_every_day_spider/dow_category_edge.py
+67
-39
No files found.
py_spider/amazon_every_day_spider/dow_category_edge.py
View file @
44057a7b
...
...
@@ -41,7 +41,6 @@ class dow_category_Product:
self
.
engine_us_mysql
=
None
self
.
engine_pg
=
None
self
.
num
=
0
week
=
time
.
strftime
(
"
%
W"
)
yaer
=
time
.
strftime
(
'
%
Y'
,
time
.
localtime
(
time
.
time
()))
self
.
y_w
=
f
"{yaer}-{week}"
...
...
@@ -52,26 +51,14 @@ class dow_category_Product:
self
.
engine_us_mysql
=
get_remote_engine
(
site_name
=
'us'
,
db_type
=
'mysql'
)
self
.
engine_pg
=
get_remote_engine
(
site_name
=
site
,
db_type
=
'postgresql_15_outer'
)
self
.
num
=
0
week
=
time
.
strftime
(
"
%
W"
)
yaer
=
time
.
strftime
(
'
%
Y'
,
time
.
localtime
(
time
.
time
()))
self
.
y_w
=
f
"{yaer}-{week}"
# ---------------------- Driver 管理 & 稳定层 ----------------------
def
_kill_edge_process
(
self
):
# 保持你原行为:强制关闭 msedge,减少 profile 被锁导致的奇怪崩溃
try
:
os
.
system
(
"taskkill /F /IM msedge.exe"
)
except
Exception
as
e
:
print
(
"强制关闭msedge.exe失败:"
,
e
)
def
_build_driver
(
self
):
print
(
'初始化'
)
try
:
pr_name
=
"msedge.exe"
os
.
system
(
'
%
s
%
s'
%
(
"taskkill /F /IM "
,
pr_name
))
except
Exception
as
e
:
print
(
"强制关闭chrome.exe失败:"
,
e
)
os
.
system
(
"taskkill /F /IM msedge.exe"
)
time
.
sleep
(
2
)
edge_options
=
Options
()
...
...
@@ -82,7 +69,7 @@ class dow_category_Product:
edge_options
.
add_argument
(
r'--user-data-dir=C:\Users\FLA主账号客服维权使用\AppData\Local\Microsoft\Edge\User Data'
)
edge_options
.
add_argument
(
'--profile-directory=Default'
)
# 降低
“首次运行/恢复弹窗/扩展”
对启动的干扰(不影响登录态)
# 降低
"首次运行/恢复弹窗/扩展"
对启动的干扰(不影响登录态)
edge_options
.
add_argument
(
"--no-first-run"
)
edge_options
.
add_argument
(
"--no-default-browser-check"
)
edge_options
.
add_argument
(
"--disable-extensions"
)
...
...
@@ -128,6 +115,16 @@ class dow_category_Product:
def
_jitter
(
self
,
a
=
0.6
,
b
=
1.6
):
time
.
sleep
(
random
.
uniform
(
a
,
b
))
def
_check_login_redirect
(
self
,
driver
):
"""检测是否被重定向到登录页,是则直接退出程序。"""
try
:
url
=
driver
.
current_url
.
lower
()
if
'signin'
in
url
or
'sign-in'
in
url
or
'ap/signin'
in
url
:
print
(
'[退出] 检测到登录页,session 已过期,请重新登录后再运行'
)
sys
.
exit
(
1
)
except
Exception
:
pass
def
_safe_action
(
self
,
action_name
,
fn
,
driver
,
site
=
None
):
"""
统一的稳定执行器:
...
...
@@ -148,25 +145,28 @@ class dow_category_Product:
msg
=
str
(
e
)
.
lower
()
print
(
f
"[{action_name}] 第{attempt}次失败:{e}"
)
# 常见
“页面崩溃/断连/渲染挂掉”
关键词
# 常见
"页面崩溃/断连/渲染挂掉"
关键词
crash_like
=
any
(
x
in
msg
for
x
in
[
"page crash"
,
"crash"
,
"renderer"
,
"disconnected"
,
"not connected to devtools"
,
"session deleted"
,
"cannot determine loading status"
,
"target window already closed"
])
# Timeout(页面未加载/JS超时)也值得先 refresh 一次,避免直接重启整个浏览器
should_retry
=
crash_like
or
isinstance
(
e
,
TimeoutException
)
# 先尝试 refresh(刷新后能继续跑)
if
attempt
<
self
.
MAX_ACTION_RETRY
and
crash_like
:
if
attempt
<
self
.
MAX_ACTION_RETRY
and
should_retry
:
try
:
print
(
f
"[{action_name}] 尝试 refresh 恢复..."
)
driver
.
refresh
()
self
.
_jitter
(
3
,
6
)
# 恢复后重新定位到 category-insights
)
# 恢复后重新定位到 category-insights
if
site
:
self
.
_ensure_category_insights
(
driver
,
site
)
continue
except
Exception
as
e2
:
print
(
f
"[{action_name}] refresh 也失败:{e2}"
)
print
(
f
"[{action_name}] refresh 也失败,触发重启:{e2}"
)
raise
NeedRestart
(
f
"[{action_name}] refresh 失败:{e2}"
)
# 走到这里:说明需要重启 driver
break
except
Exception
as
e
:
...
...
@@ -203,6 +203,7 @@ class dow_category_Product:
return
True
self
.
_safe_action
(
"open_category_insights"
,
_open
,
driver
,
site
=
site
)
self
.
_check_login_redirect
(
driver
)
# 导航完成后检测是否被重定向到登录页
def
_click_site
():
self
.
_select_site_radio
(
driver
,
site
)
...
...
@@ -219,7 +220,6 @@ class dow_category_Product:
time
.
sleep
(
1
)
html
=
etree
.
HTML
(
driver
.
page_source
)
self
.
save_category
(
html
)
print
(
333333333333333333333333
)
global
syn_state
syn_state
=
True
...
...
@@ -239,7 +239,7 @@ class dow_category_Product:
for
Category
in
Category_list
:
try
:
# 关键:每个大循环都做一次
“崩溃检测+必要时恢复”
# 关键:每个大循环都做一次
"崩溃检测+必要时恢复"
self
.
_safe_action
(
"loop_healthcheck"
,
lambda
:
True
,
driver
,
site
=
site
)
self
.
cilik_site
(
driver
)
...
...
@@ -343,8 +343,10 @@ class dow_category_Product:
'//div[@class="big-text-section-name"][1]/div[@class="big-text"]/text()'
)
if
ratio_list
:
search_ratio
=
re
.
findall
(
r'(.*?)‰'
,
ratio_list
[
0
])[
0
]
return_ratio
=
re
.
findall
(
r'(.*?)
%
'
,
ratio_list
[
1
])[
0
]
_sr
=
re
.
findall
(
r'(.*?)‰'
,
ratio_list
[
0
])
search_ratio
=
_sr
[
0
]
if
_sr
else
None
_rr
=
re
.
findall
(
r'(.*?)
%
'
,
ratio_list
[
1
])
if
len
(
ratio_list
)
>
1
else
[]
return_ratio
=
_rr
[
0
]
if
_rr
else
None
else
:
search_ratio
=
None
return_ratio
=
None
...
...
@@ -353,8 +355,10 @@ class dow_category_Product:
'//div[@class="big-text-section-name"][1]/div[@class="sub-text"]/text()'
)
if
product_ratio_list
:
product_average
=
re
.
findall
(
r'(.*?)‰'
,
product_ratio_list
[
0
])[
0
]
return_product_average
=
re
.
findall
(
r'(.*?)
%
'
,
product_ratio_list
[
1
])[
0
]
_pa
=
re
.
findall
(
r'(.*?)‰'
,
product_ratio_list
[
0
])
product_average
=
_pa
[
0
]
if
_pa
else
None
_rpa
=
re
.
findall
(
r'(.*?)
%
'
,
product_ratio_list
[
1
])
if
len
(
product_ratio_list
)
>
1
else
[]
return_product_average
=
_rpa
[
0
]
if
_rpa
else
None
else
:
product_average
=
None
return_product_average
=
None
...
...
@@ -403,11 +407,11 @@ class dow_category_Product:
total
=
0.0
for
num_str
in
sta_list
:
total
+=
float
(
num_str
)
results
=
[
float
(
num
)
/
total
if
float
(
num
)
!=
0
else
0
for
num
in
sta_list
]
five_star
=
round
(
results
[
0
],
2
)
three_star
=
round
(
results
[
1
],
2
)
two_star
=
round
(
results
[
2
],
2
)
one_star
=
round
(
results
[
3
],
2
)
results
=
[
float
(
num
)
/
total
if
total
>
0
and
float
(
num
)
!=
0
else
0
for
num
in
sta_list
]
five_star
=
round
(
results
[
0
],
2
)
if
len
(
results
)
>
0
else
0
three_star
=
round
(
results
[
1
],
2
)
if
len
(
results
)
>
1
else
0
two_star
=
round
(
results
[
2
],
2
)
if
len
(
results
)
>
2
else
0
one_star
=
round
(
results
[
3
],
2
)
if
len
(
results
)
>
3
else
0
else
:
five_star
=
0
three_star
=
0
...
...
@@ -425,7 +429,7 @@ class dow_category_Product:
pattern
=
''
matches_list
=
re
.
findall
(
pattern
,
big_text_Advertisement
)
ad_spend
=
matches_list
[
0
]
if
matches_list
else
None
majority_spend
=
matches_list
[
1
]
if
matches_list
else
None
majority_spend
=
matches_list
[
1
]
if
len
(
matches_list
)
>
1
else
None
else
:
ad_spend
=
0
majority_spend
=
0
...
...
@@ -487,7 +491,10 @@ class dow_category_Product:
save_Category_list
.
append
([
Category_name
,
Product_name
[
0
],
Keyword
[
0
],
float
(
search_ratio
),
float
(
product_average
),
float
(
return_ratio
),
float
(
return_product_average
),
float
(
search_ratio
)
if
search_ratio
is
not
None
else
None
,
float
(
product_average
)
if
product_average
is
not
None
else
None
,
float
(
return_ratio
)
if
return_ratio
is
not
None
else
None
,
float
(
return_product_average
)
if
return_product_average
is
not
None
else
None
,
self
.
y_w
,
big_text_sller
,
big_text_brand
,
big_text_asin
,
big_text_new_asin
,
big_text_per_asin
,
big_text_Advertisement
,
big_text_star
,
big_brand_int
,
big_asin_int
,
big_new_asin_int
,
big_per_asin_int
,
five_star
,
three_star
,
two_star
,
...
...
@@ -533,6 +540,7 @@ class dow_category_Product:
'news_adv_spend'
,
'news_majority_spend'
])
df
=
df
.
where
(
pd
.
notnull
(
df
),
None
)
self
.
engine_pg
.
to_sql
(
df
,
f
'{site}_aba_profit_category_insights'
,
if_exists
=
"append"
)
print
(
'存储成功 pg'
)
break
...
...
@@ -571,7 +579,6 @@ class dow_category_Product:
driver
.
close
()
driver
.
quit
()
print
(
'重新启动 浏览器,'
)
self
.
run
()
raise
NeedRestart
(
"num>1 trigger restart"
)
except
NeedRestart
as
e
:
...
...
@@ -606,7 +613,8 @@ class dow_category_Product:
majority_spend
=
0
if
input
:
products_aggregate_sales
=
input
.
get
(
'products_aggregate_sales'
,
[])[
0
]
_agg
=
input
.
get
(
'products_aggregate_sales'
,
[])
products_aggregate_sales
=
_agg
[
0
]
if
_agg
else
None
if
products_aggregate_sales
:
split
=
products_aggregate_sales
.
split
(
"|"
)
sales_amount_str
=
self
.
safe_get
(
split
,
1
,
''
)
.
partition
(
"$"
)[
-
1
]
...
...
@@ -683,8 +691,10 @@ class dow_category_Product:
most_popular_keyword_list
=
[]
if
most_popular_list
:
for
most_popular
in
most_popular_list
:
most_popular_keyword
=
most_popular
.
xpath
(
'.//div[2]/text()'
)[
0
]
most_popular_b_nums
=
most_popular
.
xpath
(
'.//div/b/text()'
)[
0
]
_kw_list
=
most_popular
.
xpath
(
'.//div[2]/text()'
)
most_popular_keyword
=
_kw_list
[
0
]
if
_kw_list
else
None
_bn_list
=
most_popular
.
xpath
(
'.//div/b/text()'
)
most_popular_b_nums
=
_bn_list
[
0
]
if
_bn_list
else
None
most_popular_keyword_list
.
append
({
"most_popular_keywords"
:
most_popular_keyword
,
'most_popular_search_nums'
:
most_popular_b_nums
...
...
@@ -692,8 +702,10 @@ class dow_category_Product:
top_ratio_list
=
html_top
.
xpath
(
'//div[@class="big-text-section-name"][1]/div[@class="big-text"]/text()'
)
if
top_ratio_list
:
search_ratio
=
re
.
findall
(
r'(.*?)‰'
,
top_ratio_list
[
0
])[
0
]
return_ratio
=
re
.
findall
(
r'(.*?)
%
'
,
top_ratio_list
[
1
])[
0
]
_sr
=
re
.
findall
(
r'(.*?)‰'
,
top_ratio_list
[
0
])
search_ratio
=
_sr
[
0
]
if
_sr
else
None
_rr
=
re
.
findall
(
r'(.*?)
%
'
,
top_ratio_list
[
1
])
if
len
(
top_ratio_list
)
>
1
else
[]
return_ratio
=
_rr
[
0
]
if
_rr
else
None
else
:
search_ratio
=
None
return_ratio
=
None
...
...
@@ -739,6 +751,8 @@ class dow_category_Product:
time
.
sleep
(
1.5
)
return
self
.
analysis_top_Newly_html
(
driver
)
return
self
.
_safe_action
(
"click_newly_launched"
,
_do
,
driver
,
site
=
self
.
site_name
)
except
NeedRestart
:
raise
except
Exception
:
return
"{}"
...
...
@@ -747,19 +761,31 @@ class dow_category_Product:
self
.
mysql_connect
(
site
=
self
.
site_name
)
select_sql
=
'select category from seller_category_insights_syn where state =1'
df
=
self
.
engine_pg
.
read_sql
(
select_sql
)
if
df
.
shape
[
0
]
>
0
:
category_list
=
list
(
df
.
category
)
print
(
category_list
)
if
category_list
:
return
category_list
else
:
return
None
else
:
for
i
in
range
(
5
):
try
:
self
.
mysql_connect
()
workflow_everyday_list
=
[
[
self
.
site_name
,
self
.
y_w
,
'类目分析抓取完成'
,
3
,
f
'{self.site_name}_aba_profit_category_insights'
,
'week'
,
'类目分析'
,
'是'
]
]
print
(
workflow_everyday_list
)
df_seller_asin_account
=
pd
.
DataFrame
(
data
=
workflow_everyday_list
,
columns
=
[
'site_name'
,
'date_info'
,
'status'
,
'status_val'
,
'table_name'
,
'date_type'
,
'page'
,
'is_end'
])
self
.
engine_us_mysql
.
to_sql
(
df_seller_asin_account
,
'workflow_progress'
,
if_exists
=
'append'
)
break
except
:
print
(
'存储报错 类目分析抓取完成'
)
self
.
mysql_connect
(
site
=
self
.
site_name
)
time
.
sleep
(
10
)
return
None
def
save_category
(
self
,
html
):
Category_list
=
html
.
xpath
(
'//h2[contains(text(),"Category")]/following-sibling::div/div'
)
...
...
@@ -769,6 +795,8 @@ class dow_category_Product:
for
Category
in
Category_list
:
Category_name
=
Category
.
xpath
(
'./@id'
)
Category_label
=
Category
.
xpath
(
'.//@label'
)
if
not
Category_name
or
not
Category_label
:
continue
self
.
category_item
[
Category_label
[
0
]]
=
Category_name
[
0
]
Categorys_list
.
append
(
Category_name
[
0
])
Categorys_list_syn
.
append
([
Category_label
[
0
]])
...
...
@@ -779,7 +807,7 @@ class dow_category_Product:
if
syn_state
is
False
:
now
=
datetime
.
datetime
.
now
()
is_monday
=
(
now
.
weekday
()
==
0
)
is_9_am
=
(
now
.
hour
==
11
)
# 保持你原判断
is_9_am
=
(
now
.
hour
==
11
)
if
is_monday
and
is_9_am
:
TRUNCATE_SQL
=
'TRUNCATE seller_category_insights_syn'
conn
.
execute
(
TRUNCATE_SQL
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment