Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
d4dde086
Commit
d4dde086
authored
Jan 21, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
本次主要解决页面加载完成没有显示详情进行判断不在往下走进行报错了。新增15s内页面源码没有该id进行重新请求页面。
parent
2a634fbb
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
83 additions
and
72 deletions
+83
-72
H10_spider.py
py_spider/amazon_spider/H10_spider.py
+83
-72
No files found.
py_spider/amazon_spider/H10_spider.py
View file @
d4dde086
...
...
@@ -66,6 +66,38 @@ class H10():
s
.
connect
((
'baidu.com'
,
0
))
ip
=
s
.
getsockname
()[
0
]
# You are viewing a demo of Cerebro
"""
H10测试账号
账号:yswg006@hotmail.com # 124 126 共用
密码:Chianbugye@8346148
yswg304@outlook.com # 122
Chinabuye@467138
YSWGHF422023@outlook.com # 244
soundasia422023@
CherryY2023@outlook.com # 245
20230322Yy@
H10961961@outlook.com # 246
soundasia961961@
X18756082657@outlook.com # 247
Zyx13075039897@
wretyu2023@outlook.com # 127
Sffgserter@1
a18756082657@outlook.com # 121
12345678Ll@
账号:yashengweige678@outlook.com # 120
密码:987654321yswg@
账号:yswg12345678@outlook.com # 126 信用卡有问题
密码:yswg654321@
"""
user_pw_dict
=
{
'192.168.10.244'
:
[
r'C:\Users\win10-244\Downloads'
,
'YSWGHF422023@outlook.com'
,
'soundasia422023@'
],
'192.168.10.245'
:
[
r'C:\Users\win10-245\Downloads'
,
'CherryY2023@outlook.com'
,
'20230322Yy@'
],
...
...
@@ -75,7 +107,7 @@ class H10():
'192.168.0.121'
:
[
r'C:\Users\1\Downloads'
,
'a18756082657@outlook.com'
,
'12345678Ll@'
],
'192.168.0.126'
:
[
r'C:\Users\Administrator\Downloads'
,
'yswg12345678@outlook.com'
,
'yswg654321@'
],
'192.168.0.127'
:
[
r'C:\Users\1\Downloads'
,
'wretyu2023@outlook.com'
,
'Sffgserter@1'
],
'192.168.0.122'
:
[
r'C:\Users\1\Downloads'
,
'y
ashengweige678@outlook.com'
,
'987654321yswg@
'
],
'192.168.0.122'
:
[
r'C:\Users\1\Downloads'
,
'y
swg304@outlook.com'
,
'Chinabuye@467138
'
],
'192.168.0.124'
:
[
r'C:\Users\1\Downloads'
,
'yswg006@hotmail.com'
,
'Chianbugye@8346148'
],
}
user_pw_list
=
user_pw_dict
.
get
(
ip
)
...
...
@@ -395,9 +427,19 @@ class H10():
except
:
pass
def
wait_page
(
self
,
timeout
=
30
):
start
=
time
.
time
()
wait
=
WebDriverWait
(
self
.
driver
,
timeout
)
try
:
wait
.
until
(
EC
.
element_to_be_clickable
((
By
.
XPATH
,
'//*[@id="re-container"]//input'
)))
print
(
"wait_page ok, used:"
,
time
.
time
()
-
start
)
return
True
except
TimeoutException
:
print
(
"wait_page timeout, used:"
,
time
.
time
()
-
start
)
return
False
def
webdrvier_html
(
self
,
asin
,
asinstype
):
# 点击选择站点
for
i
in
range
(
4
):
for
i
in
range
(
5
):
try
:
_url
=
self
.
driver
.
current_url
if
"concurrent-sessions"
in
_url
or
'signin'
in
_url
:
...
...
@@ -405,7 +447,10 @@ class H10():
if
asin
not
in
self
.
err_asin_list
and
self
.
useremail_state
:
print
(
'cerebro界面'
,
self
.
site_name_url
)
self
.
driver
.
get
(
f
'https://members.helium10.com/cerebro?accountId={self.account_id}'
)
time
.
sleep
(
10
)
if
not
self
.
wait_page
(
timeout
=
15
):
print
(
'页面未加载出来'
)
continue
time
.
sleep
(
2
)
if
'You are viewing a demo of Cerebro'
in
self
.
driver
.
page_source
:
print
(
self
.
email_name
,
'账号过期'
)
self
.
driver
.
refresh
()
...
...
@@ -448,11 +493,11 @@ class H10():
try
:
self
.
driver
.
execute_script
(
f
"""document.querySelector("img[loading='lazy']").click()"""
)
time
.
sleep
(
1
.5
)
time
.
sleep
(
1
)
except
:
self
.
driver
.
execute_script
(
f
"""document.querySelector("img[alt='{alt}']").click()"""
)
time
.
sleep
(
1
.5
)
time
.
sleep
(
1
)
self
.
verify
()
# 切换站点
self
.
driver
.
execute_script
(
f
"""document.querySelector("div[data-value='{host}']").click()"""
)
...
...
@@ -472,7 +517,7 @@ class H10():
# 点击 get keyword
time
.
sleep
(
1
)
self
.
driver
.
execute_script
(
'document.querySelector("#CerebroSearchButtons > button").click()'
)
time
.
sleep
(
2
)
time
.
sleep
(
1
)
html
=
self
.
driver
.
page_source
if
'You have reached the limit of the uses'
in
html
:
self
.
useremail_state
=
False
...
...
@@ -493,7 +538,7 @@ class H10():
time
.
sleep
(
2
)
try
:
if
'searched this product before'
in
html
or
'先前已搜索过此产品'
in
html
:
print
(
'33333333333'
)
print
(
'33333333333
444444
'
)
self
.
driver
.
execute_script
(
"""document.querySelector("button[data-testid='runnewsearch']").click()"""
)
sleep
(
randint
(
3
,
8
))
...
...
@@ -502,7 +547,7 @@ class H10():
print
(
'点击 run 报错'
)
# 点击下载
self
.
driver
.
execute_script
(
'window.scrollBy(0, 300);'
)
time
.
sleep
(
2
)
time
.
sleep
(
1
)
html
=
self
.
driver
.
page_source
if
'You have reached the limit of the uses'
in
html
:
self
.
useremail_state
=
False
...
...
@@ -518,10 +563,9 @@ class H10():
break
elif
'errorCodes.undefined'
in
html
:
continue
sleep
(
randint
(
13
,
28
))
time
.
sleep
(
5
)
sleep
(
randint
(
15
,
30
))
self
.
verify
()
time
.
sleep
(
2
.5
)
time
.
sleep
(
2
)
if
'Wrong entered data or no results'
in
html
:
print
(
'没有报告可下载2222'
,
asin
)
self
.
err_asin_list
.
append
(
asin
)
...
...
@@ -532,17 +576,26 @@ class H10():
break
elif
'errorCodes.undefined'
in
html
:
continue
time
.
sleep
(
5
)
elif
'errors.common.502'
in
html
:
print
(
'没有报告可下载333'
,
asin
)
self
.
err_asin_list
.
append
(
asin
)
break
if
asinstype
:
try
:
print
(
'点击显示下拉框'
)
button_js
=
'document.querySelector("#CerebroFilter > div > div.sc-dzXNMW.dufncf > div.sc-hFCjLd.igMWUF > div > button").click()'
self
.
driver
.
execute_script
(
button_js
)
time
.
sleep
(
2
)
html
=
self
.
driver
.
page_source
resp
=
etree
.
HTML
(
html
)
try
:
print
(
'Amazons Choice获取元素'
)
time
.
sleep
(
2
)
div_class
=
resp
.
xpath
(
'''//div[contains(text(),"Amazon Choice")]/parent::div/following-sibling::div/@class|//div[contains(text(),"Amazon's Choice")]/parent::div/following-sibling::div/@class'''
)
except
:
print
(
'报错22222222222222'
)
if
asinstype
:
time
.
sleep
(
2
)
print
(
'点击选择亚马逊精选 勾选'
)
time
.
sleep
(
2
)
try
:
script
=
f
"""
const elements = document.querySelectorAll("div[class='{div_class[0]}']>div");
...
...
@@ -553,7 +606,7 @@ class H10():
if
i
==
2
:
self
.
err_asins_adv_list
.
append
(
asin
)
self
.
driver
.
execute_script
(
script
)
time
.
sleep
(
2
)
time
.
sleep
(
1
)
html1
=
self
.
driver
.
page_source
resp1
=
etree
.
HTML
(
html1
)
span_class
=
resp1
.
xpath
(
...
...
@@ -561,15 +614,15 @@ class H10():
# 选择亚马逊精选参数1
self
.
driver
.
execute_script
(
f
"""document.querySelector("div[class='{span_class}']").click()"""
)
time
.
sleep
(
2
)
time
.
sleep
(
1
)
# 选择亚马逊精选参数2
self
.
driver
.
execute_script
(
f
"""document.querySelector("div[class='{span_class}']").click()"""
)
time
.
sleep
(
2
)
time
.
sleep
(
1
)
# 点击添加
self
.
driver
.
execute_script
(
"""document.querySelector("button[data-testid='applyfilters']").click()"""
)
time
.
sleep
(
6.5
)
time
.
sleep
(
3
)
# 下载报告
# 点击下载csv按钮
self
.
driver
.
execute_script
(
...
...
@@ -742,57 +795,6 @@ class H10():
print
(
'重新下载文件222:'
,
asin
,
path
)
self
.
webdrvier_html
(
asin
,
None
)
self
.
if_csv_path
(
file_path
)
# columns = pd.read_csv(file_path, nrows=0).columns.tolist()
#
# def contains_chinese(text):
# return bool(re.search(r'[\u4e00-\u9fff]', text))
# is_chinese_header = any(contains_chinese(col) for col in columns)
# if is_chinese_header:
# print("表头是中文")
# columns_to_include_zh = ['关键词词组', 'Cerebro IQ 得分', '搜索量', '搜索量趋势',
# '广告推广ASIN 数',
# '竞品数', 'CPR', '标题密度', '亚马逊推荐', '自然',
# '亚马逊推荐排名', '广告排名', '自然排名']
# df = pd.read_csv(file_path, usecols=columns_to_include_zh)
# # 中文 -> 英文映射
# df.rename(columns={
# '关键词词组': 'keyword',
# 'Cerebro IQ 得分': 'cerebro_iq_score',
# '搜索量': 'search_volume',
# '搜索量趋势': 'search_volume_trend',
# '广告推广ASIN 数': 'sponsored_asins',
# '竞品数': 'competing_product',
# 'CPR': 'cpr',
# '标题密度': 'title_desity',
# '亚马逊推荐': 'amazon_recommended',
# '自然': 'organic',
# '亚马逊推荐排名': 'amazon_recommended_rank',
# '广告排名': 'sponsored_rank',
# '自然排名': 'organic_rank'
# }, inplace=True)
# else:
# print("表头是英文")
# columns_to_include_en = ['Keyword Phrase', 'Cerebro IQ Score', 'Search Volume', 'Search Volume Trend',
# 'Sponsored ASINs',
# 'Competing Products', 'CPR', 'Title Density', 'Amazon Recommended', 'Organic',
# 'Amazon Rec. Rank', 'Sponsored Rank', 'Organic Rank']
# df = pd.read_csv(file_path, usecols=columns_to_include_en)
# df.rename(columns={
# 'Keyword Phrase': 'keyword',
# 'Cerebro IQ Score': 'cerebro_iq_score',
# 'Search Volume': 'search_volume',
# 'Search Volume Trend': 'search_volume_trend',
# 'Sponsored ASINs': 'sponsored_asins',
# 'Competing Products': 'competing_product',
# 'CPR': 'cpr',
# 'Title Density': 'title_desity',
# 'Amazon Recommended': 'amazon_recommended',
# 'Organic': 'organic',
# 'Amazon Rec. Rank': 'amazon_recommended_rank',
# 'Sponsored Rank': 'sponsored_rank',
# 'Organic Rank': 'organic_rank'
# }, inplace=True)
header_config
=
{
"chinese"
:
{
"columns"
:
[
'关键词词组'
,
'Cerebro IQ 得分'
,
'搜索量'
,
'搜索量趋势'
,
...
...
@@ -973,6 +975,15 @@ class H10():
previous_date_str
=
previous_date
.
strftime
(
"
%
Y-
%
m-
%
d"
)
file_path
=
fr
'{path}
\
{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{previous_date_str}.csv'
print
(
'file_pathsave_competition2222'
,
file_path
)
state
=
self
.
if_csv_path
(
file_path
)
if
state
==
False
:
time
.
sleep
(
3
)
file_path
=
fr
'{path}
\
{self.site_name_csv.upper()}_AMAZON_cerebro_{asin_list[0]}_{time_strftime}.csv'
print
(
'file_pathsave_competition3333'
,
file_path
)
state
=
self
.
if_csv_path
(
file_path
)
if
state
==
False
:
self
.
nex_page
(
self
.
asin_list
,
asinstype
=
1
)
# 创建一个字典来映射原始列名和新的列名
columns
=
pd
.
read_csv
(
file_path
,
nrows
=
0
)
.
columns
.
tolist
()
def
contains_chinese
(
text
):
...
...
@@ -1079,8 +1090,8 @@ class H10():
else
:
path
=
r'C:\Users\ASUS\Downloads'
print
(
'当前路径:'
,
path
)
self
.
email_name
=
'y
ashengweige678@outlook
.com'
self
.
pw
=
'
987654321yswg@
'
# 'yashengweige678@outlook.com', '987654321yswg@'
self
.
email_name
=
'y
swg006@hotmail
.com'
self
.
pw
=
'
Chianbugye@8346148
'
# 'yashengweige678@outlook.com', '987654321yswg@'
self
.
web_drver
()
while
True
:
self
.
data
=
{}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment