Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
067652bd
Commit
067652bd
authored
May 08, 2026
by
Peng
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
8066e738
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
49 additions
and
18 deletions
+49
-18
dow_seller_data.py
py_spider/amazon_every_day_spider/dow_seller_data.py
+49
-18
alipay_account_order_szcygjgs.py
py_spider/amazon_spider/alipay_account_order_szcygjgs.py
+0
-0
No files found.
py_spider/amazon_every_day_spider/dow_seller_data.py
View file @
067652bd
import
os
from
playwright.sync_api
import
sync_playwright
from
playwright.sync_api
import
sync_playwright
from
secure_db_client
import
get_remote_engine
from
secure_db_client
import
get_remote_engine
import
pandas
as
pd
import
pandas
as
pd
...
@@ -12,6 +13,7 @@ class One688LoginSpider(object):
...
@@ -12,6 +13,7 @@ class One688LoginSpider(object):
def
__init__
(
self
,
site
=
'us'
):
def
__init__
(
self
,
site
=
'us'
):
self
.
site
=
site
self
.
site
=
site
self
.
data
=
None
self
.
data
=
None
self
.
post_url
=
'/ox-api/graphql'
month
=
time
.
strftime
(
"
%
m"
)
month
=
time
.
strftime
(
"
%
m"
)
yaer
=
time
.
strftime
(
'
%
Y'
,
time
.
localtime
(
time
.
time
()))
yaer
=
time
.
strftime
(
'
%
Y'
,
time
.
localtime
(
time
.
time
()))
self
.
y_w
=
f
"{yaer}-{month}"
self
.
y_w
=
f
"{yaer}-{month}"
...
@@ -29,15 +31,17 @@ class One688LoginSpider(object):
...
@@ -29,15 +31,17 @@ class One688LoginSpider(object):
return
self
.
engine_us_mysql
return
self
.
engine_us_mysql
def
print_request_finished
(
self
,
request
):
def
print_request_finished
(
self
,
request
):
# 拦截请求获取数据
# 拦截请求获取数据
,只接受 data.niches 形态的 graphql 响应,避免被同 URL 的其他响应覆盖
if
self
.
post_url
in
request
.
url
:
if
self
.
post_url
in
request
.
url
:
if
request
.
response
():
resp
=
request
.
response
()
if
resp
:
try
:
try
:
self
.
data
=
request
.
response
()
.
json
()
body
=
resp
.
json
()
print
(
self
.
data
)
if
isinstance
(
body
,
dict
)
and
isinstance
(
body
.
get
(
'data'
),
dict
)
and
'niches'
in
body
[
'data'
]:
self
.
data
=
body
print
(
self
.
data
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
'拦截url报错:'
,
e
,
f
"
\n
{traceback.format_exc()}"
)
print
(
'拦截url报错:'
,
e
,
f
"
\n
{traceback.format_exc()}"
)
self
.
data
=
None
def
select_category_json
(
self
):
def
select_category_json
(
self
):
sql
=
'SELECT category_json,id FROM seller_product_opportunity_syn where state=1'
sql
=
'SELECT category_json,id FROM seller_product_opportunity_syn where state=1'
...
@@ -143,11 +147,17 @@ class One688LoginSpider(object):
...
@@ -143,11 +147,17 @@ class One688LoginSpider(object):
)
)
def
crawl
(
self
,
url
):
def
crawl
(
self
,
url
):
self
.
page
.
on
(
"requestfinished"
,
self
.
print_request_finished
)
# 请求 指定的 URL(监听器在 main 里只注册一次,避免重复绑定累积)
# 请求 指定的 URL
# goto 前清空旧响应,避免被上一次的数据污染
self
.
data
=
None
self
.
page
.
goto
(
url
)
self
.
page
.
goto
(
url
)
self
.
page
.
wait_for_timeout
(
25000
)
# 轮询等待 niches 响应到达,最多 30 秒;拿到立即返回,不再固定 sleep 25 秒
# 等待页面加载一段时间
end_time
=
time
.
time
()
+
30
while
time
.
time
()
<
end_time
:
if
self
.
data
is
not
None
:
return
self
.
page
.
wait_for_timeout
(
500
)
print
(
'等待 niches 响应超时'
)
def
run
(
self
):
def
run
(
self
):
self
.
page
.
goto
(
'https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?'
)
self
.
page
.
goto
(
'https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?'
)
...
@@ -156,28 +166,49 @@ class One688LoginSpider(object):
...
@@ -156,28 +166,49 @@ class One688LoginSpider(object):
self
.
get_category
()
self
.
get_category
()
def
main
(
self
):
def
main
(
self
):
# 启动前杀掉已有 Edge 进程,防止 user_data_dir 被锁
os
.
system
(
"taskkill /F /IM msedge.exe"
)
time
.
sleep
(
2
)
# 初始化
# 初始化
with
sync_playwright
()
as
_playwright
:
with
sync_playwright
()
as
_playwright
:
# _playwright.chromium.launch_persistent_context
browser
=
_playwright
.
chromium
.
launch_persistent_context
(
browser
=
_playwright
.
chromium
.
launch_persistent_context
(
# 指定本机用户缓存地址
# 保持免登录 profile(参考 dow_category_edge.py:71)
user_data_dir
=
r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data"
,
user_data_dir
=
r"C:\Users\FLA主账号客服维权使用\AppData\Local\Microsoft\Edge\User Data"
,
# 指定本机google客户端exe的路径
# 通过 channel 指定使用 Edge,无需写死 exe 路径
executable_path
=
r"C:\Program Files\Google\Chrome\Application\chrome.exe"
,
channel
=
"msedge"
,
# 要想通过这个下载文件这个必然要开 默认是False
accept_downloads
=
True
,
accept_downloads
=
True
,
# 设置不是无头模式
headless
=
False
,
headless
=
False
,
bypass_csp
=
True
,
bypass_csp
=
True
,
locale
=
'en-GB'
,
locale
=
'en-GB'
,
ignore_https_errors
=
True
,
ignore_https_errors
=
True
,
no_viewport
=
True
,
no_viewport
=
True
,
slow_mo
=
10
,
slow_mo
=
10
,
# 跳过检测
args
=
[
args
=
[
'--disable-blink-features=AutomationControlled'
,
'--remote-debugging-port=9222'
]
# 跳过自动化检测
'--disable-blink-features=AutomationControlled'
,
# 选 Default profile(与 user_data_dir 配合)
'--profile-directory=Default'
,
# Windows 下移除 Linux 参数,减少启动崩概率
'--disable-gpu'
,
# 降低"首次运行/恢复弹窗/扩展"对启动的干扰
'--no-first-run'
,
'--no-default-browser-check'
,
'--disable-extensions'
,
'--disable-notifications'
,
# 避免 DevToolsActivePort 问题(让系统随机分配端口)
'--remote-debugging-port=0'
,
# 保留后台节流相关(一般没问题)
'--disable-backgrounding-occluded-windows'
,
'--disable-renderer-backgrounding'
,
'--disable-background-timer-throttling'
,
'--disable-features=CalculateNativeWinOcclusion'
,
'--remote-allow-origins=*'
,
]
)
)
self
.
page
=
browser
.
new_page
()
self
.
page
=
browser
.
new_page
()
# 拦截 graphql 响应:监听器只注册一次(防止累积)
self
.
page
.
on
(
"requestfinished"
,
self
.
print_request_finished
)
js
=
"""
js
=
"""
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
"""
...
...
py_spider/amazon_spider/alipay_account_order_szcygjgs.py
0 → 100644
View file @
067652bd
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment