Commit 067652bd by Peng

no message

parent 8066e738
import os
from playwright.sync_api import sync_playwright
from secure_db_client import get_remote_engine
import pandas as pd
......@@ -12,6 +13,7 @@ class One688LoginSpider(object):
def __init__(self, site='us'):
self.site = site
self.data = None
self.post_url = '/ox-api/graphql'
month = time.strftime("%m")
yaer = time.strftime('%Y', time.localtime(time.time()))
self.y_w = f"{yaer}-{month}"
......@@ -29,15 +31,17 @@ class One688LoginSpider(object):
return self.engine_us_mysql
def print_request_finished(self, request):
# 拦截请求获取数据
# 拦截请求获取数据,只接受 data.niches 形态的 graphql 响应,避免被同 URL 的其他响应覆盖
if self.post_url in request.url:
if request.response():
resp = request.response()
if resp:
try:
self.data = request.response().json()
print(self.data)
body = resp.json()
if isinstance(body, dict) and isinstance(body.get('data'), dict) and 'niches' in body['data']:
self.data = body
print(self.data)
except Exception as e:
print('拦截url报错:', e, f"\n{traceback.format_exc()}")
self.data = None
def select_category_json(self):
sql = 'SELECT category_json,id FROM seller_product_opportunity_syn where state=1'
......@@ -143,11 +147,17 @@ class One688LoginSpider(object):
)
def crawl(self, url):
self.page.on("requestfinished", self.print_request_finished)
# 请求 指定的 URL
# 请求 指定的 URL(监听器在 main 里只注册一次,避免重复绑定累积)
# goto 前清空旧响应,避免被上一次的数据污染
self.data = None
self.page.goto(url)
self.page.wait_for_timeout(25000)
# 等待页面加载一段时间
# 轮询等待 niches 响应到达,最多 30 秒;拿到立即返回,不再固定 sleep 25 秒
end_time = time.time() + 30
while time.time() < end_time:
if self.data is not None:
return
self.page.wait_for_timeout(500)
print('等待 niches 响应超时')
def run(self):
self.page.goto('https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?')
......@@ -156,28 +166,49 @@ class One688LoginSpider(object):
self.get_category()
def main(self):
# 启动前杀掉已有 Edge 进程,防止 user_data_dir 被锁
os.system("taskkill /F /IM msedge.exe")
time.sleep(2)
# 初始化
with sync_playwright() as _playwright:
# _playwright.chromium.launch_persistent_context
browser = _playwright.chromium.launch_persistent_context(
# 指定本机用户缓存地址
user_data_dir=r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data",
# 指定本机google客户端exe的路径
executable_path=r"C:\Program Files\Google\Chrome\Application\chrome.exe",
# 要想通过这个下载文件这个必然要开 默认是False
# 保持免登录 profile(参考 dow_category_edge.py:71)
user_data_dir=r"C:\Users\FLA主账号客服维权使用\AppData\Local\Microsoft\Edge\User Data",
# 通过 channel 指定使用 Edge,无需写死 exe 路径
channel="msedge",
accept_downloads=True,
# 设置不是无头模式
headless=False,
bypass_csp=True,
locale='en-GB',
ignore_https_errors=True,
no_viewport=True,
slow_mo=10,
# 跳过检测
args=['--disable-blink-features=AutomationControlled', '--remote-debugging-port=9222']
args=[
# 跳过自动化检测
'--disable-blink-features=AutomationControlled',
# 选 Default profile(与 user_data_dir 配合)
'--profile-directory=Default',
# Windows 下移除 Linux 参数,减少启动崩概率
'--disable-gpu',
# 降低"首次运行/恢复弹窗/扩展"对启动的干扰
'--no-first-run',
'--no-default-browser-check',
'--disable-extensions',
'--disable-notifications',
# 避免 DevToolsActivePort 问题(让系统随机分配端口)
'--remote-debugging-port=0',
# 保留后台节流相关(一般没问题)
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-background-timer-throttling',
'--disable-features=CalculateNativeWinOcclusion',
'--remote-allow-origins=*',
]
)
self.page = browser.new_page()
# 拦截 graphql 响应:监听器只注册一次(防止累积)
self.page.on("requestfinished", self.print_request_finished)
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment