Commit 694247a4 by Peng

更新了获取cookie请求头

me搜索词同步到pg6更新了数据库链接方式
合并了ai类型任务代码。变成ai_analyze_spider
优化了1688付款流程。新增专属余额对应订单号自动支付
parent 7597ea9f
...@@ -47,25 +47,44 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -47,25 +47,44 @@ def get_cookie(site='us', zipCode='10010'):
engine_us = get_remote_engine(site, 'mysql') engine_us = get_remote_engine(site, 'mysql')
n = random.randint(70, 114) n = random.randint(120, 130)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36' ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36'
print(ua)
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.4929.149 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
headers = { headers = {
'connection': 'close', "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
'authority': host, "Accept-Encoding": "gzip, deflate, br, zstd",
'accept': 'text/html,*/*', "Accept-Language": "zh-CN,zh;q=0.9",
'accept-language': 'zh-CN,zh;q=0.9', "Cache-Control": "no-cache",
'cache-control': 'no-cache', "Device-Memory": "8",
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8', "Downlink": "1.25",
'origin': url_, "Dpr": "0.75",
'referer': url_, "Ect": "3g",
'sec-ch-ua-mobile': '?0', "Pragma": "no-cache",
'user-agent': ua "Rtt": "300",
} "Sec-Ch-Device-Memory": "8",
"Sec-Ch-Dpr": "0.75",
"Sec-Ch-Ua": f'"Not_A Brand";v="8", "Chromium";v="{ua}", "Google Chrome";v="{ua}"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Ch-Ua-Platform-Version": '"10.0.0"',
"Sec-Ch-Viewport-Width": "2560",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": ua,
"Viewport-Width": "2560",
}
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n'] alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
k = "" k = ""
for i in (0, random.randint(0, 5)): for i in (0, random.randint(0, 5)):
k += random.choice(alphabet) k += random.choice(alphabet)
headers[k] = str(uuid.uuid4()) # headers[k] = str(uuid.uuid4())
sess = requests.Session() sess = requests.Session()
sess.mount(url_, py_ja3.DESAdapter()) sess.mount(url_, py_ja3.DESAdapter())
resp_ = sess.get(url_, headers=headers, timeout=15, verify=False) resp_ = sess.get(url_, headers=headers, timeout=15, verify=False)
...@@ -193,12 +212,10 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -193,12 +212,10 @@ def get_cookie(site='us', zipCode='10010'):
if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip(): if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip():
print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************") print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************")
cookies = json.dumps(index_resp_cookies, ensure_ascii=False) cookies = json.dumps(index_resp_cookies, ensure_ascii=False)
cookies_list=[[cookies,'DB']]
item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies} item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies}
print(item) print(item)
# 构造 DataFrame # 构造 DataFrame
df = pd.DataFrame([{"cookies": cookies, "type": "DB"}]) df = pd.DataFrame([{"cookies": cookies, "type": "DB"}])
# df_data_list = df.values.tolist()
# 存储到数据库 # 存储到数据库
engine_us.to_sql(df, f"{site}_cookies", if_exists="append") engine_us.to_sql(df, f"{site}_cookies", if_exists="append")
...@@ -210,13 +227,13 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -210,13 +227,13 @@ def get_cookie(site='us', zipCode='10010'):
if __name__ == '__main__': if __name__ == '__main__':
while True: while True:
# get_cookie(site='us', zipCode='10010') get_cookie(site='us', zipCode='10010')
# get_cookie(site='de', zipCode='10115') get_cookie(site='de', zipCode='10115')
# get_cookie(site='uk', zipCode='W1S 3PR') get_cookie(site='uk', zipCode='W1S 3PR')
# get_cookie(site='it', zipCode='85') # get_cookie(site='it', zipCode='85')
# get_cookie(site='es', zipCode='28001') get_cookie(site='es', zipCode='28001')
# get_cookie(site='fr', zipCode='75019') get_cookie(site='fr', zipCode='75019')
# get_cookie(site='us', zipCode='10010') get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115') get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR') get_cookie(site='uk', zipCode='W1S 3PR')
time.sleep(random.uniform(10.5, 55.5)) time.sleep(random.uniform(30.5, 70.5))
...@@ -123,7 +123,6 @@ class dow_category_Product(): ...@@ -123,7 +123,6 @@ class dow_category_Product():
try: try:
num += 1 num += 1
Category_name = Category Category_name = Category
# _Category = Category.replace('&', '\\\&')
print("Category_name 名称 11111", Category) print("Category_name 名称 11111", Category)
driver.execute_script(f"""document.querySelector("kat-radiobutton[label='{Category}']").click()""") driver.execute_script(f"""document.querySelector("kat-radiobutton[label='{Category}']").click()""")
time.sleep(1) time.sleep(1)
......
...@@ -74,7 +74,7 @@ class bsr_catgory(BaseUtils): ...@@ -74,7 +74,7 @@ class bsr_catgory(BaseUtils):
except: except:
print(site, 'db_cursor_connect 报错:', sql) print(site, 'db_cursor_connect 报错:', sql)
def db_cursor_connect_msyql_read(self, site=None,select_state1_sql=None): def db_cursor_connect_msyql_read(self, site=None, select_state1_sql=None):
for i in range(3): for i in range(3):
try: try:
if site: if site:
...@@ -115,7 +115,7 @@ class bsr_catgory(BaseUtils): ...@@ -115,7 +115,7 @@ class bsr_catgory(BaseUtils):
self.year_month = f'{self.year}_{self.month}' self.year_month = f'{self.year}_{self.month}'
sele_sql = f"SELECT `week` FROM week_20_to_30 WHERE `year_month`='{self.year}_{self.month}'" sele_sql = f"SELECT `week` FROM week_20_to_30 WHERE `year_month`='{self.year}_{self.month}'"
print(sele_sql) print(sele_sql)
df_year_week = self.db_cursor_connect_msyql_read(site='us',select_state1_sql=sele_sql) df_year_week = self.db_cursor_connect_msyql_read(site='us', select_state1_sql=sele_sql)
self.year_week = list(df_year_week['week'])[-1] self.year_week = list(df_year_week['week'])[-1]
print(self.year_week, '====当前周===1232333') print(self.year_week, '====当前周===1232333')
...@@ -188,19 +188,19 @@ class bsr_catgory(BaseUtils): ...@@ -188,19 +188,19 @@ class bsr_catgory(BaseUtils):
"category_first_id": category_first_id, "category_first_id": category_first_id,
"category_parent_id": category_parent_id "category_parent_id": category_parent_id
} }
keys_to_check = ['category_id', 'category_first_id', 'category_parent_id'] # keys_to_check = ['category_id', 'category_first_id', 'category_parent_id']
# 使用列表推导式检查多个键的值是否为空字符串或None # # 使用列表推导式检查多个键的值是否为空字符串或None
empty_or_none_keys = [key for key in keys_to_check if items.get(key) in ('', None)] # empty_or_none_keys = [key for key in keys_to_check if items.get(key) in ('', None)]
if empty_or_none_keys: # if empty_or_none_keys:
print('解析失败') # print('解析失败')
try: # try:
account = 'pengyanbing' # account = 'pengyanbing'
title = self.site_name + ' bsr 榜单' # title = self.site_name + ' bsr 榜单'
content = f' bsr 榜单解析 url 失败 节点数:{nodes_num} \n 解析url:{url}' # content = f' bsr 榜单解析 url 失败 节点数:{nodes_num} \n 解析url:{url}'
db_class = connect_db(self.site_name) # db_class = connect_db(self.site_name)
db_class.send_mg(account, title, content) # db_class.send_mg(account, title, content)
except: # except:
pass # pass
return items return items
def html_4(self, bum): def html_4(self, bum):
...@@ -604,7 +604,7 @@ class bsr_catgory(BaseUtils): ...@@ -604,7 +604,7 @@ class bsr_catgory(BaseUtils):
order by {self.site_name}_bs_category.category_id, category_parent_id; order by {self.site_name}_bs_category.category_id, category_parent_id;
""" """
print('path_sql:', path_sql) print('path_sql:', path_sql)
df_exist_rows = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=path_sql) df_exist_rows = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=path_sql)
exist_rows = df_exist_rows.values.tolist() exist_rows = df_exist_rows.values.tolist()
group1_id = [] group1_id = []
group2_id = [] group2_id = []
...@@ -668,12 +668,12 @@ class bsr_catgory(BaseUtils): ...@@ -668,12 +668,12 @@ class bsr_catgory(BaseUtils):
# 不存在就插入 # 不存在就插入
try: try:
select_sql_id = f'''SELECT id FROM {self.site_name}_bs_category WHERE `path`="{name_num_path[3]}"''' select_sql_id = f'''SELECT id FROM {self.site_name}_bs_category WHERE `path`="{name_num_path[3]}"'''
df_id = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_sql_id) df_id = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql_id)
if not df_id.empty: if not df_id.empty:
save_name_num_list.append(name_num_path) save_name_num_list.append(name_num_path)
else: else:
select_sql_name = f'''SELECT en_name FROM {self.site_name}_bs_category WHERE `path`="{name_num_path[3]}" order by id desc ''' select_sql_name = f'''SELECT en_name FROM {self.site_name}_bs_category WHERE `path`="{name_num_path[3]}" order by id desc '''
df_en_name = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_sql_name) df_en_name = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql_name)
print('en_name::', df_en_name.values) print('en_name::', df_en_name.values)
if df_en_name['en_name'][0] == name_num_path[1]: if df_en_name['en_name'][0] == name_num_path[1]:
pass pass
...@@ -725,7 +725,7 @@ class bsr_catgory(BaseUtils): ...@@ -725,7 +725,7 @@ class bsr_catgory(BaseUtils):
select id, path,nodes_num from {self.site_name}_bs_category where category_first_id is null and category_parent_id != '0' select id, path,nodes_num from {self.site_name}_bs_category where category_first_id is null and category_parent_id != '0'
and delete_time is null; and delete_time is null;
""" """
df_nodes_num = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=sql) df_nodes_num = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=sql)
if not df_nodes_num.empty: if not df_nodes_num.empty:
id_path_list = df_nodes_num.values.tolist() id_path_list = df_nodes_num.values.tolist()
...@@ -815,13 +815,13 @@ class bsr_catgory(BaseUtils): ...@@ -815,13 +815,13 @@ class bsr_catgory(BaseUtils):
def select_id_1(self): def select_id_1(self):
# 查询 子节点的顶级父类id # 查询 子节点的顶级父类id
select_sql_1 = f'select id from {self.site_name}_bs_category where nodes_num=2' select_sql_1 = f'select id from {self.site_name}_bs_category where nodes_num=2'
df_id = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_sql_1) df_id = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql_1)
df_id_lsit = df_id.values.tolist() df_id_lsit = df_id.values.tolist()
for id in df_id_lsit: for id in df_id_lsit:
en_name_id_list = [] en_name_id_list = []
select_p_id = f"select t3.id,t4.en_name from (select t1.id,t1.parent_id,if(find_in_set(parent_id, @pids) > 0, @pids := concat(@pids, ',',id), 0) as ischild from (select id,p_id as parent_id from {self.site_name}_bs_category t order by p_id,id) t1,(select @pids := {id[0]}) t2) t3 LEFT JOIN {self.site_name}_bs_category t4 on t3.id = t4.id where ischild != 0;" select_p_id = f"select t3.id,t4.en_name from (select t1.id,t1.parent_id,if(find_in_set(parent_id, @pids) > 0, @pids := concat(@pids, ',',id), 0) as ischild from (select id,p_id as parent_id from {self.site_name}_bs_category t order by p_id,id) t1,(select @pids := {id[0]}) t2) t3 LEFT JOIN {self.site_name}_bs_category t4 on t3.id = t4.id where ischild != 0;"
print('select_p_id::',select_p_id) print('select_p_id::', select_p_id)
df_all_id = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_p_id) df_all_id = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_p_id)
if not df_all_id.empty: if not df_all_id.empty:
all_id_lsit = df_all_id.values.tolist() all_id_lsit = df_all_id.values.tolist()
for en_name_id in all_id_lsit: for en_name_id in all_id_lsit:
...@@ -894,7 +894,7 @@ class bsr_catgory(BaseUtils): ...@@ -894,7 +894,7 @@ class bsr_catgory(BaseUtils):
and delete_time is null and delete_time is null
order by category_id,category_first_id order by category_id,category_first_id
""" """
df_id_tuple = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=id_sql) df_id_tuple = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=id_sql)
id_tuple = df_id_tuple.values.tolist() id_tuple = df_id_tuple.values.tolist()
id_list = [] id_list = []
for id in id_tuple: for id in id_tuple:
...@@ -960,9 +960,9 @@ class bsr_catgory(BaseUtils): ...@@ -960,9 +960,9 @@ class bsr_catgory(BaseUtils):
_0_days = ((datetime.datetime.now()) + datetime.timedelta(days=0)).strftime("%Y-%m-%d") _0_days = ((datetime.datetime.now()) + datetime.timedelta(days=0)).strftime("%Y-%m-%d")
select_sql = f"select count(id) FROM {self.site_name}_bs_category_top100_asin WHERE date_info = '{_0_days}';" select_sql = f"select count(id) FROM {self.site_name}_bs_category_top100_asin WHERE date_info = '{_0_days}';"
print(select_sql) print(select_sql)
df_count_data_num = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_sql) df_count_data_num = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql)
count_data_num = df_count_data_num['count(id)'][0] count_data_num = df_count_data_num['count(id)'][0]
print('count_data_num::',count_data_num) print('count_data_num::', count_data_num)
self.send_ms_count_data_num(self.site_name, count_data_num, _0_days) self.send_ms_count_data_num(self.site_name, count_data_num, _0_days)
break break
except Exception as e: except Exception as e:
...@@ -971,6 +971,22 @@ class bsr_catgory(BaseUtils): ...@@ -971,6 +971,22 @@ class bsr_catgory(BaseUtils):
time.sleep(20) time.sleep(20)
continue continue
def sele_msyql_category(self, site):
engine_mysql = self.db_engine_us(site, 'mysql')
sql = f'select path, nodes_num,id from {site}_bs_category where nodes_num>1'
df = engine_mysql.read_sql(sql)
values_list = df.values.tolist()
with engine_mysql.begin() as conn_6:
for value in values_list:
print(value)
items = self.parse_url(value[1], value[0])
items['id'] = value[2]
print(items)
# {'category_id': '1722264031', 'category_first_id': 'baby', 'category_parent_id': '60244031', 'id': 67478}
ai_sql1 = f"update {site}_bs_category set category_id = '{items['category_id']}',category_parent_id='{items['category_parent_id']}',category_first_id='{items['category_first_id']}' where id={items['id']}"
print(ai_sql1)
conn_6.execute(ai_sql1)
def dele_self_real_spider(self): def dele_self_real_spider(self):
print('每天晚上定时删除贺哲的抓取表。用户已经取消收藏店铺') print('每天晚上定时删除贺哲的抓取表。用户已经取消收藏店铺')
select_sql = 'select data_id from user_collection_syn where data_type =2' select_sql = 'select data_id from user_collection_syn where data_type =2'
...@@ -1013,6 +1029,7 @@ if __name__ == '__main__': ...@@ -1013,6 +1029,7 @@ if __name__ == '__main__':
spider_us.run_update_redirect_flag() spider_us.run_update_redirect_flag()
spider_us.updata_category_first_id() spider_us.updata_category_first_id()
spider_us.send_ms() spider_us.send_ms()
for site in ['us','de', 'uk']: for site in ['us', 'de', 'uk']:
spider_us = bsr_catgory(site_name=site) spider_us = bsr_catgory(site_name=site)
spider_us.updata_category_state() spider_us.updata_category_state()
spider_us.sele_msyql_category(site)
'存储到pg'
'获取小语言cookie'
import sys import sys
import os import os
import pandas as pd
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import requests from curl_cffi import requests
import json import json
from lxml import etree from lxml import etree
import re import re
import random import random
import pymysql
import time import time
import py_ja3 from secure_db_client import get_remote_engine
from params import DB_CONN_DICT
import urllib3
import uuid
import traceback import traceback
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
item = {} item = {}
headers_num_int = 0 headers_num_int = 0
...@@ -38,35 +39,43 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -38,35 +39,43 @@ def get_cookie(site='us', zipCode='10010'):
elif site == 'it': elif site == 'it':
url_ = 'https://www.amazon.it' url_ = 'https://www.amazon.it'
host = 'www.amazon.it' host = 'www.amazon.it'
if site == 'us': engine_us = get_remote_engine(site, 'mysql')
us_db = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'], n = random.randint(110, 120)
password=DB_CONN_DICT['mysql_pwd'], database="selection", charset="utf8mb4") ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36'
else: print(ua)
us_db = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection_" + site,
charset="utf8mb4")
us_cursor = us_db.cursor()
n = random.randint(70, 114)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
headers = { headers = {
'connection': 'close', "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
'authority': host, "Accept-Encoding": "gzip, deflate, br, zstd",
'accept': 'text/html,*/*', "Accept-Language": "zh-CN,zh;q=0.9",
'accept-language': 'zh-CN,zh;q=0.9', "Cache-Control": "no-cache",
'cache-control': 'no-cache', "Device-Memory": "8",
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8', "Downlink": "1.25",
'origin': url_, "Dpr": "0.75",
'referer': url_, "Ect": "3g",
'sec-ch-ua-mobile': '?0', "Pragma": "no-cache",
'user-agent': ua "Rtt": "300",
"Sec-Ch-Device-Memory": "8",
"Sec-Ch-Dpr": "0.75",
"Sec-Ch-Ua": f'"Not_A Brand";v="8", "Chromium";v="{n}", "Google Chrome";v="{n}"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Ch-Ua-Platform-Version": '"10.0.0"',
"Sec-Ch-Viewport-Width": "2560",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": ua,
"Viewport-Width": "2560",
} }
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n'] alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
k = "" k = ""
for i in (0, random.randint(0, 5)): for i in (0, random.randint(0, 5)):
k += random.choice(alphabet) k += random.choice(alphabet)
headers[k] = str(uuid.uuid4()) # headers[k] = str(uuid.uuid4())
sess = requests.Session() sess = requests.Session()
sess.mount(url_, py_ja3.DESAdapter())
resp_ = sess.get(url_, headers=headers, timeout=15, verify=False) resp_ = sess.get(url_, headers=headers, timeout=15, verify=False)
cookie = resp_.headers.get('set-cookie') cookie = resp_.headers.get('set-cookie')
print("第一步 请求首页", url_) print("第一步 请求首页", url_)
...@@ -76,48 +85,49 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -76,48 +85,49 @@ def get_cookie(site='us', zipCode='10010'):
print("第一次发送请求,获取邮编:", ingress) print("第一次发送请求,获取邮编:", ingress)
data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal") data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
data_modal = json.loads(data_a_modal[0]) data_modal = json.loads(data_a_modal[0])
if site != 'us': # if site != 'us':
csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0] # csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0]
url_post = url_ + '/privacyprefs/retail/v1/acceptall' # url_post = url_ + '/privacyprefs/retail/v1/acceptall'
dada_post = { # dada_post = {
"anti-csrftoken-a2z": csrftoken, # "anti-csrftoken-a2z": csrftoken,
"accept": "all" # "accept": "all"
} # }
resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post, verify=False) # resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post,
cookie_post = resp_post.headers.get('set-cookie') # verify=False)
cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")} # cookie_post = resp_post.headers.get('set-cookie')
cookies_dict_post.update(cookies_dict) # cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")}
else: # cookies_dict_post.update(cookies_dict)
cookies_dict_post = cookies_dict # else:
if site == 'us': cookies_dict_post = cookies_dict
get_token_headers = { # if site == 'us':
'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'], # get_token_headers = {
'referer': url_, # 'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', # 'referer': url_,
} # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
print(get_token_headers,'23232') # }
else: # print(get_token_headers, '23232')
get_token_headers = { # else:
'accept': 'text/html,*/*', get_token_headers = {
'accept-encoding': 'gzip, deflate, br', 'accept': 'text/html,*/*',
'accept-language': 'zh-CN,zh;q=0.9', 'accept-encoding': 'gzip, deflate, br',
'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'], 'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache', 'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
'referer': url_, 'cache-control': 'no-cache',
'sec-fetch-dest': 'empty', 'referer': url_,
'sec-fetch-mode': 'cors', 'sec-fetch-dest': 'empty',
'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', 'sec-fetch-site': 'same-origin',
'viewport-width': '1920', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest', 'viewport-width': '1920',
} 'x-requested-with': 'XMLHttpRequest',
}
data_modal_url = url_ + data_modal['url'] data_modal_url = url_ + data_modal['url']
print('第二步 拼接url 点击更改位置:',data_modal_url) print('第二步 拼接url 点击更改位置:', data_modal_url)
data_modal_resp = sess.get(data_modal_url, headers=get_token_headers, cookies=cookies_dict_post, data_modal_resp = sess.get(data_modal_url, headers=get_token_headers, cookies=cookies_dict_post,
timeout=15,verify=False) timeout=15, verify=False)
data_modal_cookie = data_modal_resp.headers.get('set-cookie') data_modal_cookie = data_modal_resp.headers.get('set-cookie')
CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', data_modal_resp.text)[0] CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', data_modal_resp.text)[0]
print("CSRF_TOKEN:",CSRF_TOKEN) print("CSRF_TOKEN:", CSRF_TOKEN)
try: try:
data_modal_cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in data_modal_cookie.split("; ")} data_modal_cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in data_modal_cookie.split("; ")}
data_modal_cookie_dict.update(cookies_dict) data_modal_cookie_dict.update(cookies_dict)
...@@ -125,9 +135,10 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -125,9 +135,10 @@ def get_cookie(site='us', zipCode='10010'):
data_modal_cookie_dict = cookies_dict_post data_modal_cookie_dict = cookies_dict_post
url_2 = url_ + '/portal-migration/hz/glow/address-change?actionSource=glow' url_2 = url_ + '/portal-migration/hz/glow/address-change?actionSource=glow'
print('url_2:',url_2) print('url_2:', url_2)
# {"locationType":"LOCATION_INPUT","zipCode":"10010","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"} # {"locationType":"LOCATION_INPUT","zipCode":"10010","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"}
data = {"locationType":"LOCATION_INPUT","zipCode":zipCode,"storeContext":"generic","deviceType":"web","pageType":"Gateway","actionSource":"glow"} data = {"locationType": "LOCATION_INPUT", "zipCode": zipCode, "storeContext": "generic", "deviceType": "web",
"pageType": "Gateway", "actionSource": "glow"}
print(data) print(data)
post_headers = { post_headers = {
'anti-csrftoken-a2z': CSRF_TOKEN, 'anti-csrftoken-a2z': CSRF_TOKEN,
...@@ -157,13 +168,13 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -157,13 +168,13 @@ def get_cookie(site='us', zipCode='10010'):
'sec-fetch-site': 'same-origin', 'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'viewport-width': '1920', 'viewport-width': '1920',
'TE':'trailers', 'TE': 'trailers',
'x-requested-with': 'XMLHttpRequest' 'x-requested-with': 'XMLHttpRequest'
} }
print('第三步 发送post 请求 输入 邮编 点击确定') print('第三步 发送post 请求 输入 邮编 点击确定')
resp_2 = sess.post(url_2, headers=post_headers, json=data, cookies=data_modal_cookie_dict, resp_2 = sess.post(url_2, headers=post_headers, json=data, cookies=data_modal_cookie_dict,
timeout=15,verify=False) timeout=15, verify=False)
print(resp_2.text) print(resp_2.text)
post_cookies = resp_2.headers.get('set-cookie') post_cookies = resp_2.headers.get('set-cookie')
try: try:
...@@ -174,31 +185,29 @@ def get_cookie(site='us', zipCode='10010'): ...@@ -174,31 +185,29 @@ def get_cookie(site='us', zipCode='10010'):
done_url = url_ + "/portal-migration/hz/glow/get-location-label?storeContext=generic&pageType=Gateway&actionSource=desktop-modal" done_url = url_ + "/portal-migration/hz/glow/get-location-label?storeContext=generic&pageType=Gateway&actionSource=desktop-modal"
print('第四步,点击完成,') print('第四步,点击完成,')
done_resp = sess.get(done_url, headers=headers, cookies=post_cookies_dict, timeout=15,verify=False) done_resp = sess.get(done_url, headers=headers, cookies=post_cookies_dict, timeout=15, verify=False)
print(done_resp.text,'done_respdone_respdone_respdone_resp') print(done_resp.text, 'done_respdone_respdone_respdone_resp')
done_cookies_dict = sess.cookies.get_dict() done_cookies_dict = sess.cookies.get_dict()
print('done_cookies_dict::',done_cookies_dict) print('done_cookies_dict::', done_cookies_dict)
print("第五步,请求首页,获取邮编,是否修改成功") print("第五步,请求首页,获取邮编,是否修改成功")
index_resp = sess.get(url_, headers=headers, timeout=15,cookies=done_cookies_dict,verify=False) index_resp = sess.get(url_, headers=headers, timeout=15, cookies=done_cookies_dict, verify=False)
index_resp_cookies = sess.cookies.get_dict() index_resp_cookies = sess.cookies.get_dict()
print(sess.cookies.get_dict(),'2222222222222222') print(sess.cookies.get_dict(), '2222222222222222')
index_xpath = etree.HTML(index_resp.text) index_xpath = etree.HTML(index_resp.text)
ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()") ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
print("获取最新邮编:", ingress) print("获取最新邮编:", ingress)
if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip(): if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip():
print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************") print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************")
cookies = json.dumps(index_resp_cookies) cookies = json.dumps(index_resp_cookies, ensure_ascii=False)
item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies} item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies}
print(item) print(item)
insert_sql = f'insert into {site}_cookies (cookies,type)values (%s,%s)' # 构造 DataFrame
print(insert_sql) df = pd.DataFrame([{"cookies": cookies, "type": "DB"}])
us_cursor.execute(insert_sql, (cookies,'DB')) # 存储到数据库
us_db.commit() engine_us.to_sql(df, f"{site}_cookies", if_exists="append")
us_cursor.close()
us_db.close()
sess.close()
except Exception as e: except Exception as e:
print(f"获取 {site} 站点 cookie 报错,切换下一个站点",e) print(f"获取 {site} 站点 cookie 报错,切换下一个站点",e)
print("报错", f"\n{traceback.format_exc()}") print("报错", f"\n{traceback.format_exc()}")
...@@ -210,10 +219,10 @@ if __name__ == '__main__': ...@@ -210,10 +219,10 @@ if __name__ == '__main__':
get_cookie(site='us', zipCode='10010') get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115') get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR') get_cookie(site='uk', zipCode='W1S 3PR')
get_cookie(site='it', zipCode='00185') # get_cookie(site='it', zipCode='85')
get_cookie(site='es', zipCode='28001') get_cookie(site='es', zipCode='28001')
get_cookie(site='fr', zipCode='75019') get_cookie(site='fr', zipCode='75019')
get_cookie(site='us', zipCode='10010') get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115') get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR') get_cookie(site='uk', zipCode='W1S 3PR')
time.sleep(random.uniform(10.5, 35.5)) time.sleep(random.uniform(30.5, 70.5))
import pandas as pd import pandas as pd
from urllib.parse import quote from urllib.parse import quote
from datetime import datetime
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_connect import BaseUtils
# 从数据库获取数据的函数 # 从数据库获取数据的函数
def get_data_from_database(connection, query): def get_data_from_database(engine_pg, query):
return pd.read_sql(query, connection) result_df = engine_pg.read_sql(query)
return result_df
def db_read_data(engine_pg): def db_read_data(engine_pg):
...@@ -22,17 +27,19 @@ def db_read_data(engine_pg): ...@@ -22,17 +27,19 @@ def db_read_data(engine_pg):
# 创建初始 DataFrame # 创建初始 DataFrame
df_search_term = pd.DataFrame(data=result_list, columns=['search_term', 'url']) df_search_term = pd.DataFrame(data=result_list, columns=['search_term', 'url'])
print(df_search_term.shape) print(df_search_term.shape)
df_search_term['date_info'] = str(datetime.now().strftime("%Y-%m-%d"))
print('date_info::',df_search_term['date_info'])
# 找出超过 450 字符长度的 URL 行的索引 # 找出超过 450 字符长度的 URL 行的索引
long_url_rows = df_search_term['url'].str.len() <= 450 long_url_rows = df_search_term['url'].str.len() <= 450
# 筛选保留不超过 450 字符长度的 URL 行 # 筛选保留不超过 450 字符长度的 URL 行
data_df = df_search_term[long_url_rows] data_df = df_search_term[long_url_rows]
print('pg6 写入数据 merchantwords') print('pg6 写入数据 merchantwords')
with engine_pg.begin() as conn: with engine_pg.begin() as conn:
data_df.to_sql(f'us_search_term_month_syn_merchantwords', con=engine_pg, if_exists="append", index=False) engine_pg.to_sql(data_df,'us_merchantwords_search_term_month_syn_2025', if_exists="append")
update_sql = f"update us_search_term_month_merchantwords set state =3 where state=1" update_sql = f"update us_search_term_month_merchantwords set state =3 where state=1"
print(update_sql) print(update_sql)
conn.execute(update_sql) conn.execute(update_sql)
deletesql = f"DELETE from us_search_term_month_syn_merchantwords where state =3" deletesql = f"DELETE from us_merchantwords_search_term_month_syn_2025 where state =3"
print(deletesql) print(deletesql)
conn.execute(deletesql) conn.execute(deletesql)
...@@ -58,6 +65,7 @@ def build_urls(search_term): ...@@ -58,6 +65,7 @@ def build_urls(search_term):
] ]
return [[search_term, url] for url in urls] return [[search_term, url] for url in urls]
# if __name__ == '__main__': if __name__ == '__main__':
# # 传一个 数据库链接 # 传一个 数据库链接
# db_read_data(engine_pg) engine_pg = BaseUtils(site_name='us').pg_connect_6()
\ No newline at end of file db_read_data(engine_pg)
\ No newline at end of file
...@@ -258,26 +258,7 @@ class Save_asin_detail(BaseUtils): ...@@ -258,26 +258,7 @@ class Save_asin_detail(BaseUtils):
if asin_not_div_id_dp_list: if asin_not_div_id_dp_list:
self.db_change_state(state=13, asin_list=asin_not_div_id_dp_list) self.db_change_state(state=13, asin_list=asin_not_div_id_dp_list)
@func_set_timeout(240)
def save_asin_not_buysales(self, asin_buySales_list):
while True:
try:
if is_internet_available():
pass
else:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
print('错误月销的asin:', asin_buySales_list)
print('错误月销的asin:', len(asin_buySales_list))
df_asin_ = pd.DataFrame(data=asin_buySales_list, columns=['asin', 'buysales', 'date_info'])
self.engine_pg.to_sql(df_asin_, f'{self.site_name}_asin_detail_2025_not_buysales', if_exists='append')
break
except Exception as e:
print("存储 _asin_detail_2025_not_buysales 文本 数据错误", e)
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
time.sleep(random.uniform(10, 20.5))
continue
@func_set_timeout(240) @func_set_timeout(240)
def save_bs_category_asin_detail(self, bs_category_asin_list_pg): def save_bs_category_asin_detail(self, bs_category_asin_list_pg):
......
...@@ -21,7 +21,6 @@ import time ...@@ -21,7 +21,6 @@ import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session() sess = requests.Session()
urllib3.disable_warnings() urllib3.disable_warnings()
import ast
class ai_async_asin_pg(): class ai_async_asin_pg():
...@@ -120,8 +119,6 @@ class ai_async_asin_pg(): ...@@ -120,8 +119,6 @@ class ai_async_asin_pg():
sess.mount(self.site_url, py_ja3.DESAdapter()) sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers, resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False) timeout=10, verify=False)
# with open(rf'{self.site_name}_22_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp): if self.reuests_para_val.check_amazon_yzm(resp):
print('出现验证码,。asin---> ', asin) print('出现验证码,。asin---> ', asin)
if self.spider_state == '竞品asin': if self.spider_state == '竞品asin':
...@@ -331,41 +328,31 @@ class ai_async_asin_pg(): ...@@ -331,41 +328,31 @@ class ai_async_asin_pg():
def read_ai_asin(self): def read_ai_asin(self):
self.pg_connect() self.pg_connect()
self.spider_type=True self.spider_type = True
for module in ['Amazon:asin', 'Amazon:asinList']: for module in ['Amazon:asin', 'Amazon:asinList']:
if module == 'Amazon:asin': while True:
# pass try:
sql = f"SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module='{module}' and spider_status='未开始' for update;" if module == 'Amazon:asin':
else: sql = f"SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module='{module}' and spider_status='未开始' for update;"
sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and spider_status='未开始' for update;""" else:
# sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and task_id=39 for update;""" sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and spider_status='未开始' for update;"""
print(sql) # sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and task_id=39 for update;"""
df_read = self.engine_pg.read_then_update( print(sql)
select_sql=sql, df_read = self.engine_pg.read_then_update(
update_table='ai_asin_analyze_log', select_sql=sql,
set_values={"spider_status": '爬取中'}, # 把库存清零 update_table='ai_asin_analyze_log',
where_keys=["task_id"], # WHERE sku = :sku set_values={"spider_status": '爬取中'}, # 把库存清零
) where_keys=["task_id"], # WHERE sku = :sku
)
break
except:
time.sleep(10)
print(f'开始 {module} 任务:', sql) print(f'开始 {module} 任务:', sql)
if not df_read.empty: if not df_read.empty:
# if module == 'Amazon:asin':
# _asin_list = ast.literal_eval(df_read['asin'][0])
# asin_id_list = []
# for _aisn in _asin_list:
# asin_data_list = list(
# _aisn + '|-|' + df_read.task_id.astype(
# "U") + '|-|' + df_read.site_name + '|-|' + module)
# asin_id_list.extend(asin_data_list)
asin_id_list = list( asin_id_list = list(
df_read['asin'] + '|-|' + df_read.task_id.astype( df_read['asin'] + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + module) "U") + '|-|' + df_read.site_name + '|-|' + module)
# else:
# asin_id_list = list(
# df_read['asin'] + '|-|' + df_read.task_id.astype(
# "U") + '|-|' + df_read.site_name + '|-|' + module)
print(asin_id_list) print(asin_id_list)
for asin_id in asin_id_list: for asin_id in asin_id_list:
print(asin_id) print(asin_id)
self.queries_asin_queue.put(asin_id) self.queries_asin_queue.put(asin_id)
...@@ -385,7 +372,7 @@ class ai_async_asin_pg(): ...@@ -385,7 +372,7 @@ class ai_async_asin_pg():
print(asin_id) print(asin_id)
self.queries_asin_queue.put(asin_id) self.queries_asin_queue.put(asin_id)
html_thread = [] html_thread = []
for i in range(5): for i in range(8):
thread2 = threading.Thread(target=self.get_asin) thread2 = threading.Thread(target=self.get_asin)
thread2.start() thread2.start()
html_thread.append(thread2) html_thread.append(thread2)
...@@ -396,32 +383,37 @@ class ai_async_asin_pg(): ...@@ -396,32 +383,37 @@ class ai_async_asin_pg():
def select_asin(): def select_asin():
for site in ['us', 'de', 'uk']: for site in ['us', 'de', 'uk']:
select_sql = f"""select id, site_name, task_id, unique_key as asin,sub_step from ai_asin_analyze_spider where sub_step = 'AsinInfoRepository:详情' and status = '未开始' and site_name='{site}' order by task_id""" while True:
print('select_sql::', select_sql) try:
engine_pg15 = ai_async_asin_pg(site_name='us').pg_connect() select_sql = f"""select id, site_name, task_id, unique_key as asin,sub_step from ai_asin_analyze_spider where sub_step = 'AsinInfoRepository:详情' and status = '未开始' and site_name='{site}' order by task_id"""
df_read = engine_pg15.read_then_update( print('select_sql::', select_sql)
select_sql=select_sql, engine_pg15 = ai_async_asin_pg(site_name='us').pg_connect()
update_table='ai_asin_analyze_spider', df_read = engine_pg15.read_then_update(
set_values={"status": '爬取中'}, # 把库存清零 select_sql=select_sql,
where_keys=["id", "site_name"], # WHERE sku = :sku update_table='ai_asin_analyze_spider',
) set_values={"status": '爬取中'}, # 把库存清零
if not df_read.empty: where_keys=["id", "site_name"], # WHERE sku = :sku
asin_id_list = list( )
df_read['asin'] + '|-|' + df_read.task_id.astype( if not df_read.empty:
"U") + '|-|' + df_read.site_name + '|-|' + df_read.id.astype( asin_id_list = list(
"U") + '|-|' + df_read.sub_step) df_read['asin'] + '|-|' + df_read.task_id.astype(
print(asin_id_list) "U") + '|-|' + df_read.site_name + '|-|' + df_read.id.astype(
"U") + '|-|' + df_read.sub_step)
print(asin_id_list)
ai_async_asin_pg(site_name=site).run_analzye_asin(asin_id_list) ai_async_asin_pg(site_name=site).run_analzye_asin(asin_id_list)
break
except:
time.sleep(10)
def run_spider(): def run_spider():
time_ip_num = 0 time_ip_num = 0
while True: while True:
time_ip_num += 1 time_ip_num += 1
select_asin() select_asin() # 任务类型 AsinInfoRepository:详情
ai_async_asin_pg().read_ai_asin() ai_async_asin_pg().read_ai_asin() # 任务类型 'Amazon:asin', 'Amazon:asinList'
time.sleep(5) time.sleep(20)
print('-----------------------------------------------------------------------------------------') print('-----------------------------------------------------------------------------------------')
print() print()
if 10 <= datetime.now().hour < 22: if 10 <= datetime.now().hour < 22:
......
import os import os
import sys import sys
import gzip
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.secure_db_client import get_remote_engine from utils.secure_db_client import get_remote_engine
from amazon_spider.VPS_IP import pppoe_ip from amazon_spider.VPS_IP import pppoe_ip
...@@ -19,7 +19,6 @@ import time ...@@ -19,7 +19,6 @@ import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session() sess = requests.Session()
urllib3.disable_warnings() urllib3.disable_warnings()
import ast import ast
...@@ -206,46 +205,23 @@ class ai_async_asin_pg(): ...@@ -206,46 +205,23 @@ class ai_async_asin_pg():
'bundles_this_asins_json': items['bundles_this_asins_data_json'], 'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'], 'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': items['bundle_asin_component_json'], 'bundle_asin_component_json': items['bundle_asin_component_json'],
'bsr_category_asin_list': items['bs_category_asin_list_pg'],'review_json_list':items['review_json_list'] 'bsr_category_asin_list': items['bs_category_asin_list_pg'],'review_json_list':items['review_json_list'],
'fbm_delivery_price': items['fbm_delivery_price']
} }
print(item) print(item)
# a = None
# if result_list_json and module == 'Amazon:asin' and self.spider_state is None:
# is_sp_asin_state = None
# result_list_dict = json.loads(result_list_json)
# print(asin, '判断是否有竞品asin')
# for result_dict in result_list_dict:
# # Based on your recent shopping trends # Frequently purchased items with fast delivery
# # Customers who viewed this item also viewed # Brand in this category on Amazon
# sp_type = 'Based on your recent shopping trends'
# if result_dict.get(sp_type):
# print(asin, '找到有竞品asin。 数量:', len(result_dict[sp_type]))
# for i in result_dict[sp_type]:
# sp_asin = i + '|-|' + task_id + '|-|' + site_name + '|-|' + module
# self.sp_asin_queue.put(sp_asin)
# is_sp_asin_state = 111
# a = 1
# if is_sp_asin_state is None:
# print('没有找到竞品asin')
# self.item_queue.put(item)
# # self.save_data()
# # self.update_ai_asin_analyze_log([int(task_id)], '成功')
# a = 1
self.item_queue.put(item) self.item_queue.put(item)
Requests_param_val().send_kafka(html_data=response, topic=self.topic_asin_html) response_gzip = self.compress_string(response)
Requests_param_val().send_kafka(html_data=response_gzip, topic=self.topic_asin_html)
Requests_param_val().kafuka_producer_str.flush(timeout=30) Requests_param_val().kafuka_producer_str.flush(timeout=30)
# if self.spider_state == '竞品asin':
# self.item_queue.put(item)
# a = 1
#
# if module == 'Amazon:asinList':
# self.item_queue.put(item)
# a = 1
# if a is None:
# self.item_queue.put(item)
else: else:
print('asin 商品 异常') print('asin 商品 异常')
# 压缩字符串
def compress_string(self, input_string):
return gzip.compress(input_string.encode())
def update_ai_asin_analyze_log(self, task_id_list, status): def update_ai_asin_analyze_log(self, task_id_list, status):
if task_id_list: if task_id_list:
task_id_list = list(set(task_id_list)) task_id_list = list(set(task_id_list))
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json
import random import random
import time import time
import traceback import traceback
import json from datetime import datetime
import redis import redis
import requests
from DrissionPage import ChromiumPage, ChromiumOptions from DrissionPage import ChromiumPage, ChromiumOptions
from lxml import etree from lxml import etree
...@@ -15,6 +17,8 @@ class get_1688_order_data(): ...@@ -15,6 +17,8 @@ class get_1688_order_data():
self.pwd = 'aass369874.' self.pwd = 'aass369874.'
self.redis_db22 = self.redis_db() self.redis_db22 = self.redis_db()
self.alipay_data_list = [] self.alipay_data_list = []
self.err_orders_list = [] # 有问题的订单号
self.time_num = 0
def redis_db(self): def redis_db(self):
nums = 0 nums = 0
...@@ -29,16 +33,17 @@ class get_1688_order_data(): ...@@ -29,16 +33,17 @@ class get_1688_order_data():
time.sleep(3) time.sleep(3)
continue continue
def get_1688(self, list_orders): def get_1688(self, orders_type=None, list_orders=None):
# 配置 Chrome 浏览器 - 端口 9333 # 配置 Chrome 浏览器 - 端口 9333
chrome_options = ChromiumOptions() chrome_options = ChromiumOptions()
chrome_options.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe') chrome_options.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe')
chrome_options.set_local_port(9333) # 设置 Chrome 的调试端口 chrome_options.set_local_port(9333) # 设置 Chrome 的调试端口
chrom_page = ChromiumPage(addr_or_opts=chrome_options) chrom_page = ChromiumPage(addr_or_opts=chrome_options)
print(f"Chrome 浏览器运行在端口: {9333}") print(f"Chrome 浏览器运行在端口: {9333}")
self.get_order_tab(chrom_page, list_orders) # 请求订单页面。判断是否保持登录状态 self.get_order_tab(chrom_page, list_orders, orders_type) # 请求订单页面。判断是否保持登录状态
chrom_page.quit()
def get_order_tab(self, chrom_page, list_orders): def get_order_tab(self, chrom_page, list_orders, orders_type):
url = 'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10' url = 'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10'
chrom_page.get(url) chrom_page.get(url)
time.sleep(random.randint(3, 6)) time.sleep(random.randint(3, 6))
...@@ -52,9 +57,22 @@ class get_1688_order_data(): ...@@ -52,9 +57,22 @@ class get_1688_order_data():
self.login_1688(chrom_page) self.login_1688(chrom_page)
if '已买到的货品' in chrom_page.html: if '已买到的货品' in chrom_page.html:
print('保持登录状态') print('保持登录状态')
self.get_order_data(chrom_page, list_orders) self.get_order_data(chrom_page, list_orders, orders_type)
else: else:
print('需要人工进行手动确认。是否登录') print('需要人工进行手动确认。是否登录')
self.send_ms_count_data_num()
def send_ms_count_data_num(self):
account = 'pengyanbing'
title = f'1688 登录失败。店铺:{self.account}'
content = f'需要远程手动查看,远程向日葵 188779566'
url = 'http://47.112.96.71:8082/selection/sendMessage'
data = {
'account': account,
'title': title,
'content': content
}
requests.post(url=url, data=data, timeout=15)
def login_1688(self, chrom_page): def login_1688(self, chrom_page):
chrom_page.get( chrom_page.get(
...@@ -87,15 +105,18 @@ class get_1688_order_data(): ...@@ -87,15 +105,18 @@ class get_1688_order_data():
'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10') 'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10')
time.sleep(random.randint(12, 14)) time.sleep(random.randint(12, 14))
def get_order_data(self, chrom_page, list_orders): def get_order_data(self, chrom_page, list_orders, orders_type):
url = 'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10' url = 'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10'
chrom_page.get(url) chrom_page.get(url)
len_orders = len(list_orders)
for order_num in list_orders: for order_num in list_orders:
print() len_orders -= 1
print('剩余需要查询订单数量:', len_orders)
order_num = order_num.strip()
print('当前执行查询订单号:', order_num) print('当前执行查询订单号:', order_num)
print('---------------------------------------------------------------------------------') print('---------------------------------------------------------------------------------')
time.sleep(random.randint(1, 2))
js = ''' js = '''
// 定义输入函数
function typeReal(elem, text) { function typeReal(elem, text) {
if (!elem) return false; if (!elem) return false;
elem.focus(); elem.focus();
...@@ -119,19 +140,16 @@ class get_1688_order_data(): ...@@ -119,19 +140,16 @@ class get_1688_order_data():
''' '''
js2 = f''' js2 = f'''
// 获取 shadow DOM 深层 input
const realInput = document const realInput = document
.querySelector("body > article > app-root").shadowRoot .querySelector("body > article > app-root").shadowRoot
.querySelector("div > main > q-theme > order-search").shadowRoot .querySelector("div > main > q-theme > order-search").shadowRoot
.querySelector("section > order-search-keywords").shadowRoot .querySelector("section > order-search-keywords").shadowRoot
.querySelector("div > q-input").shadowRoot .querySelector("div > q-input").shadowRoot
.querySelector("input"); .querySelector("input");
// 执行输入
typeReal(realInput, "{order_num}"); typeReal(realInput, "{order_num}");
''' '''
js = js + js2 js = js + js2
chrom_page.run_js(js) # 填写订单号。定位搜索框 chrom_page.run_js(js) # 填写订单号。定位搜索框
time.sleep(random.randint(3, 5)) time.sleep(random.randint(3, 5))
print('点击搜索') print('点击搜索')
js_click = """ js_click = """
...@@ -164,7 +182,17 @@ class get_1688_order_data(): ...@@ -164,7 +182,17 @@ class get_1688_order_data():
"value": '待付款中没有查询到该订单' "value": '待付款中没有查询到该订单'
} }
} }
self.alipay_data_list.append(items) if orders_type:
try:
json_data = json.dumps(items, ensure_ascii=False)
self.redis_db22.rpush('alipay_zszh:response', json_data)
print(f'从alipay_zszh:{self.account} redis删除订单号:', order_num)
self.redis_db22.lrem(f'alipay_zszh:{self.account}', 1, order_num)
continue
except:
pass
else:
self.alipay_data_list.append(items)
else: else:
print('获取订单成功') print('获取订单成功')
js_click_2 = ''' js_click_2 = '''
...@@ -185,26 +213,143 @@ class get_1688_order_data(): ...@@ -185,26 +213,143 @@ class get_1688_order_data():
print("新标签页对象:", new_tab) # 获取最新开标签页对象 print("新标签页对象:", new_tab) # 获取最新开标签页对象
chrom_page_tab = chrom_page.get_tab(new_tab) chrom_page_tab = chrom_page.get_tab(new_tab)
chrom_page_tab.set.activate() # 激活新标签页 chrom_page_tab.set.activate() # 激活新标签页
self.chrom_new_page_tab = chrom_page_tab
time.sleep(random.randint(2, 5)) time.sleep(random.randint(2, 5))
print('寻找是否有网银对公支付') print('寻找是否有网银对公支付')
try: try:
chrom_page_tab.ele('xpath://div[contains(text(),"展开更多付款方式")]', timeout=3).click()
except:
pass
try:
bank_transfer_group = chrom_page_tab.ele('xpath://div[@data-channel="bank_transfer_group"]') bank_transfer_group = chrom_page_tab.ele('xpath://div[@data-channel="bank_transfer_group"]')
bank_transfer_group.click() bank_transfer_group.click()
except: except:
js = '''document.querySelector("#root > div > div.global-payment-channel > div:nth-child(2) > div > div.channel-card-group.available > div > div:nth-child(2) > div").click()''' try:
chrom_page_tab.run_js(js) js = '''document.querySelector("#root > div > div.global-payment-channel > div:nth-child(2) > div > div.channel-card-group.available > div > div:nth-child(2) > div").click()'''
chrom_page_tab.run_js(js)
except:
print('只有一个可选项。点击失败')
time.sleep(random.randint(1, 2)) time.sleep(random.randint(1, 2))
print('寻找支付方式') print('寻找支付方式')
# 对公支付 # 对公支付
chrom_page_tab.scroll.down(180)
time.sleep(random.randint(1, 2))
channel_name = chrom_page_tab.ele('xpath://span[contains(text(),"对公支付")]', timeout=5) channel_name = chrom_page_tab.ele('xpath://span[contains(text(),"对公支付")]', timeout=5)
if channel_name: if channel_name:
chrom_page_tab.ele('xpath://span[contains(text(),"对公支付")]', timeout=5).click() print('对公支付')
chrom_page_tab.ele('xpath://div[contains(text(),"网银或柜台转账")]', timeout=5).click() chrom_page_tab.ele('xpath://span[contains(text(),"对公支付")]', timeout=3).click()
print('找到网银。开始选择网银') time.sleep(random.randint(2, 4))
self.click_pay(chrom_page_tab) no_pay = chrom_page_tab.ele(
self.online_banking(chrom_page_tab, order_num) 'xpath://div[contains(text(),"以下支付方式暂不可用")]/following-sibling::div//span')
if no_pay:
print('暂不可用::', no_pay.text)
if '对公支付' in no_pay.text:
_items = {
"account": self.account,
"order_id": order_num,
"type": "异常",
"json": {
"value": '对公支付暂不可用'
}
}
elif '支付宝' in no_pay.text:
_items = {
"account": self.account,
"order_id": order_num,
"type": "异常",
"json": {
"value": '支付宝暂不可用'
}
}
else:
_items = None
if _items:
if orders_type:
try:
json_data = json.dumps(_items, ensure_ascii=False)
self.redis_db22.rpush('alipay_zszh:response', json_data)
print(f'从alipay_zszh:{self.account} redis删除订单号:', order_num)
self.redis_db22.lrem(f'alipay_zszh:{self.account}', 1, order_num)
except:
pass
else:
self.alipay_data_list.append(_items)
self.save_redis()
continue
if orders_type is None:
print(', 点击网银')
try:
js_pay = '''
document.querySelector("#root > div > div.global-payment-channel > div:nth-child(2) > div > div > div > div:nth-child(4) > div > div.card__face.card__face--front > div.channel-info-container > div.channel-info-content > div.sub-channel-pay-list > div:nth-child(2)").click()
'''
chrom_page_tab.run_js(js_pay)
except:
chrom_page_tab.ele('xpath://div[contains(text(),"网银或柜台转账")]', timeout=3).click()
print('开始付款')
time.sleep(random.randint(1, 3))
self.click_pay(chrom_page_tab)
self.online_banking(chrom_page_tab, order_num)
else:
print('专属账号余额 有, 使用专属账号支付')
try:
js_pay = '''
document.querySelector("#root > div > div.global-payment-channel > div:nth-child(2) > div > div > div > div:nth-child(4) > div > div.card__face.card__face--front > div.channel-info-container > div.channel-info-content > div.sub-channel-pay-list > div:nth-child(1)").click()
'''
chrom_page_tab.run_js(js_pay)
except:
try:
chrom_page_tab.ele('xpath://div[contains(text(),"专属账号余额:")]', timeout=3).click()
except:
pass
time.sleep(random.randint(1, 3))
if chrom_page_tab.ele('xpath://div[contains(text(),"专属账号余额不足")]'):
resp = chrom_page_tab.html
respons = etree.HTML(resp)
sapn_text_list = respons.xpath('//div[contains(text(),"专属账号余额不足")]//text()')
if sapn_text_list:
span_str = ''.join(sapn_text_list)
else:
span_str = '专属账号余额不足,可用余额: ¥'
print('span_str:::', span_str)
items = {
"account": self.account,
"order_id": order_num,
"type": "异常",
"json": {
"value": span_str
}
}
print('专属账号余额支付 items:::', items)
json_data = json.dumps(items, ensure_ascii=False)
self.redis_db22.rpush('alipay_zszh:response', json_data)
else:
print('开始付款')
time.sleep(random.randint(1, 3))
self.click_pay(chrom_page_tab)
if '支付成功' in chrom_page_tab.html:
print('专属账号余额 支付成功 :', order_num)
else:
items = {
"account": self.account,
"order_id": order_num,
"type": "异常",
"json": {
"value": '需要手动确认,专属账号余额是否足够支付'
}
}
print('专属账号余额支付 items:::', items)
json_data = json.dumps(items, ensure_ascii=False)
self.redis_db22.rpush('alipay_zszh:response', json_data)
try:
print(f'从alipay_zszh:{self.account} redis删除订单号:', order_num)
self.redis_db22.lrem(f'alipay_zszh:{self.account}', 1, order_num)
except:
print('删除失败:')
pass
chrom_page_tab.close() # 关闭标签页
continue
else: else:
zfb = chrom_page_tab.ele('xpath://span[contains(text(),"支付宝")]', timeout=5).click() zfb = chrom_page_tab.ele('xpath://span[contains(text(),"支付宝")]', timeout=3).click()
if zfb: if zfb:
print('找到支付宝。开始选择支付宝') print('找到支付宝。开始选择支付宝')
self.click_pay(chrom_page_tab) self.click_pay(chrom_page_tab)
...@@ -221,26 +366,72 @@ class get_1688_order_data(): ...@@ -221,26 +366,72 @@ class get_1688_order_data():
} }
self.alipay_data_list.append(items) self.alipay_data_list.append(items)
chrom_page_tab.close() # 关闭标签页 chrom_page_tab.close() # 关闭标签页
self.save_redis()
# def save_alipay_zszh(self,_items, order_num):
# self.redis_db()
# for i in range(5):
# try:
# json_data = json.dumps(_items, ensure_ascii=False)
# self.redis_db22.rpush('alipay_zszh:response', json_data)
# print(f'从alipay_zszh:{self.account} redis删除订单号:', order_num)
# self.redis_db22.lrem(f'alipay_zszh:{self.account}', 1, order_num)
# break
# except:
# time.sleep(10)
def save_redis(self):
if self.alipay_data_list:
for alipay_data in self.alipay_data_list:
while True:
try:
json_data = json.dumps(alipay_data, ensure_ascii=False)
print(json_data)
self.redis_db22.rpush('alipay:response', json_data)
break
except:
print('写入redis报错:重试')
time.sleep(5)
# 删除
try:
print('从redis删除订单号:', alipay_data['order_id'])
self.redis_db22.lrem(f"alipay:{self.account}", 1, f"{alipay_data['order_id']}")
except:
print('删除失败:')
pass
self.alipay_data_list = []
time.sleep(random.randint(1, 3))
def online_zfb(self, chrom_page_tab, order_num): def online_zfb(self, chrom_page_tab, order_num):
print('解析支付宝支付页面信息') print('解析支付宝支付页面信息')
chrom_page_tab.ele('xpath://span[contains(text(),"去网商银行付款")]', timeout=5).click() if chrom_page_tab.ele('xpath://span[contains(text(),"中国农业银行")]'):
time.sleep(random.randint(5, 10)) items = {
html = etree.HTML(chrom_page_tab.html) "account": self.account,
data_list = html.xpath('//div[contains(@class,"order-info-container")]//text()') "order_id": order_num,
print(data_list) "type": "支付宝",
del data_list[0] "json": {
data_dict = {data_list[i]: data_list[i + 1] for i in range(0, len(data_list), 2)} "merchant_name": '无'
items = { }
"account": self.account,
"order_id": order_num,
"type": "支付宝",
"json": {
"merchant_name": data_dict['商户名称']
} }
} self.alipay_data_list.append(items)
self.alipay_data_list.append(items) else:
chrom_page_tab.ele('xpath://span[contains(text(),"去网商银行付款")]', timeout=5).click()
time.sleep(random.randint(5, 10))
html = etree.HTML(chrom_page_tab.html)
data_list = html.xpath('//div[contains(@class,"order-info-container")]//text()')
print(data_list)
del data_list[0]
data_dict = {data_list[i]: data_list[i + 1] for i in range(0, len(data_list), 2)}
items = {
"account": self.account,
"order_id": order_num,
"type": "支付宝",
"json": {
"merchant_name": data_dict['商户名称']
}
}
self.alipay_data_list.append(items)
time.sleep(random.randint(1, 3))
def click_pay(self, chrom_page_tab): def click_pay(self, chrom_page_tab):
time.sleep(random.randint(1, 2)) time.sleep(random.randint(1, 2))
print('点击立即付款') print('点击立即付款')
...@@ -251,7 +442,7 @@ class get_1688_order_data(): ...@@ -251,7 +442,7 @@ class get_1688_order_data():
''' '''
# 获取 # 获取
chrom_page_tab.run_js(js_click_cashier) chrom_page_tab.run_js(js_click_cashier)
time.sleep(random.randint(5, 10)) time.sleep(random.randint(6, 15))
def online_banking(self, chrom_page_tab, order_num): def online_banking(self, chrom_page_tab, order_num):
print('解析网银支付账户信息') print('解析网银支付账户信息')
...@@ -262,22 +453,31 @@ class get_1688_order_data(): ...@@ -262,22 +453,31 @@ class get_1688_order_data():
data_list.remove('复制') data_list.remove('复制')
data_dict = {data_list[i]: data_list[i + 1] for i in range(0, len(data_list), 2)} data_dict = {data_list[i]: data_list[i + 1] for i in range(0, len(data_list), 2)}
print(data_dict) print(data_dict)
items = { try:
"account": self.account, items = {
"order_id": order_num, "account": self.account,
"type": "对公支付", "order_id": order_num,
"json": { "type": "对公支付",
"bank_account": data_dict['收款账号'].replace(" ", ""), "json": {
"name": data_dict['收款户名'], "bank_account": data_dict['收款账号'].replace(" ", ""),
"bank_name": data_dict['收款银行'].replace('”', '').replace('“', ''), "name": data_dict['收款户名'],
"bank_loaction": data_dict['所在地'], "bank_name": data_dict['收款银行'].replace('”', '').replace('“', ''),
"price": data_dict['转账金额'].replace('¥', ''), "bank_loaction": data_dict['所在地'],
"bank_clearing_number": data_dict["联行号"].replace('(选填)', '') "price": data_dict['转账金额'].replace('¥', ''),
"bank_clearing_number": data_dict["联行号"].replace('(选填)', '')
}
} }
} if items['json']['bank_name']:
self.alipay_data_list.append(items) if '浙江网商' in items['json']['bank_name']:
items['json']['bank_name'] = '浙江网商银行'
self.alipay_data_list.append(items)
except:
self.err_orders_list.append(order_num)
print('检查订单号:', self.err_orders_list)
self.time_num += 1
time.sleep(random.randint(1, 3))
def get_account(self): def get_account(self):
self.redis_db22 = self.redis_db()
start_index = 0 # 起始索引 start_index = 0 # 起始索引
end_index = -1 # 结束索引,-1 表示获取整个列表 end_index = -1 # 结束索引,-1 表示获取整个列表
list_data_b = self.redis_db22.lrange(f'alipay:{self.account}', start_index, end_index) list_data_b = self.redis_db22.lrange(f'alipay:{self.account}', start_index, end_index)
...@@ -287,26 +487,44 @@ class get_1688_order_data(): ...@@ -287,26 +487,44 @@ class get_1688_order_data():
else: else:
return None return None
def run(self): def get_alipay_zszh(self):
list_orders = self.get_account() # 此键存储 订单号 只用专属账户余额支付。只返回支付失败的信息。其他不返回
print('list_orders:', list_orders) self.redis_db22 = self.redis_db()
start_index = 0 # 起始索引
end_index = -1 # 结束索引,-1 表示获取整个列表
list_data_b = self.redis_db22.lrange(f'alipay_zszh:{self.account}', start_index, end_index)
list_orders = [item.decode('utf-8') for item in list_data_b]
print('需要用专属账号余额支付的订单号: ', list_orders)
print('需要用专属账号余额支付的订单号: ', len(list_orders), '个')
if list_orders: if list_orders:
self.get_1688(list_orders) self.get_1688(orders_type=True, list_orders=list_orders)
else: else:
print(self.account, ' :该店铺下没有可查询的订单') print('没有需要可以支付的订单')
print(self.alipay_data_list) if self.time_num > 500:
for alipay_data in self.alipay_data_list: self.time_num = 0
while True:
try: def run(self):
json_data = json.dumps(alipay_data, ensure_ascii=False) while True:
print(json_data) try:
self.redis_db22.rpush('alipay:response', json_data) list_orders = self.get_account()
break new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
except: print(new_date, ' 需要查询的订单号list_orders:', list_orders)
print('写入redis报错:重试') if list_orders:
time.sleep(5) self.get_1688(list_orders=list_orders)
# 删除 else:
#self.redis_db22.lrem(f"alipay:{self.account}", 1, "dsaofi3232e232328938928") print(self.account, ' :该店铺下没有可查询的订单')
print('开始查询专属账号余额需要支付的订单号')
self.get_alipay_zszh()
except Exception as e:
print("报错xx2222xxx:", e, f"\n{traceback.format_exc()}")
time.sleep(300)
self.chrom_new_page_tab.close()
base_min, base_max = 20, 60
sleep_min = base_min + self.time_num * 2
sleep_max = base_max + self.time_num * 3
time.sleep(random.randint(sleep_min, sleep_max))
self.err_orders_list = []
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -16,6 +16,7 @@ from datetime import datetime ...@@ -16,6 +16,7 @@ from datetime import datetime
import json import json
import threading import threading
import time import time
import gzip
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session() sess = requests.Session()
urllib3.disable_warnings() urllib3.disable_warnings()
...@@ -200,15 +201,20 @@ class ai_async_asin_pg(): ...@@ -200,15 +201,20 @@ class ai_async_asin_pg():
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'], 'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': items['bundle_asin_component_json'], 'bundle_asin_component_json': items['bundle_asin_component_json'],
'bsr_category_asin_list': items['bs_category_asin_list_pg'], 'bsr_category_asin_list': items['bs_category_asin_list_pg'],
'review_json_list': items['review_json_list'] 'review_json_list': items['review_json_list'],'fbm_delivery_price':items['fbm_delivery_price']
} }
print(item) print(item)
self.item_queue.put(item) self.item_queue.put(item)
Requests_param_val().send_kafka(html_data=response, topic=self.topic_asin_html) response_gzip = self.compress_string(response)
Requests_param_val().send_kafka(html_data=response_gzip, topic=self.topic_asin_html)
Requests_param_val().kafuka_producer_str.flush(timeout=30) Requests_param_val().kafuka_producer_str.flush(timeout=30)
else: else:
print('asin 商品 异常') print('asin 商品 异常')
# 压缩字符串
def compress_string(self, input_string):
return gzip.compress(input_string.encode())
def save_data(self): def save_data(self):
self.pg_connect() self.pg_connect()
items_data_list = [] items_data_list = []
...@@ -245,6 +251,7 @@ class ai_async_asin_pg(): ...@@ -245,6 +251,7 @@ class ai_async_asin_pg():
print('存储报错::', e) print('存储报错::', e)
self.pg_connect() self.pg_connect()
time.sleep(10) time.sleep(10)
def init_list(self): def init_list(self):
print("=======清空变量==========") print("=======清空变量==========")
self.asin_not_found_list = [] # 4 self.asin_not_found_list = [] # 4
......
...@@ -75,7 +75,7 @@ class async_asin_pg(): ...@@ -75,7 +75,7 @@ class async_asin_pg():
self.topic_detail_month = f'{self.site_name}_asin_detail_month_2025_{self.month_}' self.topic_detail_month = f'{self.site_name}_asin_detail_month_2025_{self.month_}'
self.topic_asin_html = f'asin_html_2025_{self.month_}' self.topic_asin_html = f'asin_html_2025_{self.month_}'
self.asin_video_list = [] self.asin_video_list = []
self.asin_buySales_list = []
def get_asin(self): def get_asin(self):
while True: while True:
if self.queries_asin_queue.empty() == False: if self.queries_asin_queue.empty() == False:
...@@ -112,7 +112,7 @@ class async_asin_pg(): ...@@ -112,7 +112,7 @@ class async_asin_pg():
sess.mount(self.site_url, py_ja3.DESAdapter()) sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers, resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False) timeout=10, verify=False)
# with open(rf'{self.site_name}_22_{asin}.html', 'w', encoding='utf-8')as f: # with open(rf'D:\新建文件夹\html_selenium_files\{self.site_name}_211123333_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text) # f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp): if self.reuests_para_val.check_amazon_yzm(resp):
self.yzm_err_total_list.append(1) self.yzm_err_total_list.append(1)
...@@ -216,7 +216,7 @@ class async_asin_pg(): ...@@ -216,7 +216,7 @@ class async_asin_pg():
'bundles_this_asins_json': items['bundles_this_asins_data_json'], 'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'], 'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json':items['bundle_asin_component_json'], 'bundle_asin_component_json':items['bundle_asin_component_json'],
'review_json_list':items['review_json_list'],'asin_buySales_list':items['asin_buySales_list'] 'review_json_list':items['review_json_list'],'fbm_delivery_price':items['fbm_delivery_price']
} }
if self.site_name in ['uk', 'de', 'fr', 'es', 'it']: if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
item['five_six_val'] = items['five_six_val'] item['five_six_val'] = items['five_six_val']
...@@ -286,6 +286,8 @@ class async_asin_pg(): ...@@ -286,6 +286,8 @@ class async_asin_pg():
item['node_id'] = _items["node_id"] item['node_id'] = _items["node_id"]
if item['review_json_list'] is None: if item['review_json_list'] is None:
item['review_json_list'] = _items["review_json_list"] item['review_json_list'] = _items["review_json_list"]
if item['fbm_delivery_pric'] is None:
item['fbm_delivery_price'] = _items["fbm_delivery_price"]
except: except:
pass pass
_response_text_var = None _response_text_var = None
...@@ -366,8 +368,6 @@ class async_asin_pg(): ...@@ -366,8 +368,6 @@ class async_asin_pg():
item['img_list'] = json.dumps(items["all_img_video_list"]) item['img_list'] = json.dumps(items["all_img_video_list"])
else: else:
item['img_list'] = None item['img_list'] = None
if item['asin_buySales_list']:
self.asin_buySales_list.extend(item['asin_buySales_list'])
self.item_queue.put(item) self.item_queue.put(item)
if item['img_list'] is None: if item['img_list'] is None:
...@@ -425,7 +425,6 @@ class async_asin_pg(): ...@@ -425,7 +425,6 @@ class async_asin_pg():
def init_list(self): def init_list(self):
print("=======清空变量==========") print("=======清空变量==========")
self.asin_buySales_list = []
self.asin_not_found_list = [] # 4 self.asin_not_found_list = [] # 4
self.asin_not_sure_list = [] # 6 self.asin_not_sure_list = [] # 6
self.asin_not_foot_list = [] # 7 self.asin_not_foot_list = [] # 7
...@@ -478,7 +477,7 @@ class async_asin_pg(): ...@@ -478,7 +477,7 @@ class async_asin_pg():
def run(self): def run(self):
asin_list = self.save_asin_detail.read_db_data() asin_list = self.save_asin_detail.read_db_data()
# asin_list = ['B0D663T3W8|2025-01|1|1|null|null'] # asin_list = ['B0CW1ZM991|2025-01|1|1|null|null']
if asin_list: if asin_list:
for asin in asin_list: for asin in asin_list:
self.queries_asin_queue.put(asin) self.queries_asin_queue.put(asin)
...@@ -488,7 +487,7 @@ class async_asin_pg(): ...@@ -488,7 +487,7 @@ class async_asin_pg():
for ck in cookies_dict.values(): for ck in cookies_dict.values():
self.cookies_queue.put(ck) self.cookies_queue.put(ck)
html_thread = [] html_thread = []
for i in range(27): for i in range(26):
thread2 = threading.Thread(target=self.get_asin) thread2 = threading.Thread(target=self.get_asin)
thread2.start() thread2.start()
html_thread.append(thread2) html_thread.append(thread2)
...@@ -500,14 +499,6 @@ class async_asin_pg(): ...@@ -500,14 +499,6 @@ class async_asin_pg():
self.reuests_para_val.kafuka_producer_str.flush(timeout=35) self.reuests_para_val.kafuka_producer_str.flush(timeout=35)
except KafkaTimeoutError as e: except KafkaTimeoutError as e:
print("flush 超时,跳过这次等待:", e) print("flush 超时,跳过这次等待:", e)
while True:
try:
if self.asin_buySales_list:
self.save_asin_detail.save_asin_not_buysales(self.asin_buySales_list)
break
except FunctionTimedOut as e:
print('断网', e)
while True: while True:
try: try:
print('存储 asin bsr 文本 存储pg') print('存储 asin bsr 文本 存储pg')
...@@ -569,4 +560,4 @@ class async_asin_pg(): ...@@ -569,4 +560,4 @@ class async_asin_pg():
pass pass
# if __name__ == '__main__': # if __name__ == '__main__':
# async_asin_pg(month=9, spider_int=1, week=14,site_name='us').run() # async_asin_pg(month=12, spider_int=1, week=14,site_name='de').run()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment