Commit 694247a4 by Peng

更新了获取cookie请求头

me搜索词同步到pg6更新了数据库链接方式
合并了ai类型任务代码。变成ai_analyze_spider
优化了1688付款流程。新增专属余额对应订单号自动支付
parent 7597ea9f
......@@ -47,25 +47,44 @@ def get_cookie(site='us', zipCode='10010'):
engine_us = get_remote_engine(site, 'mysql')
n = random.randint(70, 114)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
n = random.randint(120, 130)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36'
print(ua)
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.4929.149 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
headers = {
'connection': 'close',
'authority': host,
'accept': 'text/html,*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'origin': url_,
'referer': url_,
'sec-ch-ua-mobile': '?0',
'user-agent': ua
}
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Device-Memory": "8",
"Downlink": "1.25",
"Dpr": "0.75",
"Ect": "3g",
"Pragma": "no-cache",
"Rtt": "300",
"Sec-Ch-Device-Memory": "8",
"Sec-Ch-Dpr": "0.75",
"Sec-Ch-Ua": f'"Not_A Brand";v="8", "Chromium";v="{ua}", "Google Chrome";v="{ua}"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Ch-Ua-Platform-Version": '"10.0.0"',
"Sec-Ch-Viewport-Width": "2560",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": ua,
"Viewport-Width": "2560",
}
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
k = ""
for i in (0, random.randint(0, 5)):
k += random.choice(alphabet)
headers[k] = str(uuid.uuid4())
# headers[k] = str(uuid.uuid4())
sess = requests.Session()
sess.mount(url_, py_ja3.DESAdapter())
resp_ = sess.get(url_, headers=headers, timeout=15, verify=False)
......@@ -193,12 +212,10 @@ def get_cookie(site='us', zipCode='10010'):
if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip():
print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************")
cookies = json.dumps(index_resp_cookies, ensure_ascii=False)
cookies_list=[[cookies,'DB']]
item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies}
print(item)
# 构造 DataFrame
df = pd.DataFrame([{"cookies": cookies, "type": "DB"}])
# df_data_list = df.values.tolist()
# 存储到数据库
engine_us.to_sql(df, f"{site}_cookies", if_exists="append")
......@@ -210,13 +227,13 @@ def get_cookie(site='us', zipCode='10010'):
if __name__ == '__main__':
while True:
# get_cookie(site='us', zipCode='10010')
# get_cookie(site='de', zipCode='10115')
# get_cookie(site='uk', zipCode='W1S 3PR')
get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR')
# get_cookie(site='it', zipCode='85')
# get_cookie(site='es', zipCode='28001')
# get_cookie(site='fr', zipCode='75019')
# get_cookie(site='us', zipCode='10010')
get_cookie(site='es', zipCode='28001')
get_cookie(site='fr', zipCode='75019')
get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR')
time.sleep(random.uniform(10.5, 55.5))
time.sleep(random.uniform(30.5, 70.5))
......@@ -123,7 +123,6 @@ class dow_category_Product():
try:
num += 1
Category_name = Category
# _Category = Category.replace('&', '\\\&')
print("Category_name 名称 11111", Category)
driver.execute_script(f"""document.querySelector("kat-radiobutton[label='{Category}']").click()""")
time.sleep(1)
......
......@@ -74,7 +74,7 @@ class bsr_catgory(BaseUtils):
except:
print(site, 'db_cursor_connect 报错:', sql)
def db_cursor_connect_msyql_read(self, site=None,select_state1_sql=None):
def db_cursor_connect_msyql_read(self, site=None, select_state1_sql=None):
for i in range(3):
try:
if site:
......@@ -115,7 +115,7 @@ class bsr_catgory(BaseUtils):
self.year_month = f'{self.year}_{self.month}'
sele_sql = f"SELECT `week` FROM week_20_to_30 WHERE `year_month`='{self.year}_{self.month}'"
print(sele_sql)
df_year_week = self.db_cursor_connect_msyql_read(site='us',select_state1_sql=sele_sql)
df_year_week = self.db_cursor_connect_msyql_read(site='us', select_state1_sql=sele_sql)
self.year_week = list(df_year_week['week'])[-1]
print(self.year_week, '====当前周===1232333')
......@@ -188,19 +188,19 @@ class bsr_catgory(BaseUtils):
"category_first_id": category_first_id,
"category_parent_id": category_parent_id
}
keys_to_check = ['category_id', 'category_first_id', 'category_parent_id']
# 使用列表推导式检查多个键的值是否为空字符串或None
empty_or_none_keys = [key for key in keys_to_check if items.get(key) in ('', None)]
if empty_or_none_keys:
print('解析失败')
try:
account = 'pengyanbing'
title = self.site_name + ' bsr 榜单'
content = f' bsr 榜单解析 url 失败 节点数:{nodes_num} \n 解析url:{url}'
db_class = connect_db(self.site_name)
db_class.send_mg(account, title, content)
except:
pass
# keys_to_check = ['category_id', 'category_first_id', 'category_parent_id']
# # 使用列表推导式检查多个键的值是否为空字符串或None
# empty_or_none_keys = [key for key in keys_to_check if items.get(key) in ('', None)]
# if empty_or_none_keys:
# print('解析失败')
# try:
# account = 'pengyanbing'
# title = self.site_name + ' bsr 榜单'
# content = f' bsr 榜单解析 url 失败 节点数:{nodes_num} \n 解析url:{url}'
# db_class = connect_db(self.site_name)
# db_class.send_mg(account, title, content)
# except:
# pass
return items
def html_4(self, bum):
......@@ -604,7 +604,7 @@ class bsr_catgory(BaseUtils):
order by {self.site_name}_bs_category.category_id, category_parent_id;
"""
print('path_sql:', path_sql)
df_exist_rows = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=path_sql)
df_exist_rows = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=path_sql)
exist_rows = df_exist_rows.values.tolist()
group1_id = []
group2_id = []
......@@ -668,12 +668,12 @@ class bsr_catgory(BaseUtils):
# 不存在就插入
try:
select_sql_id = f'''SELECT id FROM {self.site_name}_bs_category WHERE `path`="{name_num_path[3]}"'''
df_id = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_sql_id)
df_id = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql_id)
if not df_id.empty:
save_name_num_list.append(name_num_path)
else:
select_sql_name = f'''SELECT en_name FROM {self.site_name}_bs_category WHERE `path`="{name_num_path[3]}" order by id desc '''
df_en_name = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_sql_name)
df_en_name = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql_name)
print('en_name::', df_en_name.values)
if df_en_name['en_name'][0] == name_num_path[1]:
pass
......@@ -725,7 +725,7 @@ class bsr_catgory(BaseUtils):
select id, path,nodes_num from {self.site_name}_bs_category where category_first_id is null and category_parent_id != '0'
and delete_time is null;
"""
df_nodes_num = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=sql)
df_nodes_num = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=sql)
if not df_nodes_num.empty:
id_path_list = df_nodes_num.values.tolist()
......@@ -815,13 +815,13 @@ class bsr_catgory(BaseUtils):
def select_id_1(self):
# 查询 子节点的顶级父类id
select_sql_1 = f'select id from {self.site_name}_bs_category where nodes_num=2'
df_id = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_sql_1)
df_id = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql_1)
df_id_lsit = df_id.values.tolist()
for id in df_id_lsit:
en_name_id_list = []
select_p_id = f"select t3.id,t4.en_name from (select t1.id,t1.parent_id,if(find_in_set(parent_id, @pids) > 0, @pids := concat(@pids, ',',id), 0) as ischild from (select id,p_id as parent_id from {self.site_name}_bs_category t order by p_id,id) t1,(select @pids := {id[0]}) t2) t3 LEFT JOIN {self.site_name}_bs_category t4 on t3.id = t4.id where ischild != 0;"
print('select_p_id::',select_p_id)
df_all_id = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_p_id)
print('select_p_id::', select_p_id)
df_all_id = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_p_id)
if not df_all_id.empty:
all_id_lsit = df_all_id.values.tolist()
for en_name_id in all_id_lsit:
......@@ -894,7 +894,7 @@ class bsr_catgory(BaseUtils):
and delete_time is null
order by category_id,category_first_id
"""
df_id_tuple = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=id_sql)
df_id_tuple = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=id_sql)
id_tuple = df_id_tuple.values.tolist()
id_list = []
for id in id_tuple:
......@@ -960,9 +960,9 @@ class bsr_catgory(BaseUtils):
_0_days = ((datetime.datetime.now()) + datetime.timedelta(days=0)).strftime("%Y-%m-%d")
select_sql = f"select count(id) FROM {self.site_name}_bs_category_top100_asin WHERE date_info = '{_0_days}';"
print(select_sql)
df_count_data_num = self.db_cursor_connect_msyql_read(site=None,select_state1_sql=select_sql)
df_count_data_num = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql)
count_data_num = df_count_data_num['count(id)'][0]
print('count_data_num::',count_data_num)
print('count_data_num::', count_data_num)
self.send_ms_count_data_num(self.site_name, count_data_num, _0_days)
break
except Exception as e:
......@@ -971,6 +971,22 @@ class bsr_catgory(BaseUtils):
time.sleep(20)
continue
def sele_msyql_category(self, site):
engine_mysql = self.db_engine_us(site, 'mysql')
sql = f'select path, nodes_num,id from {site}_bs_category where nodes_num>1'
df = engine_mysql.read_sql(sql)
values_list = df.values.tolist()
with engine_mysql.begin() as conn_6:
for value in values_list:
print(value)
items = self.parse_url(value[1], value[0])
items['id'] = value[2]
print(items)
# {'category_id': '1722264031', 'category_first_id': 'baby', 'category_parent_id': '60244031', 'id': 67478}
ai_sql1 = f"update {site}_bs_category set category_id = '{items['category_id']}',category_parent_id='{items['category_parent_id']}',category_first_id='{items['category_first_id']}' where id={items['id']}"
print(ai_sql1)
conn_6.execute(ai_sql1)
def dele_self_real_spider(self):
print('每天晚上定时删除贺哲的抓取表。用户已经取消收藏店铺')
select_sql = 'select data_id from user_collection_syn where data_type =2'
......@@ -1013,6 +1029,7 @@ if __name__ == '__main__':
spider_us.run_update_redirect_flag()
spider_us.updata_category_first_id()
spider_us.send_ms()
for site in ['us','de', 'uk']:
for site in ['us', 'de', 'uk']:
spider_us = bsr_catgory(site_name=site)
spider_us.updata_category_state()
spider_us.sele_msyql_category(site)
'存储到pg'
'获取小语言cookie'
import sys
import os
import pandas as pd
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import requests
from curl_cffi import requests
import json
from lxml import etree
import re
import random
import pymysql
import time
import py_ja3
from params import DB_CONN_DICT
import urllib3
import uuid
from secure_db_client import get_remote_engine
import traceback
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
item = {}
headers_num_int = 0
......@@ -38,35 +39,43 @@ def get_cookie(site='us', zipCode='10010'):
elif site == 'it':
url_ = 'https://www.amazon.it'
host = 'www.amazon.it'
if site == 'us':
us_db = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection", charset="utf8mb4")
else:
us_db = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection_" + site,
charset="utf8mb4")
us_cursor = us_db.cursor()
n = random.randint(70, 114)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
engine_us = get_remote_engine(site, 'mysql')
n = random.randint(110, 120)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.0.0 Safari/537.36'
print(ua)
headers = {
'connection': 'close',
'authority': host,
'accept': 'text/html,*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'origin': url_,
'referer': url_,
'sec-ch-ua-mobile': '?0',
'user-agent': ua
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Device-Memory": "8",
"Downlink": "1.25",
"Dpr": "0.75",
"Ect": "3g",
"Pragma": "no-cache",
"Rtt": "300",
"Sec-Ch-Device-Memory": "8",
"Sec-Ch-Dpr": "0.75",
"Sec-Ch-Ua": f'"Not_A Brand";v="8", "Chromium";v="{n}", "Google Chrome";v="{n}"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Ch-Ua-Platform-Version": '"10.0.0"',
"Sec-Ch-Viewport-Width": "2560",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": ua,
"Viewport-Width": "2560",
}
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']
k = ""
for i in (0, random.randint(0, 5)):
k += random.choice(alphabet)
headers[k] = str(uuid.uuid4())
# headers[k] = str(uuid.uuid4())
sess = requests.Session()
sess.mount(url_, py_ja3.DESAdapter())
resp_ = sess.get(url_, headers=headers, timeout=15, verify=False)
cookie = resp_.headers.get('set-cookie')
print("第一步 请求首页", url_)
......@@ -76,48 +85,49 @@ def get_cookie(site='us', zipCode='10010'):
print("第一次发送请求,获取邮编:", ingress)
data_a_modal = html_xpath.xpath("//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
data_modal = json.loads(data_a_modal[0])
if site != 'us':
csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0]
url_post = url_ + '/privacyprefs/retail/v1/acceptall'
dada_post = {
"anti-csrftoken-a2z": csrftoken,
"accept": "all"
}
resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post, verify=False)
cookie_post = resp_post.headers.get('set-cookie')
cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")}
cookies_dict_post.update(cookies_dict)
else:
cookies_dict_post = cookies_dict
if site == 'us':
get_token_headers = {
'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
'referer': url_,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
}
print(get_token_headers,'23232')
else:
get_token_headers = {
'accept': 'text/html,*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
'cache-control': 'no-cache',
'referer': url_,
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'viewport-width': '1920',
'x-requested-with': 'XMLHttpRequest',
}
# if site != 'us':
# csrftoken = html_xpath.xpath("//input[@name='anti-csrftoken-a2z']/@value")[0]
# url_post = url_ + '/privacyprefs/retail/v1/acceptall'
# dada_post = {
# "anti-csrftoken-a2z": csrftoken,
# "accept": "all"
# }
# resp_post = sess.post(url_post, headers=headers, cookies=cookies_dict, timeout=15, data=dada_post,
# verify=False)
# cookie_post = resp_post.headers.get('set-cookie')
# cookies_dict_post = {i.split("=")[0]: i.split("=")[-1] for i in cookie_post.split("; ")}
# cookies_dict_post.update(cookies_dict)
# else:
cookies_dict_post = cookies_dict
# if site == 'us':
# get_token_headers = {
# 'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
# 'referer': url_,
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
# }
# print(get_token_headers, '23232')
# else:
get_token_headers = {
'accept': 'text/html,*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'anti-csrftoken-a2z': data_modal['ajaxHeaders']['anti-csrftoken-a2z'],
'cache-control': 'no-cache',
'referer': url_,
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'viewport-width': '1920',
'x-requested-with': 'XMLHttpRequest',
}
data_modal_url = url_ + data_modal['url']
print('第二步 拼接url 点击更改位置:',data_modal_url)
print('第二步 拼接url 点击更改位置:', data_modal_url)
data_modal_resp = sess.get(data_modal_url, headers=get_token_headers, cookies=cookies_dict_post,
timeout=15,verify=False)
timeout=15, verify=False)
data_modal_cookie = data_modal_resp.headers.get('set-cookie')
CSRF_TOKEN = re.findall('CSRF_TOKEN : "(.*?)",', data_modal_resp.text)[0]
print("CSRF_TOKEN:",CSRF_TOKEN)
print("CSRF_TOKEN:", CSRF_TOKEN)
try:
data_modal_cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in data_modal_cookie.split("; ")}
data_modal_cookie_dict.update(cookies_dict)
......@@ -125,9 +135,10 @@ def get_cookie(site='us', zipCode='10010'):
data_modal_cookie_dict = cookies_dict_post
url_2 = url_ + '/portal-migration/hz/glow/address-change?actionSource=glow'
print('url_2:',url_2)
print('url_2:', url_2)
# {"locationType":"LOCATION_INPUT","zipCode":"10010","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"}
data = {"locationType":"LOCATION_INPUT","zipCode":zipCode,"storeContext":"generic","deviceType":"web","pageType":"Gateway","actionSource":"glow"}
data = {"locationType": "LOCATION_INPUT", "zipCode": zipCode, "storeContext": "generic", "deviceType": "web",
"pageType": "Gateway", "actionSource": "glow"}
print(data)
post_headers = {
'anti-csrftoken-a2z': CSRF_TOKEN,
......@@ -157,13 +168,13 @@ def get_cookie(site='us', zipCode='10010'):
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'viewport-width': '1920',
'TE':'trailers',
'TE': 'trailers',
'x-requested-with': 'XMLHttpRequest'
}
print('第三步 发送post 请求 输入 邮编 点击确定')
resp_2 = sess.post(url_2, headers=post_headers, json=data, cookies=data_modal_cookie_dict,
timeout=15,verify=False)
timeout=15, verify=False)
print(resp_2.text)
post_cookies = resp_2.headers.get('set-cookie')
try:
......@@ -174,31 +185,29 @@ def get_cookie(site='us', zipCode='10010'):
done_url = url_ + "/portal-migration/hz/glow/get-location-label?storeContext=generic&pageType=Gateway&actionSource=desktop-modal"
print('第四步,点击完成,')
done_resp = sess.get(done_url, headers=headers, cookies=post_cookies_dict, timeout=15,verify=False)
print(done_resp.text,'done_respdone_respdone_respdone_resp')
done_resp = sess.get(done_url, headers=headers, cookies=post_cookies_dict, timeout=15, verify=False)
print(done_resp.text, 'done_respdone_respdone_respdone_resp')
done_cookies_dict = sess.cookies.get_dict()
print('done_cookies_dict::',done_cookies_dict)
print('done_cookies_dict::', done_cookies_dict)
print("第五步,请求首页,获取邮编,是否修改成功")
index_resp = sess.get(url_, headers=headers, timeout=15,cookies=done_cookies_dict,verify=False)
index_resp = sess.get(url_, headers=headers, timeout=15, cookies=done_cookies_dict, verify=False)
index_resp_cookies = sess.cookies.get_dict()
print(sess.cookies.get_dict(),'2222222222222222')
print(sess.cookies.get_dict(), '2222222222222222')
index_xpath = etree.HTML(index_resp.text)
ingress = index_xpath.xpath("//span[@id='glow-ingress-line2']/text()")
print("获取最新邮编:", ingress)
if zipCode in ingress[0].strip() or "W1S 3" in ingress[0].strip():
print(f"*************** 当前获取 {site} 站点 cookie 邮编 {zipCode} ********************")
cookies = json.dumps(index_resp_cookies)
cookies = json.dumps(index_resp_cookies, ensure_ascii=False)
item = {"site": site, 'zipCode': ingress[0].strip(), 'cookie': cookies}
print(item)
insert_sql = f'insert into {site}_cookies (cookies,type)values (%s,%s)'
print(insert_sql)
us_cursor.execute(insert_sql, (cookies,'DB'))
us_db.commit()
us_cursor.close()
us_db.close()
sess.close()
# 构造 DataFrame
df = pd.DataFrame([{"cookies": cookies, "type": "DB"}])
# 存储到数据库
engine_us.to_sql(df, f"{site}_cookies", if_exists="append")
except Exception as e:
print(f"获取 {site} 站点 cookie 报错,切换下一个站点",e)
print("报错", f"\n{traceback.format_exc()}")
......@@ -210,10 +219,10 @@ if __name__ == '__main__':
get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR')
get_cookie(site='it', zipCode='00185')
# get_cookie(site='it', zipCode='85')
get_cookie(site='es', zipCode='28001')
get_cookie(site='fr', zipCode='75019')
get_cookie(site='us', zipCode='10010')
get_cookie(site='de', zipCode='10115')
get_cookie(site='uk', zipCode='W1S 3PR')
time.sleep(random.uniform(10.5, 35.5))
time.sleep(random.uniform(30.5, 70.5))
import pandas as pd
from urllib.parse import quote
from datetime import datetime
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.db_connect import BaseUtils
# 从数据库获取数据的函数
def get_data_from_database(connection, query):
return pd.read_sql(query, connection)
def get_data_from_database(engine_pg, query):
result_df = engine_pg.read_sql(query)
return result_df
def db_read_data(engine_pg):
......@@ -22,17 +27,19 @@ def db_read_data(engine_pg):
# 创建初始 DataFrame
df_search_term = pd.DataFrame(data=result_list, columns=['search_term', 'url'])
print(df_search_term.shape)
df_search_term['date_info'] = str(datetime.now().strftime("%Y-%m-%d"))
print('date_info::',df_search_term['date_info'])
# 找出超过 450 字符长度的 URL 行的索引
long_url_rows = df_search_term['url'].str.len() <= 450
# 筛选保留不超过 450 字符长度的 URL 行
data_df = df_search_term[long_url_rows]
print('pg6 写入数据 merchantwords')
with engine_pg.begin() as conn:
data_df.to_sql(f'us_search_term_month_syn_merchantwords', con=engine_pg, if_exists="append", index=False)
engine_pg.to_sql(data_df,'us_merchantwords_search_term_month_syn_2025', if_exists="append")
update_sql = f"update us_search_term_month_merchantwords set state =3 where state=1"
print(update_sql)
conn.execute(update_sql)
deletesql = f"DELETE from us_search_term_month_syn_merchantwords where state =3"
deletesql = f"DELETE from us_merchantwords_search_term_month_syn_2025 where state =3"
print(deletesql)
conn.execute(deletesql)
......@@ -58,6 +65,7 @@ def build_urls(search_term):
]
return [[search_term, url] for url in urls]
# if __name__ == '__main__':
# # 传一个 数据库链接
# db_read_data(engine_pg)
\ No newline at end of file
if __name__ == '__main__':
# 传一个 数据库链接
engine_pg = BaseUtils(site_name='us').pg_connect_6()
db_read_data(engine_pg)
\ No newline at end of file
......@@ -258,26 +258,7 @@ class Save_asin_detail(BaseUtils):
if asin_not_div_id_dp_list:
self.db_change_state(state=13, asin_list=asin_not_div_id_dp_list)
@func_set_timeout(240)
def save_asin_not_buysales(self, asin_buySales_list):
while True:
try:
if is_internet_available():
pass
else:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
print('错误月销的asin:', asin_buySales_list)
print('错误月销的asin:', len(asin_buySales_list))
df_asin_ = pd.DataFrame(data=asin_buySales_list, columns=['asin', 'buysales', 'date_info'])
self.engine_pg.to_sql(df_asin_, f'{self.site_name}_asin_detail_2025_not_buysales', if_exists='append')
break
except Exception as e:
print("存储 _asin_detail_2025_not_buysales 文本 数据错误", e)
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
time.sleep(random.uniform(10, 20.5))
continue
@func_set_timeout(240)
def save_bs_category_asin_detail(self, bs_category_asin_list_pg):
......
......@@ -21,7 +21,6 @@ import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
urllib3.disable_warnings()
import ast
class ai_async_asin_pg():
......@@ -120,8 +119,6 @@ class ai_async_asin_pg():
sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False)
# with open(rf'{self.site_name}_22_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp):
print('出现验证码,。asin---> ', asin)
if self.spider_state == '竞品asin':
......@@ -331,41 +328,31 @@ class ai_async_asin_pg():
def read_ai_asin(self):
self.pg_connect()
self.spider_type=True
self.spider_type = True
for module in ['Amazon:asin', 'Amazon:asinList']:
if module == 'Amazon:asin':
# pass
sql = f"SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module='{module}' and spider_status='未开始' for update;"
else:
sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and spider_status='未开始' for update;"""
# sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and task_id=39 for update;"""
print(sql)
df_read = self.engine_pg.read_then_update(
select_sql=sql,
update_table='ai_asin_analyze_log',
set_values={"spider_status": '爬取中'}, # 把库存清零
where_keys=["task_id"], # WHERE sku = :sku
)
while True:
try:
if module == 'Amazon:asin':
sql = f"SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module='{module}' and spider_status='未开始' for update;"
else:
sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and spider_status='未开始' for update;"""
# sql = f"""SELECT elem->>'asin' AS asin,task_id,site_name FROM ai_asin_analyze_log,LATERAL json_array_elements(input_params) elem WHERE module = '{module}' and task_id=39 for update;"""
print(sql)
df_read = self.engine_pg.read_then_update(
select_sql=sql,
update_table='ai_asin_analyze_log',
set_values={"spider_status": '爬取中'}, # 把库存清零
where_keys=["task_id"], # WHERE sku = :sku
)
break
except:
time.sleep(10)
print(f'开始 {module} 任务:', sql)
if not df_read.empty:
# if module == 'Amazon:asin':
# _asin_list = ast.literal_eval(df_read['asin'][0])
# asin_id_list = []
# for _aisn in _asin_list:
# asin_data_list = list(
# _aisn + '|-|' + df_read.task_id.astype(
# "U") + '|-|' + df_read.site_name + '|-|' + module)
# asin_id_list.extend(asin_data_list)
asin_id_list = list(
df_read['asin'] + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + module)
# else:
# asin_id_list = list(
# df_read['asin'] + '|-|' + df_read.task_id.astype(
# "U") + '|-|' + df_read.site_name + '|-|' + module)
print(asin_id_list)
for asin_id in asin_id_list:
print(asin_id)
self.queries_asin_queue.put(asin_id)
......@@ -385,7 +372,7 @@ class ai_async_asin_pg():
print(asin_id)
self.queries_asin_queue.put(asin_id)
html_thread = []
for i in range(5):
for i in range(8):
thread2 = threading.Thread(target=self.get_asin)
thread2.start()
html_thread.append(thread2)
......@@ -396,32 +383,37 @@ class ai_async_asin_pg():
def select_asin():
for site in ['us', 'de', 'uk']:
select_sql = f"""select id, site_name, task_id, unique_key as asin,sub_step from ai_asin_analyze_spider where sub_step = 'AsinInfoRepository:详情' and status = '未开始' and site_name='{site}' order by task_id"""
print('select_sql::', select_sql)
engine_pg15 = ai_async_asin_pg(site_name='us').pg_connect()
df_read = engine_pg15.read_then_update(
select_sql=select_sql,
update_table='ai_asin_analyze_spider',
set_values={"status": '爬取中'}, # 把库存清零
where_keys=["id", "site_name"], # WHERE sku = :sku
)
if not df_read.empty:
asin_id_list = list(
df_read['asin'] + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + df_read.id.astype(
"U") + '|-|' + df_read.sub_step)
print(asin_id_list)
while True:
try:
select_sql = f"""select id, site_name, task_id, unique_key as asin,sub_step from ai_asin_analyze_spider where sub_step = 'AsinInfoRepository:详情' and status = '未开始' and site_name='{site}' order by task_id"""
print('select_sql::', select_sql)
engine_pg15 = ai_async_asin_pg(site_name='us').pg_connect()
df_read = engine_pg15.read_then_update(
select_sql=select_sql,
update_table='ai_asin_analyze_spider',
set_values={"status": '爬取中'}, # 把库存清零
where_keys=["id", "site_name"], # WHERE sku = :sku
)
if not df_read.empty:
asin_id_list = list(
df_read['asin'] + '|-|' + df_read.task_id.astype(
"U") + '|-|' + df_read.site_name + '|-|' + df_read.id.astype(
"U") + '|-|' + df_read.sub_step)
print(asin_id_list)
ai_async_asin_pg(site_name=site).run_analzye_asin(asin_id_list)
ai_async_asin_pg(site_name=site).run_analzye_asin(asin_id_list)
break
except:
time.sleep(10)
def run_spider():
time_ip_num = 0
while True:
time_ip_num += 1
select_asin()
ai_async_asin_pg().read_ai_asin()
time.sleep(5)
select_asin() # 任务类型 AsinInfoRepository:详情
ai_async_asin_pg().read_ai_asin() # 任务类型 'Amazon:asin', 'Amazon:asinList'
time.sleep(20)
print('-----------------------------------------------------------------------------------------')
print()
if 10 <= datetime.now().hour < 22:
......
import os
import sys
import gzip
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.secure_db_client import get_remote_engine
from amazon_spider.VPS_IP import pppoe_ip
......@@ -19,7 +19,6 @@ import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
urllib3.disable_warnings()
import ast
......@@ -206,46 +205,23 @@ class ai_async_asin_pg():
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': items['bundle_asin_component_json'],
'bsr_category_asin_list': items['bs_category_asin_list_pg'],'review_json_list':items['review_json_list']
'bsr_category_asin_list': items['bs_category_asin_list_pg'],'review_json_list':items['review_json_list'],
'fbm_delivery_price': items['fbm_delivery_price']
}
print(item)
# a = None
# if result_list_json and module == 'Amazon:asin' and self.spider_state is None:
# is_sp_asin_state = None
# result_list_dict = json.loads(result_list_json)
# print(asin, '判断是否有竞品asin')
# for result_dict in result_list_dict:
# # Based on your recent shopping trends # Frequently purchased items with fast delivery
# # Customers who viewed this item also viewed # Brand in this category on Amazon
# sp_type = 'Based on your recent shopping trends'
# if result_dict.get(sp_type):
# print(asin, '找到有竞品asin。 数量:', len(result_dict[sp_type]))
# for i in result_dict[sp_type]:
# sp_asin = i + '|-|' + task_id + '|-|' + site_name + '|-|' + module
# self.sp_asin_queue.put(sp_asin)
# is_sp_asin_state = 111
# a = 1
# if is_sp_asin_state is None:
# print('没有找到竞品asin')
# self.item_queue.put(item)
# # self.save_data()
# # self.update_ai_asin_analyze_log([int(task_id)], '成功')
# a = 1
self.item_queue.put(item)
Requests_param_val().send_kafka(html_data=response, topic=self.topic_asin_html)
response_gzip = self.compress_string(response)
Requests_param_val().send_kafka(html_data=response_gzip, topic=self.topic_asin_html)
Requests_param_val().kafuka_producer_str.flush(timeout=30)
# if self.spider_state == '竞品asin':
# self.item_queue.put(item)
# a = 1
#
# if module == 'Amazon:asinList':
# self.item_queue.put(item)
# a = 1
# if a is None:
# self.item_queue.put(item)
else:
print('asin 商品 异常')
# 压缩字符串
def compress_string(self, input_string):
return gzip.compress(input_string.encode())
def update_ai_asin_analyze_log(self, task_id_list, status):
if task_id_list:
task_id_list = list(set(task_id_list))
......
# -*- coding: utf-8 -*-
import json
import random
import time
import traceback
import json
from datetime import datetime
import redis
import requests
from DrissionPage import ChromiumPage, ChromiumOptions
from lxml import etree
......@@ -15,6 +17,8 @@ class get_1688_order_data():
self.pwd = 'aass369874.'
self.redis_db22 = self.redis_db()
self.alipay_data_list = []
self.err_orders_list = [] # 有问题的订单号
self.time_num = 0
def redis_db(self):
nums = 0
......@@ -29,16 +33,17 @@ class get_1688_order_data():
time.sleep(3)
continue
def get_1688(self, list_orders):
def get_1688(self, orders_type=None, list_orders=None):
# 配置 Chrome 浏览器 - 端口 9333
chrome_options = ChromiumOptions()
chrome_options.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe')
chrome_options.set_local_port(9333) # 设置 Chrome 的调试端口
chrom_page = ChromiumPage(addr_or_opts=chrome_options)
print(f"Chrome 浏览器运行在端口: {9333}")
self.get_order_tab(chrom_page, list_orders) # 请求订单页面。判断是否保持登录状态
self.get_order_tab(chrom_page, list_orders, orders_type) # 请求订单页面。判断是否保持登录状态
chrom_page.quit()
def get_order_tab(self, chrom_page, list_orders):
def get_order_tab(self, chrom_page, list_orders, orders_type):
url = 'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10'
chrom_page.get(url)
time.sleep(random.randint(3, 6))
......@@ -52,9 +57,22 @@ class get_1688_order_data():
self.login_1688(chrom_page)
if '已买到的货品' in chrom_page.html:
print('保持登录状态')
self.get_order_data(chrom_page, list_orders)
self.get_order_data(chrom_page, list_orders, orders_type)
else:
print('需要人工进行手动确认。是否登录')
self.send_ms_count_data_num()
def send_ms_count_data_num(self):
account = 'pengyanbing'
title = f'1688 登录失败。店铺:{self.account}'
content = f'需要远程手动查看,远程向日葵 188779566'
url = 'http://47.112.96.71:8082/selection/sendMessage'
data = {
'account': account,
'title': title,
'content': content
}
requests.post(url=url, data=data, timeout=15)
def login_1688(self, chrom_page):
chrom_page.get(
......@@ -87,15 +105,18 @@ class get_1688_order_data():
'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10')
time.sleep(random.randint(12, 14))
def get_order_data(self, chrom_page, list_orders):
def get_order_data(self, chrom_page, list_orders, orders_type):
url = 'https://air.1688.com/app/ctf-page/trade-order-list/buyer-order-list.html?tradeStatus=waitbuyerpay&spm=a260k.home2025.topmenu.dmyorder_popup_ddaifukuan&page=1&pageSize=10'
chrom_page.get(url)
len_orders = len(list_orders)
for order_num in list_orders:
print()
len_orders -= 1
print('剩余需要查询订单数量:', len_orders)
order_num = order_num.strip()
print('当前执行查询订单号:', order_num)
print('---------------------------------------------------------------------------------')
time.sleep(random.randint(1, 2))
js = '''
// 定义输入函数
function typeReal(elem, text) {
if (!elem) return false;
elem.focus();
......@@ -119,19 +140,16 @@ class get_1688_order_data():
'''
js2 = f'''
// 获取 shadow DOM 深层 input
const realInput = document
.querySelector("body > article > app-root").shadowRoot
.querySelector("div > main > q-theme > order-search").shadowRoot
.querySelector("section > order-search-keywords").shadowRoot
.querySelector("div > q-input").shadowRoot
.querySelector("input");
// 执行输入
typeReal(realInput, "{order_num}");
'''
js = js + js2
chrom_page.run_js(js) # 填写订单号。定位搜索框
time.sleep(random.randint(3, 5))
print('点击搜索')
js_click = """
......@@ -164,7 +182,17 @@ class get_1688_order_data():
"value": '待付款中没有查询到该订单'
}
}
self.alipay_data_list.append(items)
if orders_type:
try:
json_data = json.dumps(items, ensure_ascii=False)
self.redis_db22.rpush('alipay_zszh:response', json_data)
print(f'从alipay_zszh:{self.account} redis删除订单号:', order_num)
self.redis_db22.lrem(f'alipay_zszh:{self.account}', 1, order_num)
continue
except:
pass
else:
self.alipay_data_list.append(items)
else:
print('获取订单成功')
js_click_2 = '''
......@@ -185,26 +213,143 @@ class get_1688_order_data():
print("新标签页对象:", new_tab) # 获取最新开标签页对象
chrom_page_tab = chrom_page.get_tab(new_tab)
chrom_page_tab.set.activate() # 激活新标签页
self.chrom_new_page_tab = chrom_page_tab
time.sleep(random.randint(2, 5))
print('寻找是否有网银对公支付')
try:
chrom_page_tab.ele('xpath://div[contains(text(),"展开更多付款方式")]', timeout=3).click()
except:
pass
try:
bank_transfer_group = chrom_page_tab.ele('xpath://div[@data-channel="bank_transfer_group"]')
bank_transfer_group.click()
except:
js = '''document.querySelector("#root > div > div.global-payment-channel > div:nth-child(2) > div > div.channel-card-group.available > div > div:nth-child(2) > div").click()'''
chrom_page_tab.run_js(js)
try:
js = '''document.querySelector("#root > div > div.global-payment-channel > div:nth-child(2) > div > div.channel-card-group.available > div > div:nth-child(2) > div").click()'''
chrom_page_tab.run_js(js)
except:
print('只有一个可选项。点击失败')
time.sleep(random.randint(1, 2))
print('寻找支付方式')
# 对公支付
chrom_page_tab.scroll.down(180)
time.sleep(random.randint(1, 2))
channel_name = chrom_page_tab.ele('xpath://span[contains(text(),"对公支付")]', timeout=5)
if channel_name:
chrom_page_tab.ele('xpath://span[contains(text(),"对公支付")]', timeout=5).click()
chrom_page_tab.ele('xpath://div[contains(text(),"网银或柜台转账")]', timeout=5).click()
print('找到网银。开始选择网银')
self.click_pay(chrom_page_tab)
self.online_banking(chrom_page_tab, order_num)
print('对公支付')
chrom_page_tab.ele('xpath://span[contains(text(),"对公支付")]', timeout=3).click()
time.sleep(random.randint(2, 4))
no_pay = chrom_page_tab.ele(
'xpath://div[contains(text(),"以下支付方式暂不可用")]/following-sibling::div//span')
if no_pay:
print('暂不可用::', no_pay.text)
if '对公支付' in no_pay.text:
_items = {
"account": self.account,
"order_id": order_num,
"type": "异常",
"json": {
"value": '对公支付暂不可用'
}
}
elif '支付宝' in no_pay.text:
_items = {
"account": self.account,
"order_id": order_num,
"type": "异常",
"json": {
"value": '支付宝暂不可用'
}
}
else:
_items = None
if _items:
if orders_type:
try:
json_data = json.dumps(_items, ensure_ascii=False)
self.redis_db22.rpush('alipay_zszh:response', json_data)
print(f'从alipay_zszh:{self.account} redis删除订单号:', order_num)
self.redis_db22.lrem(f'alipay_zszh:{self.account}', 1, order_num)
except:
pass
else:
self.alipay_data_list.append(_items)
self.save_redis()
continue
if orders_type is None:
print(', 点击网银')
try:
js_pay = '''
document.querySelector("#root > div > div.global-payment-channel > div:nth-child(2) > div > div > div > div:nth-child(4) > div > div.card__face.card__face--front > div.channel-info-container > div.channel-info-content > div.sub-channel-pay-list > div:nth-child(2)").click()
'''
chrom_page_tab.run_js(js_pay)
except:
chrom_page_tab.ele('xpath://div[contains(text(),"网银或柜台转账")]', timeout=3).click()
print('开始付款')
time.sleep(random.randint(1, 3))
self.click_pay(chrom_page_tab)
self.online_banking(chrom_page_tab, order_num)
else:
print('专属账号余额 有, 使用专属账号支付')
try:
js_pay = '''
document.querySelector("#root > div > div.global-payment-channel > div:nth-child(2) > div > div > div > div:nth-child(4) > div > div.card__face.card__face--front > div.channel-info-container > div.channel-info-content > div.sub-channel-pay-list > div:nth-child(1)").click()
'''
chrom_page_tab.run_js(js_pay)
except:
try:
chrom_page_tab.ele('xpath://div[contains(text(),"专属账号余额:")]', timeout=3).click()
except:
pass
time.sleep(random.randint(1, 3))
if chrom_page_tab.ele('xpath://div[contains(text(),"专属账号余额不足")]'):
resp = chrom_page_tab.html
respons = etree.HTML(resp)
sapn_text_list = respons.xpath('//div[contains(text(),"专属账号余额不足")]//text()')
if sapn_text_list:
span_str = ''.join(sapn_text_list)
else:
span_str = '专属账号余额不足,可用余额: ¥'
print('span_str:::', span_str)
items = {
"account": self.account,
"order_id": order_num,
"type": "异常",
"json": {
"value": span_str
}
}
print('专属账号余额支付 items:::', items)
json_data = json.dumps(items, ensure_ascii=False)
self.redis_db22.rpush('alipay_zszh:response', json_data)
else:
print('开始付款')
time.sleep(random.randint(1, 3))
self.click_pay(chrom_page_tab)
if '支付成功' in chrom_page_tab.html:
print('专属账号余额 支付成功 :', order_num)
else:
items = {
"account": self.account,
"order_id": order_num,
"type": "异常",
"json": {
"value": '需要手动确认,专属账号余额是否足够支付'
}
}
print('专属账号余额支付 items:::', items)
json_data = json.dumps(items, ensure_ascii=False)
self.redis_db22.rpush('alipay_zszh:response', json_data)
try:
print(f'从alipay_zszh:{self.account} redis删除订单号:', order_num)
self.redis_db22.lrem(f'alipay_zszh:{self.account}', 1, order_num)
except:
print('删除失败:')
pass
chrom_page_tab.close() # 关闭标签页
continue
else:
zfb = chrom_page_tab.ele('xpath://span[contains(text(),"支付宝")]', timeout=5).click()
zfb = chrom_page_tab.ele('xpath://span[contains(text(),"支付宝")]', timeout=3).click()
if zfb:
print('找到支付宝。开始选择支付宝')
self.click_pay(chrom_page_tab)
......@@ -221,26 +366,72 @@ class get_1688_order_data():
}
self.alipay_data_list.append(items)
chrom_page_tab.close() # 关闭标签页
self.save_redis()
# def save_alipay_zszh(self,_items, order_num):
# self.redis_db()
# for i in range(5):
# try:
# json_data = json.dumps(_items, ensure_ascii=False)
# self.redis_db22.rpush('alipay_zszh:response', json_data)
# print(f'从alipay_zszh:{self.account} redis删除订单号:', order_num)
# self.redis_db22.lrem(f'alipay_zszh:{self.account}', 1, order_num)
# break
# except:
# time.sleep(10)
def save_redis(self):
if self.alipay_data_list:
for alipay_data in self.alipay_data_list:
while True:
try:
json_data = json.dumps(alipay_data, ensure_ascii=False)
print(json_data)
self.redis_db22.rpush('alipay:response', json_data)
break
except:
print('写入redis报错:重试')
time.sleep(5)
# 删除
try:
print('从redis删除订单号:', alipay_data['order_id'])
self.redis_db22.lrem(f"alipay:{self.account}", 1, f"{alipay_data['order_id']}")
except:
print('删除失败:')
pass
self.alipay_data_list = []
time.sleep(random.randint(1, 3))
def online_zfb(self, chrom_page_tab, order_num):
print('解析支付宝支付页面信息')
chrom_page_tab.ele('xpath://span[contains(text(),"去网商银行付款")]', timeout=5).click()
time.sleep(random.randint(5, 10))
html = etree.HTML(chrom_page_tab.html)
data_list = html.xpath('//div[contains(@class,"order-info-container")]//text()')
print(data_list)
del data_list[0]
data_dict = {data_list[i]: data_list[i + 1] for i in range(0, len(data_list), 2)}
items = {
"account": self.account,
"order_id": order_num,
"type": "支付宝",
"json": {
"merchant_name": data_dict['商户名称']
if chrom_page_tab.ele('xpath://span[contains(text(),"中国农业银行")]'):
items = {
"account": self.account,
"order_id": order_num,
"type": "支付宝",
"json": {
"merchant_name": '无'
}
}
}
self.alipay_data_list.append(items)
self.alipay_data_list.append(items)
else:
chrom_page_tab.ele('xpath://span[contains(text(),"去网商银行付款")]', timeout=5).click()
time.sleep(random.randint(5, 10))
html = etree.HTML(chrom_page_tab.html)
data_list = html.xpath('//div[contains(@class,"order-info-container")]//text()')
print(data_list)
del data_list[0]
data_dict = {data_list[i]: data_list[i + 1] for i in range(0, len(data_list), 2)}
items = {
"account": self.account,
"order_id": order_num,
"type": "支付宝",
"json": {
"merchant_name": data_dict['商户名称']
}
}
self.alipay_data_list.append(items)
time.sleep(random.randint(1, 3))
def click_pay(self, chrom_page_tab):
time.sleep(random.randint(1, 2))
print('点击立即付款')
......@@ -251,7 +442,7 @@ class get_1688_order_data():
'''
# 获取
chrom_page_tab.run_js(js_click_cashier)
time.sleep(random.randint(5, 10))
time.sleep(random.randint(6, 15))
def online_banking(self, chrom_page_tab, order_num):
print('解析网银支付账户信息')
......@@ -262,22 +453,31 @@ class get_1688_order_data():
data_list.remove('复制')
data_dict = {data_list[i]: data_list[i + 1] for i in range(0, len(data_list), 2)}
print(data_dict)
items = {
"account": self.account,
"order_id": order_num,
"type": "对公支付",
"json": {
"bank_account": data_dict['收款账号'].replace(" ", ""),
"name": data_dict['收款户名'],
"bank_name": data_dict['收款银行'].replace('”', '').replace('“', ''),
"bank_loaction": data_dict['所在地'],
"price": data_dict['转账金额'].replace('¥', ''),
"bank_clearing_number": data_dict["联行号"].replace('(选填)', '')
try:
items = {
"account": self.account,
"order_id": order_num,
"type": "对公支付",
"json": {
"bank_account": data_dict['收款账号'].replace(" ", ""),
"name": data_dict['收款户名'],
"bank_name": data_dict['收款银行'].replace('”', '').replace('“', ''),
"bank_loaction": data_dict['所在地'],
"price": data_dict['转账金额'].replace('¥', ''),
"bank_clearing_number": data_dict["联行号"].replace('(选填)', '')
}
}
}
self.alipay_data_list.append(items)
if items['json']['bank_name']:
if '浙江网商' in items['json']['bank_name']:
items['json']['bank_name'] = '浙江网商银行'
self.alipay_data_list.append(items)
except:
self.err_orders_list.append(order_num)
print('检查订单号:', self.err_orders_list)
self.time_num += 1
time.sleep(random.randint(1, 3))
def get_account(self):
self.redis_db22 = self.redis_db()
start_index = 0 # 起始索引
end_index = -1 # 结束索引,-1 表示获取整个列表
list_data_b = self.redis_db22.lrange(f'alipay:{self.account}', start_index, end_index)
......@@ -287,26 +487,44 @@ class get_1688_order_data():
else:
return None
def run(self):
list_orders = self.get_account()
print('list_orders:', list_orders)
def get_alipay_zszh(self):
# 此键存储 订单号 只用专属账户余额支付。只返回支付失败的信息。其他不返回
self.redis_db22 = self.redis_db()
start_index = 0 # 起始索引
end_index = -1 # 结束索引,-1 表示获取整个列表
list_data_b = self.redis_db22.lrange(f'alipay_zszh:{self.account}', start_index, end_index)
list_orders = [item.decode('utf-8') for item in list_data_b]
print('需要用专属账号余额支付的订单号: ', list_orders)
print('需要用专属账号余额支付的订单号: ', len(list_orders), '个')
if list_orders:
self.get_1688(list_orders)
self.get_1688(orders_type=True, list_orders=list_orders)
else:
print(self.account, ' :该店铺下没有可查询的订单')
print(self.alipay_data_list)
for alipay_data in self.alipay_data_list:
while True:
try:
json_data = json.dumps(alipay_data, ensure_ascii=False)
print(json_data)
self.redis_db22.rpush('alipay:response', json_data)
break
except:
print('写入redis报错:重试')
time.sleep(5)
# 删除
#self.redis_db22.lrem(f"alipay:{self.account}", 1, "dsaofi3232e232328938928")
print('没有需要可以支付的订单')
if self.time_num > 500:
self.time_num = 0
def run(self):
while True:
try:
list_orders = self.get_account()
new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(new_date, ' 需要查询的订单号list_orders:', list_orders)
if list_orders:
self.get_1688(list_orders=list_orders)
else:
print(self.account, ' :该店铺下没有可查询的订单')
print('开始查询专属账号余额需要支付的订单号')
self.get_alipay_zszh()
except Exception as e:
print("报错xx2222xxx:", e, f"\n{traceback.format_exc()}")
time.sleep(300)
self.chrom_new_page_tab.close()
base_min, base_max = 20, 60
sleep_min = base_min + self.time_num * 2
sleep_max = base_max + self.time_num * 3
time.sleep(random.randint(sleep_min, sleep_max))
self.err_orders_list = []
if __name__ == '__main__':
......
......@@ -16,6 +16,7 @@ from datetime import datetime
import json
import threading
import time
import gzip
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
urllib3.disable_warnings()
......@@ -200,15 +201,20 @@ class ai_async_asin_pg():
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': items['bundle_asin_component_json'],
'bsr_category_asin_list': items['bs_category_asin_list_pg'],
'review_json_list': items['review_json_list']
'review_json_list': items['review_json_list'],'fbm_delivery_price':items['fbm_delivery_price']
}
print(item)
self.item_queue.put(item)
Requests_param_val().send_kafka(html_data=response, topic=self.topic_asin_html)
response_gzip = self.compress_string(response)
Requests_param_val().send_kafka(html_data=response_gzip, topic=self.topic_asin_html)
Requests_param_val().kafuka_producer_str.flush(timeout=30)
else:
print('asin 商品 异常')
# 压缩字符串
def compress_string(self, input_string):
return gzip.compress(input_string.encode())
def save_data(self):
self.pg_connect()
items_data_list = []
......@@ -245,6 +251,7 @@ class ai_async_asin_pg():
print('存储报错::', e)
self.pg_connect()
time.sleep(10)
def init_list(self):
print("=======清空变量==========")
self.asin_not_found_list = [] # 4
......
......@@ -75,7 +75,7 @@ class async_asin_pg():
self.topic_detail_month = f'{self.site_name}_asin_detail_month_2025_{self.month_}'
self.topic_asin_html = f'asin_html_2025_{self.month_}'
self.asin_video_list = []
self.asin_buySales_list = []
def get_asin(self):
while True:
if self.queries_asin_queue.empty() == False:
......@@ -112,7 +112,7 @@ class async_asin_pg():
sess.mount(self.site_url, py_ja3.DESAdapter())
resp = sess.get(scraper_url, headers=headers,
timeout=10, verify=False)
# with open(rf'{self.site_name}_22_{asin}.html', 'w', encoding='utf-8')as f:
# with open(rf'D:\新建文件夹\html_selenium_files\{self.site_name}_211123333_{asin}.html', 'w', encoding='utf-8')as f:
# f.write(resp.text)
if self.reuests_para_val.check_amazon_yzm(resp):
self.yzm_err_total_list.append(1)
......@@ -216,7 +216,7 @@ class async_asin_pg():
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json':items['bundle_asin_component_json'],
'review_json_list':items['review_json_list'],'asin_buySales_list':items['asin_buySales_list']
'review_json_list':items['review_json_list'],'fbm_delivery_price':items['fbm_delivery_price']
}
if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
item['five_six_val'] = items['five_six_val']
......@@ -286,6 +286,8 @@ class async_asin_pg():
item['node_id'] = _items["node_id"]
if item['review_json_list'] is None:
item['review_json_list'] = _items["review_json_list"]
if item['fbm_delivery_pric'] is None:
item['fbm_delivery_price'] = _items["fbm_delivery_price"]
except:
pass
_response_text_var = None
......@@ -366,8 +368,6 @@ class async_asin_pg():
item['img_list'] = json.dumps(items["all_img_video_list"])
else:
item['img_list'] = None
if item['asin_buySales_list']:
self.asin_buySales_list.extend(item['asin_buySales_list'])
self.item_queue.put(item)
if item['img_list'] is None:
......@@ -425,7 +425,6 @@ class async_asin_pg():
def init_list(self):
print("=======清空变量==========")
self.asin_buySales_list = []
self.asin_not_found_list = [] # 4
self.asin_not_sure_list = [] # 6
self.asin_not_foot_list = [] # 7
......@@ -478,7 +477,7 @@ class async_asin_pg():
def run(self):
asin_list = self.save_asin_detail.read_db_data()
# asin_list = ['B0D663T3W8|2025-01|1|1|null|null']
# asin_list = ['B0CW1ZM991|2025-01|1|1|null|null']
if asin_list:
for asin in asin_list:
self.queries_asin_queue.put(asin)
......@@ -488,7 +487,7 @@ class async_asin_pg():
for ck in cookies_dict.values():
self.cookies_queue.put(ck)
html_thread = []
for i in range(27):
for i in range(26):
thread2 = threading.Thread(target=self.get_asin)
thread2.start()
html_thread.append(thread2)
......@@ -500,14 +499,6 @@ class async_asin_pg():
self.reuests_para_val.kafuka_producer_str.flush(timeout=35)
except KafkaTimeoutError as e:
print("flush 超时,跳过这次等待:", e)
while True:
try:
if self.asin_buySales_list:
self.save_asin_detail.save_asin_not_buysales(self.asin_buySales_list)
break
except FunctionTimedOut as e:
print('断网', e)
while True:
try:
print('存储 asin bsr 文本 存储pg')
......@@ -569,4 +560,4 @@ class async_asin_pg():
pass
# if __name__ == '__main__':
# async_asin_pg(month=9, spider_int=1, week=14,site_name='us').run()
# async_asin_pg(month=12, spider_int=1, week=14,site_name='de').run()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment