Commit 4418209b by Peng

no message

parent 859e586e
import pymysql
import datetime
from params import DB_CONN_DICT, PG_CONN_DICT_14
from sqlalchemy import create_engine
import pandas as pd
from lxml import etree
import json
import os
import socket
import requests
import random
import time
"""删除 10天cookie"""
def send_mg(site, report_date, type):
url = 'http://47.112.96.71:8082/selection/sendMessage'
data = {
'account': 'pengyanbing',
'title': site + '站点 搜索词',
'content': report_date + ' 搜索词 ' + type + '页面解析错误。请检查'
}
requests.post(url=url, data=data, timeout=15)
def init_db(site_name):
if site_name == 'us':
connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'],
user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection", charset="utf8mb4")
else:
connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'],
user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection_" + site_name,
charset="utf8mb4")
cursor = connect.cursor()
# if site_name == 'us':
# db = 'selection'
# else:
# db = f'selection_{site_name}'
# DB_CONN_DICT = {
# "mysql_port": 3306,
# "mysql_db": "selection",
# "mysql_user": "XP_Yswg2025_PY",
# "mysql_pwd": "Xp_Yswg2025_Py300",
# "mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
# }
if site_name == 'us':
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT_14['pg_user']}:{PG_CONN_DICT_14['pg_pwd']}@{PG_CONN_DICT_14['pg_host']}:{PG_CONN_DICT_14['pg_port']}/selection",
encoding='utf-8')
else:
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT_14['pg_user']}:{PG_CONN_DICT_14['pg_pwd']}@{PG_CONN_DICT_14['pg_host']}:{PG_CONN_DICT_14['pg_port']}/selection_{site_name}",
encoding='utf-8')
#
# sql_read = f"SELECT asin, img_url, title, title_len, price, rating, total_comments, buy_box_seller_type, page_inventory, category, volume, weight, rank, launch_time, video_url, add_url, material, created_at, updated_at, img_num, img_type, qa_num, brand, ac_name, node_id, sp_num, mpn, online_time, `describe`, one_star, two_star, three_star, four_star, five_star, low_star, asin_type, is_coupon, search_category, weight_str, date_info, site, account_name, other_seller_name, bsr_date_info, account_id FROM us_self_asin_detail where date_info = '2023-05-24'"
#
# cursor.execute(sql_read)
# data_search_term = cursor.fetchall()
# # print(data_search_term)
# df = pd.DataFrame(data=data_search_term,
# columns=['asin', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments',
# 'buy_box_seller_type', 'page_inventory', 'category', 'volume', 'weight', 'rank',
# 'launch_time', 'video_url', 'add_url', 'material', 'created_at', 'updated_at', 'img_num',
# 'img_type', 'qa_num', 'brand', 'ac_name', 'node_id', 'sp_num', 'mpn', 'online_time',
# 'describe', 'one_star', 'two_star', 'three_star', 'four_star', 'five_star', 'low_star',
# 'asin_type', 'is_coupon', 'search_category', 'weight_str', 'date_info', 'site',
# 'account_name', 'other_seller_name', 'bsr_date_info', 'account_id'])
# df.loc[df['launch_time'] == '0000-00-00', 'launch_time'] = None
# df.to_sql('us_self_asin_detail_2023', con=engine_pg, if_exists="append", index=False)
'上传日搜索词测试'
# days = ((datetime.datetime.now()) + datetime.timedelta(days=-5)).strftime("%d")
# print(days)
# temp_date = datetime.datetime.now() # 获取当前时间 年月日时分秒
# date2 = (temp_date + datetime.timedelta(days=-5)).strftime("%Y-%m-%d") # 获取当前日期的前两天日期
# day = int(days)
# print(date2, '2222222')
# _yera_time = time.strftime('%Y', time.localtime(time.time()))
# report_date = date2
# print("report_date: ", report_date)
# df = pd.read_csv(rf"C:\Users\ASUS\Desktop\amazon_spider\US_Top_Search_Terms_Simple_Day_2023_04_01.csv", skiprows=1, encoding='UTF-8-SIG', on_bad_lines='skip', encoding_errors='ignore')
# print(df.shape)
# try:
# df.drop(columns=['Top Clicked Product #1: Product Title', 'Top Clicked Product #2: Product Title', 'Top Clicked Product #3: Product Title','Reporting Date'], inplace=True)
# except:
# df.drop(columns=['Top Clicked Product #1: Product Title', 'Top Clicked Product #2: Product Title', 'Top Clicked Product #3: Product Title','Reporting Date'],
# inplace=True)
# print(df.columns)
# df.rename(columns={
# 'Search Term': 'search_term',
# 'Search Frequency Rank': 'rank',
# 'Top Clicked Brand #1': 'top_clicked_brand_1',
# 'Top Clicked Brands #2': 'top_clicked_brand_2',
# 'Top Clicked Brands #3': 'top_clicked_brand_3',
# 'Top Clicked Category #1': 'top_clicked_category_1',
# 'Top Clicked Category #2': 'top_clicked_category_2',
# 'Top Clicked Category #3': 'top_clicked_category_3',
# 'Top Clicked Product #1: ASIN': 'asin1',
# 'Top Clicked Product #1: Click Share': 'click_share1',
# 'Top Clicked Product #1: Conversion Share': 'conversion_share1',
# 'Top Clicked Product #2: ASIN': 'asin2',
# 'Top Clicked Product #2: Click Share': 'click_share2',
# 'Top Clicked Product #2: Conversion Share': 'conversion_share2',
# 'Top Clicked Product #3: ASIN': 'asin3',
# 'Top Clicked Product #3: Click Share': 'click_share3',
# 'Top Clicked Product #3: Conversion Share': 'conversion_share3',
# }, inplace=True)
#
# df['rank'] = df['rank'].apply(lambda x: str(x).replace(",", ""))
# df.search_term = df.search_term.apply(lambda x: str(x)[:150]) # 截取前150字符
# df = df.loc[df.search_term.str.len() > 1] # 字符大于1
# df.click_share1 = df.click_share1.apply(
# lambda x: float(x) / 100)
# df.conversion_share1 = df.conversion_share1.apply(
# lambda x: float(x) / 100)
# df.click_share2 = df.click_share2.apply(
# lambda x: float(x) / 100)
# df.conversion_share2 = df.conversion_share2.apply(
# lambda x: float(x) / 100)
# df.click_share3 = df.click_share3.apply(
# lambda x: float(x) / 100)
# df.conversion_share3 = df.conversion_share3.apply(
# lambda x: float(x) / 100)
# df['report_date'] = report_date.replace('-', '')
# df['date_info'] = report_date
# y, m, d = report_date.split("-")
# print(y, m, d)
#
# df.to_sql(f"{site_name}_brand_analytics_everyday_{y}", con=engine_pg, if_exists="append",
# index=False)
'下载 音乐 '
# scraper_url = 'https://artlist.io/api/Song/List'
# headers = {
# 'accept': 'application/json, text/javascript, */*; q=0.01',
# 'accept-encoding': 'gzip, deflate, br',
# 'accept-language': 'en-US',
# 'authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiI5MzMwNTc3Yy0zNWQyLTQ3NTktYjdhNy00NTgxODIxYTAyNDciLCJzdWIiOiIyNTk2NDMxIiwiZW1haWwiOiJ6b3V3ZWlAeXN3Zy5jb20uY24iLCJodHRwOi8vc2NoZW1hcy54bWxzb2FwLm9yZy93cy8yMDA1LzA1L2lkZW50aXR5L2NsYWltcy9uYW1laWRlbnRpZmllciI6IjI1OTY0MzEiLCJ1bmlxdWVfbmFtZSI6InpvdXdlaUB5c3dnLmNvbS5jbiIsImdpdmVuX25hbWUiOiJ5c3dnMjAyMSAiLCJpc0V4dGVybmFsQmlsbGVkIjoiVHJ1ZSIsInN1YnNjcmlwdGlvbklkIjoiMzY1MDQxMiIsInRlbmFudElkIjoiYXJ0bGlzdCIsImV4cCI6MTY4MDc0MjkxNX0.KqSjuVxA0YUGVr33pRNorE0d9yJLAEBjWHKBkGUA1hI',
# 'cache-control': 'no-cache',
# 'content-length': '204',
# 'content-type': 'application/json',
# 'cookie': 'optimizelyEndUserId=oeu1679536870850r0.7071089853062891; userVisitorNew=c6ba6bbf-0695-4ee2-9103-99cca7c0513d; userSession=6e802db0-2b6f-4547-825b-ef3e538f4e0f; _gid=GA1.2.1803988674.1679536872; _gcl_au=1.1.244746656.1679536872; x-ms-routing-name=self; _rdt_uuid=1679536872203.1d0b064a-18b3-46cb-b171-d5686bc3e0aa; _hjFirstSeen=1; _hjSession_458951=eyJpZCI6ImE4MzQwZmE3LTlkMzYtNGNmMC04YzdhLWY3ZmZmNGQxNjllZiIsImNyZWF0ZWQiOjE2Nzk1MzY4NzIzODgsImluU2FtcGxlIjp0cnVlfQ==; _hjAbsoluteSessionInProgress=0; ln_or=eyI0MTkwMjQ5IjoiZCJ9; _tt_enable_cookie=1; _ttp=QHJvXju2YvJudz1ZCKxM8FgSx2T; TiPMix=82.15806888204298; AntiforgeryCookie=CfDJ8CpxpXmrGSJOvO4Uovren5glom4KYSIyVww8aRLUNlx2XhQnfW0YImGjCwLcWT3-X0phlXgMV_W-17IeiQaIUUTcpRt0UBnBAXzZEyMMAqsvmGmaGM5B4v8TPaU3kBcN7C8gZ3Ok9YuDvBKXb97i39o; _pin_unauth=dWlkPVpXRmlPV0psTnpjdE9XWTNNaTAwTmpsa0xUazBNV1F0T0RjelpUYzJZVEl3WWpObA; _clck=kabt7t|1|fa5|0; _fbp=fb.1.1679536873608.1646885667; ajs_anonymous_id=7578a89b-7b45-431d-9e75-7bebda6da804; PAPVisitorId=MSKl3AqwbgqmjSfC6CMgwrL3GsCxfmu2; accessToken=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiI5MzMwNTc3Yy0zNWQyLTQ3NTktYjdhNy00NTgxODIxYTAyNDciLCJzdWIiOiIyNTk2NDMxIiwiZW1haWwiOiJ6b3V3ZWlAeXN3Zy5jb20uY24iLCJodHRwOi8vc2NoZW1hcy54bWxzb2FwLm9yZy93cy8yMDA1LzA1L2lkZW50aXR5L2NsYWltcy9uYW1laWRlbnRpZmllciI6IjI1OTY0MzEiLCJ1bmlxdWVfbmFtZSI6InpvdXdlaUB5c3dnLmNvbS5jbiIsImdpdmVuX25hbWUiOiJ5c3dnMjAyMSAiLCJpc0V4dGVybmFsQmlsbGVkIjoiVHJ1ZSIsInN1YnNjcmlwdGlvbklkIjoiMzY1MDQxMiIsInRlbmFudElkIjoiYXJ0bGlzdCIsImV4cCI6MTY4MDc0MjkxNX0.KqSjuVxA0YUGVr33pRNorE0d9yJLAEBjWHKBkGUA1hI; _hjSessionUser_458951=eyJpZCI6ImY5YmVjOTQ3LTBlNDktNWNlMi1hNGI1LWUzMmI5NzI5YTMyMyIsImNyZWF0ZWQiOjE2Nzk1MzY4NzIzODIsImV4aXN0aW5nIjp0cnVlfQ==; _uetsid=95f97ab0c91e11edb428876e0544e20e; _uetvid=95f9bd40c91e11ed972a0f49c6b1bcfe; _ga=GA1.2.1417359823.1679536872; _hjHasCachedUserAttributes=true; ajs_user_id=2596431; ab.storage.deviceId.a8859136-009c-49a9-9457-9554c055e10d=%7B%22g%22%3A%22393578a1-b15d-0701-34c0-e379b26c246f%22%2C%22c%22%3A1679536874640%2C%22l%22%3A1679537488302%7D; ab.storage.userId.a8859136-009c-49a9-9457-9554c055e10d=%7B%22g%22%3A%222596431%22%2C%22c%22%3A1679537488300%2C%22l%22%3A1679537488302%7D; _hjDonePolls=870089; _clsk=12o55qj|1679538654837|9|1|j.clarity.ms/collect; _ga_65CXCH03KJ=GS1.1.1679536871.1.1.1679538655.60.0.0; __cf_bm=N0uk7HKoRw7.Yl20zvtKhsmiFg.zdvqiUEyPWxqFYiA-1679538653-0-AWRCasJ/UULh1Y2l+KyYnyiM7NHOsHEjWJjQVSOK+tn+zpZuX1Up3+ZY24+qG80V7rzDrfKAaEPGYg4uRH9Moxk=; al-artlist=CfDJ8CpxpXmrGSJOvO4Uovren5h2HxTWKTLxo7pe0W0TpW4BA26iJ9LWrAcqMvsoMxYF3Qr4g4dQkKDmwTZ9He2OXeZdu7AgPMLG-VeaKWblPywDfySGBAaibbBrNk-Saaq35MhgCUcEDnRpJNGCjBtKQ_Gn84GwN5r-9Kj8KIZR6or-Qiu-M1-QEdCClb2g4vszUNAeW3qNbPAD1rRUmLZXe8kzBWR8jWRPv0sO28b4x5exeBpH06bS8MVEY04pszLdSsREbNxzXK2qYV2B99abOYbYgXTK4TjXPZghZlXCwcmnROOYCufBbgfT6ERShqTuW7R1ur589Rsf8yveczDGJXUX3oIL0hB7KB2VDKVQC8A_PZJK0Z8CbBbsS1HeGnxpffHJSAYK8Bd7c8GpmY-HukokRiaFBl6VosJYXXPRggOTmaNgLF8954nW2dv-U1h1HzvYuWLrjhoPO9t-FlxvnugUgEYVm3MMQ9CiEQXf5kjp0aYKDRuQgxk-YcMZJMJ0QNPZnahnGvkfq5ZBnjMwmOaEMd_bRVlw0vnQwLZ3IU57N8blWmJ136dHph2_QLYerA; _gat=1; ab.storage.sessionId.a8859136-009c-49a9-9457-9554c055e10d=%7B%22g%22%3A%22bba6a8c9-c6f9-a513-22ef-8cbf0900a5da%22%2C%22e%22%3A1679541023985%2C%22c%22%3A1679537488301%2C%22l%22%3A1679539223985%7D; XSRF-TOKEN=CfDJ8CpxpXmrGSJOvO4Uovren5jFWko4Xo2CZ9fhETQd-duDCd9qoi7VhHOTyD1zW2_GBC8W1z7_TZ2CIhyn51Zkt8uEFrqjiDnCTto_ZqsHgRgsGkRGcMx1InB6MMevmpRWvToPdPBgnIrl8gs-yKzbj7ud-i6ikrPTvRq8tS8VGri6P4QGN_EdQbnTtbTY8qjNDw',
# 'origin': 'https://artlist.io',
# 'pragma': 'no-cache',
# 'referer': 'https://artlist.io/category/5&311/uplifting&epic/',
# 'sec-ch-ua': '"Google Chrome";v="110", "Not(A:Brand";v="8", "Chromium";v="111"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"',
# 'sec-fetch-dest': 'empty',
# 'sec-fetch-mode': 'cors',
# 'sec-fetch-site': 'same-origin',
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
# 'x-requested-with': 'XMLHttpRequest',
# 'x-xsrf-token': 'CfDJ8CpxpXmrGSJOvO4Uovren5jFWko4Xo2CZ9fhETQd-duDCd9qoi7VhHOTyD1zW2_GBC8W1z7_TZ2CIhyn51Zkt8uEFrqjiDnCTto_ZqsHgRgsGkRGcMx1InB6MMevmpRWvToPdPBgnIrl8gs-yKzbj7ud-i6ikrPTvRq8tS8VGri6P4QGN_EdQbnTtbTY8qjNDw',
# }
# import requests
# import time
# id_list = []
# categoryid_lsit = ['50', '33', '52', '9', '49', '48', '84', '85', '68', '83', '96', '42', '14', '101', '5', '36', '24', '18', '32', '19', '98', '8', '22', '10', '105', '17', '311', '21', '31', '57', '25', '34', '78', '322', '320', '65', '28', '39', '13', '37', '40', '46', '41', '26', '6', '20', '7', '38', '16', '35', '72', '92', '12', '62', '15', '51', '93', '23', '79', '69']
# for i in categoryid_lsit:
# data = {"searchTerm": '', "categoryIDs": f"{i}", "excludedIDs": "", "songSortID": "1", "page": "1",
# "durationMin": "0", "durationMax": "0", "onlyVocal": '', "bpmMin": "0", "bpmMax": "0",
# "geoClusterToCollectionIdMapping": '', "vocalId": "0"}
# resp = requests.post(scraper_url, headers=headers, timeout=30, verify=False, json=data)
# json_dict = json.loads(resp.text)
# for i in json_dict['songs']:
# print()
# print(i['songName'])
# print(i['sitePlayableFilePath'])
# print(i['albumName'])
# songName = i['songName']
# sitePlayableFilePath = i['sitePlayableFilePath']
# albumName = i['albumName']
# sql = f'INSERT into artlist_music(songName,albumName,music_url)values("{songName}","{albumName}","{sitePlayableFilePath}")'
# print(sql)
# cursor.execute(sql)
# connect.commit()
# print(i['collections']['staffPicks'][0]['categoryId'])
# id_list.append(i['collections']['staffPicks'][0]['categoryId'])
# time.sleep(10)
# print(id_list)
' asin存在不更新。'
# self_asin_list = [['B0018Z8D64']]
# cursor.executemany(
# f"insert into {site_name}_self_asin_pyb (asin) values (%s) ON DUPLICATE KEY UPDATE asin = values(asin)",
# self_asin_list)
# connect.commit()
' 查询 nysql 中 表的数据大小========================'
# # # sql = f'SELECT count(*) FROM {site_name}_search_term WHERE state=1'
# sql = f"SELECT week,count(*) FROM {site_name}_all_syn_st where state=1"
# # sql = f"SELECT `week`,count(1) FROM {site_name}_all_syn_st GROUP BY `week`"
# # # sql = f"-- UPDATE {site_name}_all_syn_st set state=5 WHERE state=1"
# # # # # # # # # # # # # # # # # # # # # # # # # sql = f"-- SELECT `week`,count(1) FROM {site_name}_all_syn_st_history_2022 GROUP BY `week`"
# # # # # # # # # # # # # # # # # sql = f" SELECT `week`,count(1) FROM {site_name}_search_term where state=1 GROUP BY `week`"
# print(sql)
# cursor.execute(sql)
# connect.commit()
# count_list = cursor.fetchall()
# print(site_name,count_list)
'===============删除重复数据。重新清空表 插入。================================'
# # 读取状态不是1 的asin
# # sql = f"SELECT count(DISTINCT(asin)) FROM us_st_everyday_syn_2023_02_13"
# sql = f"SELECT asin, state,asin_is_variation FROM uk_st_everyday_syn_2023_02_12 where state !=1;"
# df_12 = pd.read_sql(sql, con=engine_pg)
# # df_12.drop_duplicates('asin', inplace=True) # 根据asin去重
# print(df_12.shape)
# # 读取状态 1 的asin
# sql = f"SELECT asin, state,asin_is_variation FROM uk_st_everyday_syn_2023_02_12 where state =1;"
# df_st_1 = pd.read_sql(sql, con=engine_pg)
# print(df_st_1.shape)
# print(333333333)
# df_st_1.drop_duplicates('asin', inplace=True) # 根据asin去重
# print(df_st_1.shape)
# # 剔除状态 7,9 的id
# print(2222222222)
# # 找出 df_st_1 的asin 不再 df_12 里面 然后进行合并,。asin 状态不变
# df = df_st_1.loc[~(df_st_1.asin.isin(df_12.asin))]
# print(df.shape)
# df_save = pd.concat([df_12, df])
# print(212)
# print(df_save.shape)
# print(df_save.values)
# # with engine_pg.begin() as conn:
# # sql = 'truncate uk_st_everyday_syn_2023_02_12;'
# # conn.execute(sql)
# # df_save['date_info'] = '2023-02-12'
# # print('cun')
# # df_save.to_sql("uk_st_everyday_syn_2023_02_12", con=engine_pg, if_exists='append', index=False)
' ================== 创建年表分区 。查询日期 ======================='
# sql = 'select date from date_20_to_30 where year=2023'
# cursor.execute(sql)
# count_list = cursor.fetchall()
# v = 0
# # print(count_list[1])
# date_list = []
# for i in count_list:
# date_list.append(i[0])
# i = 0
# for data in date_list:
# i+=1
# # # 按年
# sql1 = f"create table us_self_product_detail_2023_{str(data).replace('-','_')} partition of us_self_product_detail_2023 for values from ('{str(data)}') to ('{str(date_list[i])}');"
# print(sql1)
# #
# # 'create table us_aba_profit_gross_day_2023_05_01 partition of us_aba_profit_gross_day_2023_05 for values from ('2023-05-01') to ('2023-05-02');'
# # moth = str(data).split('-')[1] 按 月分表
# # print(moth)
# sql1 = f"-- create table us_aba_profit_gross_day_{str(data).replace('-','_')} partition of us_aba_profit_gross_day_2023_{moth} for values from ('{str(data)}') to ('{str(date_list[i])}');"
# print(sql1)
# #
' =====================删除 各个站点 的搜索词数据==================='
# for i in range(40, 45):
# for type in ['sp', 'sb', 'bs', 'tr', 'er']:
# sql = f'TRUNCATE {site_name}_search_term_rank_{type}_2022_{i}'
# print(sql)
# cursor.execute(sql)
# connect.commit()
# zr_sql = f'delete from {site_name}_search_term_rank_zr_2022_{i} where page >= 2 ;'
# print(zr_sql)
# cursor.execute(zr_sql)
# connect.commit()
' 删除cookie 查询'
select_all_cookie = f"SELECT * FROM {site_name}_cookies"
cursor.execute(select_all_cookie)
cookie_all_lsit = cursor.fetchall()
print(f"{site_name} all cookie 总数量:", len(cookie_all_lsit))
day = ((datetime.datetime.now()) + datetime.timedelta(days=-6)).strftime("%Y-%m-%d")
select_sql = f"SELECT * FROM {site_name}_cookies WHERE updated_time < '{day}'"
print(select_sql)
cursor.execute(select_sql)
cookie_lsit = cursor.fetchall()
print("删除数量 10 天前的 cookie :",len(cookie_lsit))
delect_sql = f"DELETE FROM {site_name}_cookies WHERE updated_time < '{day}'"
print(delect_sql)
cursor.execute(delect_sql)
connect.commit()
if site_name in ('us','uk','de'):
print(site_name)
with engine_pg.begin() as conn:
conn.execute(delect_sql)
' 搜索词监控, 每日早上九点多 查询当日抓取量 ==========================='
# if site_name == 'us':
# sql_read = f"SELECT DISTINCT(report_date) FROM {site}_st_everyday where state in (1,2) GROUP BY report_date"
# print(sql_read)
# df = pd.read_sql(sql_read, con=engine_pg)
# print(list(df.report_date))
# if list(df.report_date):
# report_date = list(df.report_date)[0]
# print("report_date: ", report_date)
# ymd = report_date.replace('-', '_')
# for i in ['zr', 'sp', 'sb', 'ac', 'er', 'tr', 'bs']:
# sql_count = f"select count(id) from {site}_st_{i}_everyday_{ymd} limit 1"
# print(sql_count)
# df_count = pd.read_sql(sql_count, con=engine_pg)
# count = list(df_count.values)[0][0]
# print(count)
# if i == 'er':
# if count < 10000:
# print('数据解析错误')
# send_mg(site, ymd, i)
# elif i == 'ac' or i == 'tr':
# if count < 90000:
# print('数据解析错误')
# send_mg(site, ymd, i)
# else:
# if count < 200000:
# print('数据解析错误')
# send_mg(site, ymd, i)
# cursor.close()
# connect.close()
if __name__ == '__main__':
site_list = ['us','de','uk','fr','es','it']
# site_list = ['us']
for site in site_list:
spider_us = init_db(site_name=site)
import time
from playwright.sync_api import sync_playwright
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
from lxml import etree
import random
import traceback
import pandas as pd
class One688LoginSpider(object):
def __init__(self, site):
self.site_name = site
self.data = None
def mysql_connect(self, site='us'):
if site == 'us':
db = 'selection'
else:
db = f'selection_{site}'
DB_CONN_DICT = {
"mysql_port": 3306,
"mysql_user": "XP_Yswg2025_PY",
"mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
"mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
}
url = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database=db,
query={"charset": "utf8mb4"}
)
self.engine_mysql = create_engine(url)
url_us = URL.create(
drivername="mysql+pymysql",
username=DB_CONN_DICT["mysql_user"],
password=DB_CONN_DICT["mysql_pwd"], # 原始密码,含 @ 也没问题
host=DB_CONN_DICT["mysql_host"],
port=int(DB_CONN_DICT["mysql_port"]),
database='selection',
query={"charset": "utf8mb4"}
)
self.engine_us_mysql = create_engine(url_us)
self.engine_pg = create_engine(
f"postgresql+psycopg2://postgres:F9kL2sXe81rZq@113.100.143.162:5432/{db}",
encoding='utf-8')
self.num = 0
week = time.strftime("%W")
yaer = time.strftime('%Y', time.localtime(time.time()))
self.y_w = f"{yaer}-{week}"
def crawl(self):
self.page.wait_for_timeout(10000)
page_content = self.page.content()
html = etree.HTML(page_content)
Category_list = html.xpath('//h2[contains(text(),"Category")]/following-sibling::div/div')
for Category in Category_list:
Category_name = Category.xpath('.//@label')
print("Category_name 名称 11111", Category_name)
self.page.evaluate_handle(
f"""document.querySelector("kat-radiobutton[label='{Category_name[0]}']").click()""")
self.page.wait_for_timeout(5000)
page_content = self.page.content()
html = etree.HTML(page_content)
Product_Type_list = html.xpath(
'//h2[contains(text(),"Product Type")]/following-sibling::div/div')
del Product_Type_list[0]
for Product_Type in Product_Type_list:
Product_name = Product_Type.xpath('./@id')
self.page.evaluate_handle(f"document.querySelector('#{Product_name[0]} > kat-radiobutton').click()")
self.page.wait_for_timeout(3000)
page_content = self.page.content()
html = etree.HTML(page_content)
Item_Type_Keyword_id_list = html.xpath(
'//h2[contains(text(),"Item Type Keyword")]/following-sibling::div/div')
kw_list = []
for Item_Type_Keyword_id in Item_Type_Keyword_id_list:
Keyword_id = Item_Type_Keyword_id.xpath('./@id')
print("Keyword_id:", Keyword_id)
Keyword = html.xpath(f"//div[@id='{Keyword_id[0]}']/kat-radiobutton/@label")
print('Keyword', Keyword)
self.page.query_selector(f"xpath=//kat-radiobutton[@value='{Keyword_id[0]}']").click()
self.page.wait_for_timeout(6000)
self.page.on("requestfinished", self.print_request_finished)
self.page.wait_for_timeout(6000)
time.sleep(15)
if self.data:
kw_list.append(self.data)
self.data = None
print('\n')
self.page.wait_for_timeout(3500)
print('\n')
self.data = None
# break
self.save_data(kw_list)
def print_request_finished(self, request):
# 拦截请求获取数据
post_url = 'https://sellercentral.amazon.com/next/v2/getPerformanceDashboard'
if post_url in request.url:
if request.response():
try:
self.data = request.response().json()
print('self.data:: ',self.data)
except Exception as e:
print('拦截url报错:', e, f"\n{traceback.format_exc()}")
self.data = None
def get_category_data(self, Category_list):
print('Category_list:::', Category_list)
num = 0
for Category in Category_list:
print(Category, ' 22222222222222222222222222222222222222')
num += 1
print("Category_name 名称 11111", Category)
self.page.evaluate(f"""document.querySelector("kat-radiobutton[label='{Category}']").click()""")
time.sleep(5)
html = etree.HTML(self.page.content())
Product_Type_list = html.xpath(
'//h2[contains(text(),"Product Type")]/following-sibling::div/div')
for Product_Type in Product_Type_list:
time.sleep(5)
Product_name = Product_Type.xpath('./@id')
print("Product_name3222222222::", Product_name[0].upper())
self.page.evaluate(f"document.querySelector('#{Product_name[0]} > kat-radiobutton').click()")
time.sleep(2)
html = etree.HTML(self.page.content())
Item_Type_Keyword_id_list = html.xpath(
'//h2[contains(text(),"Item Type Keyword")]/following-sibling::div/div')
kw_list = []
for Item_Type_Keyword_id in Item_Type_Keyword_id_list:
Keyword_id = Item_Type_Keyword_id.xpath('./@id')
print("Keyword_id:", Keyword_id)
Keyword = html.xpath(f"//div[@id='{Keyword_id[0]}']/kat-radiobutton/@label")
print('Keyword', Keyword)
self.page.locator(f'//kat-radiobutton[@value="{Keyword_id[0]}"]').click()
self.page.wait_for_timeout(6000)
self.page.on("requestfinished", self.print_request_finished)
if self.data:
kw_list.append(self.data)
self.data = None
print('\n')
self.page.wait_for_timeout(3500)
print('\n')
self.data = None
# break
self.save_data(kw_list)
def save_data(self, kw_list):
with open(r'data.json', 'a', encoding='utf-8') as f:
for data_dict in kw_list:
# f.write(json.dumps(data_dict)+'\n')
print(data_dict['demand']['unitSold'], '售出單位')
print(data_dict['demand']['glanceViews'], '瀏覽量')
print(data_dict['demand']['yearOnYearUnitSold'], '同比銷量')
print('\n')
print(data_dict['demand']['yearOnYearGlanceViews'], '年同比瀏覽次數')
print('\n')
print(data_dict['demand']['unitSoldYOY'], '銷售量EO')
print('\n')
print(data_dict['demand']['glanceViewsYOY'], '瀏覽量同比')
print('\n')
print(data_dict['demand']['mostPopularKeywords'], '最熱門關鍵字')
print('\n')
print(data_dict['demand']['searchToPurchaseRatio'], '搜尋購買比率')
print('\n')
print(data_dict['demand']['returnRatio'], '回報率')
print('\n')
print(data_dict['demand']['returnReasons'], '退貨原因')
print('\n')
print(data_dict['demand']['priceConversionRates'], '價格轉換率')
print('\n')
print(data_dict['demand']['netShippedGMS'], '網運GMS')
print('\n')
print(data_dict['demand']['searchVolume'], '搜尋量')
print('\n')
print(data_dict['demand']['clickCount'], '點擊次數')
print('\n')
print(data_dict['demand']['buyBoxPrice'], '買盒價格')
print('\n')
print(data_dict['competition']['starRatings'])
print(data_dict['competition']['sellerCount'])
print(data_dict['competition']['asinCount'])
print(data_dict['competition']['offersPerAsin'])
print(data_dict['competition']['medianAdSpendPerClick'])
print(data_dict['competition']['avgAdSpendPerClick'])
print(data_dict['competition']['majorityAdSpendPerClick'])
print(data_dict['competition']['newAsinCount'])
print(data_dict['competition']['newBrandCount'])
def read_category(self):
# 接着上次中断的继续
print('接着上次中断的继续')
self.mysql_connect(site=self.site_name)
select_sql = 'select category from seller_category_insights_syn where state =1'
df = pd.read_sql(select_sql, con=self.engine_pg)
category_list = list(df.category)
if category_list:
return category_list
else:
workflow_everyday_list = [
[self.site_name, self.y_w, '类目分析抓取完成', 3, f'{self.site_name}_aba_profit_category_insights', 'week',
'类目分析', '是']]
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['site_name', 'date_info', 'status', 'status_val',
'table_name', 'date_type', 'page', 'is_end'])
df_seller_asin_account.to_sql('workflow_progress', con=self.engine_us_mysql, if_exists='append',
index=False)
def run(self):
for i in range(2):
try:
self.page.goto('https://sellercentral.amazon.com/selection/category-insights')
self.page.wait_for_timeout(2000)
time.sleep(random.uniform(10, 25.25))
break
except:
time.sleep(5)
if self.site_name == 'us':
self.page.evaluate(
'document.querySelector("#ATVPDKIKX0DER > kat-radiobutton").shadowRoot.querySelector("div > div.text > slot > kat-label:nth-child(1)").click()')
elif self.site_name == 'uk':
self.page.evaluate(
'document.querySelector("#A1F83G8C2ARO7P > kat-radiobutton").shadowRoot.querySelector("div > div.text > slot > kat-label:nth-child(1)").click()')
def main(self):
# 初始化
with sync_playwright() as _playwright:
# _playwright.chromium.launch_persistent_context
browser = _playwright.chromium.launch_persistent_context(
# 指定本机用户缓存地址
user_data_dir=r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data",
# 指定本机google客户端exe的路径
executable_path=r"C:\Program Files\Google\Chrome\Application\chrome.exe",
# 要想通过这个下载文件这个必然要开 默认是False
accept_downloads=True,
# 设置不是无头模式
headless=False,
bypass_csp=True,
locale='en-GB',
ignore_https_errors=True,
no_viewport=True,
slow_mo=10,
# 跳过检测
args=['--disable-blink-features=AutomationControlled', '--remote-debugging-port=9222']
)
self.page = browser.new_page()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
self.page.add_init_script(js)
self.page.evaluate_handle('''() =>{ window.chrome = { runtime: {}, }; }''')
self.page.evaluate_handle(
'''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
# 模拟浏览器参数
self.page.locator("body").click()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
self.page.add_init_script(js)
time.sleep(3)
Category_list = self.read_category()
if Category_list:
self.run()
self.get_category_data(Category_list)
if __name__ == '__main__':
One688 = One688LoginSpider('us')
One688.main()
import json
import random
import time
import pandas as pd
import redis
from lxml import html
from playwright.sync_api import sync_playwright
from secure_db_client import get_remote_engine
def mysql_connect():
engine_us_mysql = get_remote_engine(
site_name='us', # -> database "selection"
db_type='mysql', # -> 服务端 alias "mysql"
)
return engine_us_mysql
def run(asin_list):
print('asin_list:::',asin_list)
print('asin_list:::',len(asin_list))
if asin_list:
# 初始化
with sync_playwright() as _playwright:
# _playwright.chromium.launch_persistent_context
browser = _playwright.chromium.launch_persistent_context(
# 指定本机用户缓存地址
user_data_dir=r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data",
# 指定本机google客户端exe的路径
executable_path=r"C:\Program Files\Google\Chrome\Application\chrome.exe",
# 要想通过这个下载文件这个必然要开 默认是False
accept_downloads=True,
# 设置不是无头模式
headless=False, # False 打开。 True 无头浏览器
bypass_csp=True,
locale='en-GB',
ignore_https_errors=True,
no_viewport=True,
slow_mo=10,
# 跳过检测
args=['--disable-blink-features=AutomationControlled', '--remote-debugging-port=9222']
)
page = browser.new_page()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page.add_init_script(js)
page.evaluate_handle('''() =>{ window.chrome = { runtime: {}, }; }''')
page.evaluate_handle(
'''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
# 模拟浏览器参数
page.locator("body").click()
js = """
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
page.add_init_script(js)
print('打开浏览器请求asin:')
page = browser.new_page()
try:
page.goto('https://sellercentral.amazon.com')
time.sleep(random.uniform(2, 5))
except:
save_asin_var_data(asin_list[0], json.dumps({"content": "网络有问题 登录账号失败。远程账号电脑检查"}), '失败')
for asin in asin_list:
time.sleep(random.uniform(1, 3))
try:
print('请求asin', asin)
url = f"https://sellercentral.amazon.com/listing/varwiz/search?searchText={asin}"
print('url:', url)
page.goto(url)
time.sleep(random.uniform(3, 8))
print()
print(page.content())
html_string = page.content()
time.sleep(0.5)
if 'The ASIN you searched for is not part of any variation' not in html_string:
doc = html.fromstring(html_string)
# 取第一个 <pre> 的文本内容(会自动去掉标签内 HTML)
pre_nodes = doc.xpath('//pre')
if not pre_nodes:
raise ValueError("找不到 <pre> 节点")
pre_text = pre_nodes[0].text_content().strip()
# 直接尝试解析(适用于 <pre> 里就是整段 JSON 的情况)
data_json = json.loads(pre_text)
print(data_json) # dict / list
print('获取完成', asin)
save_asin_var_data(asin, data_json, '成功')
else:
print('没有该asin,', asin)
save_asin_var_data(asin, json.dumps(
{"content": "The ASIN you searched for is not part of any variation family"}), '成功')
except Exception as e:
print('报错,‘23232323232323232323', e)
save_asin_var_data(asin, json.dumps({"content": "下载失败。远程账号电脑检查"}), '失败')
continue
def redis_get_asin():
asin_list = []
random_key_list = []
redis_client = redis.Redis(host='113.100.143.162', port=6379, db=10, password='fG7#vT6kQ1pX')
while True:
try:
print('轮询redis 查询,')
for i in range(10):
# 随机获取一个key
random_key = redis_client.randomkey()
if random_key:
random_key_list.append(random_key)
# 获取该key对应的value
value = redis_client.get(random_key)
value = value.decode('utf-8')
print('redis取出asin: ', value)
if value not in asin_list:
asin_list.append(value)
else:
break
if asin_list:
_asin_lis = list(set(asin_list))
print("_asin_lis:::",_asin_lis, )
print("_asin_lis::: len ", len(_asin_lis))
run(_asin_lis) # 传递asin 列表
asin_list = []
for _key in random_key_list:
print(' 删除redis的asin:', _key)
redis_client.delete(_key) # 删除redis的asin
random_key_list = []
else:
time.sleep(3)
continue
# redis_client.close() 关闭redis
except Exception as e:
print('查询redis报错', e)
redis_client.close()
redis_client = redis.Redis(host='192.168.10.224', port=6379, db=10, password='fG7#vT6kQ1pX')
time.sleep(5)
continue
def save_asin_var_data(asin, data_json, spider_value):
engine_us_mysql = mysql_connect()
workflow_everyday_list = [[asin, data_json, spider_value]]
print('存储数据:', len(workflow_everyday_list))
df_seller_asin_account = pd.DataFrame(data=workflow_everyday_list,
columns=['asin', 'asin_var_data', 'spider_value'])
engine_us_mysql.to_sql(df_seller_asin_account, 'us_asin_var_info')
if __name__ == '__main__':
redis_get_asin()
import requests
data = {
"username":"pengyanbing",
'password':"15112376559"
}
# url = 'http://192.168.2.28:5000/login'
# resp = requests.post(url,json=data)
url = 'http://192.168.2.28:5000/user/members/index'
resp = requests.get(url,headers={'inventory-token':'fFZ7P4XpSA6nxaH7Xw7aHQ'})
print(resp.content.decode('utf-8'))
DB_CONFIG = {
'host': '120.77.232.73',
'port': 3306,
'user': 'yswg_it_cangchu',
'password': 'Yswg@inv-cangchu241011420',
'db': 'inventory'
}
# REDIS_CONN = {
# "redis_host": "120.77.232.73",
# "redis_port": 6379,
# "redis_pwd": "yswgInventoryTest@202307#",
# "redis_db": 1
#
# }
REDIS_CONN = {
"redis_host": "113.100.143.162",
"redis_port": 6379,
"redis_pwd": "fG7#vT6kQ1pX",
"redis_db": 14
}
\ No newline at end of file
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import pandas as pd
from utils.db_connect import BaseUtils
from amazon_params.params import DB_REQUESTS_ASIN_PARAMS
import time
from difflib import SequenceMatcher
from threading_spider.db_connectivity import connect_db
print('查询 self_asin_detail 表信息来更新erp_asin 的异常')
class Save_asin_self(BaseUtils):
def __init__(self, site_name='us'):
super().__init__()
self.site_name = site_name # 站点
self.time_strftime = time.strftime("%Y-%m-%d", time.localtime())
def check_contain_chinese(self, check_str):
"""
判断获取文本是否有中文
"""
for c in check_str:
if '\u4e00' <= c <= '\u9fa5':
print('--是中文,--')
return True
def init_db_names(self):
self.engine = self.mysql_connect()
self.db_erp_asin = self.site_name + DB_REQUESTS_ASIN_PARAMS['db_us_erp_asin'][2:]
self.db_self_asin_detail = self.site_name + DB_REQUESTS_ASIN_PARAMS['db_self_asin_detail'][2:]
sql_read = "SELECT text_name FROM censored_thesaurus WHERE data_type='负面词汇'"
print(sql_read)
df = pd.read_sql(sql_read, con=self.engine)
self.text_list = list(df.text_name)
print('负面词汇:', self.text_list)
# asin_sql = f"SELECT asin,sku,erp_seller,{self.site_name}_upload_info,title,`describe` as describe_str ,selling from {self.site_name}_erp_asin_syn WHERE created_at>='2023-05-11' and asin_type=1;"
asin_sql = f"SELECT asin,sku,erp_seller,{self.site_name}_upload_info,title,`describe` as describe_str ,selling,is_variation,fulFillable from {self.site_name}_erp_asin_syn WHERE created_at>='2023-05-11';"
print('asin_sql::', asin_sql)
df_asin = pd.read_sql(asin_sql, con=self.engine)
self.asin_list = list(df_asin.asin)
print(len(self.asin_list))
df_asin[f'{self.site_name}_upload_info'].fillna('N/A', inplace=True)
# 处理每个字段的值,如果为 None 就替换为空字符串
df_asin['title'] = df_asin['title'].fillna('')
df_asin['fulFillable'] = df_asin['fulFillable'].fillna('0')
df_asin['is_variation'] = df_asin['is_variation'].fillna('2')
df_asin['describe_str'] = df_asin['describe_str'].fillna('') # Use the correct column name here
df_asin['selling'] = df_asin['selling'].fillna('')
if self.site_name == 'us':
asin_data_list = list(
df_asin.asin + '|+|' + df_asin.sku + '|+|' + df_asin.erp_seller + '|+|' +
df_asin['us_upload_info'].fillna('').astype(str) + '|+|' +
df_asin.title + '|+|' + df_asin.describe_str + '|+|' + df_asin.selling + '|+|' + df_asin.fulFillable.astype(str) + '|+|' + df_asin.is_variation.astype(str)
)
elif self.site_name == 'uk':
asin_data_list = list(
df_asin.asin + '|+|' + df_asin.sku + '|+|' + df_asin.erp_seller + '|+|' +
df_asin['uk_upload_info'].fillna(
'') + '|+|' + df_asin.title + '|+|' + df_asin.describe_str + '|+|' + df_asin.selling + '|+|' + df_asin.fulFillable.astype(str) + '|+|' + df_asin.is_variation.astype(str)
)
self.item_asin = {}
for data in asin_data_list:
if isinstance(data, str): # 检查数据是否为字符串
data_list = data.split('|+|')
self.item_asin[data_list[0]] = data
self.select_self_asin_detail()
def select_self_asin_detail(self):
sava_data = []
asin_tuple = tuple(self.asin_list)
# 先排除状态 4 的变狗 的asin
self_all_syn_sql = f'SELECT asin from {self.site_name}_self_all_syn WHERE asin in {asin_tuple} and state=4 and updated_at>="{self.time_strftime}"'
# print(self_all_syn_sql)
self_all_syn_sql_1 = f'SELECT asin from {self.site_name}_self_real_spider WHERE asin in {asin_tuple} and state=4 and updated_at>="{self.time_strftime}"'
# print(self_all_syn_sql_1)
df_asin_error = pd.read_sql(self_all_syn_sql, con=self.engine)
df_asin_error_1 = pd.read_sql(self_all_syn_sql_1, con=self.engine)
asin_error_ = list(df_asin_error.asin)
asin_error_1 = list(df_asin_error_1.asin)
asin_error_list = asin_error_1.extend(asin_error_)
if asin_error_list:
print("asin_error_list::", asin_error_list)
for asin in list(set(asin_error_list)):
asin_data = self.item_asin.get(asin)
err_4_list = []
if asin_data:
asin_erp_data_list = asin_data.split('|+|')
sku = asin_erp_data_list[1]
erp_seller = asin_erp_data_list[2]
err_4_list.append(asin)
err_4_list.append(sku)
err_4_list.append(erp_seller)
err_4_list.append(2)
sava_data.append(err_4_list)
if asin in self.asin_list:
self.asin_list.remove(asin)
df = pd.DataFrame(data=sava_data,
columns=['asin', "sku", 'erp_seller', 'page_error'])
df.to_sql(f'{self.site_name}_erp_asin', con=self.engine, if_exists="append", index=False)
sava_data = []
asin_tuple = tuple(self.asin_list)
asin__detail_sql = f"SELECT asin,title,img_num,`describe`,category,page_inventory,search_category,product_description,img_type from {self.site_name}_self_asin_detail WHERE site='{self.site_name}' and created_at>='{self.time_strftime}' and asin in {asin_tuple};"
df_asin_detail = pd.read_sql(asin__detail_sql, con=self.engine)
fields_list = df_asin_detail.values.tolist()
for asin_data in fields_list:
data_list = []
asin = asin_data[0]
title = asin_data[1]
img = asin_data[2]
describe = asin_data[3]
category = asin_data[4]
img_type_str = asin_data[8]
if img_type_str:
img_type = img_type_str.split(',')[-1]
else:
img_type = '-1'
asin_erp_data = self.item_asin.get(asin)
if asin_erp_data:
asin_erp_data_list = asin_erp_data.split('|+|')
sku = asin_erp_data_list[1]
erp_seller = asin_erp_data_list[2]
category_upload_info = asin_erp_data_list[3]
syn_title = asin_erp_data_list[4] # 标题
if syn_title is None:
syn_title = ''
syn_describe = asin_erp_data_list[5] # 五点描述
if syn_describe is None:
syn_describe = ''
syn_selling = asin_erp_data_list[6] # 产品描述
if syn_selling is None:
syn_selling = ''
fulFillable = asin_erp_data_list[7] # ful
is_variation = asin_erp_data_list[8] # 是否为变体
else:
continue
page_inventory = asin_data[5]
search_category = asin_data[6]
product_description = asin_data[7]
if title and self.check_contain_chinese(title):
title_error = 3
else:
if title is not None:
if len(title) < 35:
title_error = 2
else:
title_error = 1
else:
title_error = 2
if title_error == 1:
for i in self.text_list:
wrods1 = f" {i},"
wrods2 = f" {i} "
wrods3 = f", {i} "
if wrods1.lower() in title.lower():
title_error = 5
break
elif wrods2.lower() in title.lower():
title_error = 5
break
elif wrods3.lower() in title.lower():
title_error = 5
break
if title_error == 1 and len(syn_title) > 5:
print(syn_title, 333333333333333333333333333, title)
if self.Compare_str(title, syn_title) > 10:
title_error = 6 # 页面标题和系统上的标题不一样
elif title_error == 1 and len(syn_title) < 5:
title_error = 1 # 页面五点描述和系统上的五点描述不一样
if img:
if img <= 4:
img_error = 2
else:
img_error = 1
else:
img_error = 2
bullet_list = []
selling_error = 1
if describe:
bullets_list = describe.split('|-|')
for bullets in bullets_list:
bullet = bullets.strip()
if bullet and self.check_contain_chinese(bullet):
selling_error = 3
break
if selling_error == 3:
pass
else:
for bullets in bullets_list:
bullet_1 = bullets.strip()
bullet_list.append(bullet_1)
str_bullet = ('').join(bullet_list)
if len(str_bullet) < 35:
selling_error = 2
if selling_error == 1:
bullets_join = ''.join(bullets_list)
for i in self.text_list:
wrods1 = f" {i},"
wrods2 = f" {i} "
wrods3 = f", {i} "
if wrods1.lower() in bullets_join.lower():
selling_error = 5
break
elif wrods2.lower() in bullets_join.lower():
selling_error = 5
break
elif wrods3.lower() in bullets_join.lower():
selling_error = 5
break
if selling_error == 1 and len(syn_describe) > 10:
if self.Compare_str(describe, syn_describe) < 0.85:
selling_error = 6 # 页面五点描述和系统上的五点描述不一样
elif selling_error == 1 and len(syn_describe) < 10:
selling_error = 1 # 页面五点描述和系统上的五点描述不一样
else:
selling_error = 2
print('search_category:', search_category)
if category and search_category:
category_str = category.replace(' ', '')
print('category::', category_str)
if 'Clothing,Shoes&Jewelry›' in category_str and 'All' in search_category:
search_ccategory_error = 1
elif "Health&Household›" in category_str and 'Health, Household & Baby Care' in search_category:
search_ccategory_error = 1
elif "Patio,Lawn&Garden›" in category_str and 'Home & Kitchen' in search_category:
search_ccategory_error = 1
elif "Patio,Lawn&Garden›" in category_str and 'Garden & Outdoor' in search_category:
search_ccategory_error = 1
elif "BabyProducts›" in category_str and 'Baby' in search_category:
search_ccategory_error = 1
elif "Arts,Crafts&Sewing›" in category_str and 'Home & Kitchen' in search_category:
search_ccategory_error = 1
elif "Tools&HomeImprovement›" in category_str and 'Home & Kitchen' in search_category:
search_ccategory_error = 1
elif 'ALL' in search_category:
search_ccategory_error = 1
elif 'Health&Household›' in category_str:
search_ccategory_error = 1
else:
va = search_category.split(' ')
s_va = ''.join(va)
nav_search_label = s_va.replace(" ", "")
print('nav_search_label', nav_search_label, category)
if category.startswith(nav_search_label):
search_ccategory_error = 1
else:
search_ccategory_error = 2
if search_ccategory_error == 2:
va_s = search_category.split(' ')
nav_search_label_s = va_s[0].replace(" ", "")
print(nav_search_label_s, '2222222222222222222', category)
if category.startswith(nav_search_label_s):
search_ccategory_error = 1
else:
if search_category:
search_ccategory_error = 1
else:
search_ccategory_error = 2
if category:
if self.check_contain_chinese(category):
ccategory_error = 2
else:
if len(category_upload_info) > 5:
category_erp = category_upload_info.replace(' ', '').replace('>', '›')
print('category_erp::', category_erp)
category = category.replace(' ', '')
print("category_asin", category)
if category != category_erp:
ccategory_error = 4
else:
ccategory_error = 1
else:
ccategory_error = 1
else:
ccategory_error = 3
if page_inventory:
buy_now_error = 1
else:
buy_now_error = 2
if product_description and len(syn_selling) > 10 and img_type != '3':
if self.Compare_str(product_description, syn_selling) < 0.85:
describe_error = 2 # 页面底部产品描述和系统上的产品描述不一样
else:
describe_error = 1
else:
describe_error = 1
if int(float(is_variation)) == 1 and int(float(fulFillable)) == 0: # asin页面为变体并且ful=0,则无需判断所有文案异常
describe_error = 1 # 底部描述
title_error = 1 # 标题描述
selling_error = 1 # 五点描述
data_list.append(asin)
data_list.append(title_error)
data_list.append(img_error)
data_list.append(selling_error)
data_list.append(search_ccategory_error)
data_list.append(ccategory_error)
data_list.append(buy_now_error)
data_list.append(sku)
data_list.append(erp_seller)
data_list.append(describe_error)
sava_data.append(data_list)
print(sava_data)
df = pd.DataFrame(data=sava_data,
columns=['asin', "title_error", 'img_error', 'selling_error', 'search_ccategory_error',
'ccategory_error', 'buy_now_error', 'sku', 'erp_seller', 'describe_error'])
df.to_sql(f'{self.site_name}_erp_asin', con=self.engine, if_exists="append", index=False)
# def Compare_str(self,str1, str2):
# # 找出两个字符串中的最短长度
# min_length = min(len(str1), len(str2))
# # 初始化计数器
# difference_count = 0
# # 比较字符并计算不同字符的数量
# for i in range(min_length):
# if str1[i] != str2[i]:
# difference_count += 1
# # 考虑字符串长度不同的情况,将多余的字符数添加到差异计数中
# difference_count += abs(len(str1) - len(str2))
# return difference_count
def Compare_str(self, str1, str2):
# 移除可能导致干扰的特殊字符 会区分大小写
str1 = str1.replace('|-|', '')
str2 = str2.replace('|-|', '')
# 使用 SequenceMatcher 计算相似性比率
similarity_ratio = SequenceMatcher(None, str1, str2).ratio()
print(similarity_ratio)
return similarity_ratio
def run(self):
self.init_db_names()
account = 'pengyanbing'
time_strftime = time.strftime('%Y %m %d %H:%M:%S', time.localtime(time.time()))
title = f'{self.site_name} 站点 每日 erp asin'
content = f'{time_strftime} 更新 erp asin 异常分类更新。正常'
connect_db(None).send_mg(account, title, content)
if __name__ == '__main__':
Save_asin_self(site_name='us').run()
Save_asin_self(site_name='uk').run()
import pymysql
from params import DB_CONN_DICT,PG_CONN_DICT_14
import pandas as pd
import traceback
from sqlalchemy import create_engine
import time
"""
每周三定时修改 feedback , product, 同步表修改状态 为 1 六个站点
"""
def run(site):
if site == 'us':
connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection", charset="utf8mb4")
else:
connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection_" + site, charset="utf8mb4")
if site == 'us':
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT_14['pg_user']}:{PG_CONN_DICT_14['pg_pwd']}@{PG_CONN_DICT_14['pg_host']}:{PG_CONN_DICT_14['pg_port']}/selection",
encoding='utf-8')
else:
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT_14['pg_user']}:{PG_CONN_DICT_14['pg_pwd']}@{PG_CONN_DICT_14['pg_host']}:{PG_CONN_DICT_14['pg_port']}/selection_{site}",
encoding='utf-8')
cursor = connect.cursor()
# 更改 feedback syn 表 状态为1
update_feedback_sql = f"update {site}_seller_account_syn_distinct set state = 1, product_state=1 and state!=12"
print(update_feedback_sql)
cursor.execute(update_feedback_sql)
connect.commit()
# 更改 店铺syn 表 状态为1
update_product_sql = f"update {site}_seller_account_product_syn set state = 1"
print(update_product_sql)
cursor.execute(update_product_sql)
connect.commit()
update_feedback_sql = f"update {site}_seller_account_syn set state = 1, product_state=1"
print(update_feedback_sql)
cursor.execute(update_feedback_sql)
connect.commit()
connect.close()
cursor.close()
if site in ('us'):
with engine_pg.begin() as conn:
conn.execute(update_feedback_sql)
conn.execute(update_product_sql)
conn.execute(update_feedback_sql)
if __name__ == '__main__':
run('us')
run('de')
run('uk')
run('fr')
run('es')
run('it')
\ No newline at end of file
from secure_db_client import get_remote_engine
import platform import platform
from sqlalchemy import create_engine from sqlalchemy import create_engine
import pandas as pd import pandas as pd
import datetime import datetime
from amazon_params.params import DB_CONN_DICT, REDIS_CONN, PG_CONN_DICT from params import DB_CONN_DICT, PG_CONN_DICT_21, REDIS_CONN, PG_CONN_DICT
import redis import redis
import time import time
from collections import Counter from collections import Counter
import json import json
import codecs
def update_state(site): def update_state(site):
if site == 'us': if site == 'us':
...@@ -17,7 +20,7 @@ def update_state(site): ...@@ -17,7 +20,7 @@ def update_state(site):
engine = create_engine( engine = create_engine(
f'mysql+pymysql://{DB_CONN_DICT["mysql_user"]}:' + f'{DB_CONN_DICT["mysql_pwd"]}@{DB_CONN_DICT["mysql_host"]}:{DB_CONN_DICT["mysql_port"]}/{db}?charset=utf8mb4') # , pool_recycle=3600 f'mysql+pymysql://{DB_CONN_DICT["mysql_user"]}:' + f'{DB_CONN_DICT["mysql_pwd"]}@{DB_CONN_DICT["mysql_host"]}:{DB_CONN_DICT["mysql_port"]}/{db}?charset=utf8mb4') # , pool_recycle=3600
engine_spider = create_engine( engine_spider = create_engine(
f"postgresql+psycopg2://postgres:fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS@61.145.136.61:54328/{db}", f"postgresql+psycopg2://postgres:F9kL2sXe81rZq@61.145.136.61:54328/{db}",
encoding='utf-8') encoding='utf-8')
engine_pg6 = create_engine( engine_pg6 = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT['pg_user']}:{PG_CONN_DICT['pg_pwd']}@{PG_CONN_DICT['pg_host']}:{PG_CONN_DICT['pg_port']}/{db}", f"postgresql+psycopg2://{PG_CONN_DICT['pg_user']}:{PG_CONN_DICT['pg_pwd']}@{PG_CONN_DICT['pg_host']}:{PG_CONN_DICT['pg_port']}/{db}",
...@@ -62,12 +65,12 @@ def update_state(site): ...@@ -62,12 +65,12 @@ def update_state(site):
sql_update_month = f"update {site}_search_term_month_syn set state = 1 where state = 2 and updated_time <'{up_time}'" sql_update_month = f"update {site}_search_term_month_syn set state = 1 where state = 2 and updated_time <'{up_time}'"
print(sql_update_month) print(sql_update_month)
conn_14.execute(sql_update_month) conn_14.execute(sql_update_month)
seller_sql = f"update {site}_seller_account_syn_distinct set state = 1 where state = 2 and updated_at <'{up_time}';" # seller_sql = f"update {site}_seller_account_syn_distinct set state = 1 where state = 2 and updated_at <'{up_time}';"
print('修改店铺信息抓取表状态:',seller_sql) # print('修改店铺信息抓取表状态:',seller_sql)
conn_14.execute(seller_sql) # conn_14.execute(seller_sql)
seller_sql_product = f"update {site}_seller_account_syn_distinct set product_state = 1 where product_state = 2 and updated_at <'{up_time}';" # seller_sql_product = f"update {site}_seller_account_syn_distinct set product_state = 1 where product_state = 2 and updated_at <'{up_time}';"
print('修改店铺asin抓取表状态',seller_sql_product) # print('修改店铺asin抓取表状态',seller_sql_product)
conn_14.execute(seller_sql_product) # conn_14.execute(seller_sql_product)
select_sql = f"select count(id) from {site}_all_syn_st_month_2025_{min_month} where state=1" select_sql = f"select count(id) from {site}_all_syn_st_month_2025_{min_month} where state=1"
df = pd.read_sql(select_sql, con=engine_spider) df = pd.read_sql(select_sql, con=engine_spider)
id_count_min = df.iloc[0, 0] id_count_min = df.iloc[0, 0]
...@@ -150,7 +153,7 @@ def get_redis_data(site_name, engine_pg6, engine_spider): ...@@ -150,7 +153,7 @@ def get_redis_data(site_name, engine_pg6, engine_spider):
list_data.append(count) list_data.append(count)
print(list_data) print(list_data)
new_date_hour = f'{site_name}_' + str(new_date) + ':0-23' new_date_hour = f'{site_name}_' + str(new_date) + ':0-23'
print(new_date_hour) print(new_date_hour,'13223')
list_hour_data = redis14.lrange(new_date_hour, start_index, end_index) list_hour_data = redis14.lrange(new_date_hour, start_index, end_index)
# 使用 Counter 统计元素出现次数 # 使用 Counter 统计元素出现次数
element_counts_hour = Counter(list_hour_data) element_counts_hour = Counter(list_hour_data)
...@@ -204,19 +207,40 @@ def get_redis_data(site_name, engine_pg6, engine_spider): ...@@ -204,19 +207,40 @@ def get_redis_data(site_name, engine_pg6, engine_spider):
print(asin_column_json) print(asin_column_json)
# 验证码 1 异常 2 成功 3 总请求 4 # 验证码 1 异常 2 成功 3 总请求 4
with engine_pg6.begin() as conn: with engine_pg6.begin() as conn:
# 如果只有一个 URL,用单引号括起来 sql_upsert = f"""
sql_update = f"""UPDATE {site_name}_count_request_data INSERT INTO {site_name}_count_request_data (
SET asin_request_err_total={list_data[1]}, date_info,
code_err_total={list_data[0]}, asin_request_err_total,
success_asin_total={list_data[2]}, code_err_total,
request_total_count={list_data[3]}, success_asin_total,
hour_asin_total='{hour_data_json}', request_total_count,
asin_column_json='{asin_column_json}', hour_asin_total,
remain_asin_total = {remain_asin_total} asin_column_json,
WHERE date_info='{new_date}'""" remain_asin_total
# sql_update = f"UPDATE {site_name}_count_request_data SET asin_request_err_total={list_data[1]},code_err_total={list_data[0]},success_asin_total={list_data[2]},request_total_count={list_data[3]},hour_asin_total='{hour_data_json}',asin_column_json='{asin_column_json}' where date_info='{new_date}'" ) VALUES (
print(sql_update) '{new_date}',
conn.execute(sql_update) {list_data[1]},
{list_data[0]},
{list_data[2]},
{list_data[3]},
'{hour_data_json}',
'{asin_column_json}',
{remain_asin_total}
)
ON CONFLICT (date_info) DO UPDATE SET
asin_request_err_total = EXCLUDED.asin_request_err_total,
code_err_total = EXCLUDED.code_err_total,
success_asin_total = EXCLUDED.success_asin_total,
request_total_count = EXCLUDED.request_total_count,
hour_asin_total = EXCLUDED.hour_asin_total,
asin_column_json = EXCLUDED.asin_column_json,
remain_asin_total = EXCLUDED.remain_asin_total;
"""
print(sql_upsert)
conn.execute(sql_upsert)
if __name__ == '__main__': if __name__ == '__main__':
......
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from secure_db_client import get_remote_engine
from curl_cffi import requests
from utils.db_connect import BaseUtils
import re
from lxml import etree
os.environ['NO_PROXY'] = 'amazon.com'
import json
from urllib.parse import urlparse
class Amazon_reviewer():
def __init__(self, site_name='us'):
if site_name == "us":
self.site_url = 'https://www.amazon.com'
self.host = 'www.amazon.com'
elif site_name == 'uk':
self.site_url = 'https://www.amazon.co.uk' # 站点url
self.host = 'www.amazon.co.uk'
elif site_name == 'de':
self.site_url = 'https://www.amazon.de'
self.host = 'www.amazon.de'
elif site_name == 'fr':
self.site_url = 'https://www.amazon.fr'
self.host = 'www.amazon.fr'
elif site_name == 'es':
self.site_url = 'https://www.amazon.es'
self.host = 'www.amazon.es'
elif site_name == 'it':
self.site_url = 'https://www.amazon.it'
self.host = 'www.amazon.it'
def pg_connect(self):
engine_pg15 = get_remote_engine(
site_name='us', # -> database "selection"
db_type='postgresql_15_outer', # -> 服务端 alias "mysql"
)
return engine_pg15
def redis_db(self):
redis14_ = BaseUtils().redis_db()
headers_json = redis14_.get('amaozn_login_dict')
self.cookeis_dict = json.loads(headers_json)
redis14_.close()
def get_asin_reviewer(self, asin='0740303090'):
headers = {
'authority': urlparse(self.site_url).hostname,
'host': self.host,
"x-requested-with": "XMLHttpRequest",
"accept": "text/html,*/*",
"content-type": "application/x-www-form-urlencoded;charset=UTF-8",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
"origin": self.site_url,
"accept-language": "zh-CN,zh;q=0.9",
}
url = f'{self.site_url}/product-reviews/B00CX547FE/ref=cm_cr_getr_d_paging_btm_next_1?sortBy=recent&pageNumber=1'
response = requests.get(url, headers=headers, cookies=self.cookeis_dict)
resp = etree.HTML(response.text)
with open(r'C:\Users\ASUS\Desktop\text.html', 'w', encoding='utf-8')as f:
f.write(response.text)
div_list = resp.xpath("//div[@id='cm_cr-review_list']/ul/li")
for div in div_list:
user_href_list = div.xpath(".//div[@class='a-row a-spacing-mini']/a/@href")
user_href = self.site_url + user_href_list[0] if user_href_list else None
user_img_list = div.xpath(".//div[@class='a-row a-spacing-mini']//img/@data-src")
user_img = self.site_url + user_img_list[0] if user_img_list else None
user_name_list = div.xpath(".//div[@class='a-row a-spacing-mini']//span[@class='a-profile-name']/text()")
user_name = user_name_list[0] if user_name_list else None
review_star_rating_list = div.xpath(".//div[@class='a-row']//i[@data-hook='review-star-rating']//text()")
review_star_rating = review_star_rating_list[0] if review_star_rating_list else None
review_title_list = div.xpath(".//div[@class='a-row']//a/span/text()")
review_title = review_title_list[0] if review_title_list else None
review_date_list = div.xpath(".//span[@data-hook='review-date']/text()")
review_date = review_date_list[0] if review_date_list else None
review_href_list = div.xpath(".//div[@class='a-row']//a/@href")
review_href = self.site_url + review_href_list[0] if review_href_list else None
var_data_list = div.xpath(".//div[@class='a-row a-spacing-mini review-data review-format-strip']//a/text()")
var_data = '||'.join(var_data_list) if var_data_list else None
var_asin_list = div.xpath(".//div[@class='a-row a-spacing-mini review-data review-format-strip']//a/@href")
if var_asin_list:
varasin_list = re.findall(r'reviews/(.*)/ref', var_asin_list[0])
var_asin = varasin_list[0] if varasin_list else None
else:
var_asin = None
vp_list = div.xpath(".//a[contains(@aria-label,'Verified Purchase')]//span/text()")
verified_purchase = vp_list[0] if vp_list else None
review_data_list = div.xpath(
".//div[@class='a-row a-spacing-small review-data']/span[@data-hook='review-body']//text()")
review_data_list = ''.join(review_data_list).strip()
review_data = review_data_list if review_data_list else None
review_img_list = div.xpath(".//img[@data-hook='review-image-tile']/@src")
print('review_img_list::', review_img_list)
if review_img_list:
review_img = ','.join(review_img_list).strip()
else:
review_img = None
items = {'user_name': user_name, 'user_img': user_img, "user_href": user_href,
'review_star_rating': review_star_rating,
'review_title': review_title, "review_date": review_date, "review_href": review_href,
"var_data": var_data,
'var_asin': var_asin, "is_vp": verified_purchase, "review_data": review_data,
"review_data_img": review_img}
print(items)
def run(self):
self.redis_db()
self.get_asin_reviewer()
if __name__ == '__main__':
Amazon_reviewer().run()
# from lxml import etree
#
# with open(r'C:\Users\ASUS\Desktop\text.html','r',encoding='utf-8')as f:
# resp = f.read()
#
# html = etree.HTML(resp)
# h2_str = html.xpath('//h2[contains(@class,"a-spacing-medium")]/text()')
# print(h2_str)
# data_asin_list = html.xpath(f"//h2[contains(text(),'{h2_str[0]}')]/parent::div/parent::div//@data-asin")
# print(data_asin_list)
import curl_cffi
headers = {
"Referer": "https://depatisnet.dpma.de/DepatisNet/depatisnet",
"Origin": "https://depatisnet.dpma.de",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
}
resp = curl_cffi.get('https://depatisnet.dpma.de/DepatisNet/depatisnet?window=1&space=main&content=treffer&action=textpdf&docid=CN000119456546B',headers=headers)
print(resp.text)
\ No newline at end of file
DEFAULT_USER = "fangxingjun"
DEFAULT_USER_TOKEN = "fxj_token_123"
import time
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool
from sqlalchemy import text
from sqlalchemy.orm import sessionmaker
import platform
import traceback
import json
import uuid
from sqlalchemy.exc import SQLAlchemyError
class ConnectSpider:
def __init__(self):
self.pg_port = 54328
self.pg_db = "selection"
self.pg_user = "postgres"
self.pg_pwd = "F9kL2sXe81rZq"
self.pg_host = "61.145.136.61"
self.db_engine = create_engine(f"postgresql://{self.pg_user}:{self.pg_pwd}@{self.pg_host}:{self.pg_port}/{self.pg_db}")
pg_host = "192.168.10.223"
self.db_engine192 = create_engine(
f"postgresql://{self.pg_user}:{self.pg_pwd}@{self.pg_host}:{self.pg_port}/{self.pg_db}")
# mysql
self.sql_port = 3306
self.sql_db = "selection"
self.sql_user = "adv_yswg"
self.sql_pwd = "Gd1pGJog1ysLMLBdML8w81"
self.sql_host = "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com"
self.charset = 'utf8mb4'
# 创建数据库连接字符串
connection_string_mysql = f"mysql+pymysql://{self.sql_user}:{self.sql_pwd}@{self.sql_host}:{self.sql_port}/{self.sql_db}?charset={self.charset}"
self.mysql_engine = create_engine(connection_string_mysql)
# mysql
sql_port = 19030
sql_db = "test"
sql_user = "fangxingjun"
sql_pwd = "fangxingjun12345"
sql_host = "192.168.10.151"
wai_host = "113.100.143.162"
# 创建数据库连接字符串
connection_string_mysql = f"mysql+pymysql://{sql_user}:{sql_pwd}@{wai_host}:{sql_port}/{sql_db}"
self.mysql_test = create_engine(connection_string_mysql)
# mysql
sql_port = 19030
sql_db = "selection"
sql_user = "fangxingjun"
sql_pwd = "fangxingjun12345"
sql_host = "192.168.10.151"
wai_host = "113.100.143.162"
# 创建数据库连接字符串
connection_string_mysql = f"mysql+pymysql://{sql_user}:{sql_pwd}@{sql_host}:{sql_port}/{sql_db}"
self.mysql_selection = create_engine(connection_string_mysql)
def con (self):
# pg_port = 5433
# pg_db = "selection"
# pg_user = "postgres"
# pg_pwd = "fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS"
# pg_host = "61.145.136.61"
# db_engine = create_engine(f"postgresql://{pg_user}:{pg_pwd}@{pg_host}:{pg_port}/{pg_db}")
# return db_engine
if platform.system().lower() == 'windows':
PG_CONN_DICT = {
"pg_port": 5433,
"pg_db": "selection",
"pg_user": "postgres",
"pg_pwd": "fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS",
"pg_host": "192.168.10.223",
}
else:
PG_CONN_DICT = {
"pg_port": 5433,
"pg_db": "selection",
"pg_user": "postgres",
"pg_pwd": "fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS",
"pg_host": "61.145.136.61",
}
nums = 0
while True:
try:
db = 'selection'
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT['pg_user']}:{PG_CONN_DICT['pg_pwd']}@{PG_CONN_DICT['pg_host']}:{PG_CONN_DICT['pg_port']}/{db}",
encoding='utf-8', connect_args={"connect_timeout": 10}, poolclass=NullPool)
return engine_pg
except Exception as e:
print("pg_connect 14 t11111111111111111111111:", e, f"\n{traceback.format_exc()}")
continue
def mysql(self):
sql_port = 3306
sql_db = "us_spider"
sql_user = "adv_yswg"
sql_pwd = "Gd1pGJog1ysLMLBdML8w81"
sql_host = "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com"
charset = 'utf8mb4'
# 创建数据库连接字符串
connection_string_mysql = f"mysql+pymysql://{sql_user}:{sql_pwd}@{sql_host}:{sql_port}/{sql_db}?charset={charset}"
mysql_engine = create_engine(connection_string_mysql)
return mysql_engine
def mysql_us_spider(self):
sql_port = 19030
# sql_db = "us_spider"
sql_db = "test"
sql_user = "fangxingjun"
sql_pwd = "fangxingjun12345"
sql_host = "192.168.10.151"
# 创建数据库连接字符串
connection_string_mysql = f"mysql+pymysql://{sql_user}:{sql_pwd}@{sql_host}:{sql_port}/{sql_db}"
mysql_us_spider_engine = create_engine(connection_string_mysql)
return mysql_us_spider_engine
def save_stock_img_id(self, items):
"""批量保存数据到数据库。"""
table_name = "stock_image_id_wj"
# 定义DataFrame的列
columns = ['account_id', 'image_id', 'state', 'created_at']
df = pd.DataFrame(items, columns=columns)
# 使用with语句管理数据库连接
with self.db_engine192.connect() as connection:
df.to_sql(
name=table_name,
con=connection,
if_exists='append',
index=False
)
# print("保存成功!")
# def get_account_id(self,item_id):
# with self.db_engine192.connect() as connection:
# table_name = "stock_image_summary_wj"
# # 修改查询语句以匹配你的数据表名称和列名称
# # query = f"""-- SELECT account_id,id FROM {table_name} where id = 27 ;"""
# query = text(f"SELECT account_id,id FROM {table_name} where id = :item_id")
# result = connection.execute(query, {"item_id": item_id})
# print(query)
# # 执行查询并将结果读取到 DataFrame 中
# df_status = pd.read_sql(query, con=connection)
# accounts = df_status.account_id.iloc[0]
# print(f'账号:{accounts}')
# # success_id = tuple(df_status.id)
# # sql_update = text(f"UPDATE {table_name} SET state = 2 WHERE id IN :success_id")
# # result = connection.execute(sql_update, {"success_id": success_id})
# # print('成功更新为2')
# connection.close()
# return accounts
def get_account_id(self, item_id):
with self.db_engine192.connect() as connection:
table_name = "stock_image_summary_wj"
query = text(f"SELECT account_id, id FROM {table_name} WHERE id = :item_id")
result = connection.execute(query, {"item_id": item_id})
df_status = pd.DataFrame(result.fetchall())
df_status.columns = result.keys()
try:
accounts = df_status.account_id.iloc[0]
except IndexError:
accounts = None # 或者处理不存在的情况
return accounts
def update_id_to_3(self,account_id):
with self.db_engine192.connect() as connection:
table_name = "stock_image_summary_wj"
success_id = tuple(account_id)
sql_update = text(f"UPDATE {table_name} SET state = 3 WHERE account_id IN :success_id")
result = connection.execute(sql_update, {"success_id": success_id})
print('成功更新为3')
connection.close()
def save_account(self,items):
"""批量保存数据到数据库。"""
table_name = "stock_image_summary_wj"
# 定义DataFrame的列
columns = ['account_id', 'account_secret', 'year_month', 'spider_date','state','created_time']
df = pd.DataFrame(items, columns=columns)
# 使用with语句管理数据库连接
with self.db_engine192.connect() as connection:
df.to_sql(
name=table_name,
con=connection,
if_exists='append',
index=False
)
print("保存成功!")
def delet_datails(self, image_id_list):
with self.db_engine192.connect() as connection:
table_name = "stock_image_detail_wj"
# 使用 SQLAlchemy 的 text 函数来创建 SQL 语句
query = text(f"SELECT image_id FROM {table_name} WHERE account_id = 'zhouweiqing@yswg.com.cn';")
# 使用 connection.execute() 来执行查询
result = connection.execute(query).fetchall()
# 获取表中的 image_id 列表
db_image_ids = [row[0] for row in result]
# 找出不在 image_id_list 中的 image_id
non_existent_image_ids = set(db_image_ids) - set(image_id_list)
# 删除不在 image_id_list 中的记录
for image_id in non_existent_image_ids:
delete_query = text(
f"DELETE FROM {table_name} WHERE account_id = 'zhouweiqing@yswg.com.cn' AND image_id = '{image_id}';")
connection.execute(delete_query)
# 提交更改
connection.commit()
def get_datails_image_id(self, account_id):
with self.db_engine192.connect() as connection:
table_name = "stock_image_detail_wj"
sql_query = text(f"SELECT image_id FROM {table_name} WHERE account_id = :account_id and created_time < '2024-09-02 00:00:00'")
result = connection.execute(sql_query, {"account_id": account_id})
image_id_list = [int(row[0]) for row in result.fetchall()]
# 提交更改
# connection.commit()
return image_id_list
# 1111111111111
def save_stock_detail(self, item):
"""批量保存数据到数据库。"""
table_name = "stock_image_detail_wj"
# 将item包装成列表
items_list = [item]
# 定义DataFrame的列
columns = ['account_id', 'image_id', 'image_size_info', 'image_title', 'image_type', 'image_url', 'created_time']
df = pd.DataFrame(items_list, columns=columns)
with self.db_engine192.connect() as connection:
df.to_sql(
name=table_name,
con=connection,
if_exists='append',
index=False
)
# print("保存成功!")
def get_stock_image_id(self):
with self.db_engine192.connect() as connection:
table_name = "stock_image_id_wj"
# 修改查询语句以匹配你的数据表名称和列名称
query = f"""SELECT image_id,id FROM {table_name} where account_id = 'daiting@yswg.com.cn' and state = 1;"""
print(query)
# 执行查询并将结果读取到 DataFrame 中
df_status = pd.read_sql(query, con=connection)
df_status['id'] = df_status['id'].astype(str)
# 现在你可以安全地连接两列了
image_id_id_pairs = list(df_status['image_id'] + '||-||' + df_status['id'])
success_id = tuple(df_status.id)
connection.close()
return image_id_id_pairs
#111111111111111111111
def get_stock_images_id(self,account_id):
with self.db_engine192.connect() as connection:
table_name = "stock_image_id_wj"
# 修改查询语句以匹配你的数据表名称和列名称
query = text(f"""SELECT image_id,id FROM {table_name} where account_id = :account_id and state = 1""")
print(query)
result = connection.execute(query, {'account_id': account_id})
df_status = pd.DataFrame(result.fetchall())
df_status.columns = result.keys()
df_status['id'] = df_status['id'].astype(str)
image_id_id_pairs = list(df_status['image_id'] + '||-||' + df_status['id'])
print(f'账号:{account_id}需爬取{len(image_id_id_pairs)}张')
connection.close()
return image_id_id_pairs
def get_kong_images_id(self, account_id):
with self.db_engine192.connect() as connection:
table_name = "stock_image_detail_wj"
# 使用子查询来过滤掉已经有 image_size_info 的 image_id
query = text(
f"""SELECT image_id, image_type, image_url
FROM {table_name}
WHERE account_id = :account_id
AND image_id NOT IN (
SELECT image_id
FROM {table_name}
WHERE account_id = :account_id
AND image_size_info != '{{}}'
)
AND image_size_info = '{{}}'"""
)
print(query)
result = connection.execute(query, {'account_id': account_id})
df_status = pd.DataFrame(result.fetchall(), columns=result.keys())
if df_status.empty:
return []
data_list = list(df_status['image_id'] + '||' + df_status['image_type'] + '||' + df_status['image_url'])
connection.close()
return data_list
def get_stock_image_detail(self, account_id):
with self.mysql_selection.connect() as connection:
table_name = "stock_image_detail"
query = text(
f"""SELECT account_id, image_id, image_size_info, image_title, image_type, image_url, created_time FROM {table_name} WHERE account_id = :account_id""")
print(query)
result = connection.execute(query, {'account_id': account_id})
df_status = pd.DataFrame(result.fetchall())
df_status.columns = result.keys()
# 将 Timestamp 转换为字符串格式
df_status['created_time'] = df_status['created_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
# 拼接字符串
detail_datas = list(
df_status['account_id'] + '||-||' +
df_status['image_id'] + '||-||' +
df_status['image_size_info'] + '||-||' +
df_status['image_title'] + '||-||' +
df_status['image_type'] + '||-||' +
df_status['image_url'] + '||-||' +
df_status['created_time']
)
print(f'账号:{account_id} 一共 {len(detail_datas)} 条数据')
return detail_datas
def save_stock_detail_move(self, data_list):
table_name = "stock_image_detail_wj"
# 定义DataFrame的列
columns = ['account_id', 'image_id', 'image_size_info', 'image_title', 'image_type', 'image_url',
'created_time']
df = pd.DataFrame(data_list, columns=columns)
with self.db_engine192.connect() as connection:
df.to_sql(
name=table_name,
con=connection,
if_exists='append',
index=False
)
print("保存成功!")
# 11111111111
def update_image_id_to_3(self, item_id):
with self.db_engine192.connect() as connection:
table_name = "stock_image_id_wj"
sql_update = text(f"UPDATE {table_name} SET state = 3 WHERE id = :item_id")
result = connection.execute(sql_update, {"item_id": item_id})
connection.close()
# 11111111111
def update_image_id_to_4(self, item_id):
with self.db_engine192.connect() as connection:
table_name = "stock_image_id_wj"
sql_update = text(f"UPDATE {table_name} SET state = 4 WHERE id = :item_id")
result = connection.execute(sql_update, {"item_id": item_id})
connection.close()
def save_stock_cookie(self,item):
table_name = "stock_cookie_wj"
# 将item包装成列表
items_list = [item]
# 定义DataFrame的列
columns = ['account_id', 'cookie', 'state', 'created_at']
df = pd.DataFrame(items_list, columns=columns)
with self.db_engine192.connect() as connection:
df.to_sql(
name=table_name,
con=connection,
if_exists='append',
index=False
)
print("保存成功!")
# def get_stock_cookie(self, account):
# with self.db_engine192.connect() as connection:
# table_name = "stock_cookie_wj"
# # 使用参数化查询
# query = text(f"""SELECT cookie FROM {table_name} WHERE account_id = :account_id AND state = :state;""")
# result = connection.execute(query, {'account_id': account, 'state': 1})
# df_status = pd.DataFrame(result.fetchall())
# df_status.columns = result.keys()
# cookie_list = df_status['cookie'].tolist() if 'cookie' in df_status.columns else []
# return cookie_list
def get_stock_cookie(self, account):
with self.db_engine192.connect() as connection:
table_name = "stock_cookie_wj"
# 使用参数化查询
query = text(f"""SELECT cookie FROM {table_name} WHERE account_id = :account_id AND state = :state ; """)
result = connection.execute(query, {'account_id': account, 'state': 1})
df_status = pd.DataFrame(result.fetchall())
df_status.columns = result.keys()
cookie_list = df_status['cookie'].tolist() if 'cookie' in df_status.columns else []
return cookie_list
def get_cookie_account(self,item_id):
with self.db_engine192.connect() as connection:
table_name = "stock_image_summary_wj"
# 修改查询语句以匹配你的数据表名称和列名称
query = text(f"""SELECT account_id,account_secret FROM {table_name} where id = :item_id ;""")
result = connection.execute(query, {'item_id': item_id})
# print(query)
df_status = pd.DataFrame(result.fetchall())
df_status.columns = result.keys()
account_id = df_status.account_id.iloc[0]
account_secret = df_status.account_secret.iloc[0]
account_list = [account_id, account_secret]
# print(account_list)
# print(111111111111)
connection.close()
return account_list
def get_stock_test_id(self,username):
with self.db_engine192.connect() as connection:
table_name = "stock_image_id_wj"
# 修改查询语句以匹配你的数据表名称和列名称
query = text(f"""SELECT image_id FROM {table_name} WHERE account_id = :username LIMIT 1;""")
result = connection.execute(query, {'username': username})
df_status = pd.DataFrame(result.fetchall())
df_status.columns = result.keys()
image_id = df_status.image_id.iloc[0]
connection.close()
return image_id
def upload_data(self, account_id, image_id, upload_time, err_msg):
with self.db_engine192.connect() as connection:
table_name = "stock_image_detail_wj"
# 强制将 image_id 转换为字符串类型
image_id = str(image_id)
err_msg_json = json.dumps(err_msg)
sql_update = text(
f"UPDATE {table_name} SET upload_time = :upload_time, err_msg = :err_msg WHERE account_id = :account_id AND image_id = :image_id AND created_time < '2024-09-02 00:00:00'")
result = connection.execute(sql_update, {
"upload_time": upload_time,
"err_msg": err_msg_json,
"account_id": account_id,
"image_id": image_id
})
def upload_success_data(self, account_id, image_id, upload_time):
with self.db_engine192.connect() as connection:
table_name = "stock_image_detail_wj"
# 强制将 image_id 转换为字符串类型
image_id = str(image_id)
sql_update = text(
f"UPDATE {table_name} SET upload_time = :upload_time WHERE account_id = :account_id AND image_id = :image_id AND created_time < '2024-09-02 00:00:00'")
result = connection.execute(sql_update, {
"upload_time": upload_time,
"account_id": account_id,
"image_id": image_id
})
def get_all_image_id(self):
with self.db_engine192.connect() as connection:
table_name = "stock_image_detail_wj"
sql_query = f"SELECT image_id FROM {table_name} "
df_status = pd.read_sql(sql_query, con=connection)
image_id = list(df_status['image_id'].astype(str))
connection.close()
return image_id
import json
import re
from datetime import datetime
import os
os.environ['NO_PROXY'] = 'stackoverflow.com'
import logging
logging.captureWarnings(True)
from all_connect import ConnectSpider
Con = ConnectSpider()
import os
os.environ['NO_PROXY'] = 'stackoverflow.com'
import random
import requests
import time
from to_upload_id import aa_496775268
class Stock_Detail():
def __init__(self):
self.headers = {
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
# 'cookie': 'stck_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; sstk_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; _gcl_au=1.1.874517867.1723276566; __ssid=c064478ee3926370e6737c3582c60d0; _ga=GA1.1.675721154.1723276566; did=02a1c297-357e-400b-87f5-41457a394697; accts_customer=903859535; accts_customer_sso1=331378495-undefined; next.sid=s%3Agy3BgikznI1m6vIui-5gGcFYcjWzDFIK.OXZW%2FtHJ8HVG%2Fx9eXhfAilGM0zicWdtakg1aacGy8OM; _4c_=%7B%22_4c_mc_%22%3A%222a68f713-f2b5-49b7-8e45-07549f8a66d8%22%7D; FPID=FPID2.2.eJMnQEWeDZwuxnkWmESNukKGwRAIKHd9t3eewKvzeQQ%3D.1723276566; visitor_id=73458769441; htjs_user_id=331378495; htjs_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%227c4b5046-d8eb-4ee5-8bac-893a170fa766%22%7D; slireg=https://scout.us1.salesloft.com; extole_access_token=BV1L1U32MH7O4IOEUHOH6F88AJ; sliguid=2259d959-8b66-4544-8a26-ee6f9e0d872c; slirequested=true; n_v=2d4faa42e64; locale=zh; NEXT_LOCALE=zh; hl=zh; sstk_session_start=2024-08-12T00%3A50%3A01.185Z; stck_session_id=eb218863-1d7b-453e-b68f-826b54a4f567; sstk_session_id=eb218863-1d7b-453e-b68f-826b54a4f567; downlink=slow; visit_id=81252382944; gtm_monit_roll=99; OptanonConsent=consentId=5cff2293-6adc-4e35-960b-a301a65fc215&datestamp=Mon+Aug+12+2024+08%3A50%3A04+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202403.2.0&interactionCount=0&isAnonUser=1&isGpcEnabled=0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0005%3A1%2CC0004%3A1%2CC0007%3A1&AwaitingReconsent=false; OptanonCachedGroups=,C0001,C0003,C0002,C0005,C0004,C0007,; FPLC=xYngCOYQalQCTnXJSrL5MkPtTggGURkuZLMfzokC8ITFBHfwhBl4NrpZuNrp44Zw2eHfL22tL9Q1ejmZot6AEMzltj2Hvtapbeu8iPj20h1y4yHKQZLIWATPUg4j1A%3D%3D; datadome=1Q5Il~6ANKiyum9UFnRoJyruo7C1U0TXOsxV_tOU4rYdEk5a6dO3mjUEFLs5FgsAbSj2xGvoSUp3jYNLxrjs~XHUOwCU9L~8nimhKtYEznVW_AeNZ23e2zxJLf2GHikU; Authorization=1%2FeyJjbGllbnRfaWQiOiJhMGI3Ni1hN2Y1ZS1mZWRlYy1iYmViZS1mYTY1Yi04NTcxOSIsInJlYWxtIjoiY3VzdG9tZXIiLCJzY29wZSI6InVzZXIudmlldyB1c2VyLmVtYWlsIHVzZXIuYWRkcmVzcyB1c2VyLmVkaXQgb3JnYW5pemF0aW9uLnZpZXcgb3JnYW5pemF0aW9uLmFkZHJlc3MgY29sbGVjdGlvbnMudmlldyBjb2xsZWN0aW9ucy5lZGl0IGxpY2Vuc2VzLnZpZXcgbGljZW5zZXMuY3JlYXRlIG1lZGlhLnVwbG9hZCBtZWRpYS5zdWJtaXQgbWVkaWEuZWRpdCBwdXJjaGFzZXMudmlldyBwdXJjaGFzZXMuY3JlYXRlIiwidXR2IjoicUhpNCIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJ1c2VybmFtZSI6IjkwMzg1OTUzNSIsInVzZXJfaWQiOjMzMTM3ODQ5NSwib3JnYW5pemF0aW9uX2lkIjpudWxsLCJwYXJlbnRfb3JnYW5pemF0aW9uX2lkcyI6W10sImN1c3RvbWVyX2lkIjozMzEzNzg0OTUsImV4cCI6MTcyMzQyNzMyMX0.QcAGSAHi_1fLXJQReHBAtupw2qzn6DXiX1nsLVOoxg6OEe277kv3zt85EsJHen0_DmoHdMS35s4Fh418lXWR-w; _uetsid=d11e0420584411efb988e71c7217a51d|17zp06m|2|fo9|0|1685; _uetvid=58dc29704a6511ef8f5e5754465f397d|1dj9d84|1723425007889|3|1|bat.bing.com/p/insights/c/i; _ga_H22ZZQTXLV=GS1.1.1723423806.2.1.1723425008.0.0.0; _ga_SSGTMSSTK=GS1.1.1723423806.2.1.1723425008.0.0.1890544395; _ga_5JRYE4Y8J9=GS1.1.1723423806.2.1.1723425008.59.0.0',
'if-modified-since': 'Sat, 10 Aug 2024 08:09:24 GMT',
'if-none-match': 'W/"17t8cdyfo1s1un"',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6Ijk2NzIzMiIsImFwIjoiMTU4ODYzMjc5MiIsImlkIjoiMTU5ZDAyYjQ0YTdjN2VlZiIsInRyIjoiNTEzYjdiODMwNGFkODE4N2Y5YzMwMzg0OTc4YWYwOWUiLCJ0aSI6MTcyMzQyNTAxNzYyOH19',
'priority': 'u=1, i',
'referer': 'https://www.shutterstock.com/zh/catalog/licenses',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-513b7b8304ad8187f9c30384978af09e-159d02b44a7c7eef-01',
'tracestate': '967232@nr=0-1-967232-1588632792-159d02b44a7c7eef----1723425017628',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'x-end-app-name': 'next-web',
'x-end-app-version': '2d4faa42e64',
'x-newrelic-id': 'XQAAU1VRGwIEVVhaBgYGUlI=',
'x-request-id': 'db700cde-0361-46f8-8ff3-d7234616ab34',
}
def random_ua(self):
first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)
os_type = [
'(Windows NT 6.1; WOW64)',
'(Windows NT 10.0; WOW64)',
'(X11; Linux x86_64)',
'(Macintosh; Intel Mac OS X 10_12_6)'
]
chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
'(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
)
self.headers['user-agent'] = ua
def transmission_api(self, data):
# url = 'http://192.168.2.97:6661/microservice-visual/visual/fileSystem/saveImageDetail?token=dacce869-0471-4ec7-ac50-3b3b1ec22c87'
url = 'http://wx.yswg.com.cn:8000/microservice-visual/visual/fileSystem/saveImageDetail?token=dacce869-0471-4ec7-ac50-3b3b1ec22c87'
data_json = json.dumps(data)
max_retries = 3
retries = 0
while retries <= max_retries:
try:
response = requests.post(url, data=data_json)
if response.status_code == 200:
return response.json()
else:
print(f'请求失败,状态码: {response.status_code},重试 ({retries}/{max_retries})')
retries += 1
except requests.exceptions.RequestException as e:
print(f'请求异常: {e},重试 ({retries}/{max_retries})')
retries += 1
raise Exception(f'请求失败,已达到最大重试次数:{max_retries} 次')
def get_pic(self,account_id, image_id, item_id, cookie_list,retry=0):
while retry <3:
try:
start_time = datetime.now().strftime("%m-%d %H:%M:%S")
start_times = datetime.now()
cookie = random.choice(cookie_list)
self.random_ua()
item = {}
transmission_data = {}
# 这里有两种不同的请求 需要判断
headers = {
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
'if-modified-since': 'Sat, 10 Aug 2024 08:09:24 GMT',
'if-none-match': 'W/"17t8cdyfo1s1un"',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6Ijk2NzIzMiIsImFwIjoiMTU4ODYzMjc5MiIsImlkIjoiMTU5ZDAyYjQ0YTdjN2VlZiIsInRyIjoiNTEzYjdiODMwNGFkODE4N2Y5YzMwMzg0OTc4YWYwOWUiLCJ0aSI6MTcyMzQyNTAxNzYyOH19',
'priority': 'u=1, i',
'referer': 'https://www.shutterstock.com/zh/catalog/licenses',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-513b7b8304ad8187f9c30384978af09e-159d02b44a7c7eef-01',
'tracestate': '967232@nr=0-1-967232-1588632792-159d02b44a7c7eef----1723425017628',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'x-end-app-name': 'next-web',
'x-end-app-version': '2d4faa42e64',
'x-newrelic-id': 'XQAAU1VRGwIEVVhaBgYGUlI=',
'x-request-id': 'db700cde-0361-46f8-8ff3-d7234616ab34',
}
params = {
'fields[images]': 'alt,aspect,channels,content_tier,contributor_id,description,displays,height,image_type,is_editorial,creativeInsights,link,model_release_info,sizes,src,title,width,has_model_release,has_property_release,status',
'fields[licenses]': 'content_id,content_size,content_type,global_accounts_user_id,insert_time,license_name,media-item,metadata',
'include': 'media-item.categories',
'filter[media_type]': 'photo',
'filter[licensee_type]': 'all',
'page[number]': '1',
'page[size]': '50',
'filter[media_id]': f'{image_id}',
'sort': 'newest',
'v': '2d4faa42e64',
}
response = requests.get('https://www.shutterstock.com/napi/user/licenses', params=params, cookies=cookie,headers=headers, timeout=600)
if response.status_code == 200:
# 获取title size
try:
data = json.loads(response.text)
image_title = data['included'][0]['attributes']['alt']
image_size_info = {}
try:
hugeJpg = data['included'][0]['attributes']['sizes']['hugeJpg']
image_size_info['hugeJpg'] = hugeJpg
except:
hugeJpg = None
try:
mediumJpg = data['included'][0]['attributes']['sizes']['mediumJpg']
image_size_info['mediumJpg'] = mediumJpg
except:
mediumJpg = None
try:
smallJpg = data['included'][0]['attributes']['sizes']['smallJpg']
image_size_info['smallJpg'] = smallJpg
except:
smallJpg = None
image_size_info = json.dumps(image_size_info)
except:
# 第一个请求获取title失败后尝试第二个
params = {
'recordActivity': 'false',
'fields[images]': 'description,keywords,sizes',
'include': 'contributor',
'v': '76c84077d03',
}
response = requests.get(f'https://www.shutterstock.com/napi/images/{image_id}', params=params, cookies=cookie,headers=headers)
if response.status_code == 200:
# 获取title size
try:
data = json.loads(response.text)
image_title = data['data']['attributes']['description']
image_size_info = {}
try:
largeJpg = data['data']['attributes']['sizes']['largePng']
image_size_info['largeJpg'] = largeJpg
except:
largeJpg = None
try:
largePsd = data['data']['attributes']['sizes']['largePsd']
image_size_info['largePsd'] = largePsd
except:
largePsd = None
image_size_info = json.dumps(image_size_info)
except:
image_size_info = None
image_title = None
# 获取图片url
if image_title:
try:
if '@' in account_id:
file_name = account_id.split('@')[0]
else:
file_name = account_id
save_folder = f'/home/wangjing/picture_material/stock_summery/all_pic/{file_name}'
# save_folder= f'D:\公司备用文件\\all_images\\{file_name}'
pic_name = f'{str(image_id)}+{image_title}'.replace(':', '_').replace('\t', '').replace('\r', '').replace('|','').replace( '/', '').replace('"', "'").replace(' ', '_').replace(',', '_').replace('.', '_').replace('\n', '_')
pic_name = re.sub(r'[\\/*?:"<>|]', '_', pic_name)[:160] + '.jpg'
json_data = {
'required_cookies': '',
'content': [
{
'content_id': f'{image_id}',
'content_type': 'photo',
'content_size': 'huge',
'content_format': 'jpg',
'license_name': 'standard',
'show_modal': True,
},
],
}
response = requests.post('https://www.shutterstock.com/napi/licensees/current/redownload',cookies=cookie, headers=self.headers,json=json_data,timeout =600)
image_url = json.loads(response.text)['meta']['licensedContent'][0]['downloadUrl']
image_type = 'jpg'
res = requests.get(image_url,timeout=600)
# 构建完整的文件路径
file_path = os.path.join(save_folder, pic_name)
# with open(file_path, "wb") as f:
# f.write(res.content)
item['account_id'] = account_id
item['image_id'] = image_id
item['image_size_info'] = image_size_info
item['image_title'] = image_title
item['image_type'] = image_type
item['image_url'] = image_url
item['created_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
transmission_data['accountId'] = account_id
transmission_data['imageId'] = image_id
transmission_data['imageSizeInfo'] = image_size_info
transmission_data['imageTitle'] = image_title
transmission_data['imageType'] = image_type
transmission_data['imageUrl'] = image_url
# Con.save_stock_detail(item)
# Con.update_image_id_to_3(item_id)
self.transmission_api(transmission_data)
now_time = datetime.now().strftime("%m-%d %H:%M:%S")
now_times = datetime.now()
print(f'pic_name:{pic_name[:38]},time:{start_time}——{now_time}下载成功')
time_difference = (now_times - start_times).total_seconds()
# 检查时间差是否小于5秒
if time_difference < 4.8:
# 随机等待2到5秒
wait_time = random.uniform(2, 5)
time.sleep(wait_time)
break
except Exception as e:
print('不是jpg', e)
if 'meta' in str(e):
try:
# 如果 jpg 格式失败,则尝试 png 格式
pic_name = f'{str(image_id)}+{image_title}'.replace(':', '_').replace('\t', '').replace('\r', '').replace('|', '').replace('/', '').replace('"', "'").replace(' ', '_').replace(',', '_').replace('.', '_').replace('\n', '_')
pic_name = re.sub(r'[\\/*?:"<>|]', '_', pic_name)[:160] + '.png'
json_data = {
'required_cookies': '',
'content': [
{
'content_id': f'{image_id}',
'content_type': 'photo',
'content_size': 'large',
'content_format': 'png',
'include_shadows': True,
'angle': 'G03',
'license_name': 'standard',
'show_modal': True,
},
],
}
response = requests.post('https://www.shutterstock.com/napi/licensees/current/redownload',cookies=cookie,headers=self.headers,json=json_data,timeout=600)
image_url = json.loads(response.text)['meta']['licensedContent'][0]['downloadUrl']
image_type = 'png'
res = requests.get(image_url,timeout=600)
# 构建完整的文件路径
file_path = os.path.join(save_folder, pic_name)
# with open(file_path, "wb") as f:
# f.write(res.content)
item['account_id'] = account_id
item['image_id'] = image_id
item['image_size_info'] = image_size_info
item['image_title'] = image_title
item['image_type'] = image_type
item['image_url'] = image_url
item['created_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
transmission_data['accountId'] = account_id
transmission_data['imageId'] = image_id
transmission_data['imageSizeInfo'] = image_size_info
transmission_data['imageTitle'] = image_title
transmission_data['imageType'] = image_type
transmission_data['imageUrl'] = image_url
# Con.save_stock_detail(item)
# Con.update_image_id_to_3(item_id)
self.transmission_api(transmission_data)
now_time = datetime.now().strftime("%m-%d %H:%M:%S")
now_times = datetime.now()
print(f'pic_name:{pic_name[:38]},time:{start_time}——{now_time}下载成功')
time_difference = (now_times - start_times).total_seconds()
# 检查时间差是否小于5秒
if time_difference < 4.8:
# 随机等待2到5秒
wait_time = random.uniform(2, 5)
time.sleep(wait_time)
break
except Exception as e:
print('不是png', e)
raise
else:
raise
else:
Con.update_image_id_to_4(item_id)
elif response.status_code == 403 and retry < 3:
print(f'重试第{retry + 1}次')
# 根据重试次数设置不同的休眠时间
if retry == 0:
sleep_time = random.randint(3, 6)
elif retry == 1:
sleep_time = random.randint(63, 66)
elif retry == 2:
sleep_time = random.randint(360, 366)
time.sleep(sleep_time)
del cookie
print(f'cookie还剩{len(cookie_list)}个')
cookie = random.choice(cookie_list)
return self.get_pic(account_id, image_id, item_id, cookie, retry + 1)
else:
raise
print(image_id, '状态码=', response.status_code)
except Exception as e:
if 'image_title' in str(e):
Con.update_image_id_to_4(item_id)
print(f'{image_id}过期 修改为4')
break
else:
print(f"网络错误 {e}, 尝试重新连接第{retry}次...")
# 设置不同的休眠时间
sleep_time = [random.randint(3, 6), random.randint(63, 66), random.randint(360, 366)][retry]
time.sleep(sleep_time)
return self.get_pic(account_id, image_id, item_id, cookie_list, retry + 1)
if retry >= 3:
print('重试次数用完')
raise
def run(self):
for item_id in range(1, 2):
account_id = Con.get_account_id(item_id)
# 获取图片id
image_id_id_pairs = Con.get_stock_images_id(account_id)
counts_start = 0
counts_last = len(image_id_id_pairs)
cookie_list = Con.get_stock_cookie(account_id)
for count in range(counts_start, counts_last):
image_id = image_id_id_pairs[count].split('||-||')[0]
item_id = image_id_id_pairs[count].split('||-||')[1]
print(f'执行{account_id}', image_id, item_id, count)
self.get_pic(account_id, image_id, item_id, cookie_list)
count += 1
# break
if count == len(image_id_id_pairs):
print(f'{account_id}全部下载完成')
break
if __name__ == '__main__':
stock_detail = Stock_Detail()
stock_detail.run()
from pprint import pprint
import logging
logging.captureWarnings(True)
import requests
import re
import requests
import json
import os
from pprint import pprint
os.environ['NO_PROXY'] = 'stackoverflow.com'
from curl_cffi import requests
from scrapy.selector import Selector
from datetime import datetime
from all_connect import ConnectSpider
Con = ConnectSpider()
import time
import random
class GetCookie(object):
def __init__(self):
pass
def get_cookie(self,username,password,test_id):
# url = "https://accounts.shutterstock.com/oauth/authorize"
# response = requests.get(url, headers=headers, allow_redirects=False)
# # print(response)
# cookies = dict(response.cookies)
# # pprint(cookies)
headers = {
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
'content-type': 'application/json',
# 'cookie': 'stck_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; sstk_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; _gcl_au=1.1.612940543.1725355186; __ssid=ba410212754b3a62d2687a791f52c1a; _ga=GA1.1.200858045.1725355186; did=9852f99c-ae8d-41a7-9263-cb68b8d6164d; accts_customer=chenxiaowei70; accts_customer_sso1=401642949-undefined; next.sid=s%3ARXkBpmFV18H84ZTlhxC5HA1MCXcGbWEz.x4av90eE2p0gIegIgxmNo9%2BholArtlFKZTXEdm3OQvc; _4c_=%7B%22_4c_mc_%22%3A%22ff9520d4-fc5e-4b9c-892e-2c0bf39ef51a%22%7D; FPID=FPID2.2.5SpZVlHvrtXYz0M8C0t7GGlMlT9hUzgKcD4FRT%2F0rAM%3D.1725355186; visitor_id=73568925800; htjs_user_id=401642949; htjs_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; sliguid=5f80ac9c-7468-44a5-a464-cb1ceb0439c7; slirequested=true; extole_access_token=Q1U7HGT00HC3CMFJQ2PAD0KUDQ; _ga_5JRYE4Y8J9=GS1.1.1726103781.4.0.1726103781.60.0.0; _ga_H22ZZQTXLV=GS1.1.1726103781.4.0.1726103781.0.0.0; locale=zh; NEXT_LOCALE=zh; hl=zh; downlink=slow; n_v=RCV2DuhCEsefvAEwPNczi; sstk_session_start=2024-09-14T01%3A19%3A23.838Z; stck_session_id=60b0982a-606d-4ca2-a683-d4631097d614; sstk_session_id=60b0982a-606d-4ca2-a683-d4631097d614; visit_id=81448935507; fullstory_enabled=false; OptanonCachedGroups=,C0001,C0003,C0002,C0005,C0004,; FPLC=MhJwfs4em%2FvHDCW22rk0s6n1fC%2BjDF32oKEW8ev5%2BwH3z%2FbGwyI39m5EenAU5F24ndYkdWHJfoKB2YvNe7T30dlv1wxAZeuDJc%2FvYwCXt%2FSDS5%2B7wApbhWzat%2FmWdQ%3D%3D; slireg=https://scout.us1.salesloft.com; datadome=T68kQIp1WWnyUdSsbqVYAp__d5Tvbp8XDguDFBeK2BNzhMv0sffU_Vdnu0SaNYa0M4bW7gHlYK~7t6ehtmmC46QV5Ocv0aJ6YO3MK39J~UF4giZ4l84Xkgd74SCJeW7e; _ga_SSGTMSSTK=GS1.1.1726276775.3.1.1726276929.0.0.2058912936; Authorization=1%2FeyJjbGllbnRfaWQiOiJhMGI3Ni1hN2Y1ZS1mZWRlYy1iYmViZS1mYTY1Yi04NTcxOSIsInJlYWxtIjoiY3VzdG9tZXIiLCJzY29wZSI6InVzZXIudmlldyB1c2VyLmVtYWlsIHVzZXIuYWRkcmVzcyB1c2VyLmVkaXQgb3JnYW5pemF0aW9uLnZpZXcgb3JnYW5pemF0aW9uLmFkZHJlc3MgY29sbGVjdGlvbnMudmlldyBjb2xsZWN0aW9ucy5lZGl0IGxpY2Vuc2VzLnZpZXcgbGljZW5zZXMuY3JlYXRlIG1lZGlhLnVwbG9hZCBtZWRpYS5zdWJtaXQgbWVkaWEuZWRpdCBwdXJjaGFzZXMudmlldyBwdXJjaGFzZXMuY3JlYXRlIiwidXR2IjoidE9VSyIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJ1c2VybmFtZSI6ImNoZW54aWFvd2VpNzAiLCJ1c2VyX2lkIjo0MDE2NDI5NDksIm9yZ2FuaXphdGlvbl9pZCI6bnVsbCwicGFyZW50X29yZ2FuaXphdGlvbl9pZHMiOltdLCJjdXN0b21lcl9pZCI6NDAxNjQyOTQ5LCJleHAiOjE3MjYyODAzMzV9.Jm_9UkRn9HFFPt3N9SkgfswEE6zKcKCLLtANgYEV-oizIbLwVceEpQt6OhS40OCHWy8BtnYVqD0FywVO0D7MYg; _uetsid=69272480723711ef81e4916cada775b8|6fkmu5|2|fp6|0|1718; _uetvid=58dc29704a6511ef8f5e5754465f397d|1unxtqr|1726276930405|2|1|bat.bing.com/p/insights/c/z; OptanonAlertBoxClosed=2024-09-14T01:26:47.115Z; OptanonConsent=consentId=0f52ec73-6a09-4521-a9f6-656d9345d77a&datestamp=Sat+Sep+14+2024+09%3A26%3A47+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202403.2.0&interactionCount=2&isAnonUser=1&isGpcEnabled=0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0005%3A1%2CC0004%3A1&AwaitingReconsent=false&intType=3; _ga_VEW1GCS46P=GS1.1.1726276775.1.1.1726277225.37.0.0',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6Ijk2NzIzMiIsImFwIjoiMTU4ODYzMjc5MiIsImlkIjoiOGM2ZmE0ZjI1NjBjMzNkYyIsInRyIjoiOWE1MTY5ODE0MDdjYzU4ODkxNDEwMzBhYTY0YjljYWQiLCJ0aSI6MTcyNjI3NzIzMjc2M319',
'origin': 'https://www.shutterstock.com',
'priority': 'u=1, i',
'referer': 'https://www.shutterstock.com/zh/catalog/licenses?q=2115440474',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-9a516981407cc5889141030aa64b9cad-8c6fa4f2560c33dc-01',
'tracestate': '967232@nr=0-1-967232-1588632792-8c6fa4f2560c33dc----1726277232763',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'x-end-app-name': 'next-web',
'x-end-app-version': 'RCV2DuhCEsefvAEwPNczi',
'x-newrelic-id': 'XQAAU1VRGwIEVVhaBgYGUlI=',
'x-request-id': '796bfe4c-8f04-46d7-8d08-3fd822e90caf',
}
cookies = {"session": "s%3AI9ziS788AduDlnYyY0e6UZdoGVzIsSP5.exVKScJJaVstpM2QMk0XT1fceF9QproiKiBCL6HNY9I",}
url = "https://accounts.shutterstock.com/login"
data = {
"username": f"{username}",
"password": f"{password}",
"next": "/oauth/authorize?state=3bd2f73742d0b1cb84c3c015d034e291&redirect_uri=https%3A%2F%2Fwww.shutterstock.com%2Fnext-oauth%2Fcallback%3Flanding_page%3D%252Fiframe-landing%253Fpreset%253Dcvrt839%26realm%3Dcustomer&scope=licenses.create%20licenses.view%20organization.view%20purchases.view%20purchases.create%20user.edit%20user.email%20user.view%20user.address%20organization.address%20collections.view%20collections.edit%20media.upload%20media.submit%20media.edit&site=image&embedded=true&preset=cvrt839&client_id=a0b76-a7f5e-fedec-bbebe-fa65b-85719",
"g-recaptcha-response": "03AFcWeA7Y3BMoHRIe-_m4-BWBpyWg2kmp8pydIOMsAk4Fd7djxmnMq4D2sSsJsbAg5rnpwYhvVw0i0x_n5_pitX76Ju0EdRD9dBwpbyUN1CvMGyS_hY9yA0QdSSElvgh9GAAXvy1sIIFymSDMpoQIG0CAB_ybvRAlnJwfb6oifrb2Yi3UBW9HkMe2cPvZBdD4ApitfP7UVk3BOk7ct2UFbPx9uvtSUeliCTFr8C4l-VrjOBR11j1rRSHvD9vLJPrne7O2I1Bp3Fu1mLCpefaVq1t1MElQWXYD99M9gp2zi5GAUTTzf4wnWDbAHmgSfHws0yXxO3okGlddQug9d4XDEkaGjY2LMP_zG53e_ZCL1hQqh3u1MpSpZblJBNJGKokyb-2p_bt1Y3_VK0jAJxnbPCD4ysRTHk8EZXNRrSKFSE7OcUIIElLyS8C8DnHvq4HoNV5FzK6m7corCJcO3n8fKHcNf_aYz2bE5dKfiFXff6iN6cmRZxBV4KxKfMj9v-H9jbBHabBhPbm-1x4Iyjq7-ihsOVRANWb1g1NJwTxZaYbiDvwVXoN7IX3OKAaLwbR26OCx7-kAQss-jpHQj3EOGM3hRAGAPLicp45p3Mo80UQRzvVEztjFuOnmEfxeucFkYHnBSB-kNHzraeC11VC7C9Dva7zDnDjlXtadGyvvY4K5xZMTADm3Evdxl23riqVbYUgkYMUtBXf92vRR388AN5XYEOjYJ6j7AhUV0E8lO3SJ828sO7q6W0NcRmG20zb1KKOAnisZ7pZtnuLXwbIzkTJcXNKnOJf-ntPTTL3_RlDu7mXa9HVFVM5xIXOSKwqIEDuzeWBvj3kNtn7-NJs2VmvRWZkKNbx80rdADHKfM6FaQ6Rm50vZhtnn9BVVlu0i7kT5f5CpbXfWAb9mkpsT81lly4sfo2QrAFDq_VlGYTWA-CaMOOsS6w7HaqoAF2Kd5OcNEN_ATP7szvuoe-dgNO4RasDMbA2qJr3lwU2CUp-hkZSqpln0X8omt3Pd1dF-pTtRrRkZfvXtFXDcesVBZNDerJHWySTiSakqfkHBmC4gQMl0goknn4EDCCkaekaxnSog9zcgn8IIFwEoArzXWlXarLTXNnGOAs2SrjyjMryQvZIHsnuBsW-WcmEn62zoXJStepG2xxTHrvADL8FCjgjEWrFP8FhZrTIa8jMh6uP-NTBMaTz0W72u7d95vb-4xE5BHeM-KUcXPavjLdwfYwatz6CByOpsyARgWE9XrjFPueMXi7HJ8Wp9rMNtpueYDYADriIPTz9pmjQTjzjPznx6nYrzhZsBIcWQrHVnsRckx9gkSwq6qwdOZ-njpxG6pvoF64gSzZG2hYN8dsbWRog9r3I86ncLAzUfPCD3ww1mlQMRB6Lzj2yhu8MFyQpjlb_H-wrp9scdswH6ajV-kna7W4lmoTa12iH_nBuAncLy7I0RxxQgrUadjswJ9s4TOQF9chmX6Pat8WR79gZiGeQ-yBaGCrMM0iHs0ALZveRPmiaaKuD2kv8V7y9TmT_Ex3DIS6IeiLnXOBZ2hJBEWnEuYzg3GxXVNsDzdNEmejK8TB7Jb783rh9y2WEOp00fqdvtRl-yFJX9Q6HtejWge3ttoasR2rF4m1J0wSl8jNcd0PEyvp4IeHQrknF_cB5CWDxZkM6RJNQhQ8OiC5UDmJ3lVGMP-rcukp_ap4ZDVUxRQbtLN9pYPYnWMEPMTAEp_yMoobguxLWWbV37sIq610SSE7IXeR8eDa9iYTfDCAMNA-RK0nOVEg-yd3_A1VGBjVgApj7Z8GL09XRuvk0Av8vuz7G9B3GFyUxk6CZb-JdreekV3G_DAMhHolgxQkwECvSvIJL51pD4SBCp5IUWi3GmLHCbjFx74_Q1xs9cYchOu6V01kziNoyx2Vf4ThHJpaBY1-5K1H_MiA28IlkJPax8NwTYaY0eHs61hcqGZRx9ioG1lBKs3MprHiykvwleWj_ff4v_v7JrNVKdCSt4GMKnOcAtx19e413rtzBi50bwmFRU9btX3KGFMWw1JQ7y6skbwm8XqtslyAKm_KpbyVdTTqRwt-rBdyMmhpaKe1EzeFDwCOTrUVUEbiuCrOTa512PILKXKtx2g9Rk3k7Vea-QYg2bqtLCsQ"
}
# data = {
# "username": "danqingmei30",
# "password": "yswg2468.",
# "next": "/oauth/authorize?state=655089be46ef190d6e41a83c1f6514ef&redirect_uri=https%3A%2F%2Fwww.shutterstock.com%2Fnext-oauth%2Fcallback%3Flanding_page%3D%252Fiframe-landing%253Fpreset%253Dcvrt839%26realm%3Dcustomer&scope=licenses.create%20licenses.view%20organization.view%20purchases.view%20purchases.create%20user.edit%20user.email%20user.view%20user.address%20organization.address%20collections.view%20collections.edit%20media.upload%20media.submit%20media.edit&site=image&embedded=true&preset=cvrt839&client_id=a0b76-a7f5e-fedec-bbebe-fa65b-85719",
# "g-recaptcha-response": ""
# }
# data = {
# "username": "771783160@qq.com",
# "password": "XUEYi15999866920!",
# "next": "/oauth/authorize?state=28d2a63c10744eb7b3d9cd97a6fd295f&redirect_uri=https%3A%2F%2Fwww.shutterstock.com%2Fnext-oauth%2Fcallback%3Flanding_page%3D%252Fiframe-landing%253Fpreset%253Dcvrt839%26realm%3Dcustomer&scope=licenses.create%20licenses.view%20organization.view%20purchases.view%20purchases.create%20user.edit%20user.email%20user.view%20user.address%20organization.address%20collections.view%20collections.edit%20media.upload%20media.submit%20media.edit&site=image&embedded=true&preset=cvrt839&client_id=a0b76-a7f5e-fedec-bbebe-fa65b-85719",
# # "g-recaptcha-response": "03AFcWeA6Ql-DumMbMjl0JWU1I9Xv-p1YQ5R1gGs9lFRCpym1ofD6E0esdR3HPH6JL-ZkhgmtyFs8Ez502BVBaVSbwjp3unfcMzXC3GwTkbt4nU9BSCihmrvllR7-XtpWiOV8WnM4MtxU5tHmTBqFg-lWmWikR2uRbuikt4NMJ6YLd3b-Q39IIf1wqyhUC3USGuAUMOZCEJMnIbMDv0p0g2MM0hiuF4KYVliks1WoeXyFKGEsRXB3ZDfImf1YTXQQFuEMZ4D7NIESvnlzuRXQRcXWE2DIC6WieuWDT_mEQBDcPTQP9z9un_7Xhwtku1CegdqRzPekG839z89ei_6uj881i5eeWcfkuysUa-6H-HJm37-jW6_-lTymFf5X_tFpAo-IEoeS7IaXKBmYiDG5xhjGpZQY--kKNS6Ygj9p_HfKzS5Lz6Nah6D4OXDmZXtkZcpVKNhPwIQ3m_FaYNCKxNRAbsTF3suPGNVxl61qg-1cG-09PGJ5o2uSx9Spa4CtNRC-jhmyjfXTCp5vv3AuRSKgONstBbZyAq6FJhrxUIMjTZ9rWUXnOVhUAaJKwb8ARlqXgEZOWyin7PhQcs6TyHBACifU7Nx5hoL-ujFD9BdynbdIcD1bOZWJ0zbg0cT-8l7MYs7Ao98FrfvjpP1C_oHvksrh7m3yvRaKzO4856CqJQgqFQCZ7nx8vKBnorrl5VHp4im9tZippfUjdDdGMPa5us8z0OY7sQkn79ybkmf-gqhTUHsGmGdfWs-kX3VJUln6PwOO1X23VYlzrnKiyEs68Mhu5z2Fijgikcg6TtA0SYmUk4lzvFx14zVlkZdWQ8BQvtA_M0dNh1EOBAJfNuuBwJD_ypG9MgYSbWH6MacnAHY_K2R2Z7NYVpzW2BP5PGeeiAEJuHeLs-A-2N_Yo2mlciLxp24fiUSEnuGtfAosKwqNn-id52NOywF82DLuIJsmX5xdGepYDVY3bn_I-pur5Kh3oqo8svzDaxjDrouS_Rqg0PROVRxF91DmMPh2kSoUMXzTH3ILeaXbKMMFbAcvLl5vnyJAzxHDrncdPHZEFiIkENi25aP2B0eDTMsE4_BFyhwqGMGFXtXC4vzfpKx2qLAC7I9D2Oi2KAi5cmMFealURmtWCDGB6fnwalmm7Fhqt8wDO-8W0OIy8qJdgysxGxRTaIQ0A4D9HMJNxITm-ZbihA6RvmI6b-0mI2KDe9jRCHqUybWdxHBwvv1rMtY6VtlhEIyYeQX3Y1xqzrOn7bNszg7WPyOFULu29dxNEJeN84HdTdqpFgR36CET8_Byq3l-xSHkhItR_1FvXaW58QFoKoC2iaFpL8GPlf5_N_fdG9Y4zVob_qPgYa7kpr6j7NF7RPghOBvNUEAYnBcHzezuIjBqwEoaRrCaVFjiliZ_zDCWRdAYkNKzcSJbXiOuB3BaEQyKVFQ6KjRfhUKZMS7efK5KKhs0YPaMbB5E1MdgZDv5QhnpI_PhjNTkfChx0o3bXdoyhGGLPuxjn6DPRhxnm6wF4KqITT9t3qrzzxKUSCZ-4GrHfBRCTbaBvtLcLU4UTEZfDtm-BLD7cW-g6uyGgSDiJiqTAJEwm0zRHqApweZHPZIjchFBCcY1HkqFecklXnqZ9RaSG6wnE-7sKVGtvON5eEA4yuZf0JflFx3vmX8BrLq4SFBUTSn3IaNOSZVe9-NbAo_4DQg3YZfuYsD8CMt8xMjbDLu4AEww2RfJ-sll6pN10KLZAZYgEfu7ZcFoZBn9J8zLdgQRrUBTLIj14rB4ZtRVKG9Yu26zER49OV-UI043XU8qkr13u_Vvhe1SA4e0dhUxvMxHtpTxISzcfx1HcBW4BrN1JU-qnJWKBBGpDu9JoDsYC5s8wDHDTbMOXKRbQBzzMiTgAUJmpZWCkvcV8Rhtb_b0Pf7vsCtel-Iqm6lZbB9Owiw71BWzqFaBLH_z_FYCn2KNnAfPIhMsW1rRW1NinMSgT-sFtmDX3CHQ5yCOY61L_18DsfY_AOClx3mu23m4aKt19Mq8TcXkTi-EDWjLuYBi_ofGJpCq4Sy_BzGJ4VEbqf7zzl713yDbajntTwZqV6tcc-TEWLUS5ImxNwhrrUKqQpR-i2xf9IBr6J4N9dEa_UsIqCuLN6P7Hva0qtw"
# }
# data = {
# "username": "yuanminghui@yswg.com.cn",
# "password": "yswg2024",
# "next": "/oauth/authorize?state=b6ab9ef1819945a0b22b1dc0875be823&redirect_uri=https%3A%2F%2Fwww.shutterstock.com%2Fnext-oauth%2Fcallback%3Flanding_page%3D%252Fiframe-landing%253Fpreset%253Dcvrt839%26realm%3Dcustomer&scope=licenses.create%20licenses.view%20organization.view%20purchases.view%20purchases.create%20user.edit%20user.email%20user.view%20user.address%20organization.address%20collections.view%20collections.edit%20media.upload%20media.submit%20media.edit&site=image&embedded=true&preset=cvrt839&client_id=a0b76-a7f5e-fedec-bbebe-fa65b-85719",
# "g-recaptcha-response": ""
# }
# data = {
# "username": "496775268@qq.com",
# "password": "Chen2436326542",
# "next": "/oauth/authorize?state=8da6aa890810d3c91856e371873cfe11&redirect_uri=https%3A%2F%2Fwww.shutterstock.com%2Fnext-oauth%2Fcallback%3Flanding_page%3D%252Fiframe-landing%253Fpreset%253Dcvrt839%26realm%3Dcustomer&scope=licenses.create%20licenses.view%20organization.view%20purchases.view%20purchases.create%20user.edit%20user.email%20user.view%20user.address%20organization.address%20collections.view%20collections.edit%20media.upload%20media.submit%20media.edit&site=image&embedded=true&preset=cvrt839&client_id=a0b76-a7f5e-fedec-bbebe-fa65b-85719",
# "g-recaptcha-response": "03AFcWeA4n5lvMh7UbNUvMRRYzU63dN_1ApJHWNeQeUPMuLWCDVkWHHMha4GlA9Dj4ZG1mAwNAEcn_AeuOLmZV3AF0MYf0OcJlmDyVhGaLaAmsm7IunwyqMOX9hkFG68CT4ErCbjWm5iJ7r2S5BErp0jSHUdmtz9Nzsr6RMEnct0JBeh3jYWqrYNIYWrlKRTs1e80I_gjA5RqURA22Q5f5WXFVr374XhZrJMclI7bpMUBFjSzYJbVuMNCMlRZ-zY-aqgjUFf0A3p3cyTqlob2juP1UrrGnaPvHyj5UJA3uEWfX6MySYJs2MSSOyz8AtOdnhKBEINIQDOSoBIhKi-ErrKpY-EdWEnjkjbW6tcw87u8ol5BEbV1CCBh-oRXOCbuzr8_QSFC0uhrUKdf1lqJfq1wkYLca6O4SYMH0IFT7AiImMaXXqvu18efQ8clJZV7mjyyue8DMHWqsBpjOr2qG8_vjYTReU2FY0D3BzLMbZcuEw3-SgQiIxaLUIe0atkZkLkgwwz6ZxHUzzaSl8MI_vTxhi0VxVk26oB-6KLnED5LvvgX5SFyYkw-spwrDa977MhiQbDJRjxfGRrnl6dLDwk8Rw4fovHEWxHobaqIE_oEmFPQDBDJXgCQ8UCec1lgwsAZAFU26VJdgUheSbKpTkN4Ch33uWCVFlbg9blLusOZjV6KTrbj72Ob_F7yQu5Z05IExUYvwKkPOqv1ypFNeRkV6Mz_iN8C6HGkNSLsQAJlWAQbuV4yDU6y7TfJVG1PR5pkSWtBDn4kQ9TXZCgahfCFLPmbIcIr3eNQ0hft_Kx93N4GFsopoYnW0xmva4LzFyuAo0EMcZrdu_lOgEiiScCyW4CnoybVs3Ywwrm0pkzsb49uXYXudjnIRAE6TdmhRoAi6PXZiCYZMMvctsf6XUSoJD4VhNkuq2ihNENBnNf3s-BPktn9kOtdPVw46mhcWgcvGSjj5_LWhbyREOtlF1xn0dPezbqkLMq6HElXQlglv6oJRL0xSSHdkLlv2xyJIPYzW7ep90JTeWJD3hVwX-bEY0ulNOp2ALjV40m-uqBYmhGMqCTwwivQ0Czkp_oyuakwty2t-hXDhTeL5cQuJxlLZrO0twOZWIgmvkmFwpCPJaFXqRSmcRtOgCaGfDSQXWHsFd30vsbfNUvv8IYRUacRdUvVAmitaxNM3_mfN-iunIlSrhxklOk2eTQIYRl2lVTM_VdO-WwkLBedMF1lZd4cbsJiERq2Kz9j80LCAmxpvFcrJ_o9IF6bY3QOniqXKKBJjEeJXJ4YNLtwi6U7zKxtbRdpg5i9GYSRg9V2JFszcZT_eB2rfHOP3u6LzfkkTZD77LOrApcrmY9oCQ07g579OB2kgpTQpQukW8sxU_yO60t-yclL6Xp_4BfCQdSClCkuNQ7ZNpmh5g9HW-ztyaGqhONh4MAmSw2kOLUNVawz3Ylc63fJls7fImMz8GSiKZCOSSV34PHePf5o2tuvjtTVGlTFl-I7rRBQlxM6lt3NMxEmqUVSzKw-zQbRzHNFW85soTD8T9dIAkZXd7x9dFTlf4s4ZKMTavm0-V5VZdkfW8HRcpgIZ0sFXfwFAj6JNIJwMeCQYarRV2Ucu1Hcf3z1xJ919ro884c4GNcSjwtWmY4Z4iE08qDZ46T-NmyR0b5iPGSmjoMQdWFV1R1IyeNZD55A2CWuzwkEbPv7a3p5FJ-aiyMTo-eOmWsuC92tDt0erobuDS7FRAgyVRJuAE_0Y1wWz4oO7FWrEam90zCprirm_Cpp6o8GqbT8WnAi-ZK4PVui84dU916niVTNihGMG44_qDDpvw-2CAKLpGTiNSF3K69GnpkELDxmopSRRwxDCh4Nxs3EOlVeYHfFgqXuP8eGRrObPpk-BYsyXRUWiYk9bJqsUyb9ifFvgXQl5mfO8KfuQQmlZqIUsgg4X7g9fiFNzHlw3aWynlz-ZQFz8MZMW2OTSnwPn6XTaBFsBvJ920ttmBfyN1ZAjRrHEIgE0it0HzRsGGueFlhwmzKXpGaxzy-Xjh5mMujhY68ZznvoVvrJkCVtdTMZau31i0YNsig6WNny9dA"
# }
time.sleep(random.randint(1,3))
data = json.dumps(data, separators=(',', ':'))
response = requests.post(url, headers=headers, cookies=cookies, data=data,verify=False, allow_redirects=False)
print(111111111111)
print(response.status_code)
print(response.text)
print("\n")
cookies1 = dict(response.cookies)
# pprint(cookies1)
url = response.json()['next_url']
time.sleep(random.randint(1, 3))
headers = {
"Host": "www.shutterstock.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"referer": "https://accounts.shutterstock.com/",
"accept-language": "zh-CN,zh;q=0.9",
}
response = requests.get(url, headers=headers, cookies=cookies1,verify=False, allow_redirects=False)
print(2222222222222)
print(url)
print(response.status_code)
print(response.text)
print("\n")
# <p>Found. Redirecting to <a href="/iframe-landing?preset=cvrt839">/iframe-landing?preset=cvrt839</a></p
cookies2 = dict(response.cookies)
cookies3 = {**cookies1, **cookies2}
time.sleep(random.randint(1, 3))
headers = {
"Host": "www.shutterstock.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"referer": "https://accounts.shutterstock.com/",
"accept-language": "zh-CN,zh;q=0.9",
}
url = "https://www.shutterstock.com/iframe-landing"
params = {
"preset": "cvrt839"
}
response = requests.get(url, headers=headers, cookies=cookies3, params=params,verify=False, allow_redirects=False)
# print(3333333333)
# print(url)
# print(response.status_code)
# print(response.text)
# print("\n")
cookies4 = dict(response.cookies)
cookies5 = {**cookies3, **cookies4}
time.sleep(random.randint(1, 3))
sel = Selector(text=response.text, type="html")
url = sel.xpath(".//a/@href").get()
headers = {
"Host": "accounts.shutterstock.com",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"referer": "https://accounts.shutterstock.com/",
"accept-language": "zh-CN,zh;q=0.9",
}
response = requests.get(url, headers=headers, cookies=cookies5,verify=False, allow_redirects=False)
# print(444444444)
# print(url)
# print(response.status_code)
# print(response.text)
# print("\n")
cookies6 = dict(response.cookies)
# pprint(cookies6)
cookies7 = {**cookies5, **cookies6}
time.sleep(random.randint(1, 3))
sel = Selector(text=response.text, type="html")
url = sel.xpath(".//a/@href").get()
headers = {
"Host": "www.shutterstock.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"referer": "https://accounts.shutterstock.com/",
"accept-language": "zh-CN,zh;q=0.9",
}
response = requests.get(url, headers=headers, cookies=cookies7,verify=False, allow_redirects=False)
# print(55555555)
# print(url)
# print(response.status_code)
# print(response.text)
# print("\n")
cookies8 = dict(response.cookies)
cookies9 = {**cookies7, **cookies8}
# pprint(cookies9)
time.sleep(random.randint(1, 3))
headers = {
"Host": "www.shutterstock.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"referer": "https://accounts.shutterstock.com/",
"accept-language": "zh-CN,zh;q=0.9",
"priority": "u=0, i"
}
url = "https://www.shutterstock.com/iframe-landing"
params = {
"preset": "cvrt839"
}
response = requests.get(url, headers=headers, cookies=cookies,verify=False, allow_redirects=False)
# print(666666666)
# print(url)
# print(response.status_code)
# print(response.text)
# print("\n")
cookies10 = dict(response.cookies)
cookies11 = {**cookies9, **cookies10}
# pprint(cookies11)
time.sleep(random.randint(1, 3))
headers = {
"Host": "www.shutterstock.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"referer": "https://accounts.shutterstock.com/",
# "priority": "u=0, i"
}
cookies = {}
url = "https://www.shutterstock.com/zh/iframe-landing"
params = {
"preset": "cvrt839"
}
response = requests.get(url, headers=headers, cookies=cookies, params=params,verify=False, allow_redirects=False)
# print(77777777)
# print(url)
# print(response.status_code)
# print(response.text)
# print("\n")
cookies12 = dict(response.cookies)
cookies13 = {**cookies11, **cookies12}
# pprint(cookies13)
time.sleep(random.randint(1, 3))
headers = {
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
'content-type': 'application/json',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6Ijk2NzIzMiIsImFwIjoiMTU4ODYzMjc5MiIsImlkIjoiNDFhZjVlNmU1Zjc3MTcyOCIsInRyIjoiMWY1OTI1N2E0ODM4Y2Q5ZmRiYTZkZTQ0YmI0NjVjNmUiLCJ0aSI6MTcyMzUxMTc2NDcwOX19',
'origin': 'https://www.shutterstock.com',
'priority': 'u=1, i',
'referer': 'https://www.shutterstock.com/zh/catalog/licenses',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-1f59257a4838cd9fdba6de44bb465c6e-41af5e6e5f771728-01',
'tracestate': '967232@nr=0-1-967232-1588632792-41af5e6e5f771728----1723511764709',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'x-end-app-name': 'next-web',
'x-end-app-version': '52ce3cb0535',
'x-newrelic-id': 'XQAAU1VRGwIEVVhaBgYGUlI=',
'x-request-id': '3712d198-d49f-43e6-b813-0ad30b2034bc',
}
params = {
'v': '52ce3cb0535',
}
json_data = {
'required_cookies': '',
'content': [
{
'content_id': f'{test_id}',
'content_type': 'photo',
'content_size': 'huge',
'content_format': 'jpg',
'license_name': 'standard',
'show_modal': True,
},
],
}
response = requests.post('https://www.shutterstock.com/napi/licensees/current/redownload',params=params,cookies=cookies13,headers=headers,json=json_data,verify=False, allow_redirects=False)
# print('测试')
# print(response.status_code)
# print(response.text)
return response,cookies13
def save_cookies(self,username,cookie13):
item = {}
item['account_id'] = username
item['cookie'] = json.dumps(cookie13)
item['state'] = 1
item['created_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
Con.save_stock_cookie(item)
#
def run(self):
# for item_id in range(4,30):
item_id = 1
account_list = Con.get_cookie_account(item_id)
username = account_list[0]
password = account_list[1]
test_id = Con.get_stock_test_id(username)
# print(username,password)
# test_id = '458951890'
print(item_id,username,password,test_id)
count = 0
while count < 20:
response,cookie13 = self.get_cookie(username,password,test_id)
if response.status_code == 200:
self.save_cookies(username,cookie13)
print(f'{username},保存第{count+1}个')
time.sleep(random.randint(3,6))
count += 1
else:
print(response.status_code)
print(response.text)
if __name__ == '__main__':
get_cookie = GetCookie()
get_cookie.run()
import json
import requests
import re
import random
from datetime import datetime
import threading
import time
import os
os.environ['NO_PROXY'] = 'stackoverflow.com'
import logging
logging.captureWarnings(True)
from all_connect import ConnectSpider
Con = ConnectSpider()
from lxml import etree
import requests
from image_id_pairs_9038595 import image_id_id_pairs
# from curl_cffi import requests
import os
os.environ['NO_PROXY'] = 'stackoverflow.com'
from cookies import cookie_list_9038595
cookie_list = cookie_list_9038595
import curl_cffi
class Stock_Detail():
def __init__(self):
self.headers = {
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
# 'cookie': 'stck_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; sstk_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; _gcl_au=1.1.874517867.1723276566; __ssid=c064478ee3926370e6737c3582c60d0; _ga=GA1.1.675721154.1723276566; did=02a1c297-357e-400b-87f5-41457a394697; accts_customer=903859535; accts_customer_sso1=331378495-undefined; next.sid=s%3Agy3BgikznI1m6vIui-5gGcFYcjWzDFIK.OXZW%2FtHJ8HVG%2Fx9eXhfAilGM0zicWdtakg1aacGy8OM; _4c_=%7B%22_4c_mc_%22%3A%222a68f713-f2b5-49b7-8e45-07549f8a66d8%22%7D; FPID=FPID2.2.eJMnQEWeDZwuxnkWmESNukKGwRAIKHd9t3eewKvzeQQ%3D.1723276566; visitor_id=73458769441; htjs_user_id=331378495; htjs_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%227c4b5046-d8eb-4ee5-8bac-893a170fa766%22%7D; slireg=https://scout.us1.salesloft.com; extole_access_token=BV1L1U32MH7O4IOEUHOH6F88AJ; sliguid=2259d959-8b66-4544-8a26-ee6f9e0d872c; slirequested=true; n_v=2d4faa42e64; locale=zh; NEXT_LOCALE=zh; hl=zh; sstk_session_start=2024-08-12T00%3A50%3A01.185Z; stck_session_id=eb218863-1d7b-453e-b68f-826b54a4f567; sstk_session_id=eb218863-1d7b-453e-b68f-826b54a4f567; downlink=slow; visit_id=81252382944; gtm_monit_roll=99; OptanonConsent=consentId=5cff2293-6adc-4e35-960b-a301a65fc215&datestamp=Mon+Aug+12+2024+08%3A50%3A04+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202403.2.0&interactionCount=0&isAnonUser=1&isGpcEnabled=0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0005%3A1%2CC0004%3A1%2CC0007%3A1&AwaitingReconsent=false; OptanonCachedGroups=,C0001,C0003,C0002,C0005,C0004,C0007,; FPLC=xYngCOYQalQCTnXJSrL5MkPtTggGURkuZLMfzokC8ITFBHfwhBl4NrpZuNrp44Zw2eHfL22tL9Q1ejmZot6AEMzltj2Hvtapbeu8iPj20h1y4yHKQZLIWATPUg4j1A%3D%3D; datadome=1Q5Il~6ANKiyum9UFnRoJyruo7C1U0TXOsxV_tOU4rYdEk5a6dO3mjUEFLs5FgsAbSj2xGvoSUp3jYNLxrjs~XHUOwCU9L~8nimhKtYEznVW_AeNZ23e2zxJLf2GHikU; Authorization=1%2FeyJjbGllbnRfaWQiOiJhMGI3Ni1hN2Y1ZS1mZWRlYy1iYmViZS1mYTY1Yi04NTcxOSIsInJlYWxtIjoiY3VzdG9tZXIiLCJzY29wZSI6InVzZXIudmlldyB1c2VyLmVtYWlsIHVzZXIuYWRkcmVzcyB1c2VyLmVkaXQgb3JnYW5pemF0aW9uLnZpZXcgb3JnYW5pemF0aW9uLmFkZHJlc3MgY29sbGVjdGlvbnMudmlldyBjb2xsZWN0aW9ucy5lZGl0IGxpY2Vuc2VzLnZpZXcgbGljZW5zZXMuY3JlYXRlIG1lZGlhLnVwbG9hZCBtZWRpYS5zdWJtaXQgbWVkaWEuZWRpdCBwdXJjaGFzZXMudmlldyBwdXJjaGFzZXMuY3JlYXRlIiwidXR2IjoicUhpNCIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJ1c2VybmFtZSI6IjkwMzg1OTUzNSIsInVzZXJfaWQiOjMzMTM3ODQ5NSwib3JnYW5pemF0aW9uX2lkIjpudWxsLCJwYXJlbnRfb3JnYW5pemF0aW9uX2lkcyI6W10sImN1c3RvbWVyX2lkIjozMzEzNzg0OTUsImV4cCI6MTcyMzQyNzMyMX0.QcAGSAHi_1fLXJQReHBAtupw2qzn6DXiX1nsLVOoxg6OEe277kv3zt85EsJHen0_DmoHdMS35s4Fh418lXWR-w; _uetsid=d11e0420584411efb988e71c7217a51d|17zp06m|2|fo9|0|1685; _uetvid=58dc29704a6511ef8f5e5754465f397d|1dj9d84|1723425007889|3|1|bat.bing.com/p/insights/c/i; _ga_H22ZZQTXLV=GS1.1.1723423806.2.1.1723425008.0.0.0; _ga_SSGTMSSTK=GS1.1.1723423806.2.1.1723425008.0.0.1890544395; _ga_5JRYE4Y8J9=GS1.1.1723423806.2.1.1723425008.59.0.0',
'if-modified-since': 'Sat, 10 Aug 2024 08:09:24 GMT',
'if-none-match': 'W/"17t8cdyfo1s1un"',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6Ijk2NzIzMiIsImFwIjoiMTU4ODYzMjc5MiIsImlkIjoiMTU5ZDAyYjQ0YTdjN2VlZiIsInRyIjoiNTEzYjdiODMwNGFkODE4N2Y5YzMwMzg0OTc4YWYwOWUiLCJ0aSI6MTcyMzQyNTAxNzYyOH19',
'priority': 'u=1, i',
'referer': 'https://www.shutterstock.com/zh/catalog/licenses',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-513b7b8304ad8187f9c30384978af09e-159d02b44a7c7eef-01',
'tracestate': '967232@nr=0-1-967232-1588632792-159d02b44a7c7eef----1723425017628',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'x-end-app-name': 'next-web',
'x-end-app-version': '2d4faa42e64',
'x-newrelic-id': 'XQAAU1VRGwIEVVhaBgYGUlI=',
'x-request-id': 'db700cde-0361-46f8-8ff3-d7234616ab34',
}
def random_ua(self):
first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)
os_type = [
'(Windows NT 6.1; WOW64)',
'(Windows NT 10.0; WOW64)',
'(X11; Linux x86_64)',
'(Macintosh; Intel Mac OS X 10_12_6)'
]
chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
'(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
)
self.headers['user-agent'] = ua
def get_pic(self,account_id,image_id, item_id,cookie,retry=0):
start_time = datetime.now().strftime("%m-%d %H:%M:%S")
try:
self.random_ua()
item = {}
headers = {
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
# 'cookie': 'stck_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; sstk_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; _gcl_au=1.1.874517867.1723276566; __ssid=c064478ee3926370e6737c3582c60d0; _ga=GA1.1.675721154.1723276566; did=02a1c297-357e-400b-87f5-41457a394697; accts_customer=903859535; accts_customer_sso1=331378495-undefined; next.sid=s%3Agy3BgikznI1m6vIui-5gGcFYcjWzDFIK.OXZW%2FtHJ8HVG%2Fx9eXhfAilGM0zicWdtakg1aacGy8OM; _4c_=%7B%22_4c_mc_%22%3A%222a68f713-f2b5-49b7-8e45-07549f8a66d8%22%7D; FPID=FPID2.2.eJMnQEWeDZwuxnkWmESNukKGwRAIKHd9t3eewKvzeQQ%3D.1723276566; visitor_id=73458769441; htjs_user_id=331378495; htjs_anonymous_id=7c4b5046-d8eb-4ee5-8bac-893a170fa766; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%227c4b5046-d8eb-4ee5-8bac-893a170fa766%22%7D; slireg=https://scout.us1.salesloft.com; extole_access_token=BV1L1U32MH7O4IOEUHOH6F88AJ; sliguid=2259d959-8b66-4544-8a26-ee6f9e0d872c; slirequested=true; n_v=2d4faa42e64; locale=zh; NEXT_LOCALE=zh; hl=zh; sstk_session_start=2024-08-12T00%3A50%3A01.185Z; stck_session_id=eb218863-1d7b-453e-b68f-826b54a4f567; sstk_session_id=eb218863-1d7b-453e-b68f-826b54a4f567; downlink=slow; visit_id=81252382944; gtm_monit_roll=99; OptanonConsent=consentId=5cff2293-6adc-4e35-960b-a301a65fc215&datestamp=Mon+Aug+12+2024+08%3A50%3A04+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202403.2.0&interactionCount=0&isAnonUser=1&isGpcEnabled=0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0005%3A1%2CC0004%3A1%2CC0007%3A1&AwaitingReconsent=false; OptanonCachedGroups=,C0001,C0003,C0002,C0005,C0004,C0007,; FPLC=xYngCOYQalQCTnXJSrL5MkPtTggGURkuZLMfzokC8ITFBHfwhBl4NrpZuNrp44Zw2eHfL22tL9Q1ejmZot6AEMzltj2Hvtapbeu8iPj20h1y4yHKQZLIWATPUg4j1A%3D%3D; datadome=1Q5Il~6ANKiyum9UFnRoJyruo7C1U0TXOsxV_tOU4rYdEk5a6dO3mjUEFLs5FgsAbSj2xGvoSUp3jYNLxrjs~XHUOwCU9L~8nimhKtYEznVW_AeNZ23e2zxJLf2GHikU; Authorization=1%2FeyJjbGllbnRfaWQiOiJhMGI3Ni1hN2Y1ZS1mZWRlYy1iYmViZS1mYTY1Yi04NTcxOSIsInJlYWxtIjoiY3VzdG9tZXIiLCJzY29wZSI6InVzZXIudmlldyB1c2VyLmVtYWlsIHVzZXIuYWRkcmVzcyB1c2VyLmVkaXQgb3JnYW5pemF0aW9uLnZpZXcgb3JnYW5pemF0aW9uLmFkZHJlc3MgY29sbGVjdGlvbnMudmlldyBjb2xsZWN0aW9ucy5lZGl0IGxpY2Vuc2VzLnZpZXcgbGljZW5zZXMuY3JlYXRlIG1lZGlhLnVwbG9hZCBtZWRpYS5zdWJtaXQgbWVkaWEuZWRpdCBwdXJjaGFzZXMudmlldyBwdXJjaGFzZXMuY3JlYXRlIiwidXR2IjoicUhpNCIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJ1c2VybmFtZSI6IjkwMzg1OTUzNSIsInVzZXJfaWQiOjMzMTM3ODQ5NSwib3JnYW5pemF0aW9uX2lkIjpudWxsLCJwYXJlbnRfb3JnYW5pemF0aW9uX2lkcyI6W10sImN1c3RvbWVyX2lkIjozMzEzNzg0OTUsImV4cCI6MTcyMzQyNzMyMX0.QcAGSAHi_1fLXJQReHBAtupw2qzn6DXiX1nsLVOoxg6OEe277kv3zt85EsJHen0_DmoHdMS35s4Fh418lXWR-w; _uetsid=d11e0420584411efb988e71c7217a51d|17zp06m|2|fo9|0|1685; _uetvid=58dc29704a6511ef8f5e5754465f397d|1dj9d84|1723425007889|3|1|bat.bing.com/p/insights/c/i; _ga_H22ZZQTXLV=GS1.1.1723423806.2.1.1723425008.0.0.0; _ga_SSGTMSSTK=GS1.1.1723423806.2.1.1723425008.0.0.1890544395; _ga_5JRYE4Y8J9=GS1.1.1723423806.2.1.1723425008.59.0.0',
'if-modified-since': 'Sat, 10 Aug 2024 08:09:24 GMT',
'if-none-match': 'W/"17t8cdyfo1s1un"',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6Ijk2NzIzMiIsImFwIjoiMTU4ODYzMjc5MiIsImlkIjoiMTU5ZDAyYjQ0YTdjN2VlZiIsInRyIjoiNTEzYjdiODMwNGFkODE4N2Y5YzMwMzg0OTc4YWYwOWUiLCJ0aSI6MTcyMzQyNTAxNzYyOH19',
'priority': 'u=1, i',
'referer': 'https://www.shutterstock.com/zh/catalog/licenses',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-513b7b8304ad8187f9c30384978af09e-159d02b44a7c7eef-01',
'tracestate': '967232@nr=0-1-967232-1588632792-159d02b44a7c7eef----1723425017628',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'x-end-app-name': 'next-web',
'x-end-app-version': '2d4faa42e64',
'x-newrelic-id': 'XQAAU1VRGwIEVVhaBgYGUlI=',
'x-request-id': 'db700cde-0361-46f8-8ff3-d7234616ab34',
}
params = {
'fields[images]': 'alt,aspect,channels,content_tier,contributor_id,description,displays,height,image_type,is_editorial,creativeInsights,link,model_release_info,sizes,src,title,width,has_model_release,has_property_release,status',
'fields[licenses]': 'content_id,content_size,content_type,global_accounts_user_id,insert_time,license_name,media-item,metadata',
'include': 'media-item.categories',
'filter[media_type]': 'photo',
'filter[licensee_type]': 'all',
'page[number]': '1',
'page[size]': '50',
'filter[media_id]': f'{image_id}',
'sort': 'newest',
'v': '2d4faa42e64',
}
response = requests.get('https://www.shutterstock.com/napi/user/licenses', params=params, cookies=cookie, headers=headers,timeout=600)
if response.status_code == 200:
try:
data = json.loads(response.text)
image_title = data['included'][0]['attributes']['alt']
image_size_info = {}
try:
hugeJpg = data['included'][0]['attributes']['sizes']['hugeJpg']
image_size_info['hugeJpg'] = hugeJpg
except:
hugeJpg = None
try:
mediumJpg = data['included'][0]['attributes']['sizes']['mediumJpg']
image_size_info['mediumJpg'] = mediumJpg
except:
mediumJpg = None
try:
smallJpg = data['included'][0]['attributes']['sizes']['smallJpg']
image_size_info['smallJpg'] = smallJpg
except:
smallJpg = None
image_size_info = json.dumps(image_size_info)
except:
image_size_info = None
image_title = None
if image_title:
# 获取图片url
try:
save_folder= 'D:\公司备用文件\\all_images\\9038595@qq'
pic_name = f'{str(image_id)}+{image_title}'.replace(':', '_').replace('\t', '').replace('\r', '').replace('|','').replace( '/', '').replace('"', "'").replace(' ', '_').replace(',', '_').replace('.', '_').replace('\n', '_')
pic_name = re.sub(r'[\\/*?:"<>|]', '_', pic_name)[:160] + '.jpg'
json_data = {
'required_cookies': '',
'content': [
{
'content_id': f'{image_id}',
'content_type': 'photo',
'content_size': 'huge',
'content_format': 'jpg',
'license_name': 'standard',
'show_modal': True,
},
],
}
response = requests.post('https://www.shutterstock.com/napi/licensees/current/redownload',cookies=cookie, headers=self.headers,json=json_data,timeout =600)
image_url = json.loads(response.text)['meta']['licensedContent'][0]['downloadUrl']
image_type = 'jpg'
res = requests.get(image_url,timeout=600)
# 构建完整的文件路径
file_path = os.path.join(save_folder, pic_name)
with open(file_path, "wb") as f:
f.write(res.content)
item['account_id'] = account_id
item['image_id'] = image_id
item['image_size_info'] = image_size_info
item['image_title'] = image_title
item['image_type'] = image_type
item['image_url'] = image_url
item['created_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# print(item)
Con.save_stock_detail(item)
Con.update_image_id_to_3(item_id)
now_time = datetime.now().strftime("%m-%d %H:%M:%S")
print(f'pic_name:{pic_name[:38]},time:{start_time}——{now_time}下载成功')
except Exception as e:
print('不是jpg',e)
try:
# 如果 jpg 格式失败,则尝试 png 格式
pic_name = f'{str(image_id)}+{image_title}'.replace(':', '_').replace('\t', '').replace('\r', '').replace('|', '').replace('/', '').replace('"', "'").replace(' ', '_').replace(',', '_').replace('.', '_').replace('\n', '_')
pic_name = re.sub(r'[\\/*?:"<>|]', '_', pic_name)[:160] + '.png'
json_data = {
'required_cookies': '',
'content': [
{
'content_id': f'{image_id}',
'content_type': 'photo',
'content_size': 'large',
'content_format': 'png',
'include_shadows': True,
'angle': 'G03',
'license_name': 'standard',
'show_modal': True,
},
],
}
response = requests.post('https://www.shutterstock.com/napi/licensees/current/redownload',cookies=cookie,headers=self.headers,json=json_data,timeout=600)
image_url = json.loads(response.text)['meta']['licensedContent'][0]['downloadUrl']
image_type = 'png'
res = requests.get(image_url,timeout=600)
# 构建完整的文件路径
file_path = os.path.join(save_folder, pic_name)
with open(file_path, "wb") as f:
f.write(res.content)
item['account_id'] = account_id
item['image_id'] = image_id
item['image_size_info'] = image_size_info
item['image_title'] = image_title
item['image_type'] = image_type
item['image_url'] = image_url
item['created_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
Con.save_stock_detail(item)
Con.update_image_id_to_3(item_id)
now_time = datetime.now().strftime("%m-%d %H:%M:%S")
print(f'pic_name:{pic_name},time:{start_time}——{now_time}下载成功')
except Exception as e:
print('不是png', e)
else:
Con.update_image_id_to_4(item_id)
elif response.status_code == 403 and retry < 3:
# 根据重试次数设置不同的休眠时间
if retry == 0:
sleep_time = random.randint(3, 6)
elif retry == 1:
sleep_time = random.randint(63, 66)
elif retry == 2:
sleep_time = random.randint(360, 366)
time.sleep(sleep_time)
cookie = cookie_list[random.randint(0, len(cookie_list) - 1)]
return self.get_pic(account_id, image_id, item_id, cookie, retry + 1)
else:
print(image_id, '状态码=', response.status_code)
except curl_cffi.requests.errors.RequestsError as e:
if retry < 3:
print(f"网络错误 {e}, 尝试重新连接...")
# 根据重试次数设置不同的休眠时间
if retry == 0:
sleep_time = random.randint(3, 6)
elif retry == 1:
sleep_time = random.randint(63, 66)
elif retry == 2:
sleep_time = random.randint(360, 366)
time.sleep(sleep_time)
cookie = cookie_list[random.randint(0, len(cookie_list) - 1)]
return self.get_pic(account_id, image_id, item_id, cookie, retry + 1)
else:
print('重试次数用完')
raise
def run(self):
account_id = '9038595@qq.com'
counts_start = 4061
counts_last = 5956
cookie = cookie_list_9038595[random.randint(0, len(cookie_list_9038595) - 1)]
cookie = {
'stck_anonymous_id': '7c4b5046-d8eb-4ee5-8bac-893a170fa766',
'sstk_anonymous_id': '7c4b5046-d8eb-4ee5-8bac-893a170fa766',
'stck_session_id': 'e581e2e7-1933-4913-b239-1761b434c44d',
'sstk_session_id': 'e581e2e7-1933-4913-b239-1761b434c44d',
'gtm_monit_roll': '28',
'_gcl_au': '1.1.771816893.1723647478',
'__ssid': '28f72d1846262242ca0b17aa9194143',
'_ga': 'GA1.1.2131351330.1723647478',
'did': 'a0286b86-d979-4399-b62c-533c763645c1',
'accts_customer': '903859535',
'accts_customer_sso1': '331378495-undefined',
'n_v': 'aed496c2095',
'next.sid': 's%3AVTU7VfLR6gI-BMo7OEwnGlwAVztcHQ-3.%2BW35URwHTuB345Gl7VSTYbPWj3rb2gqF4gMZ%2BDTxx9M',
'Authorization': '1%2FeyJjbGllbnRfaWQiOiJhMGI3Ni1hN2Y1ZS1mZWRlYy1iYmViZS1mYTY1Yi04NTcxOSIsInJlYWxtIjoiY3VzdG9tZXIiLCJzY29wZSI6InVzZXIudmlldyB1c2VyLmVtYWlsIHVzZXIuYWRkcmVzcyB1c2VyLmVkaXQgb3JnYW5pemF0aW9uLnZpZXcgb3JnYW5pemF0aW9uLmFkZHJlc3MgY29sbGVjdGlvbnMudmlldyBjb2xsZWN0aW9ucy5lZGl0IGxpY2Vuc2VzLnZpZXcgbGljZW5zZXMuY3JlYXRlIG1lZGlhLnVwbG9hZCBtZWRpYS5zdWJtaXQgbWVkaWEuZWRpdCBwdXJjaGFzZXMudmlldyBwdXJjaGFzZXMuY3JlYXRlIiwidXR2IjoicUhpNCIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJ1c2VybmFtZSI6IjkwMzg1OTUzNSIsInVzZXJfaWQiOjMzMTM3ODQ5NSwib3JnYW5pemF0aW9uX2lkIjpudWxsLCJwYXJlbnRfb3JnYW5pemF0aW9uX2lkcyI6W10sImN1c3RvbWVyX2lkIjozMzEzNzg0OTUsImV4cCI6MTcyMzY1MTAwM30.D8ssA57f7afzvpKkmArZLxjaGzPS-5LcNKbc6DfInBxNS_q4lIjRdT-cucxoJAbnNX4-XmGZyu4nZ_XlCyuxNg',
'locale': 'zh',
'NEXT_LOCALE': 'zh',
'hl': 'zh',
'downlink': 'low',
'_4c_': '%7B%22_4c_mc_%22%3A%22b6a47d96-66fe-482e-b706-8f755b50caa9%22%7D',
'FPID': 'FPID2.2.khIsa35ATNFYOdJwHRZS%2FQaz8pwpup%2BtFNxS00ynRAY%3D.1723647478',
'FPLC': 'dZnYmOYJ%2BSHKtun3yMjrhnLPM4vCY1SUAyypJkfVjo7651SS97oXiQYDgpH%2BWb1TBK6DO7WPKW7UxiW10o5kyLNP44atbgljRI6MDZ08b7GAdyibKwAjERwMLhBc3w%3D%3D',
'visit_id': '81268690780',
'visitor_id': '73477854695',
'htjs_user_id': '331378495',
'htjs_anonymous_id': '7c4b5046-d8eb-4ee5-8bac-893a170fa766',
'OptanonCachedGroups': ',C0001,C0003,C0002,C0005,C0004,C0007,',
'__rtbh.uid': '%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%227c4b5046-d8eb-4ee5-8bac-893a170fa766%22%7D',
'slireg': 'https://scout.us1.salesloft.com',
'extole_access_token': 'GBMAGUNJKCVHN6MQD2I3O10R7M',
'sliguid': 'f3718707-e12a-406c-bb72-4c6e11818ed5',
'slirequested': 'true',
'datadome': 'Qg34pBp0XktPTUP6O3VyAkUqSLrvf2a_ujVv06D9jHkrnwyKLAKZhmuJGINfosdXN5IReXFJ3vcOaPlr2M3PjStlds9k~IEQWAmgypcs4YRs7yeIgB7nQtymGIxzC1O_',
'OptanonConsent': 'consentId=bd069a57-0565-4fa3-bbab-314279d2a296&datestamp=Wed+Aug+14+2024+22%3A58%3A27+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202403.2.0&interactionCount=1&isAnonUser=1&isGpcEnabled=0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=https%3A%2F%2Fwww.shutterstock.com%2Fzh%2Fcatalog%2Flicenses&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0005%3A1%2CC0004%3A1%2CC0007%3A1',
'_ga_SSGTMSSTK': 'GS1.1.1723647477.1.1.1723647508.0.0.939777019',
'_ga_H22ZZQTXLV': 'GS1.1.1723644993.3.1.1723647508.0.0.0',
'_ga_5JRYE4Y8J9': 'GS1.1.1723647478.1.1.1723647508.30.0.0',
'_uetsid': 'd11e0420584411efb988e71c7217a51d|17zp06m|2|fob|0|1685',
'_uetvid': '58dc29704a6511ef8f5e5754465f397d|th0n3t|1723647509267|16|1|bat.bing.com/p/insights/c/x',
}
for count in range(counts_start, counts_last):
image_id = image_id_id_pairs[count].split('||-||')[0]
item_id = image_id_id_pairs[count].split('||-||')[1]
print('执行', image_id, item_id, count)
self.get_pic(account_id, image_id, item_id, cookie)
count += 1
if count == 5956:
print('全部下载完成')
break
if __name__ == '__main__':
stock_detail = Stock_Detail()
stock_detail.run()
# account = '9038595@qq.com'
# cookie_list = Con.get_stock_cookie(account)
# print(len(cookie_list))
# print(cookie_list)
# image_id_id_pairs = Con.get_stock_image_id()
# print(len(image_id_id_pairs))
# with open('image_id_pairs_9038595.py','w',encoding='utf-8') as f:
# f.write(str(image_id_id_pairs))
import os
os.environ['NO_PROXY'] = 'stackoverflow.com'
import logging
logging.captureWarnings(True)
from DrissionPage import ChromiumPage, ChromiumOptions
import time
from datetime import datetime, timedelta
from time import sleep
from random import randint
import requests
import math
import pandas as pd
import redis
import json
from pathlib import Path
import re
from sqlalchemy import create_engine
import random
class TkVideo():
def __init__(self):
# 修改请求头
self.headers = {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9', # 'en-US,en;q=0.9'
'cache-control': 'no-cache',
'content-type': 'application/json; charset=UTF-8',
'origin': 'https://www.tiktok.com',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.tiktok.com/',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'cross-site',
'sec-fetch-storage-access': 'active',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
}
self.key = "Sumind_Home"
# 配置你的参数
# 店铺账号
# 下载文件路径
self.download_folder = r"D:\Downloads"
self.receiver_name = 'pengyanbing'
# Redis 配置信息
self.REDIS_CONFIG = {
'host': '120.79.147.190',
'port': 6379,
'password': 'fG7#vT6kQ1pX',
'db': 13,
'decode_responses': True
}
self.page_chrome = ChromiumPage()
# # 配置 Chrome 浏览器 - 端口 9222
# chrome_options = ChromiumOptions()
# chrome_options.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe')
# chrome_options.set_local_port(9333) # 设置 Chrome 的调试端口
# self.page_chrome = ChromiumPage(addr_or_opts=chrome_options)
# print(f"Chrome 浏览器运行在端口: {9333}")
def get_datetime(self):
"""获取当前日期,并计算前2天的完整日期(年-月-日),并按照指定格式输出"""
today = datetime.now()
self.deadline = today - timedelta(days=2)
# 提取年、月、日
self.deadline_year = self.deadline.year
self.deadline_month = self.deadline.month # 自动是整数,不带前导零
self.deadline_day = self.deadline.day
# 输出年-月-日格式
print(f'{self.deadline_year}-{self.deadline_month}-{self.deadline_day}')
# 新增:输出 年_月 格式,月份带前导零(例如 06)
self.deadline_year_month = self.deadline.strftime("%Y_%m")
print(self.deadline_year_month) # 输出示例:2025_06
def get_day(self):
try:
self.page_chrome.get("https://www.tiktok.com/business-suite/insight/video")
self.page_chrome.set.window.max()
# 等待页面初始加载
time.sleep(random.randint(6, 10))
export_orders = self.page_chrome.ele('xpath://span[text()="自定义"]', timeout=10)
export_orders.click()
print('点击自定义')
time.sleep(random.randint(5, 10))
# 先点击开始时间:2024年7月1日
self.page_chrome.ele(
f"xpath=//div[@class='tiktok-datepicker-month-title' and contains(text(), '2024 7 月')]"
f"/following-sibling::div[@class='tiktok-datepicker-day-wrapper'][1]"
f"//div[@class='tiktok-datepicker-day valid in-this-month']"
f"//span[text()='1']/parent::div"
).click()
print('已输入开始时间2024 7 月 1 日')
time.sleep(random.randint(3, 5))
# 初始目标日期为 deadline(可能已经是上个月的某一天)
current_date = self.deadline
max_attempts = 31
while max_attempts > 0:
year = current_date.year
month = current_date.month
day = current_date.day
try:
xpath = (
f"//div[@class='tiktok-datepicker-month-title' and contains(text(), '{year} {month} 月')]"
f"/following-sibling::div[@class='tiktok-datepicker-day-wrapper']"
f"//div[@class='tiktok-datepicker-day valid in-this-month']"
f"//span[text()='{day}']/parent::div"
)
ele = self.page_chrome.ele(f"xpath={xpath}", timeout=5)
ele.click()
print(f'✅ 成功点击日期:{year}-{month}-{day}')
# self.day = str(day)
self.day = f"{day:02d}"
# 更新 self.deadline_year_month 为最新选中日期
self.deadline_year_month = current_date.strftime("%Y_%m")
self.get_data()
time.sleep(random.randint(3, 5))
return True # 成功返回
except Exception as e:
print(f'❌ 无法点击 {year}-{month}-{day},错误:{e}')
# 往前推一天
current_date -= timedelta(days=1)
max_attempts -= 1
time.sleep(random.randint(3, 5))
print('⛔ 连续尝试失败,未找到可点击的日期,请检查页面状态或网络连接。')
return False
except Exception as e:
print(f"get_day出现错误: {e}")
self.send_error_notification_via_wechat(e) # 如果有这个方法可以取消注释
def get_data(self):
try:
self.page_chrome.ele('xpath://div[text()="更新"]', timeout=10).click()
print('已点击更新')
sleep(randint(5, 10))
self.page_chrome.ele('xpath://span[text()="下载数据"]', timeout=10).click()
print('已点击下载数据')
sleep(randint(5, 10))
self.page_chrome.ele('xpath://span[text()="Xlsx"]', timeout=10).click()
print('已点击Xlsx')
sleep(randint(5, 10))
self.page_chrome.ele('xpath://button[text()="下载数据"]', timeout=10).click()
print('已点击下载数据')
sleep(randint(5, 10))
# 点击首页 获取店铺名称
self.page_chrome.ele('xpath://span[text()="首页"]', timeout=10).click()
print('已点击首页')
sleep(randint(5, 10))
self.shop_name = self.page_chrome.ele('xpath://div[@class="text-H6-Bold"]').text
print(f'已获取店铺名: {self.shop_name}')
sleep(randint(5, 10))
self.save_to_redis()
self.send_success_message_via_wechat()
time.sleep(5)
# self.page_chrome.quit()
except Exception as e:
print(f"get_data出现错误: {e}")
self.send_error_notification_via_wechat(e)
def connect_redis(self):
"""建立 Redis 连接"""
self.r = redis.StrictRedis(**self.REDIS_CONFIG)
try:
self.r.ping() # 测试连接
print("✅ 成功连接到 Redis")
except redis.exceptions.ConnectionError as e:
print(f"❌ 无法连接到 Redis: {e}")
raise
def read_excel(self, file_path):
"""读取 Excel 文件内容,并防止数值被转为科学计数法"""
print(f"📄 正在读取文件:{file_path}")
# 定义列名映射(中文 → 英文)
column_mapping = {
'视频标题': 'video_title',
'视频链接': 'video_url',
'发布时间': 'publish_date',
'视频观看次数': 'views',
'点赞数': 'likes',
'评论数': 'comments',
'分享次数': 'shares',
'添加到收藏': 'favorites'
}
# 强制所有列为字符串类型,防止科学计数法
df = pd.read_excel(file_path, dtype=str)
# 替换列名为英文
df.rename(columns=column_mapping, inplace=True)
data = df.to_dict(orient='records') # 转换为字典列表
print(f"📊 已读取 {len(data)} 条记录")
return data
def process_data(self, data, account):
processed_data = []
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
for record in data:
# 提取 content_id
video_link = record.get('video_url', '')
if video_link:
content_id = video_link.split('/')[-1]
else:
content_id = ''
# 添加 account 和 update_time 字段
processed_record = {
'account': account,
'content_id': content_id,
'update_time': current_time,
**record # 合并原始记录
}
# 替换 NaN 和 None 为 空字符串
cleaned_record = {
key: ("" if pd.isna(value) or value is None or str(value).strip().lower() == "nan" else value)
for key, value in processed_record.items()
}
processed_data.append(cleaned_record)
return processed_data
def store_data_in_redis(self, r, data):
"""将数据存储到 Redis 中,对相同的 shop_code 清除旧数据后写入新数据"""
key = f"tk_video_data_test:{self.key}:order:list"
# ⚠️ 先删除旧数据(实现“覆盖”)
if r.exists(key):
r.delete(key)
print(f"🗑️ 已清除旧数据: {key}")
# 写入新数据
pipe = r.pipeline()
for record in data:
value = json.dumps(record, ensure_ascii=False)
pipe.rpush(key, value)
pipe.execute() # 批量执行,提高效率
print(f"💾 已写入新数据到键: {key},共 {len(data)} 条记录")
def find_specific_file(self):
download_path = Path(self.download_folder)
# 构建基础前缀(使用真正的括号)
base_prefix = f"视频(2024_07_01-{self.deadline_year_month}_{self.day})"
# 构建正则表达式:以 base_prefix 开头,后面可以跟任意内容
pattern = re.escape(base_prefix) + r'.*$'
print("匹配模式:", pattern)
for file in download_path.iterdir():
if file.is_file() and re.fullmatch(pattern, file.name):
return str(file)
raise FileNotFoundError(f"未找到匹配 {base_prefix} 的文件")
def save_to_redis(self):
EXCEL_FILE = self.find_specific_file()
print(f'保存文件:{EXCEL_FILE}')
# 读取 Excel 数据
data = self.read_excel(EXCEL_FILE)
processed_data = self.process_data(data, self.shop_name)
self.store_data_in_redis(self.r, processed_data)
def send_success_message_via_wechat(self):
webhook_url = 'http://47.112.96.71:8082/selection/sendMessage' # 替换为你的企业微信机器人的Webhook URL
data = {
"account": self.receiver_name,
"title": '【TK视频数据下载成功提醒】',
"content": f'账号: {self.key}, 文件:视频(2024_07_01-{self.deadline_year_month}_{self.day}), 时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
}
response = requests.post(url=webhook_url, data=data, timeout=15)
if response.status_code == 200:
print("已成功发送通知到企业微信")
else:
print(f"发送通知失败: {response.text}")
def send_error_notification_via_wechat(self,error_message):
webhook_url = 'http://47.112.96.71:8082/selection/sendMessage' # 替换为你的企业微信机器人的Webhook URL
data = {
"account": self.receiver_name,
'title':'【TK视频数据下载异常提醒】',
'content':f'账号:{self.key},错误信息:{error_message}, 时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
}
response = requests.post(url=webhook_url, data=data,timeout=15)
if response.status_code == 200:
print("已成功发送错误通知到企业微信")
else:
print(f"发送错误通知失败: {response.text}")
def run(self):
self.connect_redis()
self.get_datetime()
self.get_day()
print('完成关闭浏览器')
time.sleep(3)
# 如果 ChromiumPage 底层保存了 browser 对象
# 或者如果它是基于 Selenium WebDriver
self.page_chrome.quit()
if __name__ == '__main__':
TkVideo().run()
# -*- coding: utf-8 -*-
import os
os.environ['NO_PROXY'] = 'stackoverflow.com'
import logging
logging.captureWarnings(True)
from DrissionPage import ChromiumPage, ChromiumOptions
import time
from datetime import datetime, timedelta
from time import sleep
from random import randint
import requests
import math
import pandas as pd
import redis
import json
from pathlib import Path
import re
from sqlalchemy import create_engine
import random
class TkVideo():
def __init__(self):
# 修改请求头
self.headers = {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9', # 'en-US,en;q=0.9'
'cache-control': 'no-cache',
'content-type': 'application/json; charset=UTF-8',
'origin': 'https://www.tiktok.com',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.tiktok.com/',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'cross-site',
'sec-fetch-storage-access': 'active',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
}
self.key = "LilyRose_sharing"
# 配置你的参数
# 店铺账号
# 下载文件路径
self.download_folder = r"D:\Downloads"
self.receiver_name = 'pengyanbing'
# Redis 配置信息
self.REDIS_CONFIG = {
'host': '120.79.147.190',
'port': 6379,
'password': 'fG7#vT6kQ1pX',
'db': 13,
'decode_responses': True
}
edge_options = ChromiumOptions()
edge_options.set_browser_path(r'C:\Program Files\Edge\App\msedge.exe')
edge_options.set_local_port(9333)
# 防止被识别为自动化脚本
edge_options.set_argument('--disable-blink-features=AutomationControlled')
edge_options.set_argument('--disable-infobars')
edge_options.set_argument('--start-maximized')
edge_options.set_argument('--disable-popup-blocking')
edge_options.set_argument('--disable-notifications')
edge_options.set_argument('--disable-gpu')
edge_options.set_argument('--no-sandbox')
edge_options.set_argument('--disable-dev-shm-usage')
edge_options.set_argument('--ignore-certificate-errors')
# # 使用本地用户数据(可选)
# edge_options.set_user_data_path(r'C:\Users\YourUsername\AppData\Local\Microsoft\Edge\User Data')
self.page_edge = ChromiumPage(addr_or_opts=edge_options)
print(f"Edge 浏览器运行在端口: {9333}")
def get_datetime(self):
"""获取当前日期,并计算前2天的完整日期(年-月-日),并按照指定格式输出"""
today = datetime.now()
self.deadline = today - timedelta(days=2)
# 提取年、月、日
self.deadline_year = self.deadline.year
self.deadline_month = self.deadline.month # 自动是整数,不带前导零
self.deadline_day = self.deadline.day
# 输出年-月-日格式
print(f'{self.deadline_year}-{self.deadline_month}-{self.deadline_day}')
# 新增:输出 年_月 格式,月份带前导零(例如 06)
self.deadline_year_month = self.deadline.strftime("%Y_%m")
print(self.deadline_year_month) # 输出示例:2025_06
def get_day(self):
try:
self.page_edge.get("https://www.tiktok.com/business-suite/insight/video")
self.page_edge.set.window.max()
# 等待页面初始加载
time.sleep(random.randint(6, 10))
export_orders = self.page_edge.ele('xpath://span[text()="自定义"]', timeout=10)
export_orders.click()
print('点击自定义')
time.sleep(random.randint(5, 10))
# 先点击开始时间:2024年7月1日
self.page_edge.ele(
f"xpath=//div[@class='tiktok-datepicker-month-title' and contains(text(), '2024 7 月')]"
f"/following-sibling::div[@class='tiktok-datepicker-day-wrapper'][1]"
f"//div[@class='tiktok-datepicker-day valid in-this-month']"
f"//span[text()='1']/parent::div"
).click()
print('已输入开始时间2024 7 月 1 日')
time.sleep(random.randint(3, 5))
# 初始目标日期为 deadline(可能已经是上个月的某一天)
current_date = self.deadline
max_attempts = 31
while max_attempts > 0:
year = current_date.year
month = current_date.month
day = current_date.day
try:
xpath = (
f"//div[@class='tiktok-datepicker-month-title' and contains(text(), '{year} {month} 月')]"
f"/following-sibling::div[@class='tiktok-datepicker-day-wrapper']"
f"//div[@class='tiktok-datepicker-day valid in-this-month']"
f"//span[text()='{day}']/parent::div"
)
ele = self.page_edge.ele(f"xpath={xpath}", timeout=5)
ele.click()
print(f'✅ 成功点击日期:{year}-{month}-{day}')
# self.day = str(day)
self.day = f"{day:02d}"
# 更新 self.deadline_year_month 为最新选中日期
self.deadline_year_month = current_date.strftime("%Y_%m")
self.get_data()
time.sleep(random.randint(3, 5))
return True # 成功返回
except Exception as e:
print(f'❌ 无法点击 {year}-{month}-{day},错误:{e}')
# 往前推一天
current_date -= timedelta(days=1)
max_attempts -= 1
time.sleep(random.randint(3, 5))
print('⛔ 连续尝试失败,未找到可点击的日期,请检查页面状态或网络连接。')
return False
except Exception as e:
print(f"get_day出现错误: {e}")
self.send_error_notification_via_wechat(e) # 如果有这个方法可以取消注释
def get_data(self):
try:
self.page_edge.ele('xpath://div[text()="更新"]', timeout=10).click()
print('已点击更新')
sleep(randint(5, 10))
self.page_edge.ele('xpath://span[text()="下载数据"]', timeout=10).click()
print('已点击下载数据')
sleep(randint(5, 10))
self.page_edge.ele('xpath://span[text()="Xlsx"]', timeout=10).click()
print('已点击Xlsx')
sleep(randint(5, 10))
self.page_edge.ele('xpath://button[text()="下载数据"]', timeout=10).click()
print('已点击下载数据')
sleep(randint(5, 10))
# 点击首页 获取店铺名称
self.page_edge.ele('xpath://span[text()="首页"]', timeout=10).click()
print('已点击首页')
sleep(randint(5, 10))
self.shop_name = self.page_edge.ele('xpath://div[@class="text-H6-Bold"]').text
print(f'已获取店铺名: {self.shop_name}')
sleep(randint(5, 10))
self.save_to_redis()
self.send_success_message_via_wechat()
time.sleep(5)
self.page_edge.quit()
except Exception as e:
print(f"get_data出现错误: {e}")
self.send_error_notification_via_wechat(e)
def connect_redis(self):
"""建立 Redis 连接"""
self.r = redis.StrictRedis(**self.REDIS_CONFIG)
try:
self.r.ping() # 测试连接
print("✅ 成功连接到 Redis")
except redis.exceptions.ConnectionError as e:
print(f"❌ 无法连接到 Redis: {e}")
raise
def read_excel(self, file_path):
"""读取 Excel 文件内容,并防止数值被转为科学计数法"""
print(f"📄 正在读取文件:{file_path}")
# 定义列名映射(中文 → 英文)
column_mapping = {
'视频标题': 'video_title',
'视频链接': 'video_url',
'发布时间': 'publish_date',
'视频观看次数': 'views',
'点赞数': 'likes',
'评论数': 'comments',
'分享次数': 'shares',
'添加到收藏': 'favorites'
}
# 强制所有列为字符串类型,防止科学计数法
df = pd.read_excel(file_path, dtype=str)
# 替换列名为英文
df.rename(columns=column_mapping, inplace=True)
data = df.to_dict(orient='records') # 转换为字典列表
print(f"📊 已读取 {len(data)} 条记录")
return data
def process_data(self, data, account):
processed_data = []
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
for record in data:
# 提取 content_id
video_link = record.get('video_url', '')
if video_link:
content_id = video_link.split('/')[-1]
else:
content_id = ''
# 添加 account 和 update_time 字段
processed_record = {
'account': account,
'content_id': content_id,
'update_time': current_time,
**record # 合并原始记录
}
# 替换 NaN 和 None 为 空字符串
cleaned_record = {
key: ("" if pd.isna(value) or value is None or str(value).strip().lower() == "nan" else value)
for key, value in processed_record.items()
}
processed_data.append(cleaned_record)
return processed_data
def store_data_in_redis(self, r, data):
"""将数据存储到 Redis 中,对相同的 shop_code 清除旧数据后写入新数据"""
key = f"tk_video_data_test:{self.key}:order:list"
# ⚠️ 先删除旧数据(实现“覆盖”)
if r.exists(key):
r.delete(key)
print(f"🗑️ 已清除旧数据: {key}")
# 写入新数据
pipe = r.pipeline()
for record in data:
value = json.dumps(record, ensure_ascii=False)
pipe.rpush(key, value)
pipe.execute() # 批量执行,提高效率
print(f"💾 已写入新数据到键: {key},共 {len(data)} 条记录")
def find_specific_file(self):
download_path = Path(self.download_folder)
# 构建基础前缀(使用真正的括号)
base_prefix = f"视频(2024_07_01-{self.deadline_year_month}_{self.day})"
# 构建正则表达式:以 base_prefix 开头,后面可以跟任意内容
pattern = re.escape(base_prefix) + r'.*$'
print("匹配模式:", pattern)
for file in download_path.iterdir():
if file.is_file() and re.fullmatch(pattern, file.name):
return str(file)
raise FileNotFoundError(f"未找到匹配 {base_prefix} 的文件")
def save_to_redis(self):
EXCEL_FILE = self.find_specific_file()
print(f'保存文件:{EXCEL_FILE}')
# 读取 Excel 数据
data = self.read_excel(EXCEL_FILE)
processed_data = self.process_data(data, self.shop_name)
self.store_data_in_redis(self.r, processed_data)
def send_success_message_via_wechat(self):
webhook_url = 'http://47.112.96.71:8082/selection/sendMessage' # 替换为你的企业微信机器人的Webhook URL
data = {
"account": self.receiver_name,
"title": '【TK视频数据下载成功提醒】',
"content": f'账号: {self.key}, 文件:视频(2024_07_01-{self.deadline_year_month}_{self.day}), 时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
}
response = requests.post(url=webhook_url, data=data, timeout=15)
if response.status_code == 200:
print("已成功发送通知到企业微信")
else:
print(f"发送通知失败: {response.text}")
def send_error_notification_via_wechat(self,error_message):
webhook_url = 'http://47.112.96.71:8082/selection/sendMessage' # 替换为你的企业微信机器人的Webhook URL
data = {
"account": self.receiver_name,
'title':'【TK视频数据下载异常提醒】',
'content':f'账号:{self.key},错误信息:{error_message}, 时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
}
response = requests.post(url=webhook_url, data=data,timeout=15)
if response.status_code == 200:
print("已成功发送错误通知到企业微信")
else:
print(f"发送错误通知失败: {response.text}")
def run(self):
self.connect_redis()
self.get_datetime()
self.get_day()
print('完成关闭浏览器')
time.sleep(3)
# 如果 ChromiumPage 底层保存了 browser 对象
# 或者如果它是基于 Selenium WebDriver
self.page_edge.quit()
if __name__ == '__main__':
TkVideo().run()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment