Commit badb8f44 by Peng

no message

parent a8be8ef1
import json
import pandas as pd
from secure_db_client import get_remote_engine
import time
from sqlalchemy import create_engine
class ConnectSpider:
def __init__(self):
self.db_engine = get_remote_engine(
site_name='us', # -> database "selection"
db_type="postgresql_14_outer", # -> 服务端 alias "mysql"
)
self.db_engine192 = get_remote_engine(
site_name='us', # -> database "selection"
db_type="postgresql_14_outer", # -> 服务端 alias "mysql"
)
# self.pg_port = 54328
# self.pg_db = "selection"
# self.pg_user = "postgres"
# self.pg_pwd = "F9kL2sXe81rZq"
# self.pg_host = "61.145.136.61"
# pg_host = "192.168.10.223"
# self.db_engine192 = create_engine(
# f"postgresql://{self.pg_user}:{self.pg_pwd}@{self.pg_host}:{self.pg_port}/{self.pg_db}")
def mysql(self):
mysql_engine = get_remote_engine(
site_name='us', # -> database "selection"
db_type="mysql", # -> 服务端 alias "mysql"
)
return mysql_engine
def save_stock_img_id(self,items):
# sql = """
# INSERT INTO stock_image_id_wj
# (account_id, image_id, state, created_at, image_title, image_size_info)
# VALUES (%s, %s, %s, %s, %s, %s)
# ON DUPLICATE KEY UPDATE
# state = VALUES(state),
# created_at = VALUES(created_at),
# image_title = VALUES(image_title),
# image_size_info = VALUES(image_size_info)
# """
sql = """
INSERT INTO stock_image_id_wj
(account_id, image_id, state, created_at, image_title, image_size_info)
VALUES (%s, %s, %s, %s, %s, %s::jsonb)
ON CONFLICT (account_id, image_id) DO UPDATE SET
state = EXCLUDED.state,
created_at = EXCLUDED.created_at,
image_title = EXCLUDED.image_title,
image_size_info = EXCLUDED.image_size_info;
"""
params = [
(
item['account_id'],
item['image_id'],
item['state'],
item['created_at'],
item.get('title', ''),
json.dumps(item.get('sizes', {}))
)
for item in items
]
for i in range(5):
try:
with self.db_engine192.begin() as conn:
conn.execute(sql, params)
print('存储更新成功')
break
except Exception as e:
time.sleep(30)
print('save_stock_img_id 报错。', e)
def update_id_to_3(self, account_id):
for i in range(5):
try:
with self.db_engine192.begin() as connection:
table_name = "stock_image_summary_wj"
print(account_id)
sql_update = f"UPDATE {table_name} SET state = 3 WHERE account_id='{account_id}'"
print(sql_update,'成功更新为3')
connection.execute(sql_update)
break
except Exception as e:
time.sleep(30)
print('update_id_to_3 报错。', e)
def update_all_states_to_1(self, state=1, item_id=None):
for i in range(5):
try:
with self.db_engine192.begin() as connection: # 使用 begin() 自动管理事务
table_name = "stock_image_summary_wj"
if state == 3:
sql_update = f"UPDATE {table_name} SET state = {state} where id={item_id}"
else:
sql_update = f"UPDATE {table_name} SET state = {state}"
print(sql_update)
connection.execute(sql_update)
break
except Exception as e:
time.sleep(30)
print(f'更新状态失败:{e}')
# 回滚事务
def save_stock_detail(self, item):
"""批量保存数据到数据库。"""
table_name = "stock_image_detail_wj"
# 将item包装成列表
items_list = [item]
# 定义DataFrame的列
columns = ['account_id', 'image_id', 'image_size_info', 'image_title', 'image_type', 'image_url', 'state',
'created_time']
df = pd.DataFrame(items_list, columns=columns)
for i in range(5):
try:
self.db_engine192.to_sql(df, table_name, if_exists='append')
print("保存成功!")
break
except Exception as e:
time.sleep(30)
print(f'save_stock_detail 报错:{e}')
# 回滚事务
def get_stock_images_id(self, account_id):
for i in range(5):
try:
table_name = "stock_image_id_wj"
# 修改查询语句以匹配你的数据表名称和列名称
query = f""" SELECT image_id,id,image_title,image_size_info FROM {table_name} where account_id ='{account_id}' and state = 1"""
print(query)
df_status = self.db_engine192.read_sql(query)
try:
df_status['id'] = df_status['id'].astype(str)
image_id_id_pairs = list(
df_status['image_id'] + '||-||' + df_status['id'] + '||-||' + df_status['image_title'] + '||-||' +
df_status['image_size_info'])
print(f'账号:{account_id}需爬取{len(image_id_id_pairs)}张')
return image_id_id_pairs
except Exception as e:
print(e)
return False
except Exception as e:
time.sleep(30)
print(f'get_stock_images_id 报错:{e}')
# 回滚事务
def update_image_id_to_3(self, item_id):
for i in range(5):
try:
with self.db_engine192.begin() as connection:
table_name = "stock_image_id_wj"
sql_update = f"UPDATE {table_name} SET state = 3 WHERE id = {item_id}"
connection.execute(sql_update)
break
except Exception as e:
time.sleep(30)
print(f'update_image_id_to_3 报错:{e}')
# 回滚事务
def update_image_id_to_4(self, item_id):
for i in range(5):
try:
with self.db_engine192.begin() as connection:
table_name = "stock_image_id_wj"
sql_update = f"UPDATE {table_name} SET state = 4 WHERE id = {item_id}"
connection.execute(sql_update)
break
except Exception as e:
time.sleep(30)
print(f'update_image_id_to_4 报错:{e}')
def get_cookie_account(self, item_id):
for i in range(5):
try:
table_name = "stock_image_summary_wj"
# 修改查询语句以匹配你的数据表名称和列名称
query = f"""SELECT account_id,account_secret FROM {table_name} where id = {item_id} and state= 1;"""
print(query)
df_status = self.db_engine192.read_sql(query)
if len(df_status) > 0:
account_id = df_status.account_id.iloc[0]
account_secret = df_status.account_secret.iloc[0]
account_list = [account_id, account_secret]
print(account_list, '232323====32')
return account_list
else:
return None
except Exception as e:
time.sleep(30)
print(f'get_cookie_account 报错:{e}')
def get_all_image_id(self):
for i in range(5):
try:
table_name = "stock_image_detail_wj"
sql_query = f"SELECT image_id FROM {table_name} "
df_status = self.db_engine192.read_sql(sql_query)
image_id = list(df_status['image_id'].astype(str))
return image_id
except Exception as e:
time.sleep(30)
print(f'get_all_image_id 报错:{e}')
def update_url_state_to_3(self, image_id):
for i in range(5):
try:
with self.db_engine192.begin() as connection:
table_name = "stock_image_detail_wj"
sql_update = f"UPDATE {table_name} SET state = 3 WHERE image_id ='{image_id}' and state = 1"
connection.execute(sql_update)
break
except Exception as e:
time.sleep(30)
print(f'update_url_state_to_3 报错:{e}')
def get_pic_urls(self, account_id):
pic_data_list = [] # 创建一个空列表来存储结果
table_name = "stock_image_detail_wj"
query =f"""select image_url, image_id, image_title from {table_name} where account_id = '{account_id}' and state = 1"""
try:
result_df = self.db_engine192.read_sql(query)
# print(result_df)
result_list = result_df.values.tolist()
for row in result_list: # 遍历所有的结果行
if row is not None:
# 直接构建所需格式的字符串
pic_datas = f"{row[0]}||{row[1]}||{row[2]}"
pic_data_list.append(pic_datas) # 添加到列表中
"""
['https://download.shutterstock.com/gatekeeper/W3siZCI6ICJzaHV0dGVyc3RvY2stbWVkaWEiLCAiayI6ICJwaG90by8yNDY2MDI5NDI1L2h1Z2UuanBnIiwgImRjIjogImlkbF8xMjMiLCAiZSI6IDE3NDYwMTIzNDQsICJtIjogMX0sICJBb0dOUzZDMXNiVU1XczgxMFN5YVBsUEJrakEiXQ==/shutterstock_2466029425.jpg||2466029425||Beautiful smiling model in sunglasses. Female dressed in summer hipster white T-shirt and jeans. Posing near white wall in the street. Funny and positive woman having fun outdoors, in sunglasses']
"""
if not pic_data_list:
# print("No data found for the given account_id")
return False
else:
return pic_data_list # 返回列表
except Exception as e:
print(f"An error occurred: {e}")
return False
if __name__ == '__main__':
ConnectSpider().get_cookie_account(10)
# -*- coding: utf-8 -*-
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from time import sleep
from random import randint
from all_connect import ConnectSpider
import traceback
Con = ConnectSpider()
import imaplib
import email
import os
import time
import requests
import hashlib
os.environ['NO_PROXY'] = 'stackoverflow.com'
import logging
logging.captureWarnings(True)
from DrissionPage import ChromiumPage,ChromiumOptions
import json
from curl_cffi import requests
import re
import random
import time
from datetime import datetime, timedelta
import calendar
import sys
class GetStockImgId(object):
def __init__(self):
self.headers = {
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
'content-type': 'application/json',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6Ijk2NzIzMiIsImFwIjoiMTU4ODYzMjc5MiIsImlkIjoiMjgzNzAxYzA5ODljNWI4YiIsInRyIjoiMDYwYTQwMzI4MjhiMGNlM2ZkZmJlYzAxNDU5NTVhZDUiLCJ0aSI6MTczNTg4NTk5ODcxOX19',
'origin': 'https://www.shutterstock.com',
'priority': 'u=1, i',
'referer': 'https://www.shutterstock.com/zh/catalog/licenses?startDate=2024-12-01&endDate=2024-12-31',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-060a4032828b0ce3fdfbec0145955ad5-283701c0989c5b8b-01',
'tracestate': '967232@nr=0-1-967232-1588632792-283701c0989c5b8b----1735885998719',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'x-end-app-name': 'next-web',
'x-end-app-version': '837034fdc61',
'x-newrelic-id': 'XQAAU1VRGwIEVVhaBgYGUlI=',
'x-request-id': 'c3a36b63-ff03-4c2f-9a94-5381cd4485a7',
}
def random_ua(self):
first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)
os_type = [
'(Windows NT 6.1; WOW64)',
'(Windows NT 10.0; WOW64)',
'(X11; Linux x86_64)',
'(Macintosh; Intel Mac OS X 10_12_6)'
]
chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
'(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
)
self.headers['user-agent'] = ua
def get_url_month(self, page, cookie, start_date, last_date):
# self.random_ua()
"https://www.shutterstock.com/napi/s/dam/holdings/search?include=media-item%2Cmedia-item.track-assets%2Cmedia-item.cms-entry&sort=-licensedAt&useMms=true&channel=shutterstock&page[size]=50&filter[licensedSince]={start_date}T00%3A00%3A00Z&filter[licensedUntil]={last_date}T23%3A59%3A59Z&filter[assetStatus]=comped%2Clicensed&language=zh"
url = f"https://www.shutterstock.com/napi/s/dam/holdings/search?include=media-item%2Cmedia-item.track-assets%2Cmedia-item.cms-entry&sort=-licensedAt&useMms=true&channel=shutterstock&page[size]=50&filter[licensedSince]={start_date}T00%3A00%3A00Z&filter[licensedUntil]={last_date}T23%3A59%3A59Z&page[number]={page}&filter[assetStatus]=comped%2Clicensed&language=zh"
print('url:',url)
# url = "https://www.shutterstock.com/napi/s/dam/holdings/search"
# params = {
# "include": "media-item,media-item.track-assets,media-item.cms-entry",
# "sort": "-licensedAt",
# "useMms": "true",
# "channel": "shutterstock",
# "page\\[size\\]": "50",
# "filter\\[licensedSince\\]": "2025-09-01T00:00:00Z",
# "filter\\[licensedUntil\\]": "2025-09-30T23:59:59Z",
# "filter\\[assetStatus\\]": "comped,licensed",
# "language": "zh"
# }
response = requests.get(url, headers=self.headers, cookies=cookie)
print(response)
print(response.url)
return response
def get_img_id(self, response, account_id, page):
try:
# print(response.json())
data = response.json()['included']
if data:
data_list = []
for item in data:
datas = {}
item_id = item['id']
title = item['attributes']['title']
sizes = item['attributes']['sizes']
datas['account_id'] = account_id
datas['image_id'] = int(item_id)
datas['title'] = title
datas['sizes'] = sizes
datas['state'] = 1
datas['created_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
data_list.append(datas)
# 保存
print('准备保存:')
Con.save_stock_img_id(data_list)
print(f"{account_id}第{page}页保存id成功,")
return True
else:
print('最后一页,全部保存成功')
return False
except Exception as e:
print(e)
if 'included' in str(e):
print('最后一页,全部保存成功')
return False
def get_last_month_start_end(self):
# 获取今天的日期
today = datetime.today()
# 计算上个月的第一天
first_day_of_this_month = today.replace(day=1)
first_day_of_last_month = (first_day_of_this_month - timedelta(days=1)).replace(day=1)
# 计算上个月的最后一天
_, last_day_of_last_month = calendar.monthrange(first_day_of_last_month.year, first_day_of_last_month.month)
last_day_of_last_month_date = first_day_of_last_month.replace(day=last_day_of_last_month)
# 格式化输出
start_date = first_day_of_last_month.strftime('%Y-%m-%d')
last_date = last_day_of_last_month_date.strftime('%Y-%m-%d')
return start_date, last_date
def run(self, account_id, cookie):
is_continue = True
page = 1
start_date, last_date = self.get_last_month_start_end()
# start_date = '2023-12-01'
# last_date = '2023-12-31'
print(f"Start Date: {start_date}")
print(f"Last Date: {last_date}")
while is_continue:
try:
response = self.get_url_month(page, cookie, str(start_date), str(last_date))
if response.status_code == 200:
# 更新是否继续标志位
is_continue = self.get_img_id(response, account_id, page)
print('is_continue:',is_continue)
# 如果不再继续,则更新数据库并将当前账户标记为已完成
if not is_continue:
Con.update_id_to_3(account_id)
break
# 等待一段时间再进行下一次请求
time.sleep(random.randint(3, 6))
page += 1
else:
print(f'状态码为{response.status_code}, 请求失败')
raise
except Exception as e:
print(e)
# 抛出异常以停止外层循环
raise
class GetSS_details():
def __init__(self):
self.account = ''
self.pwd = ''
# self.page = ChromiumPage()
# 配置 Chrome 浏览器 - 端口 9222
chrome_options = ChromiumOptions()
chrome_options.set_browser_path(r'C:\Program Files\Google\Chrome\Application\chrome.exe')
chrome_options.set_local_port(9333) # 设置 Chrome 的调试端口
self.page = ChromiumPage(addr_or_opts=chrome_options)
print(f"Chrome 浏览器运行在端口: {9333}")
self.headers = {
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
'content-type': 'application/json',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6Ijk2NzIzMiIsImFwIjoiMTU4ODYzMjc5MiIsImlkIjoiMDdjNDZhYTI3ZTBlMTAyZiIsInRyIjoiOGI4ODQ3MzNiNjFjNDNlY2YxMGEzOTQ2MzQ4MDE2NzQiLCJ0aSI6MTczNTk5NzEzNjEyOH19',
'origin': 'https://www.shutterstock.com',
'priority': 'u=1, i',
'referer': 'https://www.shutterstock.com/zh/catalog/',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-8b884733b61c43ecf10a394634801674-07c46aa27e0e102f-01',
'tracestate': '967232@nr=0-1-967232-1588632792-07c46aa27e0e102f----1735997136128',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'x-end-app-name': 'next-web',
'x-end-app-version': '5ca4a4c05d8',
'x-newrelic-id': 'XQAAU1VRGwIEVVhaBgYGUlI=',
'x-request-id': '15754a73-f152-4983-99b4-6af058379880',
}
self.email_value_config = {
'imap_server': 'imap.exmail.qq.com',
'username': 'pengyanbing@yswg.com.cn',
'password': 'Python3.8',
}
self.get_microservice_token()
def get_ck(self):
print('获取登录后的cookie')
try:
self.page.get('https://www.shutterstock.com/zh/catalog/licenses')
sleep(randint(2, 4))
# 获取 cookies 列表
original_cookies_list = self.page.cookies()
# 将 cookies 列表转换为字典
original_cookie_dict = {cookie['name']: cookie['value'] for cookie in original_cookies_list}
print('original_cookie_dict::',original_cookie_dict)
# # 检查 accts_customer_sso1 是否等于 '-undefined'
# if 'accts_customer_sso1' in original_cookie_dict and original_cookie_dict.get(
# 'accts_customer_sso1') == '-undefined':
# # 组合成新的值并更新 accts_customer_sso1
# new_value = f"{original_cookie_dict.get('htjs_user_id', '')}-undefined"
# original_cookie_dict['accts_customer_sso1'] = new_value
#
# keys_of_interest = ['datadome', 'accts_customer_sso1', 'next.sid']
# cookies = {key: original_cookie_dict[key] for key in keys_of_interest if key in original_cookie_dict}
#
# # print('filtered_cookies:', cookies)
return original_cookie_dict
except Exception as e:
print('获取cookie出错:', e)
def login_out(self):
login_out = self.page.ele('.MuiAvatar-root MuiAvatar-circular MuiAvatar-colorDefault mui-9jj0tt-avatarSize')
if login_out:
login_out.click()
sleep(randint(2, 4))
self.page.ele('@text()=登出').click()
else:
login_out = self.page.ele('.MuiAvatar-root MuiAvatar-circular MuiAvatar-colorDefault mui-1jeofke')
if login_out:
login_out.click()
sleep(randint(2, 4))
self.page.ele('@text()=登出').click()
else:
login_out = self.page.ele('.MuiAvatar-root MuiAvatar-circular MuiAvatar-colorDefault mui-1ki7tcg')
if login_out:
login_out.click()
sleep(randint(2, 4))
self.page.ele('@text()=登出').click()
def decode_body(self, body):
"""尝试多种编码方式解码邮件内容"""
encodings = ['utf-8', 'gb18030', 'iso-8859-1', 'latin1']
for encoding in encodings:
try:
decoded_body = body.decode(encoding)
return decoded_body
except UnicodeDecodeError:
continue
return body.decode('latin1', errors='replace') # 最后尝试 latin1 编码,替换无法解码的字符
def extract_verification_code(self, text_body):
"""提取验证码"""
patterns = [
r'以验证您的身份:(\d{6})',
r'一次性密码:(\d{6})',
r'验证码:(\d{6})',
r'(\d{6})\s*此密码仅可使用一次'
]
for pattern in patterns:
match = re.search(pattern, text_body)
if match:
return match.group(1)
return None
def fetch_verification_code(self, email_value_config):
try:
mail = imaplib.IMAP4_SSL(email_value_config['imap_server'])
mail.login(email_value_config["username"], email_value_config["password"])
mail.select('inbox')
search_query = '(FROM "noreply@shutterstock.com")'
result, data = mail.search(None, search_query)
if result != 'OK':
print("没有找到邮件")
return None
email_ids = data[0].split()
for email_id in reversed(email_ids): # 从最新的邮件开始
result, data = mail.fetch(email_id, "(RFC822)")
raw_email = data[0][1]
email_message = email.message_from_bytes(raw_email)
if email_message.is_multipart():
for part in email_message.walk():
content_type = part.get_content_type()
if content_type == 'text/plain':
body = part.get_payload(decode=True)
decoded_body = self.decode_body(body)
if 'shutterstock' in decoded_body:
text_body = decoded_body
verification_code = self.extract_verification_code(text_body)
if verification_code:
print("验证码是:", verification_code)
mail.close()
mail.logout()
return verification_code
else:
body = email_message.get_payload(decode=True)
decoded_body = self.decode_body(body)
if 'shutterstock' in decoded_body:
text_body = decoded_body
verification_code = self.extract_verification_code(text_body)
if verification_code:
print("验证码是:", verification_code)
mail.close()
mail.logout()
return verification_code
print("没有找到符合条件的邮件")
mail.close()
mail.logout()
return None
except imaplib.IMAP4.error as e:
print(f"IMAP4 error: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
finally:
try:
mail.close()
mail.logout()
except:
pass
def yxyzm(self):
print('需要输入邮箱验证码 等待2分钟')
sleep(randint(62, 140))
iframe = self.page.get_frame('#login-iframe')
sleep(randint(2, 4))
yzm = self.fetch_verification_code(self.email_value_config)
try:
print(('验证码输入'))
yzm_input = iframe.ele('@text()=输入代码')
sleep(randint(2, 4))
yzm_input.input(yzm)
except:
yzm_input = iframe.ele(
'.MuiInputBase-input MuiInput-input MuiInputBase-inputSizeSmall css-186x7cf')
sleep(randint(2, 4))
yzm_input.input(yzm)
print('点击验证')
iframe.ele('@text()=验证').click()
def get_microservice_token(self):
for i in range(5):
try:
url = "http://wx.yswg.com.cn:8000/microservice-system/system/admin/getToken"
timestamp = str(int(time.time()))
secret = "dafa17fb-0e97-4246-a6b3-d574e44d212d"
md5_value = hashlib.md5((secret + timestamp).encode("utf-8")).hexdigest()
response = requests.post(url, json={
"module": "spider",
"weChatId": "pengyanbing",
"secret": md5_value,
"timestamp": timestamp
})
res = response.json()
print(res)
if (res['code'] == 200):
userinfo = res['data']
self.token = userinfo['token']
expireTime = userinfo['expireTime']
print(self.token, expireTime)
else:
raise Exception(res['msg'])
break
except Exception as e:
print('get_microservice_token, 报错',e)
time.sleep(20)
def login(self):
try:
# 打开页面
self.page.get('https://www.shutterstock.com/zh/catalog/')
sleep(randint(12, 24))
try:
# print('No thanks')
print('click No thanks')
login_button = self.page.ele('xpath://a[@id="continue"]', timeout=15)
login_button.click()
except:
print('No thanks 错误')
print('开始登录。', self.account, self.pwd)
# 判断是否在登录状态
# self.login_out()
# 查找并点击登录按钮
login_button = self.page.ele('xpath://a[@data-automation="loginButton"]', timeout=15)
login_button.click()
sleep(randint(12,24))
# 等待页面加载,切换到 iframe
iframe = self.page.get_frame('#login-iframe')
print('已切换到 login-iframe')
# 查找并输入邮箱
print("正在等待邮箱输入框...")
sleep(15)
# email_input = iframe.ele('.MuiInputBase-input MuiInput-input MuiInputBase-inputSizeSmall css-186x7cf')
email_input = iframe.ele('xpath://input[@name="username"]')
email_input.clear() # 清除任何预填充的内容
email_input.input(self.account) # 输入文本
print("已输入账号到邮箱输入框")
sleep(randint(2, 4))
# 查找并输入密码
print("正在等待密码输入框...")
email_input = iframe.ele(
'.MuiInputBase-input MuiInput-input MuiInputBase-inputSizeSmall MuiInputBase-inputAdornedEnd css-186x7cf')
email_input.clear() # 清除任何预填充的内容
email_input.input(self.pwd)
print("已输入密码到密码输入框")
sleep(randint(3, 5))
# 查找并点击登录按钮
print('查找并点击登录按钮')
# submit_button = iframe.ele(
# '.MuiButtonBase-root MuiButton-root MuiButton-contained MuiButton-containedPrimary MuiButton-sizeMedium MuiButton-containedSizeMedium MuiButton-disableElevation MuiButton-fullWidth css-1w8itp0')
try:
submit_button = iframe.ele('.LoginForm_bottomSpacingMd__e2Mnm')
submit_button.click()
except:
print('切换点击')
sleep(randint(3, 4))
iframe.ele('.MuiButtonBase-root MuiButton-root MuiButton-contained MuiButton-containedPrimary MuiButton-sizeMedium MuiButton-containedSizeMedium MuiButton-disableElevation MuiButton-fullWidth css-1is1osn').click()
print('已点击登录...')
sleep(randint(8, 15))
except Exception as e:
print(f"出现错误: {e}", f"\n{traceback.format_exc()}")
return False
try:
print(33333333333)
iframe = self.page.get_frame('#login-iframe')
sleep(randint(4, 8))
h3_element = iframe.ele(
'.FormHeader_root__fHtRy wrapper-component_center__zG6GW')
h3_ = iframe.ele('@text()=输入验证代码')# 要继续,请输入发送到您电子邮件中的代码
h3_1 = iframe.ele('xpath://h3[contains(text(),"输入验证代码")]')# 要继续,请输入发送到您电子邮件中的代码
P_text1 = iframe.ele('xpath://p[contains(text(),"未收到代码?单击")]', timeout=15)
P_text2 = iframe.ele('xpath://p[contains(text(),"电子邮件中")]', timeout=15)
if h3_element or h3_ or h3_1 or P_text1 or P_text2 or '输入验证代码' in iframe.html or '输入验证代码' in self.page.html:
self.yxyzm()
else:
print('不需要验证码')
sleep(10)
self.page.refresh()
sleep(randint(5, 10))
self.page.get('https://www.shutterstock.com/zh/catalog/licenses')
sleep(randint(4, 8))
ck = self.get_ck()
return ck
except Exception as e:
print(e)
print('不需要验证码11111111111')
sleep(randint(5, 8))
ck = self.get_ck()
return ck
def transmission_api(self, account_id, image_id, image_size_info, image_title, image_type, image_url):
# url = 'http://192.168.2.97:6661/microservice-visual/visual/fileSystem/saveImageDetail?token=dacce869-0471-4ec7-ac50-3b3b1ec22c87'
url = 'http://wx.yswg.com.cn:8000/microservice-visual/visual/fileSystem/saveImageDetail?token=dacce869-0471-4ec7-ac50-3b3b1ec22c87'
transmission_data = {}
transmission_data['accountId'] = account_id
transmission_data['imageId'] = image_id
transmission_data['imageSizeInfo'] = image_size_info
transmission_data['imageTitle'] = image_title
transmission_data['imageType'] = image_type
transmission_data['imageUrl'] = image_url
data_json = json.dumps(transmission_data)
max_retries = 3
retries = 0
while retries <= max_retries:
headers = {
"authorization": self.token
}
try:
response = requests.post(url, data=data_json,headers=headers)
if response.status_code == 200:
return response.json()
else:
print(url,'2323')
print(f'请求失败,状态码: {response.status_code},重试 ({retries}/{max_retries})')
retries += 1
except requests.exceptions.RequestException as e:
print(f'请求异常: {e},重试 ({retries}/{max_retries})')
retries += 1
self.get_microservice_token()
raise Exception(f'请求失败,已达到最大重试次数:{max_retries} 次')
def get_jpg(self, cookies, image_id):
json_data = {
'required_cookies': '',
'content': [
{
'content_id': f'{image_id}',
'content_type': 'photo',
'content_size': 'huge',
'content_format': 'jpg',
'license_name': 'standard',
'show_modal': True,
},
],
}
response = requests.post(
'https://www.shutterstock.com/napi/licensees/current/redownload',
cookies=cookies,
headers=self.headers,
json=json_data,
timeout=600
)
image_url = json.loads(response.text)['meta']['licensedContent'][0]['downloadUrl']
return image_url
def get_png(self, cookie, image_id):
json_data = {
'required_cookies': '',
'content': [
{
'content_id': f'{image_id}',
'content_type': 'photo',
'content_size': 'large',
'content_format': 'png',
'include_shadows': True,
'angle': 'G03',
'license_name': 'standard',
'show_modal': True,
},
],
}
response = requests.post('https://www.shutterstock.com/napi/licensees/current/redownload', cookies=cookie,
headers=self.headers, json=json_data, timeout=600)
image_url = json.loads(response.text)['meta']['licensedContent'][0]['downloadUrl']
return image_url
def get_pic(self, account_id, image_id, item_id, image_title, image_size_info, cookie, wait_time):
retry = 0
max_retries = 3
while retry <= max_retries:
try:
start_time = datetime.now().strftime("%m-%d %H:%M:%S")
all_image_id = Con.get_all_image_id()
if str(image_id) in set(all_image_id):
print(f'{image_id}已上传过')
state = 3
else:
state = 1
# 尝试获取 JPG
try:
image_url = self.get_jpg(cookie, image_id)
image_type = 'jpg'
except Exception as e:
if 'meta' in str(e):
# JPG 失败,尝试 PNG
image_url = self.get_png(cookie, image_id)
image_type = 'png'
else:
raise
# 构建 item 数据
item = {
'account_id': account_id,
'image_id': image_id,
'image_size_info': image_size_info,
'image_title': image_title,
'image_type': image_type,
'image_url': image_url,
'state': state,
'created_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
# 存储数据 & 调用 API
Con.save_stock_detail(item)
Con.update_image_id_to_3(item_id)
self.transmission_api(account_id, image_id, image_size_info, image_title, image_type, image_url)
now_time = datetime.now().strftime("%m-%d %H:%M:%S")
print(f'pic_name:{image_title[:38]},time:{start_time}——{now_time}爬取成功')
time.sleep(wait_time)
return True
except Exception as e:
logging.error(f"发生错误: {e}")
retry += 1
if 'image_title' in str(e):
Con.update_image_id_to_4(item_id)
print(f'{image_id}过期 修改为4')
return False
elif 'meta' in str(e):
if retry > max_retries:
logging.warning("超过重试次数,跳过该图片")
return False
logging.warning(f"meta 错误,等待两小时刷新页面 第{retry}次重试...")
time.sleep(7200)
self.page.get('https://www.shutterstock.com/zh/catalog/')
continue # 继续下一次重试
else:
if retry > max_retries:
logging.warning("超过重试次数,跳过该图片")
return False
sleep_time = [random.randint(60, 180), random.randint(180, 240), random.randint(1800, 1900)][
retry - 1]
logging.warning(f"未知错误,等待{sleep_time}s 第{retry}次重试...")
time.sleep(sleep_time)
continue # 继续下一次重试
def run_get_stock_img_id(self, account, cookie):
"""封装GetStockImgId.run()调用"""
try:
get_img_id = GetStockImgId()
get_img_id.run(account, cookie)
return True
except Exception as e:
logging.error(f"Error occurred in GetStockImgId.run(): {e}")
return False
def run(self):
day = time.strftime("%d")
for item_id in range(1, 33):
print(f"开始抓取 item_id: {item_id}")
self.page.clear_cache() # 清除浏览器缓存和session信息。下一个账号直接登录。优化上一个账号没有退出导致新账号登录失败
if item_id == 1 and int(day) < 2:
Con.update_all_states_to_1(state=2)
wait_time = random.uniform(6, 10)
account_list = Con.get_cookie_account(item_id)
if account_list:
self.account = account_list[0]
self.pwd = account_list[1]
cookie = self.login() # 登录并获取cookie
# cookie = self.get_ck()
if not self.run_get_stock_img_id(self.account, cookie):
logging.critical("Stopping the entire program due to critical error.")
sys.exit(1) # 终止整个程序
image_id_id_pairs = Con.get_stock_images_id(self.account)
if not image_id_id_pairs:
print(f'{self.account} 已全部爬取完成')
Con.update_all_states_to_1(state=3, item_id=item_id)
continue
counts_start = 0
counts_last = len(image_id_id_pairs)
stop_flag = False # 初始化变量
for count in range(counts_start, counts_last):
image_id, item_id_str, image_title, image_size_info = image_id_id_pairs[count].split('||-||')
print(f'执行 {self.account}: {image_id}, {item_id_str}, 计数: {count}')
try:
chong_shi = self.get_pic(self.account, image_id, item_id_str, image_title, image_size_info,
cookie, wait_time)
if not chong_shi:
stop_flag = True
break
except Exception as e:
if 'Expected axis has 0 elements, new values have 2 elements' in str(e):
print(f'{self.account} 已全部爬取完成')
time.sleep(10)
else:
logging.error(f'发生错误: {e}, 停止循环')
break
if count == counts_last - 1:
print(f'{self.account} 全部爬取完成1122==')
Con.update_all_states_to_1(state=3, item_id=item_id)
if stop_flag:
print('超过重试次数,暂停')
logging.warning('超过重试次数,暂停')
break
if __name__ == '__main__':
GetSS_details().run()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment