Commit a5677f43 by Peng

no message

parent 000d315d
......@@ -7,6 +7,12 @@ from flask import Flask, request, jsonify
app = Flask(__name__)
logging.getLogger('ppocr').setLevel(logging.ERROR)
# 全局只实例化一次
ocr_instances = {}
def get_ocr(language):
if language not in ocr_instances:
ocr_instances[language] = PaddleOCR(use_angle_cls=True, lang=language)
return ocr_instances[language]
def OCR(language, url):
......@@ -14,7 +20,8 @@ def OCR(language, url):
# language = 'en'
# url = 'http://soundasia.oss-cn-shenzhen.aliyuncs.com/product_img/2024/12/05/EQ6368_1733363300_%E7%94%BB%E6%9D%BF%203.jpg'
# path = r'D:\新建文件夹\requests_files'
ocr = PaddleOCR(use_angle_cls=True, lang=language)
# ocr = PaddleOCR(use_angle_cls=True, lang=language)
ocr = get_ocr(language)
result = ocr.ocr(url, cls=True)
data_list = []
for idx in range(len(result)):
......
......@@ -316,6 +316,10 @@ def junglescout_spider(db_base):
# cursor = db.cursor(cursor=pymysql.cursors.DictCursor) # 以字典的格式输出
print(category_name['name'], category_name['c_id'])
name_rnak_list = []
up_sql = f"UPDATE all_site_category set state=2 WHERE site='{db_base}' and state=1 and c_id='{category_name['c_id']}'"
print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
rank_list = [1, 10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500,
1600,
1700,
......@@ -350,12 +354,12 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1752031904,1752460043,1752653436,1753353401; HMACCOUNT=800EBCCFB4C6BBFB; 894cdd1d9741ce0c9757=827b7d3d13ed7bd6b4b1b24d0246b3dc; 3d854e1bcd61963fdf05=38fcb3b742a48aa345ddfd7136bc60ee; _gaf_fp=f297033bfe53aa9891ffe2842271566b; _gcl_au=1.1.420472597.1749119222.1054917286.1753685435.1753685437; rank-guest-user=6303473571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9lihUCQIaVmrHXjbpSRP/Ca0F; rank-login-user=6303473571KK6FnhfedvWg9tSSyk3xj2GRIc/8HSm4vuPYVHI5vKLXnssgei5ccK1dG8fkQSFI; rank-login-user-info=eyJuaWNrbmFtZSI6IuW4heWTpSIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTgzKioqKjczNDciLCJ0b2tlbiI6IjYzMDM0NzM1NzFLSzZGbmhmZWR2V2c5dFNTeWszeGoyR1JJYy84SFNtNHZ1UFlWSEk1dktMWG5zc2dlaTVjY0sxZEc4ZmtRU0ZJIn0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJKc2pZSlZWeFZzTVptVWFvMzgtZ3RRIiwiaWF0IjoxNzUzNjg1NDM2LCJleHAiOjE3NTM3NzE4MzYsIm5iZiI6MTc1MzY4NTM3Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIxNSwicGkiOm51bGwsIm5uIjoi5biF5ZOlIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxODMwNzk2NzM0NyIsImVtIjoiMzE1OTk4MDg5MkBxcS5jb20iLCJtbCI6IkcifQ.EaQ7Md7iVOpjZDogkiS2DlndhFPt3GzL2t33LXnh9Z5Itr3A8scFM_tzrYuzXqF6a-BDIMFe90SdDtU18zs9WTTl6_Phv3AEqcDe6WDfPAhB_KMa15VYAE5-b9d3lgIukKR8ZZyAMpiJzcmIWShmqxrhCNQD0ER3b7idaJpSrJiKnwV-tj6La52WJ6BmVRAk8gst0p5h-SYVnNz9iNaSXLc2Dx-hHZvMVNU27yfbJgKPpzRxgh7TOD7O-cT0WrEoKvTSw9e81gG9bgvKuA_bD-z3ePhgM6prUfceWszD88KH8PcXua9s_8ZM4bgrMyKMHswLtwyLhWePcvtHUp6yyQ; ao_lo_to_n=6303473571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9liibP9br/hwQ1Dlb4xDZyVPrTQIst5JCVz4PpnUIlDMGE07YVPYBWOm3Hrx4PaVkgaQ=; _ga_38NCVF2XST=GS2.1.s1753685428$o61$g1$t1753685444$j44$l0$h984121357; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1753685445; _ga_CN0F80S6GL=GS2.1.s1753685429$o59$g1$t1753685445$j44$l0$h0; JSESSIONID=F09543D3A3D6F890BAD0F422FCA49942',
'Cookie': 'Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1754014538; HMACCOUNT=9F9252C9CBCC28DF; _gcl_au=1.1.1089500616.1754014538; _ga=GA1.1.420464702.1754014538; MEIQIA_TRACK_ID=30fNCDIKt41VFprESpAsdxA93ss; MEIQIA_VISIT_ID=30fNCEDPMElClEdkflJq4o3vq1u; ecookie=vR8uQMtRUHf2GPuw_CN; e6d47b7933e377ecd062=54dab66cb6737167d2467f75c45d482f; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=909522d879232499acd9b4ddab672d19; current_guest=hsba8eOK1Dg5_250801-107899; rank-login-user=4412704571jC8vqVc/Rw3YJBDFuUDJtYCCpovzEIJtbd/qlmC8t917Mll118BEKfWZetMkVyfW; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjQ0MTI3MDQ1NzFqQzh2cVZjL1J3M1lKQkRGdVVESnRZQ0Nwb3Z6RUlKdGJkL3FsbUM4dDkxN01sbDExOEJFS2ZXWmV0TWtWeWZXIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJHOWQwOEtTcnVjdEgwcXZWZm1XNnlBIiwiaWF0IjoxNzU0MDE0NTQ0LCJleHAiOjE3NTQxMDA5NDQsIm5iZiI6MTc1NDAxNDQ4NCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.Cwkc0tf7KniQbUgRyiZw8UED5dm3y8dOrK04ejg4a45H-W3FEBpQ6ERU8V7TTy2qKOJf8j1swyVxRIqJDrGRSwe4FBr8EKLsoZtxRe6DR0LYGx8xMmWmfUmVmwcBHR2M62RZlDO-fjvVPBuZwcLyUuslq2PZen2ugOUzdfQDHQJV8UMmWUvt1zHjjQZrRlda1tK0_TuHt8dBCZ-sC_CIooCAvXYYfMUSMeT2w_QmgFPc_EIozNKvv7EDzqisT4pR5AWKDdfoVUSWFBIVNwoulIMdtKLsVrlL8Xiq_2l3mG9NCfE0recVIGCRhV52lwWD3vT1O3bpCT-usWv0hXVgZA; ao_lo_to_n="4412704571jC8vqVc/Rw3YJBDFuUDJtakIKSNY+NxiJnARSLNieFEjr7klDXJb6hxls+GbtooUDUaltNvx27xhoy1Atnktnv2cJc/ZHsk63L1rTYoE0Cs="; rank-guest-user=5412704571Cjn566YjkeJT1oMWIEZzHnz3datXkhqNVFcNKbPORkfvRQdV46mvORTmyl8ul2JV; _ga_38NCVF2XST=GS2.1.s1754014538$o1$g1$t1754014551$j47$l0$h1907321182; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1754014552; _ga_CN0F80S6GL=GS2.1.s1754014539$o1$g1$t1754014552$j47$l0$h0; JSESSIONID=A5AEB0B286BCE5A27600AD3BD1DD6445',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": "DE",
"station": "UK",
"cid": category_name['c_id'], # 分类id
"bsr": f"{i}" # 排名
}
......@@ -431,7 +435,7 @@ def save_site_category(site_bsr_dict=None):
def run():
# get_cid()
junglescout_spider('de')
junglescout_spider('uk')
if __name__ == '__main__':
......
......@@ -296,40 +296,40 @@ else:
"pg_host": "61.145.136.61",
}
# 连接pg6数据库参数 113.100.143.162:5432
if platform.system().lower() == 'windows':
PG_CONN_DICT_6 = {
"pg_port": 5432,
"pg_db": "selection",
"pg_user": "postgres",
"pg_pwd": "F9kL2sXe81rZq",
"pg_host": "113.100.143.162",
}
else:
PG_CONN_DICT_6 = {
"pg_port": 5432,
"pg_db": "selection",
"pg_user": "postgres",
"pg_pwd": "F9kL2sXe81rZq",
"pg_host": "113.100.143.162",
}
# 连接pg12数据库参数 113.100.143.162:5443
if platform.system().lower() == 'windows':
PG_CONN_DICT_21 = {
"pg_port": 5443,
"pg_db": "selection",
"pg_user": "postgres",
"pg_pwd": "F9kL2sXe81rZq",
"pg_host": "113.100.143.162",
}
else:
PG_CONN_DICT_21 = {
"pg_port": 5443,
"pg_db": "selection",
"pg_user": "postgres",
"pg_pwd": "F9kL2sXe81rZq",
"pg_host": "113.100.143.162",
}
# 连接pg6数据库参数 113.100.143.162:5432 弃用
# if platform.system().lower() == 'windows':
# PG_CONN_DICT_6 = {
# "pg_port": 5432,
# "pg_db": "selection",
# "pg_user": "postgres",
# "pg_pwd": "F9kL2sXe81rZq",
# "pg_host": "113.100.143.162",
# }
# else:
# PG_CONN_DICT_6 = {
# "pg_port": 5432,
# "pg_db": "selection",
# "pg_user": "postgres",
# "pg_pwd": "F9kL2sXe81rZq",
# "pg_host": "113.100.143.162",
# }
# 连接pg12数据库参数 113.100.143.162:5443 弃用
# if platform.system().lower() == 'windows':
# PG_CONN_DICT_21 = {
# "pg_port": 5443,
# "pg_db": "selection",
# "pg_user": "postgres",
# "pg_pwd": "F9kL2sXe81rZq",
# "pg_host": "113.100.143.162",
# }
# else:
# PG_CONN_DICT_21 = {
# "pg_port": 5443,
# "pg_db": "selection",
# "pg_user": "postgres",
# "pg_pwd": "F9kL2sXe81rZq",
# "pg_host": "113.100.143.162",
# }
# doris
if platform.system().lower() == 'windows':
......
......@@ -477,7 +477,7 @@ class async_asin_pg():
def run(self):
asin_list = self.save_asin_detail.read_db_data()
# asin_list = ['B0BPKK2BMN|2025-01|1|1|null|null']
# asin_list = ['B0DSBTYG6W|2025-01|1|1|null|null']
if asin_list:
for asin in asin_list:
self.queries_asin_queue.put(asin)
......
......@@ -30,7 +30,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class search_temp_pg(BaseUtils):
def __init__(self, site_name='us', read_size=300, proxy_name=None, week=None, month=None):
def __init__(self, site_name='uk', read_size=300, proxy_name=None, week=None, month=None):
super().__init__()
self.site_name = site_name # 站点
self.month = month
......@@ -237,7 +237,7 @@ class search_temp_pg(BaseUtils):
self.kafuka_producer.close()
def run_pol(self):
search_term_list = ["1|-|srs|-|https://www.amazon.com/s?me=AI6OQ4YYFMZUE&marketplaceID=ATVPDKIKX0DER"]
search_term_list = ["1|-|srs|-|https://www.amazon.co.uk/s?k=power%20shower&page=1"]
if search_term_list:
if self.cookies_queue.empty():
cookies_dict = self.reuests_para_val.get_cookie()
......@@ -247,7 +247,7 @@ class search_temp_pg(BaseUtils):
for search_url in search_term_list:
self.search_term_queue.put(search_url)
html_thread = []
for i in range(15):
for i in range(1):
thread2 = threading.Thread(target=self.get_search_kw, args=(i,))
html_thread.append(thread2)
for ti in html_thread:
......
import pandas as pd
import requests, time
from typing import List
# -------- 映射字典 --------
site_name_db_dict = {
"us": "selection",
"uk": "selection_uk",
"de": "selection_de",
"es": "selection_es",
"fr": "selection_fr",
"it": "selection_it",
}
db_type_alias_map = {
"mysql": "mysql", # 阿里云mysql
"postgresql_14": "postgresql_14", # pg14爬虫库-内网
"postgresql_14_outer": "postgresql_14_outer", # pg14爬虫库-外网
"postgresql_15": "postgresql_15", # pg15正式库-内网
"postgresql_15_outer": "postgresql_15_outer", # pg15正式库-外网
"postgresql_cluster": "postgresql_cluster", # pg集群-内网
"postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
"doris": "doris", # pg集群-内网
}
DEFAULT_SERVERS = [
"http://192.168.200.210:7777",
"http://192.168.1.102:7777",
]
# ---------------------------
class RemoteTransaction:
def __init__(self, db: str, database: str,
session: requests.Session, urls: List[str]):
self.db = db
self.database = database
self.session = session
self.urls = urls
self.sql_queue = []
def execute(self, sql: str):
self.sql_queue.append(sql)
def __enter__(self): return self
def __exit__(self, exc_type, exc, tb):
for url in self.urls:
try:
self.session.post(
url + "/transaction",
json={"db": self.db,
"sql_list": self.sql_queue,
"site_name": self.database}, # site_name not needed on server, kept for clarity
timeout=15,
).raise_for_status()
return
except Exception as e:
print(f"[WARN] 事务失败 {url}: {e}")
raise RuntimeError("All servers failed for transaction")
class RemoteEngine:
def __init__(self, db: str, database: str,
server_urls: List[str], retries: int = 2):
self.db = db
self.database = database
self.urls = [u.rstrip("/") for u in server_urls]
self.session = requests.Session()
self.retries = retries
def _request(self, endpoint: str, payload):
for url in self.urls:
for _ in range(self.retries):
try:
r = self.session.post(f"{url}/{endpoint}",
json=payload, timeout=10)
r.raise_for_status()
return r.json()
except Exception as e:
print(f"[WARN] {endpoint} fail @ {url}: {e}")
time.sleep(1)
raise RuntimeError(f"All servers failed for {endpoint}")
# ---------- 公共 API ----------
def read_sql(self, sql: str) -> pd.DataFrame:
data = self._request("query",
{"db": self.db,
"sql": sql,
"site_name": self.database})
return pd.DataFrame(data["result"])
def to_sql(self, df: pd.DataFrame, table: str, if_exists="append"):
return self._request("insert",
{"db": self.db,
"table": table,
"if_exists": if_exists,
"data": df.to_dict("records"),
"site_name": self.database})
def begin(self):
return RemoteTransaction(self.db, self.database,
self.session, self.urls)
# ---------------------------------
def get_remote_engine(site_name: str, db_type: str,
servers: List[str] = None) -> RemoteEngine:
if site_name not in site_name_db_dict:
raise ValueError(f"Unknown site_name: {site_name}")
if db_type not in db_type_alias_map:
raise ValueError(f"Unknown db_type: {db_type}")
return RemoteEngine(
db=db_type_alias_map[db_type],
database=site_name,
server_urls=servers or DEFAULT_SERVERS,
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment