Commit cd1e5c51 by Peng

no message

parent 814f1b7b
......@@ -354,12 +354,12 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
'Cookie': 'Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1754014538; HMACCOUNT=9F9252C9CBCC28DF; _gcl_au=1.1.1089500616.1754014538; _ga=GA1.1.420464702.1754014538; MEIQIA_TRACK_ID=30fNCDIKt41VFprESpAsdxA93ss; MEIQIA_VISIT_ID=30fNCEDPMElClEdkflJq4o3vq1u; ecookie=vR8uQMtRUHf2GPuw_CN; e6d47b7933e377ecd062=54dab66cb6737167d2467f75c45d482f; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=909522d879232499acd9b4ddab672d19; current_guest=hsba8eOK1Dg5_250801-107899; rank-login-user=4412704571jC8vqVc/Rw3YJBDFuUDJtYCCpovzEIJtbd/qlmC8t917Mll118BEKfWZetMkVyfW; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjQ0MTI3MDQ1NzFqQzh2cVZjL1J3M1lKQkRGdVVESnRZQ0Nwb3Z6RUlKdGJkL3FsbUM4dDkxN01sbDExOEJFS2ZXWmV0TWtWeWZXIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJHOWQwOEtTcnVjdEgwcXZWZm1XNnlBIiwiaWF0IjoxNzU0MDE0NTQ0LCJleHAiOjE3NTQxMDA5NDQsIm5iZiI6MTc1NDAxNDQ4NCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.Cwkc0tf7KniQbUgRyiZw8UED5dm3y8dOrK04ejg4a45H-W3FEBpQ6ERU8V7TTy2qKOJf8j1swyVxRIqJDrGRSwe4FBr8EKLsoZtxRe6DR0LYGx8xMmWmfUmVmwcBHR2M62RZlDO-fjvVPBuZwcLyUuslq2PZen2ugOUzdfQDHQJV8UMmWUvt1zHjjQZrRlda1tK0_TuHt8dBCZ-sC_CIooCAvXYYfMUSMeT2w_QmgFPc_EIozNKvv7EDzqisT4pR5AWKDdfoVUSWFBIVNwoulIMdtKLsVrlL8Xiq_2l3mG9NCfE0recVIGCRhV52lwWD3vT1O3bpCT-usWv0hXVgZA; ao_lo_to_n="4412704571jC8vqVc/Rw3YJBDFuUDJtakIKSNY+NxiJnARSLNieFEjr7klDXJb6hxls+GbtooUDUaltNvx27xhoy1Atnktnv2cJc/ZHsk63L1rTYoE0Cs="; rank-guest-user=5412704571Cjn566YjkeJT1oMWIEZzHnz3datXkhqNVFcNKbPORkfvRQdV46mvORTmyl8ul2JV; _ga_38NCVF2XST=GS2.1.s1754014538$o1$g1$t1754014551$j47$l0$h1907321182; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1754014552; _ga_CN0F80S6GL=GS2.1.s1754014539$o1$g1$t1754014552$j47$l0$h0; JSESSIONID=A5AEB0B286BCE5A27600AD3BD1DD6445',
'Cookie': 'Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1754303192; HMACCOUNT=B44A3AA867DA8C05; _gcl_au=1.1.1206959369.1754303192; _ga=GA1.1.747881747.1754303192; MEIQIA_TRACK_ID=30ooGiCl3FRjp9OxyAufmK3iejx; MEIQIA_VISIT_ID=30ooGnIqlsOe6yn29kfqf4EnQuF; ecookie=xzfwf4ZvXd9I4bOT_CN; c57b73d8686537c32dea=36830b46c328d771081e3a79f5c51e04; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=9ac0984901990f4b1772551060468cf0; rank-login-user=7970634571pnkdYV6Hfb0IWfvuV/gc88fclJAm6p5JWeQpD30JCgc6kY0X2uN+iF7vjvKtVgBU; rank-login-user-info="eyJuaWNrbmFtZSI6IuW4heWTpSIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTgzKioqKjczNDciLCJ0b2tlbiI6Ijc5NzA2MzQ1NzFwbmtkWVY2SGZiMElXZnZ1Vi9nYzg4ZmNsSkFtNnA1SldlUXBEMzBKQ2djNmtZMFgydU4raUY3dmp2S3RWZ0JVIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJvQjhGc25vZ0ludmp5S3luRmlsSjdnIiwiaWF0IjoxNzU0MzAzMTk3LCJleHAiOjE3NTQzODk1OTcsIm5iZiI6MTc1NDMwMzEzNywic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIxNSwicGkiOm51bGwsIm5uIjoi5biF5ZOlIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxODMwNzk2NzM0NyIsImVtIjoiMzE1OTk4MDg5MkBxcS5jb20iLCJtbCI6IkcifQ.OOEsxsBWHf6J1ta8ueS0i-8fVxuxstNOtoJ2gWSxcJwr6UbRMiHiXqo3fNwkwzYrBjp75oz7xbdaui3LPu90-VZCUyh5lXoiFBjZD-iVJcQNTqkfYbV3siHtjRS27LBBh4UJLRRdSAfxP5iZscz640WHj9PupOXYUDPbljOsWOC4jBYSY3Ek3ikxH70BFluOvrD8kpwfQvbhmue_0fZAqu-rACr3ed5cpDUc3YQiFH7sDRkV0FJv4SLLm1qxLvSo4RmNftfYUBggsLl7qM0tQyBQh2BooUIt8ZBldTmtUdJiz9shLu1kYyv_zzoXtgfMmpdNADM85W0INKp1u5DGAg; ao_lo_to_n="7970634571pnkdYV6Hfb0IWfvuV/gc85dImjms+dJ7IrpjIs0CNJBquIGSx1xPUHU/OAMezoHKbqvLvZZuXrKHmPj6PK6OtV+0hL1+N+4daHAf8FeCzWg="; JSESSIONID=421BD32330EB1F2A12E2571E4D00CE8F; _ga_38NCVF2XST=GS2.1.s1754303191$o1$g1$t1754303203$j48$l0$h314662329; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1754303203; _ga_CN0F80S6GL=GS2.1.s1754303192$o1$g1$t1754303204$j48$l0$h0',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": "UK",
"station": "US",
"cid": category_name['c_id'], # 分类id
"bsr": f"{i}" # 排名
}
......@@ -399,7 +399,7 @@ def junglescout_spider(db_base):
print(inset_sql)
cursor_mysql_db.executemany(inset_sql, name_rnak_list)
db.commit()
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=1 and c_id='{category_name['c_id']}'"
up_sql = f"UPDATE all_site_category set state=3 WHERE site='{db_base}' and state=2 and c_id='{category_name['c_id']}'"
print('更新状态:', up_sql)
cursor_us_mysql_db.execute(up_sql)
db_us.commit()
......@@ -435,7 +435,7 @@ def save_site_category(site_bsr_dict=None):
def run():
# get_cid()
junglescout_spider('uk')
junglescout_spider('us')
if __name__ == '__main__':
......
......@@ -18,8 +18,6 @@ class count_all_syn_st_id(BaseUtils):
if engine_db_num == 14:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
else:
self.engine_pg = self.pg_connect_6()
self.engine_db_num = engine_db_num
if site_name == "us":
self.site_url = 'https://www.amazon.com/'
......@@ -48,8 +46,8 @@ class count_all_syn_st_id(BaseUtils):
print(self.site_name, ' 查询最小和最大 id')
query = f"SELECT MIN(id) AS min_id, MAX(id) AS max_id FROM {self.site_name}_all_syn_st_month_{self.table_data_info} where state in (1,2)"
print(query)
result = pd.read_sql(query, self.engine_pg)
# result = pd.read_sql(query, self.engine_pg)
result = self.engine_pg.read_sql(query)
if result.shape[0] > 0:
min_id = result['min_id'].values[0]
max_id = result['max_id'].values[0]
......@@ -72,58 +70,7 @@ class count_all_syn_st_id(BaseUtils):
delete_sql = f'DELETE from {self.site_name}_syn_asin_all_minid_maxid where state <4'
print('delete_sql::',delete_sql)
conn.execute(delete_sql)
df_asin_img_video.to_sql(f'{self.site_name}_syn_asin_all_minid_maxid', con=self.engine,
if_exists='append',
index=False)
def search_term_syn(self):
# 初始化一个空的 DataFrame
result_list = []
if self.engine_db_num == 6:
query = f"SELECT search_term FROM {self.site_name}_search_term_month_merchantwords WHERE state=1 and id <5000001"
elif self.engine_db_num == 14:
query = f"SELECT search_term FROM {self.site_name}_search_term_month WHERE month={self.month} and state in (1,2)"
print(query)
result_df = self.get_data_from_database(self.engine_pg, query)
result_df.drop_duplicates(['search_term'], inplace=True)
print('_search_term_month::',result_df.shape)
# 对每个搜索关键词生成 URL 并添加到结果列表
for search_term in result_df['search_term']:
urls = self.build_urls(search_term)
result_list.extend(urls)
# 创建初始 DataFrame
df_search_term = pd.DataFrame(data=result_list, columns=['search_term', 'url'])
print(df_search_term.shape)
# 找出超过 450 字符长度的 URL 行的索引
long_url_rows = df_search_term['url'].str.len() <= 450
# 筛选保留不超过 450 字符长度的 URL 行
data_df = df_search_term[long_url_rows]
data_df['month'] = f'{self.month}'
data_df['date_info'] = self.data_info
print(data_df)
print(data_df.shape)
with self.engine_pg.begin() as conn:
if self.engine_db_num == 14:
data_df.to_sql(f'{self.site_name}_search_term_month_syn',con=self.engine_pg, if_exists="append",
index=False)
update_sql = f"update {self.site_name}_search_term_month set state =3 where date_info='2025-{self.month}' and state=1"
print(update_sql)
conn.execute(update_sql)
deletesql = f"DELETE from {self.site_name}_search_term_month_syn where date_info < '{self.data_info}'"
print(deletesql)
conn.execute(deletesql)
elif self.engine_db_num == 6:
print('pg6 写入数据 merchantwords')
data_df.to_sql(f'{self.site_name}_search_term_month_syn_merchantwords', con=self.engine_pg, if_exists="append",
index=False)
update_sql = f"update us_search_term_month_merchantwords set state =3 where state=1"
print(update_sql)
conn.execute(update_sql)
deletesql = f"DELETE from {self.site_name}_search_term_month_syn_merchantwords where state =3"
print(deletesql)
conn.execute(deletesql)
self.engine.to_sql(df_asin_img_video,f'{self.site_name}_syn_asin_all_minid_maxid',if_exists='append')
# 从数据库获取数据的函数
def get_data_from_database(self, connection, query):
......@@ -161,5 +108,4 @@ if __name__ == '__main__':
for site in ['us']:
time.sleep(0)
count_all_syn_st_id(site_name=site,month=month).get_minid_maxid()
# count_all_syn_st_id(site_name=site,month=month,engine_db_num=engine_db_num).search_term_syn()
......@@ -11,6 +11,7 @@ import traceback
import time
import random
from amazon_spider.VPS_IP import is_internet_available
from redis.exceptions import LockError
print('存储 asin 到pg数据库')
......@@ -27,7 +28,7 @@ class Save_asin_detail(BaseUtils):
self.reuests_para_val = Requests_param_val(site_name=self.site_name, proxy_name=proxy_name)
self.init_db_names()
self.cols = self.reuests_para_val.db_column(site_name)
self.redis_client = self.redis_db()
def init_db_names(self):
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect() # 更改变体 时 存储 变体表 使用 self.engine
......@@ -78,6 +79,9 @@ class Save_asin_detail(BaseUtils):
else:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
lock_key = f"{self.db_syn}_{self.month}_lock"
lock = self.redis_client.lock(lock_key, timeout=25) # 10秒超时
with self.engine_pg.begin() as conn:
sql_read = f"SELECT asin, id, date_info, asin_is_variation,data_type,volume,weight_str FROM {self.db_syn}_{self.month} WHERE STATE = 1 ORDER BY id FOR UPDATE SKIP LOCKED LIMIT {self.read_size}"
print(sql_read)
......@@ -97,9 +101,15 @@ class Save_asin_detail(BaseUtils):
self.df_read.asin + '|' + self.df_read.date_info + '|' + self.df_read.asin_is_variation.astype(
"U") + '|' + self.df_read.data_type.astype("U") + '|' + self.df_read.volume.astype(
"U") + '|' + self.df_read.weight_str.astype("U"))
if lock.locked():
lock.release()
return asin_list
else:
if lock.locked():
lock.release()
return []
except LockError:
print("获取锁失败1111,其他程序正在查询")
except Exception as e:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
......@@ -116,6 +126,8 @@ class Save_asin_detail(BaseUtils):
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
if self.minid_maxid_list:
lock_key = f"{self.db_syn}_{self.month}_lock"
lock = self.redis_client.lock(lock_key, timeout=25) # 10秒超时
minid, maxid = self.minid_maxid_list[0].split('-')
with self.engine_pg.begin() as conn:
# sql_read = f"-- SELECT asin, id, date_info, asin_is_variation,data_type,volume,weight_str FROM {self.db_syn}_{self.month} WHERE STATE = 1 and id BETWEEN {minid} AND {maxid} limit {self.read_size} for update"
......@@ -137,20 +149,27 @@ class Save_asin_detail(BaseUtils):
self.df_read.asin + '|' + self.df_read.date_info + '|' + self.df_read.asin_is_variation.astype(
"U") + '|' + self.df_read.data_type.astype("U") + '|' + self.df_read.volume.astype(
"U") + '|' + self.df_read.weight_str.astype("U"))
if lock.locked():
lock.release()
return asin_list
else:
if lock.locked():
lock.release()
print('重新获取', self.minid_maxid_list[0], '无数据')
self.minid_maxid_list = self.reuests_para_val.get_minid_maxid(site_name=self.site_name,
state=3,
minid_maxid=
self.minid_maxid_list[0],
month=self.month)
else:
asin_list = self.read_db_data2()
if asin_list:
return asin_list
else:
return []
except LockError:
print("获取锁失败,其他程序正在查询")
except Exception as e:
self.engine = self.mysql_connect()
self.engine_pg = self.pg_connect()
......
......@@ -27,7 +27,9 @@ db_type_alias_map = {
}
DEFAULT_SERVERS = [
# "http://192.168.10.217:7777", # 内网
# "http://192.168.200.210:7777", # 内网
# "http://192.168.10.217:7777", # 内网-h7
# "http://61.145.136.61:7777", # 外网
"http://61.145.136.61:7779", # 外网
]
# ---------------------------
......@@ -121,7 +123,7 @@ class RemoteTransaction:
json={"db": self.db,
"sql_list": self.sql_queue,
"site_name": self.database}, # site_name not needed on server, kept for clarity
timeout=15,
timeout=3000,
).raise_for_status()
return
except Exception as e:
......@@ -146,7 +148,7 @@ class RemoteEngine:
r = self.session.post(f"{url}/{endpoint}",
data=json_bytes,
headers={"Content-Type": "application/json"},
timeout=60)
timeout=3000)
# r = self.session.post(f"{url}/{endpoint}",
# json=payload, timeout=10)
......
......@@ -657,7 +657,7 @@ class ConnectSpider:
account_id = df_status.account_id.iloc[0]
account_secret = df_status.account_secret.iloc[0]
account_list = [account_id, account_secret]
# print(account_list)
print(account_list,'232323====32')
# print(111111111111)
connection.close()
return account_list
......
......@@ -563,9 +563,9 @@ class GetSS_details():
def run(self):
day = time.strftime("%d")
for item_id in range(1, 33):
print(f"item_id: {item_id}")
print(f"开始抓取 item_id: {item_id}")
if item_id == 1 and int(day)<2:
Con.update_all_states_to_1(state=1)
Con.update_all_states_to_1(state=2)
wait_time = random.uniform(6, 10)
account_list = Con.get_cookie_account(item_id)
......@@ -604,7 +604,8 @@ class GetSS_details():
logging.error(f'发生错误: {e}, 停止循环')
break
if count == counts_last - 1:
print(f'{self.account} 全部爬取完成')
print(f'{self.account} 全部爬取完成1122==')
Con.update_all_states_to_1(state=3, item_id=item_id)
if stop_flag:
print('超过重试次数,暂停')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment