Commit 58220d8c by Peng

1 优化了kafka连接。2 对asin请求新增参数,3 h10增加中文页面自动化操作,4搜索词抓取bs类型解析发生变化。重新解析,5 asin详情新增解析评论,6 asin详情解析星级有改变.

parent c8524b22
......@@ -54,7 +54,7 @@ class CalculateMean(BaseUtils):
print(f"读取 {self.site_name}_one_category")
sql = f"select * from {self.site_name}_one_category where state!=4 and name = 'Health & Household' and `year_month`='{_year_month}';"
print('查询原始表:', sql)
self.df_sum = pd.read_sql(sql, con=self.engine)
self.df_sum = self.engine.read_sql(sql)
# # 排序
self.df_sum.sort_values(by=['name', 'rank'], inplace=True)
......@@ -66,7 +66,7 @@ class CalculateMean(BaseUtils):
self.cate_list = list(set(self.df_sum.name))
sql_select = f"SELECT `year_month` from selection.week_20_to_30 WHERE `week`={int(self.week)} and `year`={self.year}"
print(sql_select, 'sql_select:')
df = pd.read_sql(sql_select, con=self.engine)
df = self.engine.read_sql(sql_select)
self.year_month = list(df['year_month'])[0] if list(df['year_month']) else ''
print("self.year_month:", self.year_month)
......@@ -120,11 +120,11 @@ class CalculateMean(BaseUtils):
# sql = f'select en_name as name,category_id from {self.site_name}_bs_category WHERE nodes_num =2 and delete_time is NULL'
sql = f"select en_name as name, category_id from {self.site_name}_bs_category where 1 = 1 and nodes_num = 2 group by en_name, category_id"
df_en_name = pd.read_sql(sql, con=self.engine)
df_en_name = self.engine.read_sql(sql)
# 使用 merge 判断两个列的 name 是否一样
self.df_repeat = pd.merge(self.df_repeat, df_en_name, on='name', how='left')
self.df_repeat = self.df_repeat.loc[self.df_repeat.orders >= 1] # 保留大于0的 排名月销
self.df_repeat.to_sql(f"{self.site_name}_one_category_report_pyb", con=self.engine, if_exists="append", index=False)
self.engine.to_sql(self.df_repeat,f"{self.site_name}_one_category_report_pyb", if_exists="append")
def run(self):
self.send_mes(self.site_name)
......
......@@ -678,11 +678,20 @@ class bsr_catgory(BaseUtils):
if df_en_name['en_name'][0] == name_num_path[1]:
pass
else:
_strftime_ = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
update_name_sql = f'''update {self.site_name}_bs_category set delete_time = '2023-06-19 00:00:00' WHERE `path`="{name_num_path[3]}" and delete_time is null'''
print('更新 en_name:', update_name_sql)
self.db_cursor_connect_update(update_name_sql, self.site_name)
save_name_num_list.append(name_num_path)
select_sql_name_1 = f'''SELECT en_name,id FROM {self.site_name}_bs_category WHERE `path`="{name_num_path[3]}" and delete_time is null'''
df_en_name_1 = self.db_cursor_connect_msyql_read(site=None, select_state1_sql=select_sql_name_1)
print('en_name::', df_en_name_1.values)
if df_en_name_1['en_name'][0] == name_num_path[1]:
pass
else:
update_name_sql_1 = f'''update {self.site_name}_bs_category set delete_time = '2023-06-19 00:00:00' WHERE id={df_en_name_1['id'][0]} and delete_time is null'''
print('更新 en_name:', update_name_sql_1)
self.db_cursor_connect_update(update_name_sql_1, self.site_name)
break
except Exception as e:
print(e)
......@@ -820,7 +829,10 @@ class bsr_catgory(BaseUtils):
id_tuple = tuple(en_name_id_list)
print(len(id_tuple))
try:
update_sql = f'update {self.site_name}_bs_category set one_category_id={id[0]} where id in {id_tuple}'
if len(id_tuple) == 1:
update_sql = f"""UPDATE {self.site_name}_bs_category set one_category_id={id[0]} where id in ('{id_tuple[0]}')"""
else:
update_sql = f'update {self.site_name}_bs_category set one_category_id={id[0]} where id in {id_tuple}'
self.db_cursor_connect_update(update_sql, self.site_name)
except Exception as e:
print(e)
......
......@@ -34,13 +34,13 @@ class Save_asin_self(BaseUtils):
self.db_self_asin_detail = self.site_name + DB_REQUESTS_ASIN_PARAMS['db_self_asin_detail'][2:]
sql_read = "SELECT text_name FROM censored_thesaurus WHERE data_type='负面词汇'"
print(sql_read)
df = pd.read_sql(sql_read, con=self.engine)
df = self.engine.read_sql(sql_read)
self.text_list = list(df.text_name)
print('负面词汇:', self.text_list)
# asin_sql = f"SELECT asin,sku,erp_seller,{self.site_name}_upload_info,title,`describe` as describe_str ,selling from {self.site_name}_erp_asin_syn WHERE created_at>='2023-05-11' and asin_type=1;"
asin_sql = f"SELECT asin,sku,erp_seller,{self.site_name}_upload_info,title,`describe` as describe_str ,selling,is_variation,fulFillable from {self.site_name}_erp_asin_syn WHERE created_at>='2023-05-11';"
print('asin_sql::', asin_sql)
df_asin = pd.read_sql(asin_sql, con=self.engine)
df_asin = self.engine.read_sql(asin_sql)
self.asin_list = list(df_asin.asin)
print(len(self.asin_list))
df_asin[f'{self.site_name}_upload_info'].fillna('N/A', inplace=True)
......@@ -78,10 +78,15 @@ class Save_asin_self(BaseUtils):
# print(self_all_syn_sql)
self_all_syn_sql_1 = f'SELECT asin from {self.site_name}_self_real_spider WHERE asin in {asin_tuple} and state=4 and updated_at>="{self.time_strftime}"'
# print(self_all_syn_sql_1)
df_asin_error = pd.read_sql(self_all_syn_sql, con=self.engine)
df_asin_error_1 = pd.read_sql(self_all_syn_sql_1, con=self.engine)
asin_error_ = list(df_asin_error.asin)
asin_error_1 = list(df_asin_error_1.asin)
df_asin_error = self.engine.read_sql(self_all_syn_sql)
df_asin_error_1 = self.engine.read_sql(self_all_syn_sql_1)
asin_error_1 =[]
asin_error_ =[]
if not df_asin_error_1.empty:
asin_error_1 = list(df_asin_error_1.asin)
if not df_asin_error.empty:
asin_error_ = list(df_asin_error.asin)
asin_error_list = asin_error_1.extend(asin_error_)
if asin_error_list:
print("asin_error_list::", asin_error_list)
......@@ -101,12 +106,12 @@ class Save_asin_self(BaseUtils):
self.asin_list.remove(asin)
df = pd.DataFrame(data=sava_data,
columns=['asin', "sku", 'erp_seller', 'page_error'])
df.to_sql(f'{self.site_name}_erp_asin', con=self.engine, if_exists="append", index=False)
self.engine.to_sql(df,f'{self.site_name}_erp_asin', if_exists="append")
sava_data = []
asin_tuple = tuple(self.asin_list)
asin__detail_sql = f"SELECT asin,title,img_num,`describe`,category,page_inventory,search_category,product_description,img_type from {self.site_name}_self_asin_detail WHERE site='{self.site_name}' and created_at>='{self.time_strftime}' and asin in {asin_tuple};"
df_asin_detail = pd.read_sql(asin__detail_sql, con=self.engine)
df_asin_detail = self.engine.read_sql(asin__detail_sql)
fields_list = df_asin_detail.values.tolist()
for asin_data in fields_list:
data_list = []
......@@ -313,7 +318,7 @@ class Save_asin_self(BaseUtils):
df = pd.DataFrame(data=sava_data,
columns=['asin', "title_error", 'img_error', 'selling_error', 'search_ccategory_error',
'ccategory_error', 'buy_now_error', 'sku', 'erp_seller', 'describe_error'])
df.to_sql(f'{self.site_name}_erp_asin', con=self.engine, if_exists="append", index=False)
self.engine.to_sql(df,f'{self.site_name}_erp_asin', if_exists="append")
# def Compare_str(self,str1, str2):
# # 找出两个字符串中的最短长度
......
import pymysql
from params import DB_CONN_DICT,PG_CONN_DICT_14
import pandas as pd
import traceback
from sqlalchemy import create_engine
import time
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.secure_db_client import get_remote_engine
"""
每周三定时修改 feedback , product, 同步表修改状态 为 1 六个站点
"""
def run(site):
if site == 'us':
connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection", charset="utf8mb4")
else:
connect = pymysql.connect(host=DB_CONN_DICT['mysql_host'], port=DB_CONN_DICT['mysql_port'], user=DB_CONN_DICT['mysql_user'],
password=DB_CONN_DICT['mysql_pwd'], database="selection_" + site, charset="utf8mb4")
if site == 'us':
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT_14['pg_user']}:{PG_CONN_DICT_14['pg_pwd']}@{PG_CONN_DICT_14['pg_host']}:{PG_CONN_DICT_14['pg_port']}/selection",
encoding='utf-8')
else:
engine_pg = create_engine(
f"postgresql+psycopg2://{PG_CONN_DICT_14['pg_user']}:{PG_CONN_DICT_14['pg_pwd']}@{PG_CONN_DICT_14['pg_host']}:{PG_CONN_DICT_14['pg_port']}/selection_{site}",
encoding='utf-8')
cursor = connect.cursor()
engine_mysql = get_remote_engine(
site_name=site, # -> database "selection"
db_type="mysql", # -> 服务端 alias "mysql"
)
engine_pg = get_remote_engine(
site_name=site, # -> database "selection"
db_type="postgresql_14_outer", # -> 服务端 alias "mysql"
)
# cursor = connect.cursor()
# 更改 feedback syn 表 状态为1
update_feedback_sql = f"update {site}_seller_account_syn_distinct set state = 1, product_state=1 and state!=12"
print(update_feedback_sql)
cursor.execute(update_feedback_sql)
connect.commit()
# 更改 店铺syn 表 状态为1
update_product_sql = f"update {site}_seller_account_product_syn set state = 1"
print(update_product_sql)
cursor.execute(update_product_sql)
connect.commit()
update_feedback_sql = f"update {site}_seller_account_syn set state = 1, product_state=1"
print(update_feedback_sql)
cursor.execute(update_feedback_sql)
connect.commit()
connect.close()
cursor.close()
with engine_mysql.begin() as conn_mysql:
update_feedback_sql = f"update {site}_seller_account_syn_distinct set state = 1, product_state=1 and state!=12"
conn_mysql.execute(update_feedback_sql)
# 更改 店铺syn 表 状态为1
update_product_sql = f"update {site}_seller_account_product_syn set state = 1"
print(update_product_sql)
conn_mysql.execute(update_product_sql)
update_feedback_sql = f"update {site}_seller_account_syn set state = 1, product_state=1"
print(update_feedback_sql)
conn_mysql.execute(update_feedback_sql)
if site in ('us'):
with engine_pg.begin() as conn:
conn.execute(update_feedback_sql)
......
import sys
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from amazon_params import py_ja3
from amazon_save_db.save_asin_detail_pg import Save_asin_detail
from utils.asin_parse import ParseAsinUs
from queue import Queue
import time
import re
from lxml import etree
import requests
......@@ -17,7 +16,7 @@ import traceback
from datetime import datetime
import gzip
import json
from kafka.errors import KafkaError, KafkaTimeoutError
from kafka.errors import KafkaTimeoutError
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sess = requests.Session()
......@@ -214,7 +213,9 @@ class async_asin_pg():
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"], 'div_id_list': items['div_id_list'],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json']
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json': None, 'review_json_list': items['review_json_list'],
'asin_buySales_list': items['asin_buySales_list']
}
if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
item['five_six_val'] = items['five_six_val']
......@@ -379,7 +380,6 @@ class async_asin_pg():
def on_send_error(self, excp):
print("消息发送失败", excp)
def send_kafka(self, items=None, html_data=None, topic=None, num=3):
print('向Kafka发送数据')
for i in range(3):
......@@ -459,13 +459,52 @@ class async_asin_pg():
# 总请求 4
self.request_total_count_list = []
def run(self):
# asin_list = self.save_asin_detail.read_db_data()
asin_list = ['B0F1TKH4C1|2025-01|1|1|null|null',
'B0DQ413NZ3|2025-01|1|1|null|null',
'B0DYPCJMDZ|2025-01|1|1|null|null',
]
asin_list = ['B0FHPBN5BD|2025-01|1|1|null|null',
'B0F98H3X25|2025-01|1|1|null|null',
'B0FC5C8LYB|2025-01|1|1|null|null',
'B0F1C95Y7Z|2025-01|1|1|null|null',
'B0F1XPL81W|2025-01|1|1|null|null',
'B0FH6HXXKG|2025-01|1|1|null|null',
'B0FCDXQNKW|2025-01|1|1|null|null',
'B0FB8D2RZX|2025-01|1|1|null|null',
'B0F4QJ2PKL|2025-01|1|1|null|null',
'B0FTTSTYBH|2025-01|1|1|null|null',
'B0F1X7Y6HG|2025-01|1|1|null|null',
'B0FK4RJ8BQ|2025-01|1|1|null|null',
'B0FB31NQ6C|2025-01|1|1|null|null',
'B0F1XBNK8N|2025-01|1|1|null|null',
'B0F4R31W9G|2025-01|1|1|null|null',
'B0F2RZ7SQY|2025-01|1|1|null|null',
'B0FJL52XZL|2025-01|1|1|null|null',
'B0F1S7FC9Z|2025-01|1|1|null|null',
'B0FB3CGNWF|2025-01|1|1|null|null',
'B0F2SLP2JM|2025-01|1|1|null|null',
'B0FJ7YWTBC|2025-01|1|1|null|null',
'B0F1C95998|2025-01|1|1|null|null',
'B0FMRKGK1B|2025-01|1|1|null|null',
'B0F1NCNGCY|2025-01|1|1|null|null',
'B0FGHHZRDB|2025-01|1|1|null|null',
'B0FH6CRWJ3|2025-01|1|1|null|null',
'B0F4CGG71T|2025-01|1|1|null|null',
'B0F93LS2X4|2025-01|1|1|null|null',
'B0F8B343WJ|2025-01|1|1|null|null',
'B0F1CCJ6T8|2025-01|1|1|null|null',
'B0FPFKLV4W|2025-01|1|1|null|null',
'B0FB82RRNJ|2025-01|1|1|null|null',
'B0FBG8BNWR|2025-01|1|1|null|null',
'B0F1XD9PP4|2025-01|1|1|null|null',
'B0F1X9GPV4|2025-01|1|1|null|null',
'B0F4R1RKG7|2025-01|1|1|null|null',
'B0CM8VHPPG|2025-01|1|1|null|null',
'B0FPKC3VXL|2025-01|1|1|null|null',
'B0F9P17QZB|2025-01|1|1|null|null',
'B0FRLL5FRD|2025-01|1|1|null|null',
'B0FPX6QGC7|2025-01|1|1|null|null',
'B0FP97HMR6|2025-01|1|1|null|null',
]
if asin_list:
for asin in asin_list:
self.queries_asin_queue.put(asin)
......@@ -495,38 +534,29 @@ class async_asin_pg():
break
except FunctionTimedOut as e:
print('断网', e)
# if __name__ == '__main__':
# async_asin_pg(month=9, spider_int=1, week=14,site_name='us').run()
#
from datetime import datetime, timedelta
# 当前日期
today = datetime(2025, 7, 8)
# if __name__ == '__main__':
# async_asin_pg(month=9, spider_int=1, week=14, site_name='us').run()
# 向前推 447 天
delta = timedelta(days=119)
result_date = today - delta
#
# # 输出结果
print("447天前的日期是:", result_date.strftime("%Y-%m-%d"))
#
# from datetime import datetime
#
# # 当前日期
# today = datetime.today()
from datetime import datetime
# 当前日期
today = datetime.today()
#
# # 起始日期
# start_date = datetime(2025, 1, 24)
start_date = datetime(2025, 5, 7)
#
# # 相差天数
# delta_days = (today - start_date).days
# print('相差天数',delta_days)
delta_days = (today - start_date).days
print('相差天数',delta_days)
# # 除以30
# result = delta_days / 30
# print('每天销售',result)
result = delta_days / 30
print('每天销售',result)
print('累计销量', result * 9) # 每天* bsr月销
# print('每天销售* 月销售数量',result*110) # 每天销售* 月销售数量
# # 输出结果
# print(f"到今天相隔 {delta_days} 天")
# # print(1426*20.99)
\ No newline at end of file
print(f"到今天相隔 {delta_days} 天")
# # print(1426*20.99)
......@@ -250,7 +250,7 @@ class search_temp_pg(BaseUtils):
for search_url in search_term_list:
self.search_term_queue.put(search_url)
html_thread = []
for i in range(16):
for i in range(17):
thread2 = threading.Thread(target=self.get_search_kw, args=(i,))
html_thread.append(thread2)
for ti in html_thread:
......@@ -307,17 +307,23 @@ class search_temp_pg(BaseUtils):
self.engine_pg = self.pg_connect()
sql_read = f"""SELECT id, search_term, url FROM {self.db_search_term} where state=1 and month={self.month} LIMIT {self.read_size} for update;"""
print(sql_read)
self.df_read = self.engine.read_sql(sql_read)
# self.df_read = self.engine_pg.read_sql(sql_read)
self.df_read = self.engine_pg.read_then_update(
select_sql=sql_read,
update_table=self.db_search_term,
set_values={"state": 2}, # 把库存清零
where_keys=["id"], # WHERE sku = :sku
)
if self.df_read.shape[0] > 0:
self.id_tuple = tuple(self.df_read.id)
# self.id_tuple = tuple(self.df_read.id)
self.date_info = f'2025-{self.month}'
print('date_info::', self.date_info, ' 月:', self.month)
with self.engine_pg.begin() as conn:
if len(self.id_tuple) == 1:
sql_update = f'UPDATE {self.db_search_term} set state=2 where id in ({self.id_tuple[0]});'
else:
sql_update = f'UPDATE {self.db_search_term} set state=2 where id in {self.id_tuple};'
conn.execute(sql_update)
# with self.engine_pg.begin() as conn:
# if len(self.id_tuple) == 1:
# sql_update = f'UPDATE {self.db_search_term} set state=2 where id in ({self.id_tuple[0]});'
# else:
# sql_update = f'UPDATE {self.db_search_term} set state=2 where id in {self.id_tuple};'
# conn.execute(sql_update)
search_term_list = list(
self.df_read.id.astype("U") + '|-|' + self.df_read.search_term + '|-|' + self.df_read.url)
return search_term_list
......
......@@ -68,6 +68,9 @@ def select_sate_mysql(site, num=None, month=None, week=None):
engine_us_mysql = db_engine('us', 'mysql')
df = engine_us_mysql.read_sql(sql_select_)
if int(df.status_val[0]) in (1, 2):
redis_client = BaseUtils().redis_db()
lock_key = "ALL站点-asin同步-pg-api_lock"
lock = redis_client.lock(lock_key, timeout=15) # 10秒超时
update_workflow_progress = f"update workflow_progress set status_val=3,status='抓取结束' where page='反查搜索词' and date_info='2025-{week}' and site_name='{site}' and date_type='week'"
print('update_workflow_progress: 修改状态3 ', update_workflow_progress)
db_cursor_connect_update(update_workflow_progress, site)
......@@ -83,9 +86,7 @@ def select_sate_mysql(site, num=None, month=None, week=None):
ii += 1
if ii > 8:
break
redis_client = BaseUtils().redis_db()
lock_key = "ALL站点-asin同步-pg-api_lock"
lock = redis_client.lock(lock_key, timeout=5) # 10秒超时
if id_tuple is None:
DolphinschedulerHelper.start_process_instance_common(
project_name="big_data_selection",
......@@ -127,7 +128,7 @@ def long_time_task(site, proxy_name, month):
if __name__ == '__main__':
pppoe_ip()
site_list = ['us', 'de', 'uk']
site_list = ['us','de','uk']
month = int(sys.argv[1])
week = int(sys.argv[2])
proxy_name = None
......
import time
import sys
import os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import time
from redis.exceptions import LockError
from threading_spider.db_connectivity import connect_db
from threading_spider.post_to_dolphin import DolphinschedulerHelper
from utils.db_connect import BaseUtils
......@@ -84,7 +86,7 @@ def select_sate_mysql(site=None, num=None, page=None, month=None, week=None, spi
# 定义锁的键
redis_client = BaseUtils().redis_db()
lock_key = f"{year_week}_{site}_lock"
lock = redis_client.lock(lock_key, timeout=5) # 10秒超时
lock = redis_client.lock(lock_key, timeout=55) # 10秒超时
select_sql = f"select status_val from workflow_progress WHERE date_info='{year_week}' and date_type='week' and site_name='{site}' and page='ASIN详情'"
print(select_sql)
df_state = db_cursor_connect_msyql_read(select_sql)
......@@ -154,6 +156,7 @@ def select_sate_mysql(site=None, num=None, page=None, month=None, week=None, spi
update_workflow_progress = f"update workflow_progress set status_val=3,status='ASIN爬取完成',up_spider_state=3 where page='ASIN详情' and date_info='{year_week}' and site_name='{site}' and date_type='week'"
print(update_workflow_progress)
db_cursor_connect_update(update_workflow_progress, site)
db_class.send_mg('pengyanbing', '修改进度表', update_workflow_progress)
ii = 0
for i in range(10):
time.sleep(180)
......@@ -164,6 +167,7 @@ def select_sate_mysql(site=None, num=None, page=None, month=None, week=None, spi
update_month_asin_state = f"update workflow_progress set status_val=3,status='月ASIN抓取完成' WHERE site_name='{site}' and page='asin详情' and date_type='month' and status_val=1 and status='月ASIN导出完成' and date_info='{year_month}'"
print(update_month_asin_state)
db_cursor_connect_update(update_month_asin_state, site)
db_class.send_mg('pengyanbing', '修改 月 维度 进度表', update_month_asin_state)
update_month_spider_state = f"update workflow_progress set kafka_flow_state=1,spider_state=3,spider_int={spider_int} WHERE site_name='{site}' and date_type='month' and date_info='{year_month}' and page='ASIN详情'"
db_cursor_connect_update(update_month_spider_state, site)
DolphinschedulerHelper.start_process_instance_common(
......@@ -216,8 +220,8 @@ def select_sate_mysql(site=None, num=None, page=None, month=None, week=None, spi
# project_name="big_data_selection",
# process_df_name='ALL站点-启动30day/月流程',
# startParams={
# "site_name": "us",
# "site_name": "uk",
# "date_type": "month",
# "date_info": '2025-07'
# "date_info": '2025-10'
# }
# )
\ No newline at end of file
......@@ -222,11 +222,16 @@ DE_ASIN_XPATH = {
"brand2": ["//a[@id='amznStoresBylineLogoImageContainer']/following-sibling::a/text()"],
"ac_name": ["//span[@class='ac-keyword-link']/a/text()", "//span[@class='ac-for-text']/span/text()"],
"reviews": ['//*[@id="acrCustomerReviewText"]/text()', '//div[@class="a2s-pdd-reviews"]//a/span/text()'],
"star5": ["//a[@class='a-link-normal 5star']/@title|//a[contains(@href,'ref=acr_dp_hist_5')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star4": ["//a[@class='a-link-normal 4star']/@title|//a[contains(@href,'ref=acr_dp_hist_4')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star3": ["//a[@class='a-link-normal 3star']/@title|//a[contains(@href,'ref=acr_dp_hist_3')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star2": ["//a[@class='a-link-normal 2star']/@title|//a[contains(@href,'ref=acr_dp_hist_2')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star1": ["//a[@class='a-link-normal 1star']/@title|//a[contains(@href,'ref=acr_dp_hist_1')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star5": ["//a[@class='a-link-normal 5star']/@title|//a[contains(@href,'ref=acr_dp_hist_5')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()","//a[contains(@href,'five_star')]/@aria-label",
"//a[contains(@href,'ref=acr_dp_hist_5')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star4": ["//a[@class='a-link-normal 4star']/@title|//a[contains(@href,'ref=acr_dp_hist_4')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()","//a[contains(@href,'four_star')]/@aria-label",
"//a[contains(@href,'ref=acr_dp_hist_4')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star3": ["//a[@class='a-link-normal 3star']/@title|//a[contains(@href,'ref=acr_dp_hist_3')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()","//a[contains(@href,'three_star')]/@aria-label",
"//a[contains(@href,'ref=acr_dp_hist_3')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star2": ["//a[@class='a-link-normal 2star']/@title|//a[contains(@href,'ref=acr_dp_hist_2')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()","//a[contains(@href,'two_star')]/@aria-label",
"//a[contains(@href,'ref=acr_dp_hist_2')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"star1": ["//a[@class='a-link-normal 1star']/@title|//a[contains(@href,'ref=acr_dp_hist_1')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()","//a[contains(@href,'one_star')]/@aria-label",
"//a[contains(@href,'ref=acr_dp_hist_1')]/div[@class='a-section a-spacing-none a-text-right aok-nowrap']/text()"],
"material": ["//span[text()='Material']/ancestor-or-self::td/following-sibling::td/span/text()"],
"package_quantity": ["//label[contains(text(),'Package Quantity:')]/following-sibling::span/text()"],
"pattern_name": ["//span[contains(text(),'Pattern Name:')]/following-sibling::span/text()"],
......@@ -269,6 +274,9 @@ DE_ASIN_XPATH = {
"best_sellers_herf": ['//span[contains(text(),"Amazon Bestseller-Rang")]/parent::span//a/@href',
'//th[contains(text(),"Amazon Bestseller-Rang")]/following-sibling::td//a/@href'],
"best_sellers_text": ['//span[contains(text(),"Amazon Bestseller-Rang")]/parent::span//a/text()',
'//th[contains(text(),"Amazon Bestseller-Rang")]/following-sibling::td//a/text()'],
"Best_rank": ['//th[contains(text(),"Amazon Bestseller-Rang")]/following-sibling::td//text()',
'//span[contains(text(),"Amazon Bestseller-Rang")]/parent::span//text()'],
"Best_rank2": ['//th[contains(text(),"Amazon Bestseller")]/following-sibling::td//text()',
......
......@@ -488,12 +488,29 @@ class ParseSearchTermUs(object):
def parse_bs(self):
try:
asin_list = self.etree_html.xpath(
"//span[contains(text(),'estseller')]/parent::span//parent::span[contains(@id,'best-seller')]/@id|//span[contains(text(),'Seller')]/parent::span//parent::span[contains(@id,'best-seller')]/@id")
print('############## bsr_asin::', asin_list)
if len(asin_list):
asin_list = [asin.split("-")[0] for asin in asin_list if len(asin.split("-")[0]) >= 9]
self.bs_list.extend(self.parse_type_common(asin_list=asin_list, cate_type='sb'))
bsr_asin_xpath_list = [
"//div[@data-csa-c-content-id='BEST_SELLER']/parent::div/parent::div/parent::div/parent::span/parent::div//@data-csa-c-asin",
"//div[@data-csa-c-content-id='BEST_SELLER']/parent::div/parent::div/parent::div/parent::div/parent::div//@data-csa-c-asin",
"//div[@data-csa-c-content-id='BEST_SELLER']/parent::div/parent::div/parent::div//@data-csa-c-item-id",
"//div[@data-csa-c-content-id='BEST_SELLER']/parent::div/parent::div/parent::div/parent::span/parent::div//@data-csa-c-item-id",
"//div[@data-csa-c-content-id='BEST_SELLER']/parent::div/parent::div/parent::div/parent::div/parent::div//@data-csa-c-item-id"]
for bsr_asin_xpath in bsr_asin_xpath_list:
asin_list = self.etree_html.xpath(bsr_asin_xpath)
print('############## bsr_asin::', asin_list)
bsr_asin_list = []
if len(asin_list):
asin_list = [asin.split("-")[0] for asin in asin_list if len(asin.split("-")[0]) >= 9]
for asin in asin_list:
if len(asin)>10:
pattern = re.compile(r'(?<=amzn1\.asin\.)[A-Z0-9]{10}', re.I)
asins = pattern.findall(asin)
bsr_asin_list.append(asins)
else:
bsr_asin_list.append(asin)
print('############## bsr_asin::', bsr_asin_list)
self.bs_list.extend(self.parse_type_common(asin_list=bsr_asin_list, cate_type='sb'))
break
except Exception as e:
pass
......@@ -592,3 +609,12 @@ class ParseSearchTermUs(object):
"https://www.amazon.co.uk/dp/B09FLQD7VN?pd_rd_i=B09FLQD7VN&pd_rd_w=GwsFh&pf_rd_p=88aa1216-6e73-4bd1-9903-e6883ff8dae3&pd_rd_wg=2kZM8&pf_rd_r=P8P1KCGMPXS9XWH1NFQV&pd_rd_r=a7c81c84-a2aa-47ad-8bd9-055c75c99a28"
return (self.zr_list, self.sp_list, self.sb_list, self.ac_list,
self.bs_list, self.er_list, self.tr_list, self.sold_list, self.buy_text_list, self.hr_list)
# if __name__ == '__main__':
# with open(r'C:\Users\ASUS\Downloads\python2.html','r',encoding='utf-8')as f:
# response = f.read()
# parse_search_term = ParseSearchTermUs(page_source=response, driver=None, search_term='keywords',
# page=1, site_name='us')
# st_list = parse_search_term.run()
# zr_list, sp_list, sb_list, ac_list, bs_list, er_list, tr_list, sort_list, buy_text_list, hr_list = st_list
# print( zr_list, sp_list, sb_list, ac_list, bs_list, er_list, tr_list, sort_list, buy_text_list, hr_list )
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment