Commit 16efe939 by Peng

update: utils 模块多项优化与功能扩展

[asin_parse.py]
- ParseAsinUs 新增 response_s 参数,支持外部传入 etree 解析树,避免重复解析
- 新增搜索框分类(search_category)字段解析
- 分类路径新增 all_nodeid(全路径节点 ID 拼接)及空值保护
- initialSeenAsins 解析新增空列表保护,防止 IndexError
- AI 评论按钮解析改用 data-testid 定位,过滤无效短文本
- 修复 es 站品牌解析变量名冲突 bug
- 修复评论数去括号逻辑及德站 Sternebewertung 判断错误
- 修复日期解析变量名 data_time 误用 bug
- 移除优惠券动态 XPATH 追加逻辑,防止重复追加

[db_connect.py]
- BaseUtils.__init__ 新增 site_name 参数(默认 us),不再硬编码
- 新增 doris_connect_adv() 连接 Doris 广告库(advertising_manager)
- 新增 doris_adv_direct_connect() pymysql 直连 Doris 广告库
- 清理旧版 SQLAlchemy 注释代码及调试 print 语句
- Kafka 超时 30000→40000ms,linger_ms 150→350,重试 10→5 次

[requests_param.py]
- 新增 next_page_lock 线程锁和 headers_num_int_s 属性,支持多线程分页
- 修复中文检测 check_str 为 None 时报错的 bug
- cookie 加载上限 300→350 条
- 修正请求头 accept-Encodin 拼写错误,移除 authority 字段
- 请求超时 10→30 秒,提升慢速页面成功率

[params_asin_xpath.py]
- 全站点 review_ai_list / review_button_list 新增 data-testid XPath,兼容新版 AI 评论结构
- 全站点新增 fbm_delivery_price 字段,采集 FBM 配送运费
- 全站点新增 search_category 字段,采集搜索框当前分类
- US/UK/DE/FR 站点 td_0_text 新增多条 XPath,适配 Amazon 最新页面结构

[check_columns.py]
- 取消 __main__ 注释,支持脚本直接运行

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
parent 4c8ae1db
......@@ -12,10 +12,10 @@ import json
class ParseAsinUs(object):
def __init__(self, resp=None, asin=None, week=None, date_info=None, data_type_asin=1, site_name=None, month=None):
def __init__(self, resp=None, asin=None, week=None, date_info=None, data_type_asin=1, site_name=None, month=None, response_s=None):
print(f'请求 {site_name} 站点数据')
self.page_source = resp
self.response_s = etree.HTML(resp)
self.response_s = response_s if response_s is not None else etree.HTML(resp)
self.asin = asin
self.all_img_video_list = []
self.asin_variation_list = []
......@@ -138,6 +138,8 @@ class ParseAsinUs(object):
initial_seen_asins_str = initialSeenAsins_list[0] + ']'
else:
initialSeenAsins_list = re.findall(r'initialSeenAsins\":(.*)],\"set', resp)
if not initialSeenAsins_list:
return None
initial_seen_asins_str = initialSeenAsins_list[0] + ']'
pattern = re.compile(r'B\w+')
initialSeenAsins = pattern.findall(initial_seen_asins_str)
......@@ -636,21 +638,44 @@ class ParseAsinUs(object):
cleaned_dict = {self.clean_string(key): self.clean_string(value) for key, value in productdetail.items()}
prcdt_dict = {key: value for key, value in cleaned_dict.items() if value != ''}
productdetail_json = json.dumps(prcdt_dict, ensure_ascii=False)
# 搜索框分类
for i in ASIN_XPATH['search_category']:
search_category_list = self.response_s.xpath(i)
if search_category_list:
search_category = search_category_list[0].strip()
break
else:
search_category = None
# 分类
els_category_href_list = []
for i in ASIN_XPATH['category_href']:
els_category_href_list = self.response_s.xpath(i)
if els_category_href_list:
break
for i in ASIN_XPATH['category_data']:
els_category = self.response_s.xpath(i)
if els_category:
category_list = []
nodes_id_list = []
for e in els_category:
category_list.append(e.strip())
category = ''.join(category_list)
node_id = re.findall(r'node=(\d+)', els_category_href_list[-1])[0]
if els_category_href_list:
node_id_list = re.findall(r'node=(\d+)', els_category_href_list[-1])
node_id = node_id_list[0] if node_id_list else None
for i in els_category_href_list:
_id_list = re.findall(r'node=(\d+)', i)
if _id_list:
nodes_id_list.append(_id_list[0])
all_nodeid = '›'.join(nodes_id_list) if nodes_id_list else None
else:
node_id = None
all_nodeid = None
break
else:
category = None
node_id = None
all_nodeid = None
# 解析标题
for i in ASIN_XPATH['title']:
title_ = self.response_s.xpath(i)
......@@ -805,8 +830,9 @@ class ParseAsinUs(object):
if Brand_list_:
break
elif self.site_name == 'es':
Brand_list = re.findall(r'de (.*)', Brand_list[0])
if Brand_list:
Brand_list_es = re.findall(r'de (.*)', Brand_list[0])
if Brand_list_es:
Brand_list = Brand_list_es
break
else:
Brand_list_ = re.findall(r'Marca:(.*)', Brand_list[0])
......@@ -867,14 +893,15 @@ class ParseAsinUs(object):
for i in ASIN_XPATH['reviews']:
number_of_reviews = self.response_s.xpath(i)
if number_of_reviews:
number_of_reviews = number_of_reviews[0].strip().replace('\xa0', '')
number_of_reviews = number_of_reviews[0].strip().replace('\xa0', '').replace(')', '').replace('(', '')
print('number_of_reviews::',number_of_reviews)
if self.site_name == 'de':
if "Sternebewertungen" in number_of_reviews:
total_comments = re.findall(r"(.*) Sternebewertungen", number_of_reviews)[0]
elif "Sternebewertung" in number_of_reviews[0]:
elif "Sternebewertung" in number_of_reviews:
total_comments = re.findall(r"(.*) Sternebewertung", number_of_reviews)[0]
else:
total_comments = None
total_comments = number_of_reviews
break
elif self.site_name == 'fr':
if "évaluations" in number_of_reviews:
......@@ -978,11 +1005,15 @@ class ParseAsinUs(object):
for button in button_list:
try:
button_text = button.xpath('./text()')[0]
i = button_list.index(button)
span_text = self.response_s.xpath(f"//div[@id='aspect-bottom-sheet-0-{i}']//span/text()")
p_text = self.response_s.xpath(f"//div[@id='aspect-bottom-sheet-0-{i}']//p/text()")
_text = '&&&&'.join(span_text) + '|-|' + '&&&&'.join(p_text)
print('button_text::',button_text)
span_text = self.response_s.xpath(f"//div[@data-testid='bottomsheet-content-{button_text.lower()}']//span/text()")
span_text_list = []
for data in span_text:
if len(data) > 5:
span_text_list.append(data)
_text = '|-|'.join(span_text_list).replace('"','').replace('Read more','').replace('‘','').replace('’','')
button_dict[button_text] = _text
# 從2026年1月份開始 根據 |-| 來分割。26以前的先根據 |-| 分割 再根據 &&&& 分割
except:
pass
if len(button_dict) < 1:
......@@ -1035,6 +1066,7 @@ class ParseAsinUs(object):
elif len(price.strip()) < 1:
price = self.get_price()
if self.site_name == 'us':
# 判断是否有 Coupon 促销类型
deal_type = []
......@@ -1297,7 +1329,6 @@ class ParseAsinUs(object):
coupon_int = None
deal_type = []
coupon_trne = ''
ASIN_XPATH['coupon'].append(f"//div[@data-csa-c-asin='{self.asin}']//label/text()")
for i in ASIN_XPATH['coupon']:
Voucher_list = self.response_s.xpath(i)
if Voucher_list:
......@@ -2213,10 +2244,10 @@ class ParseAsinUs(object):
data = re.findall(r' \((.*)', data)[0]
chars_to_remove = ['(', ')', '\u200e', ',', '.']
for char in chars_to_remove:
data_time = data_time.replace(char, '')
month_ = re.findall(r'[A-Za-z]', data_time)
data = data.replace(char, '')
month_ = re.findall(r'[A-Za-z]', data)
month_str = ''.join(month_)
_month = data_time.replace(month_str, tiem_dict.get(month_str))
_month = data.replace(month_str, tiem_dict.get(month_str))
amazon_launch_time = _month
try:
year_moth_day = datetime.datetime.strptime(_month, '%m %d %Y')
......@@ -2292,10 +2323,10 @@ class ParseAsinUs(object):
data = re.findall(r' \((.*)', data)[0]
chars_to_remove = ['(', ')', '\u200e', ',', '.']
for char in chars_to_remove:
data_time = data_time.replace(char, '')
month_ = re.findall(r'[A-Za-z]', data_time)
data = data.replace(char, '')
month_ = re.findall(r'[A-Za-z]', data)
month_str = ''.join(month_)
_month = data_time.replace(month_str, tiem_dict.get(month_str))
_month = data.replace(month_str, tiem_dict.get(month_str))
amazon_launch_time = _month
try:
year_moth_day = datetime.datetime.strptime(_month, '%m %d %Y')
......@@ -2589,16 +2620,21 @@ class ParseAsinUs(object):
buy_box_seller_type = None
for i in ASIN_XPATH['td_0_text']:
td_0_text = self.response_s.xpath(i)
td_0_text = [t for t in td_0_text if t.strip()] # 过滤纯空白
if td_0_text:
break
else:
td_0_text = []
for i in ASIN_XPATH['td_1_text']:
td_1_text = self.response_s.xpath(i)
td_1_text = [t for t in td_1_text if t.strip()] # 过滤纯空白
if td_1_text:
break
else:
td_1_text = []
print('td_1_text::',td_1_text, 'td_0_text:56565634:',td_0_text)
if td_1_text and td_0_text:
if self.site_name in ['us', 'uk', 'es', 'fr']:
buy_box_seller_type = self.re_buy_sller(td_1_text, td_0_text)
......@@ -2654,6 +2690,33 @@ class ParseAsinUs(object):
break
else:
buy_box_seller_type = 4
# FBM运费价格筛选 fbm_delivery_price
for i in ASIN_XPATH['fbm_delivery_price']:
delivery_pric_list = self.response_s.xpath(i)
# FREE → 0
try:
if delivery_pric_list:
p = delivery_pric_list[0] if delivery_pric_list[
0] != 'FREE' and buy_box_seller_type == 3 else None
# 去掉货币符号
if p:
for sym in ['£', '€', '$']:
p = p.replace(sym, '')
# 把 , 转成 .
p = p.replace(' ', '').replace(',', '.')
# 判断是否有小数点
if '.' in p:
delivery_pric = round(float(p), 2) # 保留两位小数
else:
delivery_pric = int(p) # 纯整数
else:
delivery_pric = p
else:
delivery_pric = None
except:
print('获取FBM运费价格筛选 错误::', self.asin)
delivery_pric = None
# 变体 # 当前亚马逊的真实asin。和请求asin对比是否一样
current_asin = None
current_Asin_list = re.findall(r'currentAsin(.*?),', self.page_source)
......@@ -2725,13 +2788,6 @@ class ParseAsinUs(object):
except:
pass
# 月销具体数值。如果有值拼接一起
# buy_sales_num_list
# 月销具体数值。如果有值拼接一起
# buy_sales_num_list
for i in ASIN_XPATH['buy_sales_num_list']:
buySales_num_list = self.response_s.xpath(i)
if buySales_num_list:
......@@ -2777,16 +2833,6 @@ class ParseAsinUs(object):
if len(buySales) > 50:
buySales = None
asin_buySales_list = []
if asin_not_Sales and buySales is None:
asin_buy = self.asin
asin_buySales = asin_not_Sales
else:
asin_buy = None
asin_buySales = None
if asin_buy and asin_buySales:
asin_buySales_list.append([asin_buy, asin_buySales, self.date_info])
# 跟卖
for i in ASIN_XPATH['box_follow_list']:
buyBox_num_list = self.response_s.xpath(i)
......@@ -2840,7 +2886,7 @@ class ParseAsinUs(object):
rating = round(float(rating), 2)
except:
rating = 0
print('total_comments::',total_comments)
if total_comments:
try:
total_comments = total_comments.replace(',', '').replace('.', '')
......@@ -2891,23 +2937,17 @@ class ParseAsinUs(object):
if buyBox_url is not None and seller_id is not None and td_1_text:
buyBox_name = td_1_text[0]
if 'Amazon.com' not in td_1_text[0]:
lock = Lock()
lock.acquire()
account_name = buyBox_name.replace("%", "%%")
account_name = account_name.strip()
account_url = f'{self.site_url}/s?me={seller_id}'
self.buyBox_list.append([seller_id, account_name, buyBox_url])
lock.release()
else:
buyBox_name = None
if buyBox_name is not None and seller_id is not None:
lock = Lock()
lock.acquire()
account_name = buyBox_name.replace("%", "%%")
account_name = account_name.strip()
account_url = f'{self.site_url}/s?me={seller_id}'
self.buyBoxname_asin_list.append([account_name, self.asin, seller_id])
lock.release()
if launch_time:
launch_time = launch_time.replace('00:00:00', '').strip()
if td_0_text:
......@@ -2931,7 +2971,7 @@ class ParseAsinUs(object):
}
cleaned_data = {k: (v.strip() if isinstance(v, str) and v.strip() else None) for k, v in seller_dict.items()}
ship_sold_json = self.get_ship_sold_json()
if all(value is None for value in cleaned_data.values()):
seller_json = None
else:
......@@ -2974,7 +3014,8 @@ class ParseAsinUs(object):
'div_id_list': div_id_list, 'bundles_this_asins_data_json': bundles_this_asins_data_json,
'video_m3u8': video_m3u8, 'result_list_json': result_list_json,
'bundle_asin_component_json': bundle_asin_component_json,
"review_json_list": review_json, 'asin_buySales_list': asin_buySales_list}
"review_json_list": review_json,'fbm_delivery_price':delivery_pric,'all_nodeid':all_nodeid,'search_category':search_category,
'ship_sold_json':ship_sold_json}
if self.site_name == 'us':
item['three_four_val'] = Join_Prime_int
elif self.site_name in ['uk', 'fr', 'it', 'es']:
......@@ -2986,5 +3027,77 @@ class ParseAsinUs(object):
return item
def get_ship_sold_json(self):
KNOWN_LABELS = [
# US / UK
'Shipper / Seller', 'Ships from', 'Sold by',
'Returns', 'Payment', 'Packaging', 'Subscription',
# DE
'Versender / Verkäufer', 'Gesetzliche Gewährleistung',
'Garantie und Reparaturen', 'Rückgaben', 'Zahlung',
]
BLOCK_STARTERS = {
'Shipper / Seller', 'Ships from', 'Versender / Verkäufer',
}
NOISE_PHRASES = [
'Your transaction is secure', 'We work hard', 'Learn more', 'Read full', 'Read more',
'Deine Transaktion ist sicher', 'Mehr erfahren',
]
def is_noise(text):
return bool(re.search(
r'[{};]|function\s*\w*\s*[\w(]|var\s+\w+|window\.|\.po-|word-break|padding:|logTech',
text
))
def extract_first_value(raw):
raw = re.sub(r'\s+', ' ', raw).strip()
if not raw:
return ''
words = raw.split()
for l in range(1, min(len(words) // 2 + 1, 15)):
candidate = ' '.join(words[:l])
if raw[len(candidate):].lstrip().startswith(candidate):
return candidate
for noise in NOISE_PHRASES:
idx = raw.find(noise)
if 0 < idx < 500:
return raw[:idx].strip()
return raw[:300].strip()
def is_complete_block(block):
has_shipper = any(k in block for k in ['Ships from', 'Shipper / Seller', 'Versender / Verkäufer'])
has_returns = any(k in block for k in ['Returns', 'Rückgaben'])
return has_shipper and has_returns
text_list = self.response_s.xpath(
"//div[@class='a-expander-content a-expander-partial-collapse-content']//text()"
)
cleaned = [t.strip() for t in text_list if t.strip() and not is_noise(t.strip())]
full = re.sub(r'\s+', ' ', ' '.join(cleaned)).strip()
label_re = '|'.join(re.escape(l) for l in sorted(KNOWN_LABELS, key=len, reverse=True))
parts = re.split(f'({label_re})', full)
blocks, current = [], {}
i = 1
while i < len(parts):
label = parts[i]
raw_val = parts[i + 1].strip() if i + 1 < len(parts) else ''
value = extract_first_value(raw_val)
if label in BLOCK_STARTERS and label in current:
if is_complete_block(current):
blocks.append(current)
current = {}
if value and label not in current:
current[label] = value
i += 2
if current:
blocks.append(current)
return json.dumps(blocks, ensure_ascii=False) if blocks else None
# if __name__ == '__main__':
# ParseAsinUs().xpath_html()
......@@ -301,7 +301,7 @@ class spider_check(BaseUtils):
bytes(json.dumps(item), 'utf-8').decode('unicode_escape'))
# if __name__ == '__main__':
# spider_check('us').send_ms()
if __name__ == '__main__':
spider_check('us').send_ms()
# spider_check('de').send_ms()
# spider_check('uk').send_ms()
......@@ -14,78 +14,43 @@ import redis
from utils.secure_db_client import get_remote_engine
class BaseUtils(object):
def __init__(self):
self.site_name = 'us'
def __init__(self,site_name='us'):
self.site_name = site_name
self.engine = self.mysql_connect()
def pg_connect(self):
# db_type_alias_map = {
# "mysql": "mysql", # 阿里云mysql
# "postgresql_14": "postgresql_14", # pg14爬虫库-内网
# "postgresql_14_outer": "postgresql_14_outer", # pg14爬虫库-外网
# "postgresql_15": "postgresql_15", # pg15正式库-内网
# "postgresql_15_outer": "postgresql_15_outer", # pg15正式库-外网
# "postgresql_cluster": "postgresql_cluster", # pg集群-内网
# "postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
# "doris": "doris", # doris集群-内网
# }
engine_pg = get_remote_engine(
site_name=self.site_name, # -> database "selection"
db_type="postgresql_14_outer", # -> 服务端 alias "mysql"
)
print('engine_pg::',engine_pg)
return engine_pg
# while True:
# try:
# if self.site_name == 'us':
# db = 'selection'
# else:
# db = f'selection_{self.site_name}'
# engine_pg = create_engine(
# f"postgresql+psycopg2://{PG_CONN_DICT['pg_user']}:{PG_CONN_DICT['pg_pwd']}@{PG_CONN_DICT['pg_host']}:{PG_CONN_DICT['pg_port']}/{db}",
# encoding='utf-8', connect_args={"connect_timeout": 10, "keepalives": 1,
# "keepalives_idle": 40, # 40s 空闲后开始发 心跳链接
# "keepalives_interval": 20, # 每 20s 发一次
# "keepalives_count": 10}, #在空闲 40 秒后,每 20 秒发一次探测,总共探测 10 次
# pool_recycle=900, # 太老的连接(15mi)强制回收,避免中间网络设备回收后无法用,池中连接存活 15 min 后丢弃
# pool_size=60, # 根据并发量适当设置
# max_overflow=40)
# return engine_pg
# except Exception as e:
# print("pg_connect 14 t11111111111111111111111:", e, f"\n{traceback.format_exc()}")
# time.sleep(3)
# continue
def doris_connect_adv(self):
engine_doris = get_remote_engine(
site_name=self.site_name, # -> database "selection"
db_type="doris_adv", # -> 服务端 alias "mysql"
database="advertising_manager", # -> 服务端 alias "mysql"
)
return engine_doris
def doris_adv_direct_connect(self):
"""直连 Doris 广告库(不走代理,用于高速读取和更新)"""
import pymysql
from amazon_params.params import DORIS_ADV_DIRECT_CONN
conn = pymysql.connect(**DORIS_ADV_DIRECT_CONN)
return conn
def doris_connect(self):
engine_doris = get_remote_engine(
site_name=self.site_name, # -> database "selection"
db_type="doris", # -> 服务端 alias "mysql"
)
print('engine_pg::', engine_doris)
return engine_doris
# nums = 0
# while True:
# nums += 1
# try:
# db = 'us_spider'
# # 设置连接参数字典,包括连接超时参数
# connect_args = {
# "connect_timeout": 10
# }
# return create_engine(
# f'mysql+pymysql://{DORIS_CONN["mysql_user"]}:' + f'{DORIS_CONN["mysql_pwd"]}@{DORIS_CONN["mysql_host"]}:{DORIS_CONN["mysql_port"]}/{db}?charset=utf8mb4',
# connect_args=connect_args, poolclass=NullPool)
# except Exception as e:
# print("doris_connect22222222222222222222222222:", e, f"\n{traceback.format_exc()}")
# time.sleep(3)
# continue
def pg_connect_6(self):
engine_pg15 = get_remote_engine(
site_name=self.site_name, # -> database "selection"
db_type="postgresql_15_outer", # -> 服务端 alias "mysql"
)
print('engine_pg15::', engine_pg15)
return engine_pg15
def pg_reconnect(self, table_name=None, e=None):
......@@ -153,7 +118,6 @@ class BaseUtils(object):
time.sleep(5)
def kafuka_connect(self, kafka_html_connect=None, bootstrap_servers=None, acks=None, connections_max_idle_ms=60000):
request_timeout_ms = 30000
if kafka_html_connect:
bootstrap_servers = '61.145.136.61:20092'
else:
......@@ -172,17 +136,17 @@ class BaseUtils(object):
sasl_plain_password='R8@xY3pL!qz',
value_serializer=str.encode,
max_request_size=10000120,
request_timeout_ms=request_timeout_ms,
max_block_ms=30000, # 阻塞超时时间设置为60秒
request_timeout_ms=40000,
max_block_ms=40000, # 阻塞超时时间设置为60秒
compression_type='gzip', # 启用消息压缩
acks=1 if acks else 0, # 根据需要设置 acks, # 等待所有副本确认接收
connections_max_idle_ms=connections_max_idle_ms, # 一分钟释放链接
max_in_flight_requests_per_connection=1000,
linger_ms=150, # 增加等待时间
linger_ms=350, # 增加等待时间
batch_size=16384 if acks else 0, # 增加批处理大小
api_version=(2, 4, 1), # 我的kafka版本是2.4.1
retries=10, # 自动重试
retry_backoff_ms=500
retries=5, # 自动重试
retry_backoff_ms=600
)
return producer
except Exception as e:
......
......@@ -18,8 +18,9 @@ US_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button/span','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]'],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button/span','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
......@@ -130,10 +131,12 @@ US_ASIN_XPATH = {
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[@class="offer-display-feature-text a-spacing-none"]/span/text()',
'//a[contains(text(),"Fulfilled by")]/text()',
'//span[contains(text(),"Shipper / Seller")]/..//following-sibling::div/text()',
'//span[contains(text(),"Ships from and sold by")]/../a/span/text()',
'//div[contains(@id,"FromSoldBy_")]//tr[1]//td/span[contains(@class,"-base")]/text()',
'//span[contains(text(),"Ships from")]/parent::div/parent::div/following-sibling::div//span[@class="a-size-small offer-display-feature-text-message"]/text()',
'//span[contains(text(),"Ships from")]/parent::div/parent::div/following-sibling::div//span/text()'
'//span[contains(text(),"Ships from")]/parent::div/parent::div/following-sibling::div//span[contains(text()," Sold by: ")]/following-sibling::span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ['//div[@tabular-attribute-name="Sold by"]//a/text()', '//div[contains(text(),"Sold by")]//a/text()',
'//div[@tabular-attribute-name="Sold by"]//span[@class="a-size-small"]/text()',
......@@ -166,7 +169,9 @@ US_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[contains(@id,'lob-carousel')]//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"]
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price':['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category' :["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
}
DE_ASIN_XPATH = {
......@@ -187,8 +192,9 @@ DE_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]'],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
......@@ -325,7 +331,9 @@ DE_ASIN_XPATH = {
"td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()',
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[@class="offer-display-feature-text a-spacing-none"]/span/text()',
'//div[contains(@class,"offer-display-feature-text")]/span/text()'],
'//div[contains(@class,"offer-display-feature-text")]/span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ["//div[@id='merchant-info']//a[2]/span/text()", '//div[@tabular-attribute-name="Sold by"]//a/text()',
'//div[contains(text(),"Verkäufer")]//a/text()',
'//div[@tabular-attribute-name="Verkäufer"]//span[@class="a-size-small"]/text()',
......@@ -351,8 +359,9 @@ DE_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"]
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
}
UK_ASIN_XPATH = {
......@@ -373,8 +382,9 @@ UK_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]'],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
......@@ -488,7 +498,9 @@ UK_ASIN_XPATH = {
'//span[contains(text(),"Ships from and sold by")]/../a/span/text()',
'//div[contains(@id,"FromSoldBy_")]//tr[1]//td/span[contains(@class,"-base")]/text()',
'//div[@data-csa-c-slot-id="odf-feature-text-desktop-fulfiller-info"]//span/text()',
'//span[contains(text(),"Dispatches from")]/parent::div/parent::div/following-sibling::div//span[@class="a-size-small offer-display-feature-text-message"]/text()'
'//span[contains(text(),"Dispatches from")]/parent::div/parent::div/following-sibling::div//span[@class="a-size-small offer-display-feature-text-message"]/text()',
'//span[contains(text(),"Dispatches from")]/parent::div/parent::div/following-sibling::div//span[contains(text()," Sold by: ")]/following-sibling::span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ['//div[@tabular-attribute-name="Sold by"]//a/text()', '//div[contains(text(),"Sold by")]//a/text()',
'//div[@tabular-attribute-name="Sold by"]//span[@class="a-size-small"]/text()',
......@@ -520,7 +532,9 @@ UK_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"]
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
}
......@@ -542,8 +556,9 @@ FR_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button'],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
......@@ -656,7 +671,9 @@ FR_ASIN_XPATH = {
'//span[@id="submit.add-to-cart-ubb-announce"]/text()',
'//div[contains(@id,"availability")]/span/text()'],
"td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()',
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()'],
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ['//div[@tabular-attribute-name="Vendu par"]//a/text()',
'//div[contains(text(),"Vendu par")]//a/text()',
'//div[@tabular-attribute-name="Vendu par"]//span[@class="a-size-small"]/text()',
......@@ -676,7 +693,9 @@ FR_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"]
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
}
......@@ -698,8 +717,9 @@ ES_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button'],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
......@@ -824,7 +844,9 @@ ES_ASIN_XPATH = {
'//div[contains(@id,"availability")]/span/text()'],
"td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()',
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()'],
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ['//div[@tabular-attribute-name="Vendido por"]//a/text()',
'//div[contains(text(),"Vendido por")]//a/text()',
'//div[@tabular-attribute-name="Vendido por"]//span[@class="a-size-small"]/text()',
......@@ -844,7 +866,9 @@ ES_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[contains(@id,'lob-carousel')]//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"]
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
}
......@@ -866,8 +890,9 @@ IT_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button'],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
......@@ -987,7 +1012,9 @@ IT_ASIN_XPATH = {
'//div[contains(@id,"availability")]/span/text()'],
"td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()',
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()'],
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ['//div[@tabular-attribute-name="Venditore"]//a/text()',
'//div[contains(text(),"Venditore")]//a/text()',
'//div[@tabular-attribute-name="Venditore"]//span[@class="a-size-small"]/text()',
......@@ -1007,5 +1034,8 @@ IT_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"]
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
}
......@@ -115,25 +115,26 @@ class ParseSearchTermUs(object):
def parse_asin_zr(self):
"""
返回通过data-asin匹配到的所有asin中,排除了sb、sp的对应asin --> 剩余zrasin
通过 data-index + data-asin 且不含 AdHolder 的 div 获取ZR自然排名ASIN
"""
asin_all = self.etree_html.xpath('//div[@data-asin]/@data-asin')
asin_all_str = "-".join(asin_all).replace('/', '')
asin_all = re.findall("(\w+)", asin_all_str)
self.asin_all = asin_all # 保留原始列表给 parse_buy 用
# 用集合排除 SB/SP 的 ASIN,避免 list.remove() 只删第一个导致漏删
# 先去重保持页面顺序,再排除已识别的 SB 和 SP
exclude_set = set(self.sb_list_all) | set(self.sp_list_all)
asin_unique = list(dict.fromkeys(asin_all))
zr_list = [a for a in asin_unique if a not in exclude_set]
return zr_list
# 保留原始全量列表给 parse_buy 用
asin_all_raw = self.etree_html.xpath('//div[@data-asin]/@data-asin')
asin_all_str = "-".join(asin_all_raw).replace('/', '')
self.asin_all = re.findall("(\w+)", asin_all_str)
# ZR: 主搜索结果中没有 AdHolder 的项
zr_items = self.etree_html.xpath('//div[@data-index and @data-asin and not(contains(@class, "AdHolder"))]')
zr_asin_list = []
for item in zr_items:
asin = item.get('data-asin', '').strip()
if asin and len(asin) >= 9 and asin not in zr_asin_list:
zr_asin_list.append(asin)
return zr_asin_list
def parse_type_common(self, asin_list=None, cate_type=None):
"""
asin_list: list
"""
asin_list = list(dict.fromkeys(asin_list)) # 去重保序
asin_list.sort(key=lambda a: self.asin_position_map.get(a, 9999)) # 按 data-index 页面位置排序,无 data-index 的排最后
asin_list = list(dict.fromkeys(asin_list)) # 去重保序,保持xpath返回的页面顺序
asin_detail_all_list = []
cate_type_copy = 1
asin_detail_dict = {
......@@ -319,6 +320,21 @@ class ParseSearchTermUs(object):
print(self.search_term,' 页数:',self.page,'广告asin:',asin)
if asin and len(asin) >= 9 and asin not in sp_asin_list:
sp_asin_list.append(asin)
# 标签下的SP广告位(如 Customers frequently viewed、Today's deals)
if self.site_name == 'de':
sp_label = 'Gesponsert'
elif self.site_name == 'us' or self.site_name == 'uk':
sp_label = 'Sponsored'
else:
sp_label = 'Sponsored'
tag_asin_list = self.etree_html.xpath(
f'//span[@class="a-declarative"]/span[contains(text(),"{sp_label}")]/../../../../../../../../div/following-sibling::span[2]//div/@data-asin|//span/a[contains(text(),"{sp_label}")]/../../../../../../../../div/following-sibling::span[2]//div/@data-asin')
if tag_asin_list:
for asin in tag_asin_list:
if asin and len(asin) >= 9 and asin not in sp_asin_list:
sp_asin_list.append(asin)
self.sp_list_all = sp_asin_list.copy() # 供 parse_asin_zr 排除用
if sp_asin_list:
sp_asin_list.sort(key=lambda a: self.asin_position_map.get(a, 9999))
......@@ -632,3 +648,4 @@ if __name__ == '__main__':
print('sb_list:' ,sb_list)
import hashlib
# import requests
import json
import os
import random
......@@ -7,11 +6,11 @@ import re
import sys
import time
import uuid
from urllib.parse import urlparse
from threading import Lock
import urllib3
from lxml import etree
# py -3.9 -m pip pyinstaller 指定pip 安装
# py -3.10 -m pip install -r E:\Git_new\spider\yswg-agent\requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from amazon_params.params import DB_REQUESTS_ASIN_PARAMS
from utils.db_connect import BaseUtils
......@@ -35,6 +34,8 @@ class Requests_param_val(BaseUtils):
print("站点名称:", self.site_name, '抓取项目', "代理ip:", self.proxy_name)
self.cookies_queue = Queue() # cookie队列
self.kafuka_producer_str = self.kafuka_connect()
self.next_page_lock = Lock()
self.headers_num_int_s = 0
def init_db_names(self):
self.engine_pg = self.pg_connect()
......@@ -67,7 +68,7 @@ class Requests_param_val(BaseUtils):
"""
判断获取文本是否有中文
"""
if check_str != '无':
if check_str and check_str != '无':
for c in check_str:
if '\u4e00' <= c <= '\u9fa5':
print('--是中文,说明该cookie有问题,或者改数据有问题--')
......@@ -117,7 +118,7 @@ class Requests_param_val(BaseUtils):
if num:
sql_read = f'SELECT cookies,id FROM {self.db_cookies} limit {num};'
else:
sql_read = f'SELECT cookies,id FROM {self.db_cookies} limit 300;'
sql_read = f'SELECT cookies,id FROM {self.db_cookies} limit 350;'
print("获取cookie:", sql_read)
df_read = self.engine.read_sql(sql_read)
clientPriceList = list(df_read.cookies + "|-|" + df_read.id.astype("U"))
......@@ -195,31 +196,26 @@ class Requests_param_val(BaseUtils):
# 组装请求头,
def requests_amazon_headers(self, host=None, site_url=None, asin=None, scraper_url=None):
n = random.randint(120, 142)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 6900)}.{random.randint(1, 181)} Safari/537.36'
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
# ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
headers = {
'connection': 'close',
'authority': urlparse(self.site_url).hostname,
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
'accept-Encodin': 'gzip, deflate, br, zstd',
'accept-encoding': 'gzip, deflate, br, zstd',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'sec-ch-ua-mobile': '?0',
'user-agent': ua,
"pragma": "no-cache",
}
if asin:
headers['origin'] = f'{site_url}dp/{asin}'
headers['referer'] = f'{site_url}?th=1'
if scraper_url:
headers['origin'] = scraper_url
headers['referer'] = scraper_url
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
k = ""
......@@ -229,11 +225,11 @@ class Requests_param_val(BaseUtils):
return headers
# 第二次发送请求。
def requests_amazon(self, headers=None, scraper_url=None):
def requests_amazon(self, headers=None, scraper_url=None, sess=None):
for i in range(2):
try:
resp = requests.get(scraper_url, headers=headers, impersonate="chrome",
timeout=10, verify=False)
timeout=30, verify=False)
if self.check_amazon_yzm(resp):
print('验证码2222222222222222')
continue
......@@ -262,22 +258,7 @@ class Requests_param_val(BaseUtils):
# 获取对应每个小时的数字。存到redis列表中
def get_hour(self, new_date_hour):
# new_date_hour = datetime.now().strftime("%Y-%m-%d:%H")
# 获取当前日期
current_date = datetime.now()
# 将当前时间的小时、分钟和秒设置为0
current_date = current_date.replace(hour=0, minute=0, second=0, microsecond=0)
# 生成当天的24小时时间
hourly_times = [current_date + timedelta(hours=i) for i in range(24)]
hour_dict = {}
# 打印每个小时的时间
for hour_time in hourly_times:
hour = hour_time.strftime("%Y-%m-%d:%H")
num = re.findall(r':(\d+)', hour)[0]
hour_dict[hour] = num
print(new_date_hour, hour_dict)
n = hour_dict[new_date_hour]
return n
return str(datetime.now().hour)
# 组装cookie
def get_cookie_str(self, cookies_queue):
......@@ -305,11 +286,11 @@ class Requests_param_val(BaseUtils):
break
if self.site_name == 'uk':
cookie_str = cookie_str.replace('i18n-prefs=HKD;', 'i18n-prefs=GBP;').replace('i18n-prefs=USD;', 'i18n-prefs=GBP;')
cookie_str = cookie_str.replace('i18n-prefs=HKD', 'i18n-prefs=GBP').replace('i18n-prefs=USD', 'i18n-prefs=GBP')
elif self.site_name == 'de':
cookie_str = cookie_str.replace('i18n-prefs=HKD;', 'i18n-prefs=EUR;').replace('i18n-prefs=USD;', 'i18n-prefs=EUR;')
cookie_str = cookie_str.replace('i18n-prefs=HKD', 'i18n-prefs=EUR').replace('i18n-prefs=USD', 'i18n-prefs=EUR')
elif self.site_name == 'us':
cookie_str = cookie_str.replace('i18n-prefs=HKD;', 'i18n-prefs=USD;')
cookie_str = cookie_str.replace('i18n-prefs=HKD', 'i18n-prefs=USD')
return cookie_str
# 获取自增id区间。根据传的站点获取对应的月 周 syn表的id
......@@ -353,7 +334,7 @@ class Requests_param_val(BaseUtils):
def hex_md5(self, input_string):
# 创建一个MD5哈希对象
md5_hash = hashlib.md5()
# 使用输入字符串的字节更新哈希对象
# 使用输入字符串的字节更新哈希对象items.pop('div_id_list', None)
md5_hash.update(input_string.encode('utf-8'))
# 获取哈希的十六进制表示
md5_hex_digest = md5_hash.hexdigest()
......@@ -367,24 +348,25 @@ class Requests_param_val(BaseUtils):
def send_kafka(self, items=None, html_data=None, topic=None):
print('向Kafka发送数据')
for i in range(5):
if items:
items.pop('div_id_list', None)
for i in range(3):
try:
if items:
print('232323232323')
del items['div_id_list']
future = self.kafuka_producer_str.send(topic, json.dumps(items))
future.add_callback(self.on_send_success).add_errback(self.on_send_error)
if html_data:
future = self.kafuka_producer_str.send(topic, html_data)
future.add_callback(self.on_send_success).add_errback(self.on_send_error)
print('向Kafka发送数据 发送成功')
break
except KafkaTimeoutError:
print(f'Kafka flush超时,第{i+1}次重试')
if i >= 2:
self.kafuka_producer_str = self.kafuka_connect()
except Exception as e:
print(e)
if i >= 1:
self.kafuka_producer_str = self.kafuka_connect() # 调用kafka
try:
self.kafuka_producer_str.flush(timeout=30)
except KafkaTimeoutError as e:
print("flush 超时,跳过这次等待:", e)
if i >= 2:
self.kafuka_producer_str = self.kafuka_connect()
if __name__ == '__main__':
Requests_param_val().get_cookie(num=1)
\ No newline at end of file
import json
#
#
#
# '''旧版'''
#
# import json
#
# import pandas as pd
# import numpy as np
# import orjson, requests, time
# from typing import List
#
# # -------- 映射字典 --------
# site_name_db_dict = {
# "us": "selection",
# "uk": "selection_uk",
# "de": "selection_de",
# "es": "selection_es",
# "fr": "selection_fr",
# "it": "selection_it",
# }
#
# db_type_alias_map = {
# "mysql": "mysql", # 阿里云mysql
# "postgresql_14": "postgresql_14", # pg14爬虫库-内网
# "postgresql_14_outer": "postgresql_14", # pg14爬虫库-外网
# "postgresql_15": "postgresql_15", # pg15正式库-内网
# "postgresql_15_outer": "postgresql_15_outer", # pg15正式库-外网
# "postgresql_cluster": "postgresql_cluster", # pg集群-内网
# "postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
# "doris": "doris", # doris集群-内网
# }
#
# DEFAULT_SERVERS = [
# # "http://192.168.200.210:7777", # 内网
# # "http://192.168.10.217:7777", # 内网-h7
# # "http://113.100.143.162:7777", # 外网
# # "http://113.100.143.162:7779", # 外网
# # "http://61.145.136.61:7777", # 外网
# # "http://61.145.136.61:7779", # 外网
# "http://192.168.200.210:7780"
#
#
# ]
# # ---------------------------
#
# def df_to_json_records(df: pd.DataFrame) -> list:
# """保证 DataFrame 可安全序列化为 JSON records(处理 NaN / ±Inf)"""
# df_clean = df.copy()
#
# # 1️⃣ 替换 ±Inf -> NaN
# num_cols = df_clean.select_dtypes(include=[np.number]).columns
# if len(num_cols):
# df_clean[num_cols] = df_clean[num_cols].replace([np.inf, -np.inf], np.nan)
#
# # 2️⃣ 替换 NaN -> None(注意:有时 astype(object) 不彻底,需用 applymap)
# df_clean = df_clean.applymap(lambda x: None if pd.isna(x) else x)
#
# # 3️⃣ 转为 dict records
# return df_clean.to_dict("records")
#
#
# def clean_json_field_for_orjson(v):
# """清洗单个 JSON 字段的值,使其符合 orjson 要求并避免空字典入库"""
# if v is None or pd.isna(v):
# return None
#
# # 1️⃣ 如果是空字典对象,返回 None
# if isinstance(v, dict) and not v:
# return None
#
# # 2️⃣ 如果是空字符串或仅为 "{}",返回 None
# if isinstance(v, str):
# stripped = v.strip()
# if not stripped or stripped == "{}":
# return None
# try:
# parsed = json.loads(stripped)
# if isinstance(parsed, dict) and not parsed:
# return None
# return json.dumps(parsed, ensure_ascii=False)
# except Exception:
# return v # 非 JSON 字符串则原样保留
#
# return v
#
#
# def fully_clean_for_orjson(df: pd.DataFrame) -> pd.DataFrame:
# # """全面清洗 DataFrame 以符合 orjson 要求"""
# # df = df.replace([np.inf, -np.inf], np.nan)
# # df = df.applymap(lambda x: None if pd.isna(x) else x)
# #
# # # 找出所有可能为 JSON 字符串的字段
# # json_like_cols = [col for col in df.columns if col.endswith('_json')]
# #
# # # 针对每个 JSON-like 字段,应用清洗函数
# # for col in json_like_cols:
# # df[col] = df[col].apply(clean_json_field_for_orjson)
# #
# # return df
# """全面清洗 DataFrame 以符合 orjson 要求"""
# df = df.replace([np.inf, -np.inf], np.nan)
#
# # NaN → None (比 applymap 高效且不出错)
# df = df.where(pd.notna(df), None)
#
# # 找出所有可能为 JSON 字符串的字段
# json_like_cols = [col for col in df.columns if col.endswith('_json')]
#
# # 针对每个 JSON-like 字段,应用清洗函数
# for col in json_like_cols:
# df[col] = df[col].apply(clean_json_field_for_orjson)
#
# return df
#
#
# class RemoteTransaction:
#
# def __init__(self, db: str, database: str,
# session: requests.Session, urls: List[str]):
# self.db = db
# self.database = database
# self.session = session
# self.urls = urls
# self.sql_queue = []
#
# # def execute(self, sql: str):
# # self.sql_queue.append(sql)
# def execute(self, sql: str, params=None):
# """
# params 可取:
# • None → 纯文本 SQL
# • dict → 单条参数化 e.g. {"id":1,"name":"a"}
# • list/tuple → 批量 executemany
# - list[dict] ↔ INSERT .. VALUES (:id,:name)
# - list[tuple] ↔ INSERT .. VALUES (%s,%s)
# """
# self.sql_queue.append({"sql": sql, "params": params})
#
# def __enter__(self): return self
#
# def __exit__(self, exc_type, exc, tb):
# for url in self.urls:
# try:
# self.session.post(
# url + "/transaction",
# json={"db": self.db,
# "sql_list": self.sql_queue,
# "site_name": self.database}, # site_name not needed on server, kept for clarity
# timeout=3000,
# ).raise_for_status()
# return
# except Exception as e:
# print(f"[WARN] 事务失败 {url}: {e}")
# raise RuntimeError("All servers failed for transaction")
#
#
# class RemoteEngine:
# def __init__(self, db: str, database: str,
# server_urls: List[str], retries: int = 2):
# self.db = db
# self.database = database
# self.urls = [u.rstrip("/") for u in server_urls]
# self.session = requests.Session()
# self.retries = retries
#
# def _request(self, endpoint: str, payload):
# for url in self.urls:
# for _ in range(self.retries):
# try:
# json_bytes = orjson.dumps(payload)
# r = self.session.post(f"{url}/{endpoint}",
# data=json_bytes,
# headers={"Content-Type": "application/json"},
# timeout=3000)
#
# # r = self.session.post(f"{url}/{endpoint}",
# # json=payload, timeout=10)
# r.raise_for_status()
# return r.json()
# except Exception as e:
# print(f"[WARN] {endpoint} fail @ {url}: {e}")
# time.sleep(1)
# raise RuntimeError(f"All servers failed for {endpoint}")
# # def _request(self, endpoint: str, payload):
# # # 用 orjson,“allow_nan” 会把 NaN/Inf 写成 null
# # # json_bytes = orjson.dumps(payload,
# # # option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC | orjson.OPT_OMIT_MICROSECOND | orjson.OPT_ALLOW_INF_AND_NAN)
# # json_bytes = orjson.dumps(
# # payload,
# # option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC | orjson.OPT_ALLOW_INF_AND_NAN
# # )
# #
# # headers = {"Content-Type": "application/json"}
# #
# # for url in self.urls:
# # for _ in range(self.retries):
# # try:
# # r = self.session.post(f"{url}/{endpoint}",
# # data=json_bytes, headers=headers,
# # timeout=15)
# # r.raise_for_status()
# # return r.json()
# # except Exception as e:
# # print(f"[WARN] {endpoint} fail @ {url}: {e}")
# # time.sleep(1)
# # raise RuntimeError(f"All servers failed for {endpoint}")
#
# # ---------- 公共 API ----------
# def read_sql(self, sql: str) -> pd.DataFrame:
# data = self._request("query",
# {"db": self.db,
# "sql": sql,
# "site_name": self.database})
# return pd.DataFrame(data["result"])
#
# def to_sql(self, df: pd.DataFrame, table: str, if_exists="append"):
#
# return self._request("insert",
# {"db": self.db,
# "table": table,
# "if_exists": if_exists,
# "data": fully_clean_for_orjson(df=df).to_dict("records"),
# # "data": df_to_json_records(df), # ← 清洗后的 records
# "site_name": self.database})
#
# def read_then_update(
# self,
# select_sql: str,
# update_table: str,
# set_values: dict,
# where_keys: List[str],
# error_if_empty: bool = False,
# ):
# """
# 动态生成 UPDATE:把 select_sql 读到的行,按 where_keys 精准更新 set_values
# 返回 (DataFrame, rows_updated)
# """
# payload = {
# "db": self.db,
# "site_name": self.database,
# "select_sql": select_sql,
# "update_table": update_table,
# "set_values": set_values,
# "where_keys": where_keys,
# "error_if_empty": error_if_empty,
# }
# resp = self._request("read_then_update", payload)
# df = pd.DataFrame(resp["read_result"])
# rows_updated = resp.get("rows_updated", 0)
# return df
#
# def begin(self):
# return RemoteTransaction(self.db, self.database,
# self.session, self.urls)
# # ---------------------------------
#
#
# def get_remote_engine(site_name: str, db_type: str,
# servers: List[str] = None) -> RemoteEngine:
# if site_name not in site_name_db_dict:
# raise ValueError(f"Unknown site_name: {site_name}")
# if db_type not in db_type_alias_map:
# raise ValueError(f"Unknown db_type: {db_type}")
# return RemoteEngine(
# db=db_type_alias_map[db_type],
# database=site_name,
# server_urls=servers or DEFAULT_SERVERS,
# )
import json
import sys,os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import pandas as pd
import numpy as np
import orjson, requests, time
from typing import List
# -------- 映射字典 --------
site_name_db_dict = {
"us": "selection",
......@@ -24,14 +296,15 @@ db_type_alias_map = {
"postgresql_cluster": "postgresql_cluster", # pg集群-内网
"postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
"doris": "doris", # doris集群-内网
"doris_adv": "doris_adv",
}
DEFAULT_SERVERS = [
# "http://192.168.200.210:7777", # 内网
# "http://192.168.10.217:7777", # 内网-h7
"http://61.145.136.61:7777", # 外网
"http://61.145.136.61:7779", # 外网
# "http://61.145.136.61:7777", # 外网
# "http://61.145.136.61:7779", # 外网
"http://61.145.136.61:7780"
]
# ---------------------------
......@@ -44,8 +317,8 @@ def df_to_json_records(df: pd.DataFrame) -> list:
if len(num_cols):
df_clean[num_cols] = df_clean[num_cols].replace([np.inf, -np.inf], np.nan)
# 2️⃣ 替换 NaN -> None(注意:有时 astype(object) 不彻底,需用 applymap
df_clean = df_clean.applymap(lambda x: None if pd.isna(x) else x)
# 2️⃣ 替换 NaN -> None(兼容 pandas 1.x 和 2.x
df_clean = df_clean.where(pd.notna(df_clean), None)
# 3️⃣ 转为 dict records
return df_clean.to_dict("records")
......@@ -79,7 +352,7 @@ def clean_json_field_for_orjson(v):
def fully_clean_for_orjson(df: pd.DataFrame) -> pd.DataFrame:
"""全面清洗 DataFrame 以符合 orjson 要求"""
df = df.replace([np.inf, -np.inf], np.nan)
df = df.applymap(lambda x: None if pd.isna(x) else x)
df = df.where(pd.notna(df), None)
# 找出所有可能为 JSON 字符串的字段
json_like_cols = [col for col in df.columns if col.endswith('_json')]
......@@ -121,9 +394,9 @@ class RemoteTransaction:
try:
self.session.post(
url + "/transaction",
json={"db": self.db,
json={"db_type": self.db,
"sql_list": self.sql_queue,
"site_name": self.database}, # site_name not needed on server, kept for clarity
"database": self.database}, # site_name not needed on server, kept for clarity
timeout=3000,
).raise_for_status()
return
......@@ -134,7 +407,13 @@ class RemoteTransaction:
class RemoteEngine:
def __init__(self, db: str, database: str,
server_urls: List[str], retries: int = 2):
server_urls: List[str], retries: int = 20):
"""
:param db: db_type--数据库类型
:param database: 数据库名称
:param server_urls:
:param retries:
"""
self.db = db
self.database = database
self.urls = [u.rstrip("/") for u in server_urls]
......@@ -149,7 +428,7 @@ class RemoteEngine:
r = self.session.post(f"{url}/{endpoint}",
data=json_bytes,
headers={"Content-Type": "application/json"},
timeout=3000)
timeout=3600)
# r = self.session.post(f"{url}/{endpoint}",
# json=payload, timeout=10)
......@@ -159,47 +438,24 @@ class RemoteEngine:
print(f"[WARN] {endpoint} fail @ {url}: {e}")
time.sleep(1)
raise RuntimeError(f"All servers failed for {endpoint}")
# def _request(self, endpoint: str, payload):
# # 用 orjson,“allow_nan” 会把 NaN/Inf 写成 null
# # json_bytes = orjson.dumps(payload,
# # option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC | orjson.OPT_OMIT_MICROSECOND | orjson.OPT_ALLOW_INF_AND_NAN)
# json_bytes = orjson.dumps(
# payload,
# option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC | orjson.OPT_ALLOW_INF_AND_NAN
# )
#
# headers = {"Content-Type": "application/json"}
#
# for url in self.urls:
# for _ in range(self.retries):
# try:
# r = self.session.post(f"{url}/{endpoint}",
# data=json_bytes, headers=headers,
# timeout=15)
# r.raise_for_status()
# return r.json()
# except Exception as e:
# print(f"[WARN] {endpoint} fail @ {url}: {e}")
# time.sleep(1)
# raise RuntimeError(f"All servers failed for {endpoint}")
# ---------- 公共 API ----------
def read_sql(self, sql: str) -> pd.DataFrame:
data = self._request("query",
{"db": self.db,
{"db_type": self.db,
"sql": sql,
"site_name": self.database})
"database": self.database})
return pd.DataFrame(data["result"])
def to_sql(self, df: pd.DataFrame, table: str, if_exists="append"):
return self._request("insert",
{"db": self.db,
{"db_type": self.db,
"table": table,
"if_exists": if_exists,
"data": fully_clean_for_orjson(df=df).to_dict("records"),
# "data": df_to_json_records(df), # ← 清洗后的 records
"site_name": self.database})
"database": self.database})
def read_then_update(
self,
......@@ -214,8 +470,8 @@ class RemoteEngine:
返回 (DataFrame, rows_updated)
"""
payload = {
"db": self.db,
"site_name": self.database,
"db_type": self.db,
"database": self.database,
"select_sql": select_sql,
"update_table": update_table,
"set_values": set_values,
......@@ -232,15 +488,69 @@ class RemoteEngine:
self.session, self.urls)
# ---------------------------------
# db -- 数据库类型
# database -- 站点
def get_remote_engine(site_name: str, db_type: str,
def get_remote_engine(site_name: str, db_type: str, database: str = None,
servers: List[str] = None) -> RemoteEngine:
"""
:param site_name: 站点
:param db_type: 数据库类型
:param database: 数据库名称-默认为None, 否则通过站点来匹配
:param servers: 服务器url地址
:return: 数据库连接对象
"""
if not database:
database = site_name_db_dict[site_name]
if site_name not in site_name_db_dict:
raise ValueError(f"Unknown site_name: {site_name}")
if db_type not in db_type_alias_map:
raise ValueError(f"Unknown db_type: {db_type}")
print(f"db_type: {db_type_alias_map[db_type]}, database: {database}")
return RemoteEngine(
db=db_type_alias_map[db_type],
database=site_name,
database=database,
server_urls=servers or DEFAULT_SERVERS,
)
def stream_load_df(df: pd.DataFrame, database: str='advertising_manager', table: str='', server_url: str = None):
server = (server_url or DEFAULT_SERVERS[0]).rstrip("/")
payload = {
"database": database,
"table": table,
"data": fully_clean_for_orjson(df).to_dict("records"),
}
json_bytes = orjson.dumps(payload)
r = requests.post(
f"{server}/stream_load_df",
data=json_bytes,
headers={"Content-Type": "application/json"},
timeout=3000,
)
r.raise_for_status()
return r.json()
if __name__ == '__main__':
engine_mysql = get_remote_engine(
site_name='us', # -> database "selection"
db_type="doris_adv", # -> 服务端 alias "mysql"
database="advertising_manager", # -> 服务端 alias "mysql"
)
# e = engine_mysql.read_sql('SELECT max(created_time),MIN(created_time) FROM us_sp_search_term_rank WHERE time_batch="2026-03-09-15"')
# e = engine_mysql.read_sql('SELECT * FROM us_sp_search_term_rank WHERE time_batch="2026-03-09-12" and page_row=68')
# e = engine_mysql.read_sql('SELECT * FROM us_sp_search_term_rank WHERE id=50564433')
e = engine_mysql.read_sql('SELECT count(id),time_batch FROM us_sp_search_term_rank GROUP BY time_batch ORDER BY time_batch ASC')
# e = engine_mysql.read_sql('SELECT count(id),time_batch FROM us_sp_search_term_rank GROUP BY time_batch ORDER BY time_batch ASC')
# e = engine_mysql.read_sql('SELECT count(id) FROM sp_keyword_position_keyword')
# e = engine_mysql.read_sql('select * from us_sp_search_term_syn limit 100')
# dog bandanas spring
# 2026-04-03-11
# e = engine_mysql.read_sql('select * from us_sp_search_term_syn where search_term = "dog bandanas spring"')
# e = engine_mysql.read_sql('select * from us_sp_search_term_rank where search_term="dog bandanas spring" and time_batch="2026-04-03-11" and cate_type="sp" and asin="B0GCN7CTCM"')
# e = engine_mysql.read_sql('select * from us_sp_search_term_rank where time_batch="2026-03-17-16" and cate_type="sp" limit 100')
# print(e)
print(e.columns)
print(e.values.tolist())
DEFAULT_USER = "fangxingjun"
DEFAULT_USER_TOKEN = "fxj_token_123"
DEFAULT_USER = "pengyanbing"
DEFAULT_USER_TOKEN = "8f3b9d2a4c7e58b1"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment