Commit 16efe939 by Peng

update: utils 模块多项优化与功能扩展

[asin_parse.py]
- ParseAsinUs 新增 response_s 参数,支持外部传入 etree 解析树,避免重复解析
- 新增搜索框分类(search_category)字段解析
- 分类路径新增 all_nodeid(全路径节点 ID 拼接)及空值保护
- initialSeenAsins 解析新增空列表保护,防止 IndexError
- AI 评论按钮解析改用 data-testid 定位,过滤无效短文本
- 修复 es 站品牌解析变量名冲突 bug
- 修复评论数去括号逻辑及德站 Sternebewertung 判断错误
- 修复日期解析变量名 data_time 误用 bug
- 移除优惠券动态 XPATH 追加逻辑,防止重复追加

[db_connect.py]
- BaseUtils.__init__ 新增 site_name 参数(默认 us),不再硬编码
- 新增 doris_connect_adv() 连接 Doris 广告库(advertising_manager)
- 新增 doris_adv_direct_connect() pymysql 直连 Doris 广告库
- 清理旧版 SQLAlchemy 注释代码及调试 print 语句
- Kafka 超时 30000→40000ms,linger_ms 150→350,重试 10→5 次

[requests_param.py]
- 新增 next_page_lock 线程锁和 headers_num_int_s 属性,支持多线程分页
- 修复中文检测 check_str 为 None 时报错的 bug
- cookie 加载上限 300→350 条
- 修正请求头 accept-Encodin 拼写错误,移除 authority 字段
- 请求超时 10→30 秒,提升慢速页面成功率

[params_asin_xpath.py]
- 全站点 review_ai_list / review_button_list 新增 data-testid XPath,兼容新版 AI 评论结构
- 全站点新增 fbm_delivery_price 字段,采集 FBM 配送运费
- 全站点新增 search_category 字段,采集搜索框当前分类
- US/UK/DE/FR 站点 td_0_text 新增多条 XPath,适配 Amazon 最新页面结构

[check_columns.py]
- 取消 __main__ 注释,支持脚本直接运行

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
parent 4c8ae1db
...@@ -12,10 +12,10 @@ import json ...@@ -12,10 +12,10 @@ import json
class ParseAsinUs(object): class ParseAsinUs(object):
def __init__(self, resp=None, asin=None, week=None, date_info=None, data_type_asin=1, site_name=None, month=None): def __init__(self, resp=None, asin=None, week=None, date_info=None, data_type_asin=1, site_name=None, month=None, response_s=None):
print(f'请求 {site_name} 站点数据') print(f'请求 {site_name} 站点数据')
self.page_source = resp self.page_source = resp
self.response_s = etree.HTML(resp) self.response_s = response_s if response_s is not None else etree.HTML(resp)
self.asin = asin self.asin = asin
self.all_img_video_list = [] self.all_img_video_list = []
self.asin_variation_list = [] self.asin_variation_list = []
...@@ -138,6 +138,8 @@ class ParseAsinUs(object): ...@@ -138,6 +138,8 @@ class ParseAsinUs(object):
initial_seen_asins_str = initialSeenAsins_list[0] + ']' initial_seen_asins_str = initialSeenAsins_list[0] + ']'
else: else:
initialSeenAsins_list = re.findall(r'initialSeenAsins\":(.*)],\"set', resp) initialSeenAsins_list = re.findall(r'initialSeenAsins\":(.*)],\"set', resp)
if not initialSeenAsins_list:
return None
initial_seen_asins_str = initialSeenAsins_list[0] + ']' initial_seen_asins_str = initialSeenAsins_list[0] + ']'
pattern = re.compile(r'B\w+') pattern = re.compile(r'B\w+')
initialSeenAsins = pattern.findall(initial_seen_asins_str) initialSeenAsins = pattern.findall(initial_seen_asins_str)
...@@ -636,21 +638,44 @@ class ParseAsinUs(object): ...@@ -636,21 +638,44 @@ class ParseAsinUs(object):
cleaned_dict = {self.clean_string(key): self.clean_string(value) for key, value in productdetail.items()} cleaned_dict = {self.clean_string(key): self.clean_string(value) for key, value in productdetail.items()}
prcdt_dict = {key: value for key, value in cleaned_dict.items() if value != ''} prcdt_dict = {key: value for key, value in cleaned_dict.items() if value != ''}
productdetail_json = json.dumps(prcdt_dict, ensure_ascii=False) productdetail_json = json.dumps(prcdt_dict, ensure_ascii=False)
# 搜索框分类
for i in ASIN_XPATH['search_category']:
search_category_list = self.response_s.xpath(i)
if search_category_list:
search_category = search_category_list[0].strip()
break
else:
search_category = None
# 分类 # 分类
els_category_href_list = []
for i in ASIN_XPATH['category_href']: for i in ASIN_XPATH['category_href']:
els_category_href_list = self.response_s.xpath(i) els_category_href_list = self.response_s.xpath(i)
if els_category_href_list:
break
for i in ASIN_XPATH['category_data']: for i in ASIN_XPATH['category_data']:
els_category = self.response_s.xpath(i) els_category = self.response_s.xpath(i)
if els_category: if els_category:
category_list = [] category_list = []
nodes_id_list = []
for e in els_category: for e in els_category:
category_list.append(e.strip()) category_list.append(e.strip())
category = ''.join(category_list) category = ''.join(category_list)
node_id = re.findall(r'node=(\d+)', els_category_href_list[-1])[0] if els_category_href_list:
node_id_list = re.findall(r'node=(\d+)', els_category_href_list[-1])
node_id = node_id_list[0] if node_id_list else None
for i in els_category_href_list:
_id_list = re.findall(r'node=(\d+)', i)
if _id_list:
nodes_id_list.append(_id_list[0])
all_nodeid = '›'.join(nodes_id_list) if nodes_id_list else None
else:
node_id = None
all_nodeid = None
break break
else: else:
category = None category = None
node_id = None node_id = None
all_nodeid = None
# 解析标题 # 解析标题
for i in ASIN_XPATH['title']: for i in ASIN_XPATH['title']:
title_ = self.response_s.xpath(i) title_ = self.response_s.xpath(i)
...@@ -805,8 +830,9 @@ class ParseAsinUs(object): ...@@ -805,8 +830,9 @@ class ParseAsinUs(object):
if Brand_list_: if Brand_list_:
break break
elif self.site_name == 'es': elif self.site_name == 'es':
Brand_list = re.findall(r'de (.*)', Brand_list[0]) Brand_list_es = re.findall(r'de (.*)', Brand_list[0])
if Brand_list: if Brand_list_es:
Brand_list = Brand_list_es
break break
else: else:
Brand_list_ = re.findall(r'Marca:(.*)', Brand_list[0]) Brand_list_ = re.findall(r'Marca:(.*)', Brand_list[0])
...@@ -867,14 +893,15 @@ class ParseAsinUs(object): ...@@ -867,14 +893,15 @@ class ParseAsinUs(object):
for i in ASIN_XPATH['reviews']: for i in ASIN_XPATH['reviews']:
number_of_reviews = self.response_s.xpath(i) number_of_reviews = self.response_s.xpath(i)
if number_of_reviews: if number_of_reviews:
number_of_reviews = number_of_reviews[0].strip().replace('\xa0', '') number_of_reviews = number_of_reviews[0].strip().replace('\xa0', '').replace(')', '').replace('(', '')
print('number_of_reviews::',number_of_reviews)
if self.site_name == 'de': if self.site_name == 'de':
if "Sternebewertungen" in number_of_reviews: if "Sternebewertungen" in number_of_reviews:
total_comments = re.findall(r"(.*) Sternebewertungen", number_of_reviews)[0] total_comments = re.findall(r"(.*) Sternebewertungen", number_of_reviews)[0]
elif "Sternebewertung" in number_of_reviews[0]: elif "Sternebewertung" in number_of_reviews:
total_comments = re.findall(r"(.*) Sternebewertung", number_of_reviews)[0] total_comments = re.findall(r"(.*) Sternebewertung", number_of_reviews)[0]
else: else:
total_comments = None total_comments = number_of_reviews
break break
elif self.site_name == 'fr': elif self.site_name == 'fr':
if "évaluations" in number_of_reviews: if "évaluations" in number_of_reviews:
...@@ -978,11 +1005,15 @@ class ParseAsinUs(object): ...@@ -978,11 +1005,15 @@ class ParseAsinUs(object):
for button in button_list: for button in button_list:
try: try:
button_text = button.xpath('./text()')[0] button_text = button.xpath('./text()')[0]
i = button_list.index(button) print('button_text::',button_text)
span_text = self.response_s.xpath(f"//div[@id='aspect-bottom-sheet-0-{i}']//span/text()") span_text = self.response_s.xpath(f"//div[@data-testid='bottomsheet-content-{button_text.lower()}']//span/text()")
p_text = self.response_s.xpath(f"//div[@id='aspect-bottom-sheet-0-{i}']//p/text()") span_text_list = []
_text = '&&&&'.join(span_text) + '|-|' + '&&&&'.join(p_text) for data in span_text:
if len(data) > 5:
span_text_list.append(data)
_text = '|-|'.join(span_text_list).replace('"','').replace('Read more','').replace('‘','').replace('’','')
button_dict[button_text] = _text button_dict[button_text] = _text
# 從2026年1月份開始 根據 |-| 來分割。26以前的先根據 |-| 分割 再根據 &&&& 分割
except: except:
pass pass
if len(button_dict) < 1: if len(button_dict) < 1:
...@@ -1035,6 +1066,7 @@ class ParseAsinUs(object): ...@@ -1035,6 +1066,7 @@ class ParseAsinUs(object):
elif len(price.strip()) < 1: elif len(price.strip()) < 1:
price = self.get_price() price = self.get_price()
if self.site_name == 'us': if self.site_name == 'us':
# 判断是否有 Coupon 促销类型 # 判断是否有 Coupon 促销类型
deal_type = [] deal_type = []
...@@ -1297,7 +1329,6 @@ class ParseAsinUs(object): ...@@ -1297,7 +1329,6 @@ class ParseAsinUs(object):
coupon_int = None coupon_int = None
deal_type = [] deal_type = []
coupon_trne = '' coupon_trne = ''
ASIN_XPATH['coupon'].append(f"//div[@data-csa-c-asin='{self.asin}']//label/text()")
for i in ASIN_XPATH['coupon']: for i in ASIN_XPATH['coupon']:
Voucher_list = self.response_s.xpath(i) Voucher_list = self.response_s.xpath(i)
if Voucher_list: if Voucher_list:
...@@ -2213,10 +2244,10 @@ class ParseAsinUs(object): ...@@ -2213,10 +2244,10 @@ class ParseAsinUs(object):
data = re.findall(r' \((.*)', data)[0] data = re.findall(r' \((.*)', data)[0]
chars_to_remove = ['(', ')', '\u200e', ',', '.'] chars_to_remove = ['(', ')', '\u200e', ',', '.']
for char in chars_to_remove: for char in chars_to_remove:
data_time = data_time.replace(char, '') data = data.replace(char, '')
month_ = re.findall(r'[A-Za-z]', data_time) month_ = re.findall(r'[A-Za-z]', data)
month_str = ''.join(month_) month_str = ''.join(month_)
_month = data_time.replace(month_str, tiem_dict.get(month_str)) _month = data.replace(month_str, tiem_dict.get(month_str))
amazon_launch_time = _month amazon_launch_time = _month
try: try:
year_moth_day = datetime.datetime.strptime(_month, '%m %d %Y') year_moth_day = datetime.datetime.strptime(_month, '%m %d %Y')
...@@ -2292,10 +2323,10 @@ class ParseAsinUs(object): ...@@ -2292,10 +2323,10 @@ class ParseAsinUs(object):
data = re.findall(r' \((.*)', data)[0] data = re.findall(r' \((.*)', data)[0]
chars_to_remove = ['(', ')', '\u200e', ',', '.'] chars_to_remove = ['(', ')', '\u200e', ',', '.']
for char in chars_to_remove: for char in chars_to_remove:
data_time = data_time.replace(char, '') data = data.replace(char, '')
month_ = re.findall(r'[A-Za-z]', data_time) month_ = re.findall(r'[A-Za-z]', data)
month_str = ''.join(month_) month_str = ''.join(month_)
_month = data_time.replace(month_str, tiem_dict.get(month_str)) _month = data.replace(month_str, tiem_dict.get(month_str))
amazon_launch_time = _month amazon_launch_time = _month
try: try:
year_moth_day = datetime.datetime.strptime(_month, '%m %d %Y') year_moth_day = datetime.datetime.strptime(_month, '%m %d %Y')
...@@ -2589,16 +2620,21 @@ class ParseAsinUs(object): ...@@ -2589,16 +2620,21 @@ class ParseAsinUs(object):
buy_box_seller_type = None buy_box_seller_type = None
for i in ASIN_XPATH['td_0_text']: for i in ASIN_XPATH['td_0_text']:
td_0_text = self.response_s.xpath(i) td_0_text = self.response_s.xpath(i)
td_0_text = [t for t in td_0_text if t.strip()] # 过滤纯空白
if td_0_text: if td_0_text:
break break
else: else:
td_0_text = [] td_0_text = []
for i in ASIN_XPATH['td_1_text']: for i in ASIN_XPATH['td_1_text']:
td_1_text = self.response_s.xpath(i) td_1_text = self.response_s.xpath(i)
td_1_text = [t for t in td_1_text if t.strip()] # 过滤纯空白
if td_1_text: if td_1_text:
break break
else: else:
td_1_text = [] td_1_text = []
print('td_1_text::',td_1_text, 'td_0_text:56565634:',td_0_text)
if td_1_text and td_0_text: if td_1_text and td_0_text:
if self.site_name in ['us', 'uk', 'es', 'fr']: if self.site_name in ['us', 'uk', 'es', 'fr']:
buy_box_seller_type = self.re_buy_sller(td_1_text, td_0_text) buy_box_seller_type = self.re_buy_sller(td_1_text, td_0_text)
...@@ -2654,6 +2690,33 @@ class ParseAsinUs(object): ...@@ -2654,6 +2690,33 @@ class ParseAsinUs(object):
break break
else: else:
buy_box_seller_type = 4 buy_box_seller_type = 4
# FBM运费价格筛选 fbm_delivery_price
for i in ASIN_XPATH['fbm_delivery_price']:
delivery_pric_list = self.response_s.xpath(i)
# FREE → 0
try:
if delivery_pric_list:
p = delivery_pric_list[0] if delivery_pric_list[
0] != 'FREE' and buy_box_seller_type == 3 else None
# 去掉货币符号
if p:
for sym in ['£', '€', '$']:
p = p.replace(sym, '')
# 把 , 转成 .
p = p.replace(' ', '').replace(',', '.')
# 判断是否有小数点
if '.' in p:
delivery_pric = round(float(p), 2) # 保留两位小数
else:
delivery_pric = int(p) # 纯整数
else:
delivery_pric = p
else:
delivery_pric = None
except:
print('获取FBM运费价格筛选 错误::', self.asin)
delivery_pric = None
# 变体 # 当前亚马逊的真实asin。和请求asin对比是否一样 # 变体 # 当前亚马逊的真实asin。和请求asin对比是否一样
current_asin = None current_asin = None
current_Asin_list = re.findall(r'currentAsin(.*?),', self.page_source) current_Asin_list = re.findall(r'currentAsin(.*?),', self.page_source)
...@@ -2725,13 +2788,6 @@ class ParseAsinUs(object): ...@@ -2725,13 +2788,6 @@ class ParseAsinUs(object):
except: except:
pass pass
# 月销具体数值。如果有值拼接一起
# buy_sales_num_list
# 月销具体数值。如果有值拼接一起
# buy_sales_num_list
for i in ASIN_XPATH['buy_sales_num_list']: for i in ASIN_XPATH['buy_sales_num_list']:
buySales_num_list = self.response_s.xpath(i) buySales_num_list = self.response_s.xpath(i)
if buySales_num_list: if buySales_num_list:
...@@ -2777,16 +2833,6 @@ class ParseAsinUs(object): ...@@ -2777,16 +2833,6 @@ class ParseAsinUs(object):
if len(buySales) > 50: if len(buySales) > 50:
buySales = None buySales = None
asin_buySales_list = []
if asin_not_Sales and buySales is None:
asin_buy = self.asin
asin_buySales = asin_not_Sales
else:
asin_buy = None
asin_buySales = None
if asin_buy and asin_buySales:
asin_buySales_list.append([asin_buy, asin_buySales, self.date_info])
# 跟卖 # 跟卖
for i in ASIN_XPATH['box_follow_list']: for i in ASIN_XPATH['box_follow_list']:
buyBox_num_list = self.response_s.xpath(i) buyBox_num_list = self.response_s.xpath(i)
...@@ -2840,7 +2886,7 @@ class ParseAsinUs(object): ...@@ -2840,7 +2886,7 @@ class ParseAsinUs(object):
rating = round(float(rating), 2) rating = round(float(rating), 2)
except: except:
rating = 0 rating = 0
print('total_comments::',total_comments)
if total_comments: if total_comments:
try: try:
total_comments = total_comments.replace(',', '').replace('.', '') total_comments = total_comments.replace(',', '').replace('.', '')
...@@ -2891,23 +2937,17 @@ class ParseAsinUs(object): ...@@ -2891,23 +2937,17 @@ class ParseAsinUs(object):
if buyBox_url is not None and seller_id is not None and td_1_text: if buyBox_url is not None and seller_id is not None and td_1_text:
buyBox_name = td_1_text[0] buyBox_name = td_1_text[0]
if 'Amazon.com' not in td_1_text[0]: if 'Amazon.com' not in td_1_text[0]:
lock = Lock()
lock.acquire()
account_name = buyBox_name.replace("%", "%%") account_name = buyBox_name.replace("%", "%%")
account_name = account_name.strip() account_name = account_name.strip()
account_url = f'{self.site_url}/s?me={seller_id}' account_url = f'{self.site_url}/s?me={seller_id}'
self.buyBox_list.append([seller_id, account_name, buyBox_url]) self.buyBox_list.append([seller_id, account_name, buyBox_url])
lock.release()
else: else:
buyBox_name = None buyBox_name = None
if buyBox_name is not None and seller_id is not None: if buyBox_name is not None and seller_id is not None:
lock = Lock()
lock.acquire()
account_name = buyBox_name.replace("%", "%%") account_name = buyBox_name.replace("%", "%%")
account_name = account_name.strip() account_name = account_name.strip()
account_url = f'{self.site_url}/s?me={seller_id}' account_url = f'{self.site_url}/s?me={seller_id}'
self.buyBoxname_asin_list.append([account_name, self.asin, seller_id]) self.buyBoxname_asin_list.append([account_name, self.asin, seller_id])
lock.release()
if launch_time: if launch_time:
launch_time = launch_time.replace('00:00:00', '').strip() launch_time = launch_time.replace('00:00:00', '').strip()
if td_0_text: if td_0_text:
...@@ -2931,7 +2971,7 @@ class ParseAsinUs(object): ...@@ -2931,7 +2971,7 @@ class ParseAsinUs(object):
} }
cleaned_data = {k: (v.strip() if isinstance(v, str) and v.strip() else None) for k, v in seller_dict.items()} cleaned_data = {k: (v.strip() if isinstance(v, str) and v.strip() else None) for k, v in seller_dict.items()}
ship_sold_json = self.get_ship_sold_json()
if all(value is None for value in cleaned_data.values()): if all(value is None for value in cleaned_data.values()):
seller_json = None seller_json = None
else: else:
...@@ -2974,7 +3014,8 @@ class ParseAsinUs(object): ...@@ -2974,7 +3014,8 @@ class ParseAsinUs(object):
'div_id_list': div_id_list, 'bundles_this_asins_data_json': bundles_this_asins_data_json, 'div_id_list': div_id_list, 'bundles_this_asins_data_json': bundles_this_asins_data_json,
'video_m3u8': video_m3u8, 'result_list_json': result_list_json, 'video_m3u8': video_m3u8, 'result_list_json': result_list_json,
'bundle_asin_component_json': bundle_asin_component_json, 'bundle_asin_component_json': bundle_asin_component_json,
"review_json_list": review_json, 'asin_buySales_list': asin_buySales_list} "review_json_list": review_json,'fbm_delivery_price':delivery_pric,'all_nodeid':all_nodeid,'search_category':search_category,
'ship_sold_json':ship_sold_json}
if self.site_name == 'us': if self.site_name == 'us':
item['three_four_val'] = Join_Prime_int item['three_four_val'] = Join_Prime_int
elif self.site_name in ['uk', 'fr', 'it', 'es']: elif self.site_name in ['uk', 'fr', 'it', 'es']:
...@@ -2986,5 +3027,77 @@ class ParseAsinUs(object): ...@@ -2986,5 +3027,77 @@ class ParseAsinUs(object):
return item return item
def get_ship_sold_json(self):
KNOWN_LABELS = [
# US / UK
'Shipper / Seller', 'Ships from', 'Sold by',
'Returns', 'Payment', 'Packaging', 'Subscription',
# DE
'Versender / Verkäufer', 'Gesetzliche Gewährleistung',
'Garantie und Reparaturen', 'Rückgaben', 'Zahlung',
]
BLOCK_STARTERS = {
'Shipper / Seller', 'Ships from', 'Versender / Verkäufer',
}
NOISE_PHRASES = [
'Your transaction is secure', 'We work hard', 'Learn more', 'Read full', 'Read more',
'Deine Transaktion ist sicher', 'Mehr erfahren',
]
def is_noise(text):
return bool(re.search(
r'[{};]|function\s*\w*\s*[\w(]|var\s+\w+|window\.|\.po-|word-break|padding:|logTech',
text
))
def extract_first_value(raw):
raw = re.sub(r'\s+', ' ', raw).strip()
if not raw:
return ''
words = raw.split()
for l in range(1, min(len(words) // 2 + 1, 15)):
candidate = ' '.join(words[:l])
if raw[len(candidate):].lstrip().startswith(candidate):
return candidate
for noise in NOISE_PHRASES:
idx = raw.find(noise)
if 0 < idx < 500:
return raw[:idx].strip()
return raw[:300].strip()
def is_complete_block(block):
has_shipper = any(k in block for k in ['Ships from', 'Shipper / Seller', 'Versender / Verkäufer'])
has_returns = any(k in block for k in ['Returns', 'Rückgaben'])
return has_shipper and has_returns
text_list = self.response_s.xpath(
"//div[@class='a-expander-content a-expander-partial-collapse-content']//text()"
)
cleaned = [t.strip() for t in text_list if t.strip() and not is_noise(t.strip())]
full = re.sub(r'\s+', ' ', ' '.join(cleaned)).strip()
label_re = '|'.join(re.escape(l) for l in sorted(KNOWN_LABELS, key=len, reverse=True))
parts = re.split(f'({label_re})', full)
blocks, current = [], {}
i = 1
while i < len(parts):
label = parts[i]
raw_val = parts[i + 1].strip() if i + 1 < len(parts) else ''
value = extract_first_value(raw_val)
if label in BLOCK_STARTERS and label in current:
if is_complete_block(current):
blocks.append(current)
current = {}
if value and label not in current:
current[label] = value
i += 2
if current:
blocks.append(current)
return json.dumps(blocks, ensure_ascii=False) if blocks else None
# if __name__ == '__main__': # if __name__ == '__main__':
# ParseAsinUs().xpath_html() # ParseAsinUs().xpath_html()
...@@ -301,7 +301,7 @@ class spider_check(BaseUtils): ...@@ -301,7 +301,7 @@ class spider_check(BaseUtils):
bytes(json.dumps(item), 'utf-8').decode('unicode_escape')) bytes(json.dumps(item), 'utf-8').decode('unicode_escape'))
# if __name__ == '__main__': if __name__ == '__main__':
# spider_check('us').send_ms() spider_check('us').send_ms()
# spider_check('de').send_ms() # spider_check('de').send_ms()
# spider_check('uk').send_ms() # spider_check('uk').send_ms()
...@@ -14,78 +14,43 @@ import redis ...@@ -14,78 +14,43 @@ import redis
from utils.secure_db_client import get_remote_engine from utils.secure_db_client import get_remote_engine
class BaseUtils(object): class BaseUtils(object):
def __init__(self): def __init__(self,site_name='us'):
self.site_name = 'us' self.site_name = site_name
self.engine = self.mysql_connect() self.engine = self.mysql_connect()
def pg_connect(self): def pg_connect(self):
# db_type_alias_map = {
# "mysql": "mysql", # 阿里云mysql
# "postgresql_14": "postgresql_14", # pg14爬虫库-内网
# "postgresql_14_outer": "postgresql_14_outer", # pg14爬虫库-外网
# "postgresql_15": "postgresql_15", # pg15正式库-内网
# "postgresql_15_outer": "postgresql_15_outer", # pg15正式库-外网
# "postgresql_cluster": "postgresql_cluster", # pg集群-内网
# "postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
# "doris": "doris", # doris集群-内网
# }
engine_pg = get_remote_engine( engine_pg = get_remote_engine(
site_name=self.site_name, # -> database "selection" site_name=self.site_name, # -> database "selection"
db_type="postgresql_14_outer", # -> 服务端 alias "mysql" db_type="postgresql_14_outer", # -> 服务端 alias "mysql"
) )
print('engine_pg::',engine_pg)
return engine_pg return engine_pg
# while True:
# try:
# if self.site_name == 'us':
# db = 'selection'
# else:
# db = f'selection_{self.site_name}'
# engine_pg = create_engine(
# f"postgresql+psycopg2://{PG_CONN_DICT['pg_user']}:{PG_CONN_DICT['pg_pwd']}@{PG_CONN_DICT['pg_host']}:{PG_CONN_DICT['pg_port']}/{db}",
# encoding='utf-8', connect_args={"connect_timeout": 10, "keepalives": 1,
# "keepalives_idle": 40, # 40s 空闲后开始发 心跳链接
# "keepalives_interval": 20, # 每 20s 发一次
# "keepalives_count": 10}, #在空闲 40 秒后,每 20 秒发一次探测,总共探测 10 次
# pool_recycle=900, # 太老的连接(15mi)强制回收,避免中间网络设备回收后无法用,池中连接存活 15 min 后丢弃
# pool_size=60, # 根据并发量适当设置
# max_overflow=40)
# return engine_pg
# except Exception as e:
# print("pg_connect 14 t11111111111111111111111:", e, f"\n{traceback.format_exc()}")
# time.sleep(3)
# continue
def doris_connect_adv(self):
engine_doris = get_remote_engine(
site_name=self.site_name, # -> database "selection"
db_type="doris_adv", # -> 服务端 alias "mysql"
database="advertising_manager", # -> 服务端 alias "mysql"
)
return engine_doris
def doris_adv_direct_connect(self):
"""直连 Doris 广告库(不走代理,用于高速读取和更新)"""
import pymysql
from amazon_params.params import DORIS_ADV_DIRECT_CONN
conn = pymysql.connect(**DORIS_ADV_DIRECT_CONN)
return conn
def doris_connect(self): def doris_connect(self):
engine_doris = get_remote_engine( engine_doris = get_remote_engine(
site_name=self.site_name, # -> database "selection" site_name=self.site_name, # -> database "selection"
db_type="doris", # -> 服务端 alias "mysql" db_type="doris", # -> 服务端 alias "mysql"
) )
print('engine_pg::', engine_doris)
return engine_doris return engine_doris
# nums = 0
# while True:
# nums += 1
# try:
# db = 'us_spider'
# # 设置连接参数字典,包括连接超时参数
# connect_args = {
# "connect_timeout": 10
# }
# return create_engine(
# f'mysql+pymysql://{DORIS_CONN["mysql_user"]}:' + f'{DORIS_CONN["mysql_pwd"]}@{DORIS_CONN["mysql_host"]}:{DORIS_CONN["mysql_port"]}/{db}?charset=utf8mb4',
# connect_args=connect_args, poolclass=NullPool)
# except Exception as e:
# print("doris_connect22222222222222222222222222:", e, f"\n{traceback.format_exc()}")
# time.sleep(3)
# continue
def pg_connect_6(self): def pg_connect_6(self):
engine_pg15 = get_remote_engine( engine_pg15 = get_remote_engine(
site_name=self.site_name, # -> database "selection" site_name=self.site_name, # -> database "selection"
db_type="postgresql_15_outer", # -> 服务端 alias "mysql" db_type="postgresql_15_outer", # -> 服务端 alias "mysql"
) )
print('engine_pg15::', engine_pg15)
return engine_pg15 return engine_pg15
def pg_reconnect(self, table_name=None, e=None): def pg_reconnect(self, table_name=None, e=None):
...@@ -153,7 +118,6 @@ class BaseUtils(object): ...@@ -153,7 +118,6 @@ class BaseUtils(object):
time.sleep(5) time.sleep(5)
def kafuka_connect(self, kafka_html_connect=None, bootstrap_servers=None, acks=None, connections_max_idle_ms=60000): def kafuka_connect(self, kafka_html_connect=None, bootstrap_servers=None, acks=None, connections_max_idle_ms=60000):
request_timeout_ms = 30000
if kafka_html_connect: if kafka_html_connect:
bootstrap_servers = '61.145.136.61:20092' bootstrap_servers = '61.145.136.61:20092'
else: else:
...@@ -172,17 +136,17 @@ class BaseUtils(object): ...@@ -172,17 +136,17 @@ class BaseUtils(object):
sasl_plain_password='R8@xY3pL!qz', sasl_plain_password='R8@xY3pL!qz',
value_serializer=str.encode, value_serializer=str.encode,
max_request_size=10000120, max_request_size=10000120,
request_timeout_ms=request_timeout_ms, request_timeout_ms=40000,
max_block_ms=30000, # 阻塞超时时间设置为60秒 max_block_ms=40000, # 阻塞超时时间设置为60秒
compression_type='gzip', # 启用消息压缩 compression_type='gzip', # 启用消息压缩
acks=1 if acks else 0, # 根据需要设置 acks, # 等待所有副本确认接收 acks=1 if acks else 0, # 根据需要设置 acks, # 等待所有副本确认接收
connections_max_idle_ms=connections_max_idle_ms, # 一分钟释放链接 connections_max_idle_ms=connections_max_idle_ms, # 一分钟释放链接
max_in_flight_requests_per_connection=1000, max_in_flight_requests_per_connection=1000,
linger_ms=150, # 增加等待时间 linger_ms=350, # 增加等待时间
batch_size=16384 if acks else 0, # 增加批处理大小 batch_size=16384 if acks else 0, # 增加批处理大小
api_version=(2, 4, 1), # 我的kafka版本是2.4.1 api_version=(2, 4, 1), # 我的kafka版本是2.4.1
retries=10, # 自动重试 retries=5, # 自动重试
retry_backoff_ms=500 retry_backoff_ms=600
) )
return producer return producer
except Exception as e: except Exception as e:
......
...@@ -18,8 +18,9 @@ US_ASIN_XPATH = { ...@@ -18,8 +18,9 @@ US_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'], 'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'], 'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"], 'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'], 'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'review_button_list': ['//div[contains(@id,"product-insights")]//button/span','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]'], '//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button/span','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"], "category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"], "category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()', "title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
...@@ -130,10 +131,12 @@ US_ASIN_XPATH = { ...@@ -130,10 +131,12 @@ US_ASIN_XPATH = {
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()', '//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[@class="offer-display-feature-text a-spacing-none"]/span/text()', '//div[@class="offer-display-feature-text a-spacing-none"]/span/text()',
'//a[contains(text(),"Fulfilled by")]/text()', '//a[contains(text(),"Fulfilled by")]/text()',
'//span[contains(text(),"Shipper / Seller")]/..//following-sibling::div/text()',
'//span[contains(text(),"Ships from and sold by")]/../a/span/text()', '//span[contains(text(),"Ships from and sold by")]/../a/span/text()',
'//div[contains(@id,"FromSoldBy_")]//tr[1]//td/span[contains(@class,"-base")]/text()', '//div[contains(@id,"FromSoldBy_")]//tr[1]//td/span[contains(@class,"-base")]/text()',
'//span[contains(text(),"Ships from")]/parent::div/parent::div/following-sibling::div//span[@class="a-size-small offer-display-feature-text-message"]/text()', '//span[contains(text(),"Ships from")]/parent::div/parent::div/following-sibling::div//span[@class="a-size-small offer-display-feature-text-message"]/text()',
'//span[contains(text(),"Ships from")]/parent::div/parent::div/following-sibling::div//span/text()' '//span[contains(text(),"Ships from")]/parent::div/parent::div/following-sibling::div//span[contains(text()," Sold by: ")]/following-sibling::span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
], ],
"td_1_text": ['//div[@tabular-attribute-name="Sold by"]//a/text()', '//div[contains(text(),"Sold by")]//a/text()', "td_1_text": ['//div[@tabular-attribute-name="Sold by"]//a/text()', '//div[contains(text(),"Sold by")]//a/text()',
'//div[@tabular-attribute-name="Sold by"]//span[@class="a-size-small"]/text()', '//div[@tabular-attribute-name="Sold by"]//span[@class="a-size-small"]/text()',
...@@ -166,7 +169,9 @@ US_ASIN_XPATH = { ...@@ -166,7 +169,9 @@ US_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[contains(@id,'lob-carousel')]//a/@href"], 'pba_lob_asin_list': ["//div[contains(@id,'lob-carousel')]//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'], 'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [ 'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"] "//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price':['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category' :["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
} }
DE_ASIN_XPATH = { DE_ASIN_XPATH = {
...@@ -187,8 +192,9 @@ DE_ASIN_XPATH = { ...@@ -187,8 +192,9 @@ DE_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'], 'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'], 'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"], 'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'], 'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]'], '//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"], "category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"], "category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()', "title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
...@@ -325,7 +331,9 @@ DE_ASIN_XPATH = { ...@@ -325,7 +331,9 @@ DE_ASIN_XPATH = {
"td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()', "td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()',
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()', '//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[@class="offer-display-feature-text a-spacing-none"]/span/text()', '//div[@class="offer-display-feature-text a-spacing-none"]/span/text()',
'//div[contains(@class,"offer-display-feature-text")]/span/text()'], '//div[contains(@class,"offer-display-feature-text")]/span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ["//div[@id='merchant-info']//a[2]/span/text()", '//div[@tabular-attribute-name="Sold by"]//a/text()', "td_1_text": ["//div[@id='merchant-info']//a[2]/span/text()", '//div[@tabular-attribute-name="Sold by"]//a/text()',
'//div[contains(text(),"Verkäufer")]//a/text()', '//div[contains(text(),"Verkäufer")]//a/text()',
'//div[@tabular-attribute-name="Verkäufer"]//span[@class="a-size-small"]/text()', '//div[@tabular-attribute-name="Verkäufer"]//span[@class="a-size-small"]/text()',
...@@ -351,8 +359,9 @@ DE_ASIN_XPATH = { ...@@ -351,8 +359,9 @@ DE_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"], 'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'], 'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [ 'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"] "//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
} }
UK_ASIN_XPATH = { UK_ASIN_XPATH = {
...@@ -373,8 +382,9 @@ UK_ASIN_XPATH = { ...@@ -373,8 +382,9 @@ UK_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'], 'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'], 'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"], 'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'], 'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]'], '//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//div[contains(@id,"product-insights")]//a[contains(@id,"aspect-button")]','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"], "category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"], "category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()', "title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
...@@ -488,7 +498,9 @@ UK_ASIN_XPATH = { ...@@ -488,7 +498,9 @@ UK_ASIN_XPATH = {
'//span[contains(text(),"Ships from and sold by")]/../a/span/text()', '//span[contains(text(),"Ships from and sold by")]/../a/span/text()',
'//div[contains(@id,"FromSoldBy_")]//tr[1]//td/span[contains(@class,"-base")]/text()', '//div[contains(@id,"FromSoldBy_")]//tr[1]//td/span[contains(@class,"-base")]/text()',
'//div[@data-csa-c-slot-id="odf-feature-text-desktop-fulfiller-info"]//span/text()', '//div[@data-csa-c-slot-id="odf-feature-text-desktop-fulfiller-info"]//span/text()',
'//span[contains(text(),"Dispatches from")]/parent::div/parent::div/following-sibling::div//span[@class="a-size-small offer-display-feature-text-message"]/text()' '//span[contains(text(),"Dispatches from")]/parent::div/parent::div/following-sibling::div//span[@class="a-size-small offer-display-feature-text-message"]/text()',
'//span[contains(text(),"Dispatches from")]/parent::div/parent::div/following-sibling::div//span[contains(text()," Sold by: ")]/following-sibling::span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
], ],
"td_1_text": ['//div[@tabular-attribute-name="Sold by"]//a/text()', '//div[contains(text(),"Sold by")]//a/text()', "td_1_text": ['//div[@tabular-attribute-name="Sold by"]//a/text()', '//div[contains(text(),"Sold by")]//a/text()',
'//div[@tabular-attribute-name="Sold by"]//span[@class="a-size-small"]/text()', '//div[@tabular-attribute-name="Sold by"]//span[@class="a-size-small"]/text()',
...@@ -520,7 +532,9 @@ UK_ASIN_XPATH = { ...@@ -520,7 +532,9 @@ UK_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"], 'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'], 'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [ 'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"] "//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
} }
...@@ -542,8 +556,9 @@ FR_ASIN_XPATH = { ...@@ -542,8 +556,9 @@ FR_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'], 'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'], 'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"], 'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'], 'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'review_button_list': ['//div[contains(@id,"product-insights")]//button'], '//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"], "category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"], "category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()', "title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
...@@ -656,7 +671,9 @@ FR_ASIN_XPATH = { ...@@ -656,7 +671,9 @@ FR_ASIN_XPATH = {
'//span[@id="submit.add-to-cart-ubb-announce"]/text()', '//span[@id="submit.add-to-cart-ubb-announce"]/text()',
'//div[contains(@id,"availability")]/span/text()'], '//div[contains(@id,"availability")]/span/text()'],
"td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()', "td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()',
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()'], '//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ['//div[@tabular-attribute-name="Vendu par"]//a/text()', "td_1_text": ['//div[@tabular-attribute-name="Vendu par"]//a/text()',
'//div[contains(text(),"Vendu par")]//a/text()', '//div[contains(text(),"Vendu par")]//a/text()',
'//div[@tabular-attribute-name="Vendu par"]//span[@class="a-size-small"]/text()', '//div[@tabular-attribute-name="Vendu par"]//span[@class="a-size-small"]/text()',
...@@ -676,7 +693,9 @@ FR_ASIN_XPATH = { ...@@ -676,7 +693,9 @@ FR_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"], 'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'], 'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [ 'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"] "//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
} }
...@@ -698,8 +717,9 @@ ES_ASIN_XPATH = { ...@@ -698,8 +717,9 @@ ES_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'], 'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'], 'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"], 'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'], 'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'review_button_list': ['//div[contains(@id,"product-insights")]//button'], '//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"], "category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"], "category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()', "title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
...@@ -824,7 +844,9 @@ ES_ASIN_XPATH = { ...@@ -824,7 +844,9 @@ ES_ASIN_XPATH = {
'//div[contains(@id,"availability")]/span/text()'], '//div[contains(@id,"availability")]/span/text()'],
"td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()', "td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()',
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()'], '//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ['//div[@tabular-attribute-name="Vendido por"]//a/text()', "td_1_text": ['//div[@tabular-attribute-name="Vendido por"]//a/text()',
'//div[contains(text(),"Vendido por")]//a/text()', '//div[contains(text(),"Vendido por")]//a/text()',
'//div[@tabular-attribute-name="Vendido por"]//span[@class="a-size-small"]/text()', '//div[@tabular-attribute-name="Vendido por"]//span[@class="a-size-small"]/text()',
...@@ -844,7 +866,9 @@ ES_ASIN_XPATH = { ...@@ -844,7 +866,9 @@ ES_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[contains(@id,'lob-carousel')]//a/@href"], 'pba_lob_asin_list': ["//div[contains(@id,'lob-carousel')]//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'], 'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [ 'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"] "//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
} }
...@@ -866,8 +890,9 @@ IT_ASIN_XPATH = { ...@@ -866,8 +890,9 @@ IT_ASIN_XPATH = {
'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'], 'product_attribute_1': ['//div[contains(@class,"product-facts-detail")]/div'],
'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'], 'proddetails_list_1': ['//div[contains(@id,"detailBullets")]/ul/li/span/span'],
'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"], 'proddetails_list': ["//div[@id='prodDetails']//th","//div[@id='audibleProductDetails']//th"],
'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()'], 'review_ai_list': ['//div[contains(@id,"product-insights")]//div[@id="product-summary"]//span/text()',
'review_button_list': ['//div[contains(@id,"product-insights")]//button'], '//div[contains(@data-csa-c-item-id,"product-insights")]//div[@data-testid="overall-summary"]//span/text()'],
'review_button_list': ['//div[contains(@id,"product-insights")]//button','//span[@data-testid="aspect-label"]'],
"category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"], "category_href": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//@href"],
"category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"], "category_data": ["//ul[@class='a-unordered-list a-horizontal a-size-small']//li/span//text()"],
"title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()', "title": ['//*[@id="productTitle"]/text()', '//h1[@class="a-size-large"]/text()',
...@@ -987,7 +1012,9 @@ IT_ASIN_XPATH = { ...@@ -987,7 +1012,9 @@ IT_ASIN_XPATH = {
'//div[contains(@id,"availability")]/span/text()'], '//div[contains(@id,"availability")]/span/text()'],
"td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()', "td_0_text": ['//span[@id="tabular-buybox-truncate-1"]/span/span//text()',
'//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()'], '//div[@class="tabular-buybox-text"]/div[@class="tabular-buybox-text a-spacing-none"]/span/text()',
'//div[contains(@data-csa-c-slot-id,"odf-feature-label-desktop-merchant-info")]//span[contains(text()," / ")]/parent::div/parent::div/following-sibling::div//span[contains(@class,"offer-display-feature-text-message")]/text()'
],
"td_1_text": ['//div[@tabular-attribute-name="Venditore"]//a/text()', "td_1_text": ['//div[@tabular-attribute-name="Venditore"]//a/text()',
'//div[contains(text(),"Venditore")]//a/text()', '//div[contains(text(),"Venditore")]//a/text()',
'//div[@tabular-attribute-name="Venditore"]//span[@class="a-size-small"]/text()', '//div[@tabular-attribute-name="Venditore"]//span[@class="a-size-small"]/text()',
...@@ -1007,5 +1034,8 @@ IT_ASIN_XPATH = { ...@@ -1007,5 +1034,8 @@ IT_ASIN_XPATH = {
'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"], 'pba_lob_asin_list': ["//div[@id='pba-lob-carousel-row']//a/@href"],
'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'], 'compare_similar_asin_list': ['//div[contains(@id,"imageContainer-")]/@id'],
'customer_reviews_list': [ 'customer_reviews_list': [
"//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"] "//div[@data-csa-c-content-id='customerReviewsAttribute']//div[@class='a-row a-spacing-none']//span/text()"],
'fbm_delivery_price': ['//span[@data-csa-c-content-id="DEXUnifiedCXPDM"]/@data-csa-c-delivery-price'],
'search_category': ["//select[@id='searchDropdownBox']/option[@current='parent']/text()"]
} }
...@@ -115,25 +115,26 @@ class ParseSearchTermUs(object): ...@@ -115,25 +115,26 @@ class ParseSearchTermUs(object):
def parse_asin_zr(self): def parse_asin_zr(self):
""" """
返回通过data-asin匹配到的所有asin中,排除了sb、sp的对应asin --> 剩余zrasin 通过 data-index + data-asin 且不含 AdHolder 的 div 获取ZR自然排名ASIN
""" """
asin_all = self.etree_html.xpath('//div[@data-asin]/@data-asin') # 保留原始全量列表给 parse_buy 用
asin_all_str = "-".join(asin_all).replace('/', '') asin_all_raw = self.etree_html.xpath('//div[@data-asin]/@data-asin')
asin_all = re.findall("(\w+)", asin_all_str) asin_all_str = "-".join(asin_all_raw).replace('/', '')
self.asin_all = asin_all # 保留原始列表给 parse_buy 用 self.asin_all = re.findall("(\w+)", asin_all_str)
# 用集合排除 SB/SP 的 ASIN,避免 list.remove() 只删第一个导致漏删 # ZR: 主搜索结果中没有 AdHolder 的项
# 先去重保持页面顺序,再排除已识别的 SB 和 SP zr_items = self.etree_html.xpath('//div[@data-index and @data-asin and not(contains(@class, "AdHolder"))]')
exclude_set = set(self.sb_list_all) | set(self.sp_list_all) zr_asin_list = []
asin_unique = list(dict.fromkeys(asin_all)) for item in zr_items:
zr_list = [a for a in asin_unique if a not in exclude_set] asin = item.get('data-asin', '').strip()
return zr_list if asin and len(asin) >= 9 and asin not in zr_asin_list:
zr_asin_list.append(asin)
return zr_asin_list
def parse_type_common(self, asin_list=None, cate_type=None): def parse_type_common(self, asin_list=None, cate_type=None):
""" """
asin_list: list asin_list: list
""" """
asin_list = list(dict.fromkeys(asin_list)) # 去重保序 asin_list = list(dict.fromkeys(asin_list)) # 去重保序,保持xpath返回的页面顺序
asin_list.sort(key=lambda a: self.asin_position_map.get(a, 9999)) # 按 data-index 页面位置排序,无 data-index 的排最后
asin_detail_all_list = [] asin_detail_all_list = []
cate_type_copy = 1 cate_type_copy = 1
asin_detail_dict = { asin_detail_dict = {
...@@ -319,6 +320,21 @@ class ParseSearchTermUs(object): ...@@ -319,6 +320,21 @@ class ParseSearchTermUs(object):
print(self.search_term,' 页数:',self.page,'广告asin:',asin) print(self.search_term,' 页数:',self.page,'广告asin:',asin)
if asin and len(asin) >= 9 and asin not in sp_asin_list: if asin and len(asin) >= 9 and asin not in sp_asin_list:
sp_asin_list.append(asin) sp_asin_list.append(asin)
# 标签下的SP广告位(如 Customers frequently viewed、Today's deals)
if self.site_name == 'de':
sp_label = 'Gesponsert'
elif self.site_name == 'us' or self.site_name == 'uk':
sp_label = 'Sponsored'
else:
sp_label = 'Sponsored'
tag_asin_list = self.etree_html.xpath(
f'//span[@class="a-declarative"]/span[contains(text(),"{sp_label}")]/../../../../../../../../div/following-sibling::span[2]//div/@data-asin|//span/a[contains(text(),"{sp_label}")]/../../../../../../../../div/following-sibling::span[2]//div/@data-asin')
if tag_asin_list:
for asin in tag_asin_list:
if asin and len(asin) >= 9 and asin not in sp_asin_list:
sp_asin_list.append(asin)
self.sp_list_all = sp_asin_list.copy() # 供 parse_asin_zr 排除用 self.sp_list_all = sp_asin_list.copy() # 供 parse_asin_zr 排除用
if sp_asin_list: if sp_asin_list:
sp_asin_list.sort(key=lambda a: self.asin_position_map.get(a, 9999)) sp_asin_list.sort(key=lambda a: self.asin_position_map.get(a, 9999))
...@@ -632,3 +648,4 @@ if __name__ == '__main__': ...@@ -632,3 +648,4 @@ if __name__ == '__main__':
print('sb_list:' ,sb_list) print('sb_list:' ,sb_list)
import hashlib import hashlib
# import requests
import json import json
import os import os
import random import random
...@@ -7,11 +6,11 @@ import re ...@@ -7,11 +6,11 @@ import re
import sys import sys
import time import time
import uuid import uuid
from urllib.parse import urlparse from threading import Lock
import urllib3 import urllib3
from lxml import etree from lxml import etree
# py -3.9 -m pip pyinstaller 指定pip 安装
# py -3.10 -m pip install -r E:\Git_new\spider\yswg-agent\requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from amazon_params.params import DB_REQUESTS_ASIN_PARAMS from amazon_params.params import DB_REQUESTS_ASIN_PARAMS
from utils.db_connect import BaseUtils from utils.db_connect import BaseUtils
...@@ -35,6 +34,8 @@ class Requests_param_val(BaseUtils): ...@@ -35,6 +34,8 @@ class Requests_param_val(BaseUtils):
print("站点名称:", self.site_name, '抓取项目', "代理ip:", self.proxy_name) print("站点名称:", self.site_name, '抓取项目', "代理ip:", self.proxy_name)
self.cookies_queue = Queue() # cookie队列 self.cookies_queue = Queue() # cookie队列
self.kafuka_producer_str = self.kafuka_connect() self.kafuka_producer_str = self.kafuka_connect()
self.next_page_lock = Lock()
self.headers_num_int_s = 0
def init_db_names(self): def init_db_names(self):
self.engine_pg = self.pg_connect() self.engine_pg = self.pg_connect()
...@@ -67,7 +68,7 @@ class Requests_param_val(BaseUtils): ...@@ -67,7 +68,7 @@ class Requests_param_val(BaseUtils):
""" """
判断获取文本是否有中文 判断获取文本是否有中文
""" """
if check_str != '无': if check_str and check_str != '无':
for c in check_str: for c in check_str:
if '\u4e00' <= c <= '\u9fa5': if '\u4e00' <= c <= '\u9fa5':
print('--是中文,说明该cookie有问题,或者改数据有问题--') print('--是中文,说明该cookie有问题,或者改数据有问题--')
...@@ -117,7 +118,7 @@ class Requests_param_val(BaseUtils): ...@@ -117,7 +118,7 @@ class Requests_param_val(BaseUtils):
if num: if num:
sql_read = f'SELECT cookies,id FROM {self.db_cookies} limit {num};' sql_read = f'SELECT cookies,id FROM {self.db_cookies} limit {num};'
else: else:
sql_read = f'SELECT cookies,id FROM {self.db_cookies} limit 300;' sql_read = f'SELECT cookies,id FROM {self.db_cookies} limit 350;'
print("获取cookie:", sql_read) print("获取cookie:", sql_read)
df_read = self.engine.read_sql(sql_read) df_read = self.engine.read_sql(sql_read)
clientPriceList = list(df_read.cookies + "|-|" + df_read.id.astype("U")) clientPriceList = list(df_read.cookies + "|-|" + df_read.id.astype("U"))
...@@ -195,31 +196,26 @@ class Requests_param_val(BaseUtils): ...@@ -195,31 +196,26 @@ class Requests_param_val(BaseUtils):
# 组装请求头, # 组装请求头,
def requests_amazon_headers(self, host=None, site_url=None, asin=None, scraper_url=None): def requests_amazon_headers(self, host=None, site_url=None, asin=None, scraper_url=None):
n = random.randint(120, 142) n = random.randint(120, 142)
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 6900)}.{random.randint(1, 181)} Safari/537.36'
# Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36
ua = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{n}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'
# ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
headers = { headers = {
'connection': 'close', 'connection': 'close',
'authority': urlparse(self.site_url).hostname,
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9', 'accept-language': 'zh-CN,zh;q=0.9',
'accept-Encodin': 'gzip, deflate, br, zstd', 'accept-encoding': 'gzip, deflate, br, zstd',
'cache-control': 'no-cache', 'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8', 'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'user-agent': ua, 'user-agent': ua,
"pragma": "no-cache", "pragma": "no-cache",
} }
if asin: if asin:
headers['origin'] = f'{site_url}dp/{asin}' headers['origin'] = f'{site_url}dp/{asin}'
headers['referer'] = f'{site_url}?th=1'
if scraper_url: if scraper_url:
headers['origin'] = scraper_url headers['origin'] = scraper_url
headers['referer'] = scraper_url headers['referer'] = scraper_url
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
k = "" k = ""
...@@ -229,11 +225,11 @@ class Requests_param_val(BaseUtils): ...@@ -229,11 +225,11 @@ class Requests_param_val(BaseUtils):
return headers return headers
# 第二次发送请求。 # 第二次发送请求。
def requests_amazon(self, headers=None, scraper_url=None): def requests_amazon(self, headers=None, scraper_url=None, sess=None):
for i in range(2): for i in range(2):
try: try:
resp = requests.get(scraper_url, headers=headers, impersonate="chrome", resp = requests.get(scraper_url, headers=headers, impersonate="chrome",
timeout=10, verify=False) timeout=30, verify=False)
if self.check_amazon_yzm(resp): if self.check_amazon_yzm(resp):
print('验证码2222222222222222') print('验证码2222222222222222')
continue continue
...@@ -262,22 +258,7 @@ class Requests_param_val(BaseUtils): ...@@ -262,22 +258,7 @@ class Requests_param_val(BaseUtils):
# 获取对应每个小时的数字。存到redis列表中 # 获取对应每个小时的数字。存到redis列表中
def get_hour(self, new_date_hour): def get_hour(self, new_date_hour):
# new_date_hour = datetime.now().strftime("%Y-%m-%d:%H") return str(datetime.now().hour)
# 获取当前日期
current_date = datetime.now()
# 将当前时间的小时、分钟和秒设置为0
current_date = current_date.replace(hour=0, minute=0, second=0, microsecond=0)
# 生成当天的24小时时间
hourly_times = [current_date + timedelta(hours=i) for i in range(24)]
hour_dict = {}
# 打印每个小时的时间
for hour_time in hourly_times:
hour = hour_time.strftime("%Y-%m-%d:%H")
num = re.findall(r':(\d+)', hour)[0]
hour_dict[hour] = num
print(new_date_hour, hour_dict)
n = hour_dict[new_date_hour]
return n
# 组装cookie # 组装cookie
def get_cookie_str(self, cookies_queue): def get_cookie_str(self, cookies_queue):
...@@ -305,11 +286,11 @@ class Requests_param_val(BaseUtils): ...@@ -305,11 +286,11 @@ class Requests_param_val(BaseUtils):
break break
if self.site_name == 'uk': if self.site_name == 'uk':
cookie_str = cookie_str.replace('i18n-prefs=HKD;', 'i18n-prefs=GBP;').replace('i18n-prefs=USD;', 'i18n-prefs=GBP;') cookie_str = cookie_str.replace('i18n-prefs=HKD', 'i18n-prefs=GBP').replace('i18n-prefs=USD', 'i18n-prefs=GBP')
elif self.site_name == 'de': elif self.site_name == 'de':
cookie_str = cookie_str.replace('i18n-prefs=HKD;', 'i18n-prefs=EUR;').replace('i18n-prefs=USD;', 'i18n-prefs=EUR;') cookie_str = cookie_str.replace('i18n-prefs=HKD', 'i18n-prefs=EUR').replace('i18n-prefs=USD', 'i18n-prefs=EUR')
elif self.site_name == 'us': elif self.site_name == 'us':
cookie_str = cookie_str.replace('i18n-prefs=HKD;', 'i18n-prefs=USD;') cookie_str = cookie_str.replace('i18n-prefs=HKD', 'i18n-prefs=USD')
return cookie_str return cookie_str
# 获取自增id区间。根据传的站点获取对应的月 周 syn表的id # 获取自增id区间。根据传的站点获取对应的月 周 syn表的id
...@@ -353,7 +334,7 @@ class Requests_param_val(BaseUtils): ...@@ -353,7 +334,7 @@ class Requests_param_val(BaseUtils):
def hex_md5(self, input_string): def hex_md5(self, input_string):
# 创建一个MD5哈希对象 # 创建一个MD5哈希对象
md5_hash = hashlib.md5() md5_hash = hashlib.md5()
# 使用输入字符串的字节更新哈希对象 # 使用输入字符串的字节更新哈希对象items.pop('div_id_list', None)
md5_hash.update(input_string.encode('utf-8')) md5_hash.update(input_string.encode('utf-8'))
# 获取哈希的十六进制表示 # 获取哈希的十六进制表示
md5_hex_digest = md5_hash.hexdigest() md5_hex_digest = md5_hash.hexdigest()
...@@ -367,24 +348,25 @@ class Requests_param_val(BaseUtils): ...@@ -367,24 +348,25 @@ class Requests_param_val(BaseUtils):
def send_kafka(self, items=None, html_data=None, topic=None): def send_kafka(self, items=None, html_data=None, topic=None):
print('向Kafka发送数据') print('向Kafka发送数据')
for i in range(5): if items:
items.pop('div_id_list', None)
for i in range(3):
try: try:
if items: if items:
print('232323232323')
del items['div_id_list']
future = self.kafuka_producer_str.send(topic, json.dumps(items)) future = self.kafuka_producer_str.send(topic, json.dumps(items))
future.add_callback(self.on_send_success).add_errback(self.on_send_error) future.add_callback(self.on_send_success).add_errback(self.on_send_error)
if html_data: if html_data:
future = self.kafuka_producer_str.send(topic, html_data) future = self.kafuka_producer_str.send(topic, html_data)
future.add_callback(self.on_send_success).add_errback(self.on_send_error) future.add_callback(self.on_send_success).add_errback(self.on_send_error)
print('向Kafka发送数据 发送成功') print('向Kafka发送数据 发送成功')
break break
except KafkaTimeoutError:
print(f'Kafka flush超时,第{i+1}次重试')
if i >= 2:
self.kafuka_producer_str = self.kafuka_connect()
except Exception as e: except Exception as e:
print(e) print(e)
if i >= 1: if i >= 2:
self.kafuka_producer_str = self.kafuka_connect() # 调用kafka self.kafuka_producer_str = self.kafuka_connect()
try: if __name__ == '__main__':
self.kafuka_producer_str.flush(timeout=30) Requests_param_val().get_cookie(num=1)
except KafkaTimeoutError as e: \ No newline at end of file
print("flush 超时,跳过这次等待:", e)
import json #
#
#
# '''旧版'''
#
# import json
#
# import pandas as pd
# import numpy as np
# import orjson, requests, time
# from typing import List
#
# # -------- 映射字典 --------
# site_name_db_dict = {
# "us": "selection",
# "uk": "selection_uk",
# "de": "selection_de",
# "es": "selection_es",
# "fr": "selection_fr",
# "it": "selection_it",
# }
#
# db_type_alias_map = {
# "mysql": "mysql", # 阿里云mysql
# "postgresql_14": "postgresql_14", # pg14爬虫库-内网
# "postgresql_14_outer": "postgresql_14", # pg14爬虫库-外网
# "postgresql_15": "postgresql_15", # pg15正式库-内网
# "postgresql_15_outer": "postgresql_15_outer", # pg15正式库-外网
# "postgresql_cluster": "postgresql_cluster", # pg集群-内网
# "postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
# "doris": "doris", # doris集群-内网
# }
#
# DEFAULT_SERVERS = [
# # "http://192.168.200.210:7777", # 内网
# # "http://192.168.10.217:7777", # 内网-h7
# # "http://113.100.143.162:7777", # 外网
# # "http://113.100.143.162:7779", # 外网
# # "http://61.145.136.61:7777", # 外网
# # "http://61.145.136.61:7779", # 外网
# "http://192.168.200.210:7780"
#
#
# ]
# # ---------------------------
#
# def df_to_json_records(df: pd.DataFrame) -> list:
# """保证 DataFrame 可安全序列化为 JSON records(处理 NaN / ±Inf)"""
# df_clean = df.copy()
#
# # 1️⃣ 替换 ±Inf -> NaN
# num_cols = df_clean.select_dtypes(include=[np.number]).columns
# if len(num_cols):
# df_clean[num_cols] = df_clean[num_cols].replace([np.inf, -np.inf], np.nan)
#
# # 2️⃣ 替换 NaN -> None(注意:有时 astype(object) 不彻底,需用 applymap)
# df_clean = df_clean.applymap(lambda x: None if pd.isna(x) else x)
#
# # 3️⃣ 转为 dict records
# return df_clean.to_dict("records")
#
#
# def clean_json_field_for_orjson(v):
# """清洗单个 JSON 字段的值,使其符合 orjson 要求并避免空字典入库"""
# if v is None or pd.isna(v):
# return None
#
# # 1️⃣ 如果是空字典对象,返回 None
# if isinstance(v, dict) and not v:
# return None
#
# # 2️⃣ 如果是空字符串或仅为 "{}",返回 None
# if isinstance(v, str):
# stripped = v.strip()
# if not stripped or stripped == "{}":
# return None
# try:
# parsed = json.loads(stripped)
# if isinstance(parsed, dict) and not parsed:
# return None
# return json.dumps(parsed, ensure_ascii=False)
# except Exception:
# return v # 非 JSON 字符串则原样保留
#
# return v
#
#
# def fully_clean_for_orjson(df: pd.DataFrame) -> pd.DataFrame:
# # """全面清洗 DataFrame 以符合 orjson 要求"""
# # df = df.replace([np.inf, -np.inf], np.nan)
# # df = df.applymap(lambda x: None if pd.isna(x) else x)
# #
# # # 找出所有可能为 JSON 字符串的字段
# # json_like_cols = [col for col in df.columns if col.endswith('_json')]
# #
# # # 针对每个 JSON-like 字段,应用清洗函数
# # for col in json_like_cols:
# # df[col] = df[col].apply(clean_json_field_for_orjson)
# #
# # return df
# """全面清洗 DataFrame 以符合 orjson 要求"""
# df = df.replace([np.inf, -np.inf], np.nan)
#
# # NaN → None (比 applymap 高效且不出错)
# df = df.where(pd.notna(df), None)
#
# # 找出所有可能为 JSON 字符串的字段
# json_like_cols = [col for col in df.columns if col.endswith('_json')]
#
# # 针对每个 JSON-like 字段,应用清洗函数
# for col in json_like_cols:
# df[col] = df[col].apply(clean_json_field_for_orjson)
#
# return df
#
#
# class RemoteTransaction:
#
# def __init__(self, db: str, database: str,
# session: requests.Session, urls: List[str]):
# self.db = db
# self.database = database
# self.session = session
# self.urls = urls
# self.sql_queue = []
#
# # def execute(self, sql: str):
# # self.sql_queue.append(sql)
# def execute(self, sql: str, params=None):
# """
# params 可取:
# • None → 纯文本 SQL
# • dict → 单条参数化 e.g. {"id":1,"name":"a"}
# • list/tuple → 批量 executemany
# - list[dict] ↔ INSERT .. VALUES (:id,:name)
# - list[tuple] ↔ INSERT .. VALUES (%s,%s)
# """
# self.sql_queue.append({"sql": sql, "params": params})
#
# def __enter__(self): return self
#
# def __exit__(self, exc_type, exc, tb):
# for url in self.urls:
# try:
# self.session.post(
# url + "/transaction",
# json={"db": self.db,
# "sql_list": self.sql_queue,
# "site_name": self.database}, # site_name not needed on server, kept for clarity
# timeout=3000,
# ).raise_for_status()
# return
# except Exception as e:
# print(f"[WARN] 事务失败 {url}: {e}")
# raise RuntimeError("All servers failed for transaction")
#
#
# class RemoteEngine:
# def __init__(self, db: str, database: str,
# server_urls: List[str], retries: int = 2):
# self.db = db
# self.database = database
# self.urls = [u.rstrip("/") for u in server_urls]
# self.session = requests.Session()
# self.retries = retries
#
# def _request(self, endpoint: str, payload):
# for url in self.urls:
# for _ in range(self.retries):
# try:
# json_bytes = orjson.dumps(payload)
# r = self.session.post(f"{url}/{endpoint}",
# data=json_bytes,
# headers={"Content-Type": "application/json"},
# timeout=3000)
#
# # r = self.session.post(f"{url}/{endpoint}",
# # json=payload, timeout=10)
# r.raise_for_status()
# return r.json()
# except Exception as e:
# print(f"[WARN] {endpoint} fail @ {url}: {e}")
# time.sleep(1)
# raise RuntimeError(f"All servers failed for {endpoint}")
# # def _request(self, endpoint: str, payload):
# # # 用 orjson,“allow_nan” 会把 NaN/Inf 写成 null
# # # json_bytes = orjson.dumps(payload,
# # # option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC | orjson.OPT_OMIT_MICROSECOND | orjson.OPT_ALLOW_INF_AND_NAN)
# # json_bytes = orjson.dumps(
# # payload,
# # option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC | orjson.OPT_ALLOW_INF_AND_NAN
# # )
# #
# # headers = {"Content-Type": "application/json"}
# #
# # for url in self.urls:
# # for _ in range(self.retries):
# # try:
# # r = self.session.post(f"{url}/{endpoint}",
# # data=json_bytes, headers=headers,
# # timeout=15)
# # r.raise_for_status()
# # return r.json()
# # except Exception as e:
# # print(f"[WARN] {endpoint} fail @ {url}: {e}")
# # time.sleep(1)
# # raise RuntimeError(f"All servers failed for {endpoint}")
#
# # ---------- 公共 API ----------
# def read_sql(self, sql: str) -> pd.DataFrame:
# data = self._request("query",
# {"db": self.db,
# "sql": sql,
# "site_name": self.database})
# return pd.DataFrame(data["result"])
#
# def to_sql(self, df: pd.DataFrame, table: str, if_exists="append"):
#
# return self._request("insert",
# {"db": self.db,
# "table": table,
# "if_exists": if_exists,
# "data": fully_clean_for_orjson(df=df).to_dict("records"),
# # "data": df_to_json_records(df), # ← 清洗后的 records
# "site_name": self.database})
#
# def read_then_update(
# self,
# select_sql: str,
# update_table: str,
# set_values: dict,
# where_keys: List[str],
# error_if_empty: bool = False,
# ):
# """
# 动态生成 UPDATE:把 select_sql 读到的行,按 where_keys 精准更新 set_values
# 返回 (DataFrame, rows_updated)
# """
# payload = {
# "db": self.db,
# "site_name": self.database,
# "select_sql": select_sql,
# "update_table": update_table,
# "set_values": set_values,
# "where_keys": where_keys,
# "error_if_empty": error_if_empty,
# }
# resp = self._request("read_then_update", payload)
# df = pd.DataFrame(resp["read_result"])
# rows_updated = resp.get("rows_updated", 0)
# return df
#
# def begin(self):
# return RemoteTransaction(self.db, self.database,
# self.session, self.urls)
# # ---------------------------------
#
#
# def get_remote_engine(site_name: str, db_type: str,
# servers: List[str] = None) -> RemoteEngine:
# if site_name not in site_name_db_dict:
# raise ValueError(f"Unknown site_name: {site_name}")
# if db_type not in db_type_alias_map:
# raise ValueError(f"Unknown db_type: {db_type}")
# return RemoteEngine(
# db=db_type_alias_map[db_type],
# database=site_name,
# server_urls=servers or DEFAULT_SERVERS,
# )
import json
import sys,os
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import orjson, requests, time import orjson, requests, time
from typing import List from typing import List
# -------- 映射字典 -------- # -------- 映射字典 --------
site_name_db_dict = { site_name_db_dict = {
"us": "selection", "us": "selection",
...@@ -24,14 +296,15 @@ db_type_alias_map = { ...@@ -24,14 +296,15 @@ db_type_alias_map = {
"postgresql_cluster": "postgresql_cluster", # pg集群-内网 "postgresql_cluster": "postgresql_cluster", # pg集群-内网
"postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网 "postgresql_cluster_outer": "postgresql_cluster_outer", # pg集群-外网
"doris": "doris", # doris集群-内网 "doris": "doris", # doris集群-内网
"doris_adv": "doris_adv",
} }
DEFAULT_SERVERS = [ DEFAULT_SERVERS = [
# "http://192.168.200.210:7777", # 内网 # "http://192.168.200.210:7777", # 内网
# "http://192.168.10.217:7777", # 内网-h7 # "http://192.168.10.217:7777", # 内网-h7
"http://61.145.136.61:7777", # 外网 # "http://61.145.136.61:7777", # 外网
"http://61.145.136.61:7779", # 外网 # "http://61.145.136.61:7779", # 外网
"http://61.145.136.61:7780"
] ]
# --------------------------- # ---------------------------
...@@ -44,8 +317,8 @@ def df_to_json_records(df: pd.DataFrame) -> list: ...@@ -44,8 +317,8 @@ def df_to_json_records(df: pd.DataFrame) -> list:
if len(num_cols): if len(num_cols):
df_clean[num_cols] = df_clean[num_cols].replace([np.inf, -np.inf], np.nan) df_clean[num_cols] = df_clean[num_cols].replace([np.inf, -np.inf], np.nan)
# 2️⃣ 替换 NaN -> None(注意:有时 astype(object) 不彻底,需用 applymap # 2️⃣ 替换 NaN -> None(兼容 pandas 1.x 和 2.x
df_clean = df_clean.applymap(lambda x: None if pd.isna(x) else x) df_clean = df_clean.where(pd.notna(df_clean), None)
# 3️⃣ 转为 dict records # 3️⃣ 转为 dict records
return df_clean.to_dict("records") return df_clean.to_dict("records")
...@@ -79,7 +352,7 @@ def clean_json_field_for_orjson(v): ...@@ -79,7 +352,7 @@ def clean_json_field_for_orjson(v):
def fully_clean_for_orjson(df: pd.DataFrame) -> pd.DataFrame: def fully_clean_for_orjson(df: pd.DataFrame) -> pd.DataFrame:
"""全面清洗 DataFrame 以符合 orjson 要求""" """全面清洗 DataFrame 以符合 orjson 要求"""
df = df.replace([np.inf, -np.inf], np.nan) df = df.replace([np.inf, -np.inf], np.nan)
df = df.applymap(lambda x: None if pd.isna(x) else x) df = df.where(pd.notna(df), None)
# 找出所有可能为 JSON 字符串的字段 # 找出所有可能为 JSON 字符串的字段
json_like_cols = [col for col in df.columns if col.endswith('_json')] json_like_cols = [col for col in df.columns if col.endswith('_json')]
...@@ -121,9 +394,9 @@ class RemoteTransaction: ...@@ -121,9 +394,9 @@ class RemoteTransaction:
try: try:
self.session.post( self.session.post(
url + "/transaction", url + "/transaction",
json={"db": self.db, json={"db_type": self.db,
"sql_list": self.sql_queue, "sql_list": self.sql_queue,
"site_name": self.database}, # site_name not needed on server, kept for clarity "database": self.database}, # site_name not needed on server, kept for clarity
timeout=3000, timeout=3000,
).raise_for_status() ).raise_for_status()
return return
...@@ -134,7 +407,13 @@ class RemoteTransaction: ...@@ -134,7 +407,13 @@ class RemoteTransaction:
class RemoteEngine: class RemoteEngine:
def __init__(self, db: str, database: str, def __init__(self, db: str, database: str,
server_urls: List[str], retries: int = 2): server_urls: List[str], retries: int = 20):
"""
:param db: db_type--数据库类型
:param database: 数据库名称
:param server_urls:
:param retries:
"""
self.db = db self.db = db
self.database = database self.database = database
self.urls = [u.rstrip("/") for u in server_urls] self.urls = [u.rstrip("/") for u in server_urls]
...@@ -149,7 +428,7 @@ class RemoteEngine: ...@@ -149,7 +428,7 @@ class RemoteEngine:
r = self.session.post(f"{url}/{endpoint}", r = self.session.post(f"{url}/{endpoint}",
data=json_bytes, data=json_bytes,
headers={"Content-Type": "application/json"}, headers={"Content-Type": "application/json"},
timeout=3000) timeout=3600)
# r = self.session.post(f"{url}/{endpoint}", # r = self.session.post(f"{url}/{endpoint}",
# json=payload, timeout=10) # json=payload, timeout=10)
...@@ -159,47 +438,24 @@ class RemoteEngine: ...@@ -159,47 +438,24 @@ class RemoteEngine:
print(f"[WARN] {endpoint} fail @ {url}: {e}") print(f"[WARN] {endpoint} fail @ {url}: {e}")
time.sleep(1) time.sleep(1)
raise RuntimeError(f"All servers failed for {endpoint}") raise RuntimeError(f"All servers failed for {endpoint}")
# def _request(self, endpoint: str, payload):
# # 用 orjson,“allow_nan” 会把 NaN/Inf 写成 null
# # json_bytes = orjson.dumps(payload,
# # option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC | orjson.OPT_OMIT_MICROSECOND | orjson.OPT_ALLOW_INF_AND_NAN)
# json_bytes = orjson.dumps(
# payload,
# option=orjson.OPT_NON_STR_KEYS | orjson.OPT_NAIVE_UTC | orjson.OPT_ALLOW_INF_AND_NAN
# )
#
# headers = {"Content-Type": "application/json"}
#
# for url in self.urls:
# for _ in range(self.retries):
# try:
# r = self.session.post(f"{url}/{endpoint}",
# data=json_bytes, headers=headers,
# timeout=15)
# r.raise_for_status()
# return r.json()
# except Exception as e:
# print(f"[WARN] {endpoint} fail @ {url}: {e}")
# time.sleep(1)
# raise RuntimeError(f"All servers failed for {endpoint}")
# ---------- 公共 API ---------- # ---------- 公共 API ----------
def read_sql(self, sql: str) -> pd.DataFrame: def read_sql(self, sql: str) -> pd.DataFrame:
data = self._request("query", data = self._request("query",
{"db": self.db, {"db_type": self.db,
"sql": sql, "sql": sql,
"site_name": self.database}) "database": self.database})
return pd.DataFrame(data["result"]) return pd.DataFrame(data["result"])
def to_sql(self, df: pd.DataFrame, table: str, if_exists="append"): def to_sql(self, df: pd.DataFrame, table: str, if_exists="append"):
return self._request("insert", return self._request("insert",
{"db": self.db, {"db_type": self.db,
"table": table, "table": table,
"if_exists": if_exists, "if_exists": if_exists,
"data": fully_clean_for_orjson(df=df).to_dict("records"), "data": fully_clean_for_orjson(df=df).to_dict("records"),
# "data": df_to_json_records(df), # ← 清洗后的 records # "data": df_to_json_records(df), # ← 清洗后的 records
"site_name": self.database}) "database": self.database})
def read_then_update( def read_then_update(
self, self,
...@@ -214,8 +470,8 @@ class RemoteEngine: ...@@ -214,8 +470,8 @@ class RemoteEngine:
返回 (DataFrame, rows_updated) 返回 (DataFrame, rows_updated)
""" """
payload = { payload = {
"db": self.db, "db_type": self.db,
"site_name": self.database, "database": self.database,
"select_sql": select_sql, "select_sql": select_sql,
"update_table": update_table, "update_table": update_table,
"set_values": set_values, "set_values": set_values,
...@@ -232,15 +488,69 @@ class RemoteEngine: ...@@ -232,15 +488,69 @@ class RemoteEngine:
self.session, self.urls) self.session, self.urls)
# --------------------------------- # ---------------------------------
# db -- 数据库类型
# database -- 站点
def get_remote_engine(site_name: str, db_type: str, def get_remote_engine(site_name: str, db_type: str, database: str = None,
servers: List[str] = None) -> RemoteEngine: servers: List[str] = None) -> RemoteEngine:
"""
:param site_name: 站点
:param db_type: 数据库类型
:param database: 数据库名称-默认为None, 否则通过站点来匹配
:param servers: 服务器url地址
:return: 数据库连接对象
"""
if not database:
database = site_name_db_dict[site_name]
if site_name not in site_name_db_dict: if site_name not in site_name_db_dict:
raise ValueError(f"Unknown site_name: {site_name}") raise ValueError(f"Unknown site_name: {site_name}")
if db_type not in db_type_alias_map: if db_type not in db_type_alias_map:
raise ValueError(f"Unknown db_type: {db_type}") raise ValueError(f"Unknown db_type: {db_type}")
print(f"db_type: {db_type_alias_map[db_type]}, database: {database}")
return RemoteEngine( return RemoteEngine(
db=db_type_alias_map[db_type], db=db_type_alias_map[db_type],
database=site_name, database=database,
server_urls=servers or DEFAULT_SERVERS, server_urls=servers or DEFAULT_SERVERS,
) )
def stream_load_df(df: pd.DataFrame, database: str='advertising_manager', table: str='', server_url: str = None):
server = (server_url or DEFAULT_SERVERS[0]).rstrip("/")
payload = {
"database": database,
"table": table,
"data": fully_clean_for_orjson(df).to_dict("records"),
}
json_bytes = orjson.dumps(payload)
r = requests.post(
f"{server}/stream_load_df",
data=json_bytes,
headers={"Content-Type": "application/json"},
timeout=3000,
)
r.raise_for_status()
return r.json()
if __name__ == '__main__':
engine_mysql = get_remote_engine(
site_name='us', # -> database "selection"
db_type="doris_adv", # -> 服务端 alias "mysql"
database="advertising_manager", # -> 服务端 alias "mysql"
)
# e = engine_mysql.read_sql('SELECT max(created_time),MIN(created_time) FROM us_sp_search_term_rank WHERE time_batch="2026-03-09-15"')
# e = engine_mysql.read_sql('SELECT * FROM us_sp_search_term_rank WHERE time_batch="2026-03-09-12" and page_row=68')
# e = engine_mysql.read_sql('SELECT * FROM us_sp_search_term_rank WHERE id=50564433')
e = engine_mysql.read_sql('SELECT count(id),time_batch FROM us_sp_search_term_rank GROUP BY time_batch ORDER BY time_batch ASC')
# e = engine_mysql.read_sql('SELECT count(id),time_batch FROM us_sp_search_term_rank GROUP BY time_batch ORDER BY time_batch ASC')
# e = engine_mysql.read_sql('SELECT count(id) FROM sp_keyword_position_keyword')
# e = engine_mysql.read_sql('select * from us_sp_search_term_syn limit 100')
# dog bandanas spring
# 2026-04-03-11
# e = engine_mysql.read_sql('select * from us_sp_search_term_syn where search_term = "dog bandanas spring"')
# e = engine_mysql.read_sql('select * from us_sp_search_term_rank where search_term="dog bandanas spring" and time_batch="2026-04-03-11" and cate_type="sp" and asin="B0GCN7CTCM"')
# e = engine_mysql.read_sql('select * from us_sp_search_term_rank where time_batch="2026-03-17-16" and cate_type="sp" limit 100')
# print(e)
print(e.columns)
print(e.values.tolist())
DEFAULT_USER = "fangxingjun" DEFAULT_USER = "pengyanbing"
DEFAULT_USER_TOKEN = "fxj_token_123" DEFAULT_USER_TOKEN = "8f3b9d2a4c7e58b1"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment