Commit 000d315d by Peng

no message

parent 8dd9963f
......@@ -350,7 +350,7 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; x-hng=lang=zh-CN&domain=www.sellersprite.com; 8f00639f9c446a2d0213=54fb71d3f2c9e8acb7878e0f73abbf33; _gcl_au=1.1.420472597.1749119222.719336435.1751886424.1751886424; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751516385,1751886410,1751973053,1752031904; HMACCOUNT=800EBCCFB4C6BBFB; 65722c3d8208b58d42f9=7dc2ebaa5e4a51182da4ade1aacd8dc4; rank-guest-user=6159802571t3e3obe8rwmCywrH0Xq28vOMfd8Q+siSpAi1WiGPGuuMcYrYhXyf/QpgeBCBdgCT; rank-login-user=6159802571t3e3obe8rwmCywrH0Xq28mIqu6gO0eXYPrSqY9RlSIznMsavLuIJkOkjELzcr/d1; rank-login-user-info="eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjYxNTk4MDI1NzF0M2Uzb2JlOHJ3bUN5d3JIMFhxMjhtSXF1NmdPMGVYWVByU3FZOVJsU0l6bk1zYXZMdUlKa09rakVMemNyL2QxIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJJUGFLc3VqMkZsUmpPR1NRQnIxYkJRIiwiaWF0IjoxNzUyMDMxOTE2LCJleHAiOjE3NTIxMTgzMTYsIm5iZiI6MTc1MjAzMTg1Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.mLIjN_qO4K8w18IDVa0GCRY3MODTmJhZlQaPbgBjeYJRPDwteHfkfqFS_GFyLu4svoahzyFRxkdnKhxs1x90QxQ-7QCwjwypbk8On6gMarKl8jopo9sJbZITvk8mrqtoT6N34LZ1ash35iAkIuPZONPMH8_cp5NxiSC70J12fvIT9ZXp-9zvEk6WV8qQ3pRr0yRuGnSsuWjVvDE9WRNpE3ZmYS_EUBroA51yBEPdS8aBThRuuVGt4HuqrPXp9ZwHoiOcRYu1VcQu-wpIAhLfXcnY1vJA3FXm7w_H00DOGZuM9HRcxdg6Fj-2WP5FvCxbE8z5n1-zbQMs_J8JVaVXgQ; ao_lo_to_n="6159802571t3e3obe8rwmCywrH0Xq28osFyhyxlRsfXXDx9AUjMD2qAFgWUPkLF84KewBkZoL5OL21x5jznuxdPNdiJfglPNE7YH03Vk5CofaP+MGH3y8="; _gaf_fp=01fef3c14bfcaf5a01438f74a677e95a; _ga_38NCVF2XST=GS2.1.s1752031904$o47$g1$t1752031923$j41$l0$h1543227925; _ga_CN0F80S6GL=GS2.1.s1752031906$o46$g1$t1752031924$j42$l0$h0; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1752035308; JSESSIONID=165F9BAA752FE5B22CCD7C5BB7B62F2F',
'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1752031904,1752460043,1752653436,1753353401; HMACCOUNT=800EBCCFB4C6BBFB; 894cdd1d9741ce0c9757=827b7d3d13ed7bd6b4b1b24d0246b3dc; 3d854e1bcd61963fdf05=38fcb3b742a48aa345ddfd7136bc60ee; _gaf_fp=f297033bfe53aa9891ffe2842271566b; _gcl_au=1.1.420472597.1749119222.1054917286.1753685435.1753685437; rank-guest-user=6303473571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9lihUCQIaVmrHXjbpSRP/Ca0F; rank-login-user=6303473571KK6FnhfedvWg9tSSyk3xj2GRIc/8HSm4vuPYVHI5vKLXnssgei5ccK1dG8fkQSFI; rank-login-user-info=eyJuaWNrbmFtZSI6IuW4heWTpSIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTgzKioqKjczNDciLCJ0b2tlbiI6IjYzMDM0NzM1NzFLSzZGbmhmZWR2V2c5dFNTeWszeGoyR1JJYy84SFNtNHZ1UFlWSEk1dktMWG5zc2dlaTVjY0sxZEc4ZmtRU0ZJIn0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJKc2pZSlZWeFZzTVptVWFvMzgtZ3RRIiwiaWF0IjoxNzUzNjg1NDM2LCJleHAiOjE3NTM3NzE4MzYsIm5iZiI6MTc1MzY4NTM3Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIxNSwicGkiOm51bGwsIm5uIjoi5biF5ZOlIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxODMwNzk2NzM0NyIsImVtIjoiMzE1OTk4MDg5MkBxcS5jb20iLCJtbCI6IkcifQ.EaQ7Md7iVOpjZDogkiS2DlndhFPt3GzL2t33LXnh9Z5Itr3A8scFM_tzrYuzXqF6a-BDIMFe90SdDtU18zs9WTTl6_Phv3AEqcDe6WDfPAhB_KMa15VYAE5-b9d3lgIukKR8ZZyAMpiJzcmIWShmqxrhCNQD0ER3b7idaJpSrJiKnwV-tj6La52WJ6BmVRAk8gst0p5h-SYVnNz9iNaSXLc2Dx-hHZvMVNU27yfbJgKPpzRxgh7TOD7O-cT0WrEoKvTSw9e81gG9bgvKuA_bD-z3ePhgM6prUfceWszD88KH8PcXua9s_8ZM4bgrMyKMHswLtwyLhWePcvtHUp6yyQ; ao_lo_to_n=6303473571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9liibP9br/hwQ1Dlb4xDZyVPrTQIst5JCVz4PpnUIlDMGE07YVPYBWOm3Hrx4PaVkgaQ=; _ga_38NCVF2XST=GS2.1.s1753685428$o61$g1$t1753685444$j44$l0$h984121357; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1753685445; _ga_CN0F80S6GL=GS2.1.s1753685429$o59$g1$t1753685445$j44$l0$h0; JSESSIONID=F09543D3A3D6F890BAD0F422FCA49942',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
......@@ -368,7 +368,7 @@ def junglescout_spider(db_base):
response = json.loads(response.text)
break
except:
time.sleep(random.uniform(15, 35.75))
time.sleep(random.uniform(15, 30.75))
response_data = response['data']
print('code::', response['code'])
......@@ -387,7 +387,7 @@ def junglescout_spider(db_base):
print('获取数据:', category_name['name'], i, est, year_month)
sales = int(est)
name_rnak_list.append((category_name['name'], i, sales, year_month))
time.sleep(random.uniform(20, 75.75))
time.sleep(random.uniform(20, 65.75))
# break
for i in range(4):
try:
......@@ -408,7 +408,7 @@ def junglescout_spider(db_base):
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
time.sleep(20)
print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(120, 240.5))
time.sleep(random.uniform(90, 200.5))
def save_site_category(site_bsr_dict=None):
......
......@@ -85,12 +85,12 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache",
'Cookie': '_ga=GA1.1.19240078.1751854600; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751854601; HMACCOUNT=28ABEEABEFA97E4A; _gcl_au=1.1.536675967.1751854601; MEIQIA_TRACK_ID=2zWlEnsYAqnZRdhJqJ5txX7tpXm; MEIQIA_VISIT_ID=2zWlEmUkBQV745rliAtXEdAk0CJ; ecookie=ZyZ05gxOxlDTPkM1_CN; 8f00639f9c446a2d0213=54fb71d3f2c9e8acb7878e0f73abbf33; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=e03eac62da4f8988dc796341e1bd822c; current_guest=jsxcNvsgBJO1_250707-100340; rank-login-user=502219157192wVgAJpdturGN5Im+nPDQqTtoVYwVNo1oWP9MD0mtMHFwS3LrhtAUhuCnvMHsCl; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjUwMjIxOTE1NzE5MndWZ0FKcGR0dXJHTjVJbStuUERRcVR0b1ZZd1ZObzFvV1A5TUQwbXRNSEZ3UzNMcmh0QVVodUNudk1Ic0NsIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiIwZ01FdlJuNWJ1dlZhVW5IZ1lKSDFRIiwiaWF0IjoxNzUxODU0NjA1LCJleHAiOjE3NTE5NDEwMDUsIm5iZiI6MTc1MTg1NDU0NSwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.Ujr6_K3vHIQRw3x52QAQdTftMy6GbZ_TunmFMgW76onCy3EkBzx7uxEv-42zRRXgKLMUfJz2t0ierqXV6Evh9i-o5F0ZUBREzm48LHpGSw6Iupjx4Udc3VQwVqgiUOmYBvnTAQqmaj6iA5l06zAZcVNHQASZ5xe5QFUCllIOL0m8tf3Xad6T8u5oLHRHTTuyy5nDAqLu6ZxVOqUYYXsIzq9H2qAsPhqIgRy_5Av1zyoAcQErddadCe25H_ILmKO0Az9ANIFg4o1r_is_VFVZpGvbz8nCN0JLuY3uajAjf2JXoEzhHT9YbMP0o2TrZDRPdORV3HVK1N5uvghRaRyJvw; ao_lo_to_n="502219157192wVgAJpdturGN5Im+nPDfbd9htCMUGF/tdMS8/gmBNzv9/utYT5ucwmHHPC71S6i4RnT3fLUZW/nDI61eZx1uqLqr+hBy0X/aeJ6c/sSSc="; rank-guest-user=502219157192wVgAJpdturGN5Im+nPDYsyQgRxjbXtKYdDjju8ax0OkcsNUNGWP3xY6uiwKVVO; JSESSIONID=96FF611DCBDF20B9C6C921EAD2A55205; _ga_38NCVF2XST=GS2.1.s1751854600$o1$g1$t1751854612$j48$l0$h1855838417; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1751854612; _ga_CN0F80S6GL=GS2.1.s1751854600$o1$g1$t1751854613$j47$l0$h0',
'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; x-hng=lang=zh-CN&domain=www.sellersprite.com; a40ac813159995d028ba=3d9b7c15f5787387e62acd734f598f23; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751973053,1752031904,1752460043,1752653436; HMACCOUNT=800EBCCFB4C6BBFB; rank-guest-user=8301172571YFpPM/DhYDVQzRAgRu7tcQTFTi48nSnOk/TNMkep2gdtR77QXyNfDPmFlYbdSsdL; rank-login-user=8301172571YFpPM/DhYDVQzRAgRu7tcWqD2KCbe1WiKcOarfxTCdls3AJ9lNFy+VA8a+RTm195; rank-login-user-info=eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjgzMDExNzI1NzFZRnBQTS9EaFlEVlF6UkFnUnU3dGNXcUQyS0NiZTFXaUtjT2FyZnhUQ2RsczNBSjlsTkZ5K1ZBOGErUlRtMTk1In0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJLcVRRV2RPbVNNcjlKTU1qYTdXRjFRIiwiaWF0IjoxNzUyNjUzNDM4LCJleHAiOjE3NTI3Mzk4MzgsIm5iZiI6MTc1MjY1MzM3OCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.caY2QxTbtUVg7CQXvNJcmVo1YU0TGy3AD01dIddF76PHjYbbFh5a8zZAdAXnAKM1wNcs39d1MM8Wa-uoXHiitqDlCZsWyF9aXzco9L4wn-yU4xlMYsf7VoquZI6bxaMT2TNeX6vgQBod-NeXHYFpZQWdrH5sfZHQypkpRINb_o1QwaWvZrjufj1UwYdiypryBxTDyCuLfD4djU0PLMRXvifY6Ef86VNjAlsY8gFqDdHiVLixR2GWGdKRtoG74Ak5DX2eMDT6ak-OMrWYOaikthxIXiqdADTq2tvUCmjO4pE0oYnWhSEx9-UABo7jxJ0v_Af8B6AVu7ccC0NUUvWBMA; ao_lo_to_n=8301172571YFpPM/DhYDVQzRAgRu7tca/7vKUOAtDW4w4LhsAzrvlsqk8xCK+opMY27DGtrDKlwUwhqg///+C6QOw12iRKNIq9mCOV5+ORmOA+PwqisF4=; _gaf_fp=0f3f9e0c791b5513d38aa715d0624aab; _gcl_au=1.1.420472597.1749119222.448034571.1752653439.1752653439; JSESSIONID=0F617D64E2FD6DD92F3BB10935E3C846; _ga_38NCVF2XST=GS2.1.s1752653436$o51$g1$t1752653450$j46$l0$h366949276; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1752653451; _ga_CN0F80S6GL=GS2.1.s1752653437$o50$g1$t1752653451$j46$l0$h0',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = {
"station": "US",
"station": "UK",
"cid": category_name['c_id'], # 分类id
"bsr": f"{i}" # 排名
}
......@@ -167,7 +167,7 @@ def save_site_category(site_bsr_dict=None):
def run():
# get_cid()
junglescout_spider('us')
junglescout_spider('uk')
if __name__ == '__main__':
......
......@@ -158,7 +158,7 @@ if __name__ == '__main__':
month = 7
engine_db_num = 14
# for site in ['de','uk']:
for site in ['uk']:
for site in ['us']:
time.sleep(0)
count_all_syn_st_id(site_name=site,month=month).get_minid_maxid()
# count_all_syn_st_id(site_name=site,month=month,engine_db_num=engine_db_num).search_term_syn()
......
......@@ -214,7 +214,8 @@ class async_asin_pg():
'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"], 'div_id_list': items['div_id_list'],
'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json']
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json':items['bundle_asin_component_json']
}
if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
item['five_six_val'] = items['five_six_val']
......@@ -222,8 +223,6 @@ class async_asin_pg():
item['five_six_val'] = None
# 第二次请求
_response_text = None
# if (item['weight'] is None and item['volume'] is None and item['rank'] is None and item[
# 'launch_time'] is None) or (item['variat_num'] > 0 and is_variat == '0'):
if item['variat_num'] > 0 and is_variat == '0':
self.request_total_count_list.append(4)
if item['variat_num'] > 0:
......@@ -478,7 +477,7 @@ class async_asin_pg():
def run(self):
asin_list = self.save_asin_detail.read_db_data()
# asin_list = ['B07BXM8RZ3|2025-01|1|1|null|null','B07FM8P1Z1|2025-01|1|1|null|null','B07TWHCK69|2025-01|1|1|null|null']
# asin_list = ['B0BPKK2BMN|2025-01|1|1|null|null']
if asin_list:
for asin in asin_list:
self.queries_asin_queue.put(asin)
......
......@@ -14,7 +14,7 @@ import html
import re
from html import unescape
import urllib.parse
from sqlalchemy import text
class recall_cases():
......@@ -90,6 +90,15 @@ class recall_cases():
if response_detail:
recall_date_list = response_detail.xpath("//div[contains(text(),'Recall Date:')]/parent::div/text()")
product_title_list = response_detail.xpath("//div[contains(text(),'Name of Product:')]/parent::div/text()")
if product_title_list:
matches = re.findall(r'[A-Za-z\-®]+(?: [A-Za-z\-®]+)*', product_title_list[-1].strip())
if matches:
brand = ','.join(matches)
else:
brand = None
else:
brand = None
hazard_list = response_detail.xpath("//div[contains(text(),'危险:')]/parent::div//p//text()")
image_url_list = response_detail.xpath("//div[@id='recall-gallery-img']//li/img/@src")
recall_date = recall_date_list[-1].strip() if recall_date_list else None # 召回日期
......@@ -98,7 +107,7 @@ class recall_cases():
image_url = 'https://www.cpsc.gov' + image_url_list[0].strip() if image_url_list else None # 图片
if recall_date:
recall_date = self._parse_date_str(recall_date)
data_list = ['us_recalls_product', recall_date, product_title, hazard, image_url, a_href]
data_list = ['us_recalls_product', recall_date, product_title, hazard, image_url, a_href,brand]
return data_list
else:
return None
......@@ -140,7 +149,7 @@ class recall_cases():
df = pd.DataFrame(data=save_data_list,
columns=['data_type', 'recall_date', 'product_title', 'hazard',
'image_url',
'ext_url', 'recall_title', 'country'])
'ext_url','brand', 'recall_title', 'country'])
df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
except:
is_None = False
......@@ -207,6 +216,7 @@ class recall_cases():
dict_item = response.json()
data_lists = dict_item['data']
for data in data_lists:
print(data,'344444444')
data_list = []
try:
# 逐项解码
......@@ -234,18 +244,24 @@ class recall_cases():
response2 = self._request(headers=headers, url=url)
response_detail = etree.HTML(response2.text)
src_list = response_detail.xpath("//div[@id='recall-photos']//img/@src")
Brand_list = response_detail.xpath("//div[contains(text(),'Brand Name')]/following-sibling::div//text()")
if Brand_list:
brand = ''.join(Brand_list).strip()
else:
brand = None
print(brand,'Brand_list::',Brand_list)
if src_list:
image_url = 'https://www.fda.gov' + src_list[0]
else:
image_url = None
print('image_url:', image_url)
data_list.append(['us_fba_recalls', date, link_text, hazard, image_url, url, recall_title, 'us',
product_category])
product_category,brand])
try:
df = pd.DataFrame(data=data_list,
columns=['data_type', 'recall_date', 'product_title', 'hazard',
'image_url',
'ext_url', 'recall_title', 'country', 'product_category'])
'ext_url', 'recall_title', 'country', 'product_category','brand'])
df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
except:
......@@ -260,7 +276,7 @@ class recall_cases():
def ec_europa_eu(self):
'欧盟召回'
for i in range(0, 32):
for i in range(1, 33):
url = 'https://ec.europa.eu/safety-gate-alerts/public/api/notification/carousel/?'
data = {"language": "en", "page": f"{i}"}
headers = {
......@@ -284,6 +300,7 @@ class recall_cases():
print(data, '请求列表页url:', url)
is_None = True
response = requests.post(url, headers=headers, impersonate="chrome120", timeout=120, json=data)
print(response.url)
if response:
content = response.json()['content']
for ids in content:
......@@ -314,11 +331,17 @@ class recall_cases():
recall_title = items['product']['versions'][0]['description']
print(product_title)
print(recall_title)
brands = items['product']['brands']
if brands:
brand = brands[0].get('brand')
else:
brand = None
print('brand::1',brand)
hazard = items['risk']['versions'][0]['riskDescription']
print(hazard)
ext_url = 'https://ec.europa.eu/safety-gate-alerts/screen/webReport/alertDetail/' + str(items['id']) + '?lang=en'
print('ext_url::',ext_url)
ext_url = 'https://ec.europa.eu/safety-gate-alerts/screen/webReport/alertDetail/' + str(
items['id']) + '?lang=en'
print('ext_url::', ext_url)
if items['product']['photos']:
image_id = items['product']['photos'][0]['id']
image_url = f'https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{image_id}'
......@@ -326,30 +349,36 @@ class recall_cases():
image_url = None
print(image_url)
data_list.append(
[date, product_category, product_title, recall_title, hazard, 'eu_recall', image_url, 'eu', ext_url,
data_json])
# try:
# df = pd.DataFrame(data=data_list,
# columns=['recall_date', 'product_category', 'product_title', 'recall_title',
# 'hazard', 'data_type', 'image_url',
# 'country', 'ext_url', 'data_json'])
# df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
# df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
[date, product_category, product_title, recall_title, hazard, 'eu_recall', image_url, 'eu',
ext_url,data_json,brand])
keys = [
"recall_date", "product_category", "product_title", "recall_title",
"hazard", "data_type", "image_url", "country", "ext_url", "data_json", "brand"
]
# 把 list of list 转成 list of dict
dict_list = [dict(zip(keys, row)) for row in data_list]
with self.mysql_db.begin() as conn:
conn.execute(
f"insert into recall_cases_data (recall_date, product_category, product_title,recall_title,hazard,"
f"data_type,image_url,country,ext_url,data_json) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE recall_date = values(recall_date),product_title=values (product_title),ext_url=values (ext_url)",
data_list)
# except:
# is_None = False
# break
text("""
INSERT INTO recall_cases_data
(recall_date, product_category, product_title, recall_title, hazard, data_type, image_url, country, ext_url, data_json, brand)
VALUES (:recall_date, :product_category, :product_title, :recall_title, :hazard, :data_type, :image_url, :country, :ext_url, :data_json, :brand)
ON DUPLICATE KEY UPDATE
recall_date = VALUES(recall_date),
product_title = VALUES(product_title),
ext_url = VALUES(ext_url)
"""),
dict_list
)
if is_None == False:
break
else:
break
time.sleep(random.uniform(2, 8))
def ec_europa_uk(self):
'https://www.gov.uk/product-safety-alerts-reports-recalls?page=2'
url = 'https://www.gov.uk/product-safety-alerts-reports-recalls'
......@@ -392,6 +421,8 @@ class recall_cases():
print('产品标题:', product_title)
hazard_list = resp_html.xpath("//p[contains(text(),'Hazard:')]/text()")
print('风险:', hazard_list)
Brand_list = resp_html.xpath("//td[contains(text(),'Brand')]/following-sibling::td/text()")
brand = Brand_list[0].strip() if Brand_list else None
image_url_list = resp_html.xpath("//span[@class='attachment-inline']/a/@href")
product_category = product_category[0].strip() if product_category else None
product_title = product_title[0].strip().replace('Product:', '') if product_title else None
......@@ -399,13 +430,13 @@ class recall_cases():
image_url_list = image_url_list[0].strip() if image_url_list else None
data_list.append(
[recall_title, detail_url, recall_date, product_category, product_title,
hazard_list, image_url_list, 'uk_recall', 'uk'])
hazard_list, image_url_list, 'uk_recall', 'uk',brand])
if data_list:
try:
df = pd.DataFrame(data=data_list,
columns=['recall_title', 'ext_url', 'recall_date', 'product_category',
'product_title',
'hazard', 'image_url', 'data_type', 'country'])
'hazard', 'image_url', 'data_type', 'country','brand'])
df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
......@@ -554,9 +585,12 @@ class recall_cases():
def get_globalrecalls(self):
# sql = 'SELECT data_json FROM global_recalls_data'
# df_data = pd.read_sql(sql, con=self.mysql_db)
list_url = 'https://globalrecalls.oecd.org/ws/search.xqy?end=20&lang=en&order=desc&q=&sort=date&start=0&uiLang=en'
# list_url = f'https://globalrecalls.oecd.org/ws/search.xqy?end={i}&lang=en&order=desc&q=&sort=date&start={i - 20}&uiLang=en'
print('请求url', list_url)
# 'https://globalrecalls.oecd.org/ws/search.xqy?end=200&lang=en&order=desc&q=&sort=date&start=180&uiLang=en'
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
......@@ -574,7 +608,8 @@ class recall_cases():
for result in result_list:
countryId = result['countryId']
imageUri = result['imageUri']
if countryId.lower() in ['us', 'ca', 'mx', 'nl', 'sa', 'se', 'pl', 'tr', 'be', 'uk', 'de', 'es', 'fr', 'it',
if countryId.lower() in ['us', 'ca', 'mx', 'nl', 'sa', 'se', 'pl', 'tr', 'be', 'uk', 'de', 'es', 'fr',
'it',
'jp']:
date_time = result['date']
extUrl = result['extUrl']
......@@ -600,7 +635,11 @@ class recall_cases():
if 'ENTITY_NOT_FOUN' in resp.text:
continue
items_data = resp.json()
brands = items_data['product']['brands']
if brands:
brand = brands[0].get('brand')
else:
brand = None
time.sleep(random.uniform(1, 3))
items['country'] = countryId
items['reacll_time'] = date_time
......@@ -616,17 +655,19 @@ class recall_cases():
items['data_type'] = 'global_recalls'
items['product_title'] = re.findall(r'^(.*?)\s*;', title_name + ';')[0]
items['ext_url'] = extUrl
items['brand'] = brand
data_json = json.dumps(items_data)
data_list.append([items['data_type'], items['product_title'], items['productCategory'],
items['reacll_time'], items['riskDescription'], items['country'],
items['image_url'],
items['recall_title'], items['ext_url'], data_json])
print(items)
items['recall_title'], items['ext_url'], data_json,items['brand']])
print('itemsitems::',items)
try:
df = pd.DataFrame(data=data_list,
columns=['data_type', 'product_title', 'product_category', 'recall_date',
'hazard',
'country', 'image_url', 'recall_title', 'ext_url', 'data_json'])
'country', 'image_url', 'recall_title', 'ext_url', 'data_json','brand'])
df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
except:
print('数据重复=====')
......@@ -637,10 +678,11 @@ class recall_cases():
url = result['uri']
items['country'] = countryId # 站点
encoded_url = urllib.parse.quote(url)
_url = 'https://globalrecalls.oecd.org/ws/getrecall.xqy?uiLang=en&uri='+encoded_url
_url = 'https://globalrecalls.oecd.org/ws/getrecall.xqy?uiLang=en&uri=' + encoded_url
print('_url::',_url)
resp = requests.get(_url, headers=headers, timeout=60)
result = resp.json()
print("result::",result)
print("result::", result)
time.sleep(random.uniform(1, 3))
extUrl = result['recall']['extUrl'] # 详情url
imageUri = result['recall']['images'][0]['imageUri']
......@@ -653,7 +695,7 @@ class recall_cases():
if recall_title is None:
recall_title = result['recall']['images'][0]['alt.text']
if recall_title:
recall_title.replace('Image of ','')
recall_title.replace('Image of ', '')
hazard = result['recall']['hazard']
items['recall_title'] = recall_title
items['productCategory'] = result['recall']['product.type']
......@@ -667,7 +709,7 @@ class recall_cases():
items['reacll_time'], items['riskDescription'], items['country'],
items['image_url'],
items['recall_title'], items['ext_url'], data_json])
print('没有解析到id的数据:',items)
print('没有解析到id的数据:', items)
try:
df = pd.DataFrame(data=data_list,
columns=['data_type', 'product_title', 'product_category', 'recall_date',
......@@ -677,6 +719,7 @@ class recall_cases():
except:
print('没有解析到id 存储 数据重复=====')
continue
def run(self):
# self.global_recalls()
self.get_globalrecalls()
......@@ -685,25 +728,41 @@ class recall_cases():
self.ec_europa_eu()
self.ec_europa_uk()
self.gov_uk()
# with self.mysql_db.begin() as conn:
# sql = "SELECT data_json FROM recall_cases_data WHERE data_type='eu_recall'"
# df_data = pd.read_sql(sql, con=self.mysql_db)
# data_json_list = list(df_data.data_json)
# for data_json in data_json_list:
# data_dict = json.loads(data_json)
# print(data_dict['product']['photos'][0]['id'])
# imgurl = f"https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{data_dict['product']['photos'][0]['id']}"
# up_sql = f"update recall_cases_data set image_url ='{imgurl}' WHERE data_type='eu_recall' and image_url like '%%/image/{data_dict['product']['photos'][0]['id']}'"
# print(up_sql)
# conn.execute(up_sql)
# """
# 数据类型,属于那个国的
# eu_recall
# global_recalls
# uk_drug_device 1
# uk_recall 2
# us_fba_recalls 3
# us_recalls_product
# """
# with self.mysql_db.begin() as conn:
# sql = "SELECT id,product_title FROM recall_cases_data WHERE data_type='us_recalls_product'"
# df_data = pd.read_sql(sql, con=self.mysql_db)
# df_data['id'] = df_data['id'].fillna('').astype(str)
# df_data['product_title'] = df_data['product_title'].fillna('').astype(str)
# data_json_list = list(df_data.id+ "|=||+||" + df_data.product_title)
# for data_json_id in data_json_list:
# if data_json_id:
# data_json_id_list = data_json_id.split('|=||+||')
# id = data_json_id_list[0]
# product_title = data_json_id_list[1]
# print(product_title)
# if bool(re.search(r'[\u4e00-\u9fff]', product_title)):
# # print(product_title,'23333333')
# matches = re.findall(r'[A-Za-z\-®]+(?: [A-Za-z\-®]+)*', product_title.strip())
# if matches:
# brand = ','.join(matches)
# else:
# brand = None
# print(id, brand,'23444444444')
# if brand:
# brand = brand.replace('"','').replace("'","")
# up_sql = f"""update recall_cases_data set brand ="{brand}" WHERE id={id}"""
# print(up_sql)
# conn.execute(up_sql)
if __name__ == '__main__':
......
......@@ -35,7 +35,7 @@ class Parse_asin_html():
print('没有该 asin html')
def search_term_html(self, site_name='us', month='04'):
sql = f"SELECT search_term,page,html FROM search_term_html_2025_{month} WHERE search_term='lace white tops for women' and site_name = '{site_name}'"
sql = f"SELECT search_term,page,html FROM search_term_html_2025_{month} WHERE search_term='resin kit' and site_name = '{site_name}'"
print(sql)
df = pd.read_sql(sql, con=engine_strrocks)
print(df.values)
......@@ -52,8 +52,8 @@ class Parse_asin_html():
print('没有该 搜索词 html')
def run(self):
self.asin_html()
# self.search_term_html(site_name='us',month='04')
# self.asin_html()
self.search_term_html(site_name='uk',month='07')
if __name__ == '__main__':
......
......@@ -402,7 +402,7 @@ class ParseAsinUs(object):
break
if min_match_asin_data_list:
min_match_asin_json = json.dumps(min_match_asin_data_list, ensure_ascii=False)
# bundles_this_asins ,Bundles with this item B0BPV8R4K8
# bundles_this_asins ,Bundles with this item B0BPV8R4K8 变体下方位置。和五点描述挨着
bundles_this_asins_data_list = []
bundles_this_asins_data_json = None
for i in ASIN_XPATH['bundles_this_asins']:
......@@ -436,7 +436,48 @@ class ParseAsinUs(object):
break
if bundles_this_asins_data_list:
bundles_this_asins_data_json = json.dumps(bundles_this_asins_data_list, ensure_ascii=False)
# 捆绑销售 B0DD8W2DZD This bundle contains 2 items
href_asin_list = self.response_s.xpath(
"//div[@class='bundle-title']/following-sibling::div//div[@class='bundle-components']//div[contains(@id,'bundle-component-details-component-title')]/a/@href")
bundle_asin_component_list = []
if href_asin_list:
bundle_component_asin_list = []
for href_asin in href_asin_list:
i_asin_list = re.findall(r'(?:[A-Z0-9]{10}|[0-9]{10})', href_asin)
bundle_component_asin_list.append(i_asin_list[0])
if bundle_component_asin_list:
bundle_component_asin_list = list(set(bundle_component_asin_list))
for bundle_component_asin in bundle_component_asin_list:
print('bundle_component_asin:', bundle_component_asin)
bundle_title_list = self.response_s.xpath(
f"//a[contains(@href,'{bundle_component_asin}')]/parent::div[contains(@id,'component-details-component-title')]/a/text()")
bundle_asin_title = bundle_title_list[0] if bundle_title_list else None
bundle_img_list = self.response_s.xpath(f"//a[contains(@href,'{bundle_component_asin}')]/img/@src")
bundle_asin_img = bundle_img_list[0] if bundle_img_list else None
bundle_review_list = self.response_s.xpath(
rf"//a[contains(@href,'{bundle_component_asin}')]/i[contains(@class,'component-details-component-review')]//following-sibling::span/text()")
bundle_asin_review = bundle_review_list[0] if bundle_review_list else None
bundle_starslist = self.response_s.xpath(
rf"//a[contains(@href,'{bundle_component_asin}')]/i[contains(@class,'component-details-component-review-stars')]/@class")
bundle_stars = bundle_starslist[0] if bundle_starslist else None
bundle_stars_list = re.findall(r'a-star-(.*?) ', bundle_stars)
bundle_asin_star = bundle_stars_list[0].replace('-', '.') if bundle_stars_list else None
bundle_asin_price_list = self.response_s.xpath(
f"//a[contains(@href,'{bundle_component_asin}')]/parent::div/following-sibling::div[contains(@class,'component-details-component-prices')]/span/text()")
bundle_asin_price = bundle_asin_price_list[0] if bundle_asin_price_list else None
bundle_asin_point_list = self.response_s.xpath(
f"//a[contains(@href,'{bundle_component_asin}')]/parent::div/following-sibling::ul/li[contains(@id,'component-details-component-bullet-point')]/span/text()")
bundle_asin_point = '|-|'.join(bundle_asin_point_list) if bundle_asin_point_list else None
bundle_component_asin_item = {"bundle_component_asin":bundle_component_asin,"bundle_asin_title":bundle_asin_title,
'bundle_asin_img':bundle_asin_img,"bundle_asin_review":bundle_asin_review,
"bundle_asin_star":bundle_asin_star,"bundle_asin_price":bundle_asin_price,
"bundle_asin_point":bundle_asin_point}
bundle_asin_component_list.append(bundle_component_asin_item)
if bundle_asin_component_list:
bundle_asin_component_json = json.dumps(bundle_asin_component_list)
else:
bundle_asin_component_json = None
# 五点描述
for i in ASIN_XPATH['five_data']:
five_text_list = self.response_s.xpath(i)
......@@ -2815,7 +2856,7 @@ class ParseAsinUs(object):
'customer_reviews_json': customer_reviews_json, 'together_asin_json': together_asin_json,
'min_match_asin_json': min_match_asin_json, 'seller_json': seller_json, 'current_asin': current_asin,
'div_id_list': div_id_list, 'bundles_this_asins_data_json': bundles_this_asins_data_json,
'video_m3u8': video_m3u8, 'result_list_json': result_list_json}
'video_m3u8': video_m3u8, 'result_list_json': result_list_json,'bundle_asin_component_json':bundle_asin_component_json}
if self.site_name == 'us':
item['three_four_val'] = Join_Prime_int
elif self.site_name in ['uk', 'fr', 'it', 'es']:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment