Commit 000d315d by Peng

no message

parent 8dd9963f
...@@ -350,7 +350,7 @@ def junglescout_spider(db_base): ...@@ -350,7 +350,7 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8", "Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache", "Cache-Control": "no-cache",
'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; x-hng=lang=zh-CN&domain=www.sellersprite.com; 8f00639f9c446a2d0213=54fb71d3f2c9e8acb7878e0f73abbf33; _gcl_au=1.1.420472597.1749119222.719336435.1751886424.1751886424; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751516385,1751886410,1751973053,1752031904; HMACCOUNT=800EBCCFB4C6BBFB; 65722c3d8208b58d42f9=7dc2ebaa5e4a51182da4ade1aacd8dc4; rank-guest-user=6159802571t3e3obe8rwmCywrH0Xq28vOMfd8Q+siSpAi1WiGPGuuMcYrYhXyf/QpgeBCBdgCT; rank-login-user=6159802571t3e3obe8rwmCywrH0Xq28mIqu6gO0eXYPrSqY9RlSIznMsavLuIJkOkjELzcr/d1; rank-login-user-info="eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjYxNTk4MDI1NzF0M2Uzb2JlOHJ3bUN5d3JIMFhxMjhtSXF1NmdPMGVYWVByU3FZOVJsU0l6bk1zYXZMdUlKa09rakVMemNyL2QxIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJJUGFLc3VqMkZsUmpPR1NRQnIxYkJRIiwiaWF0IjoxNzUyMDMxOTE2LCJleHAiOjE3NTIxMTgzMTYsIm5iZiI6MTc1MjAzMTg1Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.mLIjN_qO4K8w18IDVa0GCRY3MODTmJhZlQaPbgBjeYJRPDwteHfkfqFS_GFyLu4svoahzyFRxkdnKhxs1x90QxQ-7QCwjwypbk8On6gMarKl8jopo9sJbZITvk8mrqtoT6N34LZ1ash35iAkIuPZONPMH8_cp5NxiSC70J12fvIT9ZXp-9zvEk6WV8qQ3pRr0yRuGnSsuWjVvDE9WRNpE3ZmYS_EUBroA51yBEPdS8aBThRuuVGt4HuqrPXp9ZwHoiOcRYu1VcQu-wpIAhLfXcnY1vJA3FXm7w_H00DOGZuM9HRcxdg6Fj-2WP5FvCxbE8z5n1-zbQMs_J8JVaVXgQ; ao_lo_to_n="6159802571t3e3obe8rwmCywrH0Xq28osFyhyxlRsfXXDx9AUjMD2qAFgWUPkLF84KewBkZoL5OL21x5jznuxdPNdiJfglPNE7YH03Vk5CofaP+MGH3y8="; _gaf_fp=01fef3c14bfcaf5a01438f74a677e95a; _ga_38NCVF2XST=GS2.1.s1752031904$o47$g1$t1752031923$j41$l0$h1543227925; _ga_CN0F80S6GL=GS2.1.s1752031906$o46$g1$t1752031924$j42$l0$h0; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1752035308; JSESSIONID=165F9BAA752FE5B22CCD7C5BB7B62F2F', 'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1752031904,1752460043,1752653436,1753353401; HMACCOUNT=800EBCCFB4C6BBFB; 894cdd1d9741ce0c9757=827b7d3d13ed7bd6b4b1b24d0246b3dc; 3d854e1bcd61963fdf05=38fcb3b742a48aa345ddfd7136bc60ee; _gaf_fp=f297033bfe53aa9891ffe2842271566b; _gcl_au=1.1.420472597.1749119222.1054917286.1753685435.1753685437; rank-guest-user=6303473571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9lihUCQIaVmrHXjbpSRP/Ca0F; rank-login-user=6303473571KK6FnhfedvWg9tSSyk3xj2GRIc/8HSm4vuPYVHI5vKLXnssgei5ccK1dG8fkQSFI; rank-login-user-info=eyJuaWNrbmFtZSI6IuW4heWTpSIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTgzKioqKjczNDciLCJ0b2tlbiI6IjYzMDM0NzM1NzFLSzZGbmhmZWR2V2c5dFNTeWszeGoyR1JJYy84SFNtNHZ1UFlWSEk1dktMWG5zc2dlaTVjY0sxZEc4ZmtRU0ZJIn0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJKc2pZSlZWeFZzTVptVWFvMzgtZ3RRIiwiaWF0IjoxNzUzNjg1NDM2LCJleHAiOjE3NTM3NzE4MzYsIm5iZiI6MTc1MzY4NTM3Niwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIxNSwicGkiOm51bGwsIm5uIjoi5biF5ZOlIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxODMwNzk2NzM0NyIsImVtIjoiMzE1OTk4MDg5MkBxcS5jb20iLCJtbCI6IkcifQ.EaQ7Md7iVOpjZDogkiS2DlndhFPt3GzL2t33LXnh9Z5Itr3A8scFM_tzrYuzXqF6a-BDIMFe90SdDtU18zs9WTTl6_Phv3AEqcDe6WDfPAhB_KMa15VYAE5-b9d3lgIukKR8ZZyAMpiJzcmIWShmqxrhCNQD0ER3b7idaJpSrJiKnwV-tj6La52WJ6BmVRAk8gst0p5h-SYVnNz9iNaSXLc2Dx-hHZvMVNU27yfbJgKPpzRxgh7TOD7O-cT0WrEoKvTSw9e81gG9bgvKuA_bD-z3ePhgM6prUfceWszD88KH8PcXua9s_8ZM4bgrMyKMHswLtwyLhWePcvtHUp6yyQ; ao_lo_to_n=6303473571KK6FnhfedvWg9tSSyk3xj0WOO7cLm/YtvwwmR8H9liibP9br/hwQ1Dlb4xDZyVPrTQIst5JCVz4PpnUIlDMGE07YVPYBWOm3Hrx4PaVkgaQ=; _ga_38NCVF2XST=GS2.1.s1753685428$o61$g1$t1753685444$j44$l0$h984121357; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1753685445; _ga_CN0F80S6GL=GS2.1.s1753685429$o59$g1$t1753685445$j44$l0$h0; JSESSIONID=F09543D3A3D6F890BAD0F422FCA49942',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
} }
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json" url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
...@@ -368,7 +368,7 @@ def junglescout_spider(db_base): ...@@ -368,7 +368,7 @@ def junglescout_spider(db_base):
response = json.loads(response.text) response = json.loads(response.text)
break break
except: except:
time.sleep(random.uniform(15, 35.75)) time.sleep(random.uniform(15, 30.75))
response_data = response['data'] response_data = response['data']
print('code::', response['code']) print('code::', response['code'])
...@@ -387,7 +387,7 @@ def junglescout_spider(db_base): ...@@ -387,7 +387,7 @@ def junglescout_spider(db_base):
print('获取数据:', category_name['name'], i, est, year_month) print('获取数据:', category_name['name'], i, est, year_month)
sales = int(est) sales = int(est)
name_rnak_list.append((category_name['name'], i, sales, year_month)) name_rnak_list.append((category_name['name'], i, sales, year_month))
time.sleep(random.uniform(20, 75.75)) time.sleep(random.uniform(20, 65.75))
# break # break
for i in range(4): for i in range(4):
try: try:
...@@ -408,7 +408,7 @@ def junglescout_spider(db_base): ...@@ -408,7 +408,7 @@ def junglescout_spider(db_base):
cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql cursor_us_mysql_db, db_us = db_class_us.us_mysql_db() # us 站点 mysql
time.sleep(20) time.sleep(20)
print('当前完成。获取下一个分类销量') print('当前完成。获取下一个分类销量')
time.sleep(random.uniform(120, 240.5)) time.sleep(random.uniform(90, 200.5))
def save_site_category(site_bsr_dict=None): def save_site_category(site_bsr_dict=None):
......
...@@ -85,12 +85,12 @@ def junglescout_spider(db_base): ...@@ -85,12 +85,12 @@ def junglescout_spider(db_base):
"Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8", "Accept-Language": "zh-CN,zh-TW;q=0.9,zh;q=0.8",
"Cache-Control": "no-cache", "Cache-Control": "no-cache",
'Cookie': '_ga=GA1.1.19240078.1751854600; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751854601; HMACCOUNT=28ABEEABEFA97E4A; _gcl_au=1.1.536675967.1751854601; MEIQIA_TRACK_ID=2zWlEnsYAqnZRdhJqJ5txX7tpXm; MEIQIA_VISIT_ID=2zWlEmUkBQV745rliAtXEdAk0CJ; ecookie=ZyZ05gxOxlDTPkM1_CN; 8f00639f9c446a2d0213=54fb71d3f2c9e8acb7878e0f73abbf33; _fp=65dbbe41a37f8f9fbe702eba96328267; _gaf_fp=e03eac62da4f8988dc796341e1bd822c; current_guest=jsxcNvsgBJO1_250707-100340; rank-login-user=502219157192wVgAJpdturGN5Im+nPDQqTtoVYwVNo1oWP9MD0mtMHFwS3LrhtAUhuCnvMHsCl; rank-login-user-info="eyJuaWNrbmFtZSI6IuWViuWTiOWTiOWTiCIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTUzKioqKjEyNzAiLCJ0b2tlbiI6IjUwMjIxOTE1NzE5MndWZ0FKcGR0dXJHTjVJbStuUERRcVR0b1ZZd1ZObzFvV1A5TUQwbXRNSEZ3UzNMcmh0QVVodUNudk1Ic0NsIn0="; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiIwZ01FdlJuNWJ1dlZhVW5IZ1lKSDFRIiwiaWF0IjoxNzUxODU0NjA1LCJleHAiOjE3NTE5NDEwMDUsIm5iZiI6MTc1MTg1NDU0NSwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTQ2NjIwMSwicGkiOm51bGwsIm5uIjoi5ZWK5ZOI5ZOI5ZOIIiwic3lzIjoiU1NfQ04iLCJlZCI6Ik4iLCJwaG4iOiIxNTM2ODA1MTI3MCIsImVtIjoibWVpeW91bGFAbWVpeW91bGEuY29tIiwibWwiOiJHIn0.Ujr6_K3vHIQRw3x52QAQdTftMy6GbZ_TunmFMgW76onCy3EkBzx7uxEv-42zRRXgKLMUfJz2t0ierqXV6Evh9i-o5F0ZUBREzm48LHpGSw6Iupjx4Udc3VQwVqgiUOmYBvnTAQqmaj6iA5l06zAZcVNHQASZ5xe5QFUCllIOL0m8tf3Xad6T8u5oLHRHTTuyy5nDAqLu6ZxVOqUYYXsIzq9H2qAsPhqIgRy_5Av1zyoAcQErddadCe25H_ILmKO0Az9ANIFg4o1r_is_VFVZpGvbz8nCN0JLuY3uajAjf2JXoEzhHT9YbMP0o2TrZDRPdORV3HVK1N5uvghRaRyJvw; ao_lo_to_n="502219157192wVgAJpdturGN5Im+nPDfbd9htCMUGF/tdMS8/gmBNzv9/utYT5ucwmHHPC71S6i4RnT3fLUZW/nDI61eZx1uqLqr+hBy0X/aeJ6c/sSSc="; rank-guest-user=502219157192wVgAJpdturGN5Im+nPDYsyQgRxjbXtKYdDjju8ax0OkcsNUNGWP3xY6uiwKVVO; JSESSIONID=96FF611DCBDF20B9C6C921EAD2A55205; _ga_38NCVF2XST=GS2.1.s1751854600$o1$g1$t1751854612$j48$l0$h1855838417; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1751854612; _ga_CN0F80S6GL=GS2.1.s1751854600$o1$g1$t1751854613$j47$l0$h0', 'Cookie': '_ga=GA1.1.522737765.1749119222; _fp=65dbbe41a37f8f9fbe702eba96328267; MEIQIA_TRACK_ID=2y5KvHOzkFTlJAhOLENKAKWsOeb; MEIQIA_VISIT_ID=2y5KvGrMsL4O61rUcCdsLjChlRa; current_guest=r0hgXGqjbSw0_250605-186810; ecookie=xOHgcnYmcZIZKG0z_CN; x-hng=lang=zh-CN&domain=www.sellersprite.com; a40ac813159995d028ba=3d9b7c15f5787387e62acd734f598f23; Hm_lvt_e0dfc78949a2d7c553713cb5c573a486=1751973053,1752031904,1752460043,1752653436; HMACCOUNT=800EBCCFB4C6BBFB; rank-guest-user=8301172571YFpPM/DhYDVQzRAgRu7tcQTFTi48nSnOk/TNMkep2gdtR77QXyNfDPmFlYbdSsdL; rank-login-user=8301172571YFpPM/DhYDVQzRAgRu7tcWqD2KCbe1WiKcOarfxTCdls3AJ9lNFy+VA8a+RTm195; rank-login-user-info=eyJuaWNrbmFtZSI6Iuilv+mXqOWQuembqiIsImlzQWRtaW4iOmZhbHNlLCJhY2NvdW50IjoiMTMzKioqKjU0MDciLCJ0b2tlbiI6IjgzMDExNzI1NzFZRnBQTS9EaFlEVlF6UkFnUnU3dGNXcUQyS0NiZTFXaUtjT2FyZnhUQ2RsczNBSjlsTkZ5K1ZBOGErUlRtMTk1In0=; Sprite-X-Token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjE2Nzk5NjI2YmZlMDQzZTBiYzI5NTEwMTE4ODA3YWExIn0.eyJqdGkiOiJLcVRRV2RPbVNNcjlKTU1qYTdXRjFRIiwiaWF0IjoxNzUyNjUzNDM4LCJleHAiOjE3NTI3Mzk4MzgsIm5iZiI6MTc1MjY1MzM3OCwic3ViIjoieXVueWEiLCJpc3MiOiJyYW5rIiwiYXVkIjoic2VsbGVyU3BhY2UiLCJpZCI6MTMzNDkzLCJwaSI6bnVsbCwibm4iOiLopb_pl6jlkLnpm6oiLCJzeXMiOiJTU19DTiIsImVkIjoiTiIsInBobiI6IjEzMzkyNDE1NDA3IiwiZW0iOiJxcTE2NTMxMjE4NjUzQDE2My5jb20iLCJtbCI6IkcifQ.caY2QxTbtUVg7CQXvNJcmVo1YU0TGy3AD01dIddF76PHjYbbFh5a8zZAdAXnAKM1wNcs39d1MM8Wa-uoXHiitqDlCZsWyF9aXzco9L4wn-yU4xlMYsf7VoquZI6bxaMT2TNeX6vgQBod-NeXHYFpZQWdrH5sfZHQypkpRINb_o1QwaWvZrjufj1UwYdiypryBxTDyCuLfD4djU0PLMRXvifY6Ef86VNjAlsY8gFqDdHiVLixR2GWGdKRtoG74Ak5DX2eMDT6ak-OMrWYOaikthxIXiqdADTq2tvUCmjO4pE0oYnWhSEx9-UABo7jxJ0v_Af8B6AVu7ccC0NUUvWBMA; ao_lo_to_n=8301172571YFpPM/DhYDVQzRAgRu7tca/7vKUOAtDW4w4LhsAzrvlsqk8xCK+opMY27DGtrDKlwUwhqg///+C6QOw12iRKNIq9mCOV5+ORmOA+PwqisF4=; _gaf_fp=0f3f9e0c791b5513d38aa715d0624aab; _gcl_au=1.1.420472597.1749119222.448034571.1752653439.1752653439; JSESSIONID=0F617D64E2FD6DD92F3BB10935E3C846; _ga_38NCVF2XST=GS2.1.s1752653436$o51$g1$t1752653450$j46$l0$h366949276; Hm_lpvt_e0dfc78949a2d7c553713cb5c573a486=1752653451; _ga_CN0F80S6GL=GS2.1.s1752653437$o50$g1$t1752653451$j46$l0$h0',
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
} }
url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json" url = "https://www.sellersprite.com/v2/tools/sales-estimator/bsr.json"
data = { data = {
"station": "US", "station": "UK",
"cid": category_name['c_id'], # 分类id "cid": category_name['c_id'], # 分类id
"bsr": f"{i}" # 排名 "bsr": f"{i}" # 排名
} }
...@@ -167,7 +167,7 @@ def save_site_category(site_bsr_dict=None): ...@@ -167,7 +167,7 @@ def save_site_category(site_bsr_dict=None):
def run(): def run():
# get_cid() # get_cid()
junglescout_spider('us') junglescout_spider('uk')
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -158,7 +158,7 @@ if __name__ == '__main__': ...@@ -158,7 +158,7 @@ if __name__ == '__main__':
month = 7 month = 7
engine_db_num = 14 engine_db_num = 14
# for site in ['de','uk']: # for site in ['de','uk']:
for site in ['uk']: for site in ['us']:
time.sleep(0) time.sleep(0)
count_all_syn_st_id(site_name=site,month=month).get_minid_maxid() count_all_syn_st_id(site_name=site,month=month).get_minid_maxid()
# count_all_syn_st_id(site_name=site,month=month,engine_db_num=engine_db_num).search_term_syn() # count_all_syn_st_id(site_name=site,month=month,engine_db_num=engine_db_num).search_term_syn()
......
...@@ -214,7 +214,8 @@ class async_asin_pg(): ...@@ -214,7 +214,8 @@ class async_asin_pg():
'created_time': new_date, 'current_asin': items['current_asin'], 'created_time': new_date, 'current_asin': items['current_asin'],
'parent_asin': items["parentAsin"], 'div_id_list': items['div_id_list'], 'parent_asin': items["parentAsin"], 'div_id_list': items['div_id_list'],
'bundles_this_asins_json': items['bundles_this_asins_data_json'], 'bundles_this_asins_json': items['bundles_this_asins_data_json'],
'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'] 'video_m3u8_url': items["video_m3u8"], 'result_list_json': items['result_list_json'],
'bundle_asin_component_json':items['bundle_asin_component_json']
} }
if self.site_name in ['uk', 'de', 'fr', 'es', 'it']: if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
item['five_six_val'] = items['five_six_val'] item['five_six_val'] = items['five_six_val']
...@@ -222,8 +223,6 @@ class async_asin_pg(): ...@@ -222,8 +223,6 @@ class async_asin_pg():
item['five_six_val'] = None item['five_six_val'] = None
# 第二次请求 # 第二次请求
_response_text = None _response_text = None
# if (item['weight'] is None and item['volume'] is None and item['rank'] is None and item[
# 'launch_time'] is None) or (item['variat_num'] > 0 and is_variat == '0'):
if item['variat_num'] > 0 and is_variat == '0': if item['variat_num'] > 0 and is_variat == '0':
self.request_total_count_list.append(4) self.request_total_count_list.append(4)
if item['variat_num'] > 0: if item['variat_num'] > 0:
...@@ -478,7 +477,7 @@ class async_asin_pg(): ...@@ -478,7 +477,7 @@ class async_asin_pg():
def run(self): def run(self):
asin_list = self.save_asin_detail.read_db_data() asin_list = self.save_asin_detail.read_db_data()
# asin_list = ['B07BXM8RZ3|2025-01|1|1|null|null','B07FM8P1Z1|2025-01|1|1|null|null','B07TWHCK69|2025-01|1|1|null|null'] # asin_list = ['B0BPKK2BMN|2025-01|1|1|null|null']
if asin_list: if asin_list:
for asin in asin_list: for asin in asin_list:
self.queries_asin_queue.put(asin) self.queries_asin_queue.put(asin)
......
...@@ -14,7 +14,7 @@ import html ...@@ -14,7 +14,7 @@ import html
import re import re
from html import unescape from html import unescape
import urllib.parse import urllib.parse
from sqlalchemy import text
class recall_cases(): class recall_cases():
...@@ -90,6 +90,15 @@ class recall_cases(): ...@@ -90,6 +90,15 @@ class recall_cases():
if response_detail: if response_detail:
recall_date_list = response_detail.xpath("//div[contains(text(),'Recall Date:')]/parent::div/text()") recall_date_list = response_detail.xpath("//div[contains(text(),'Recall Date:')]/parent::div/text()")
product_title_list = response_detail.xpath("//div[contains(text(),'Name of Product:')]/parent::div/text()") product_title_list = response_detail.xpath("//div[contains(text(),'Name of Product:')]/parent::div/text()")
if product_title_list:
matches = re.findall(r'[A-Za-z\-®]+(?: [A-Za-z\-®]+)*', product_title_list[-1].strip())
if matches:
brand = ','.join(matches)
else:
brand = None
else:
brand = None
hazard_list = response_detail.xpath("//div[contains(text(),'危险:')]/parent::div//p//text()") hazard_list = response_detail.xpath("//div[contains(text(),'危险:')]/parent::div//p//text()")
image_url_list = response_detail.xpath("//div[@id='recall-gallery-img']//li/img/@src") image_url_list = response_detail.xpath("//div[@id='recall-gallery-img']//li/img/@src")
recall_date = recall_date_list[-1].strip() if recall_date_list else None # 召回日期 recall_date = recall_date_list[-1].strip() if recall_date_list else None # 召回日期
...@@ -98,7 +107,7 @@ class recall_cases(): ...@@ -98,7 +107,7 @@ class recall_cases():
image_url = 'https://www.cpsc.gov' + image_url_list[0].strip() if image_url_list else None # 图片 image_url = 'https://www.cpsc.gov' + image_url_list[0].strip() if image_url_list else None # 图片
if recall_date: if recall_date:
recall_date = self._parse_date_str(recall_date) recall_date = self._parse_date_str(recall_date)
data_list = ['us_recalls_product', recall_date, product_title, hazard, image_url, a_href] data_list = ['us_recalls_product', recall_date, product_title, hazard, image_url, a_href,brand]
return data_list return data_list
else: else:
return None return None
...@@ -140,7 +149,7 @@ class recall_cases(): ...@@ -140,7 +149,7 @@ class recall_cases():
df = pd.DataFrame(data=save_data_list, df = pd.DataFrame(data=save_data_list,
columns=['data_type', 'recall_date', 'product_title', 'hazard', columns=['data_type', 'recall_date', 'product_title', 'hazard',
'image_url', 'image_url',
'ext_url', 'recall_title', 'country']) 'ext_url','brand', 'recall_title', 'country'])
df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False) df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
except: except:
is_None = False is_None = False
...@@ -207,6 +216,7 @@ class recall_cases(): ...@@ -207,6 +216,7 @@ class recall_cases():
dict_item = response.json() dict_item = response.json()
data_lists = dict_item['data'] data_lists = dict_item['data']
for data in data_lists: for data in data_lists:
print(data,'344444444')
data_list = [] data_list = []
try: try:
# 逐项解码 # 逐项解码
...@@ -234,18 +244,24 @@ class recall_cases(): ...@@ -234,18 +244,24 @@ class recall_cases():
response2 = self._request(headers=headers, url=url) response2 = self._request(headers=headers, url=url)
response_detail = etree.HTML(response2.text) response_detail = etree.HTML(response2.text)
src_list = response_detail.xpath("//div[@id='recall-photos']//img/@src") src_list = response_detail.xpath("//div[@id='recall-photos']//img/@src")
Brand_list = response_detail.xpath("//div[contains(text(),'Brand Name')]/following-sibling::div//text()")
if Brand_list:
brand = ''.join(Brand_list).strip()
else:
brand = None
print(brand,'Brand_list::',Brand_list)
if src_list: if src_list:
image_url = 'https://www.fda.gov' + src_list[0] image_url = 'https://www.fda.gov' + src_list[0]
else: else:
image_url = None image_url = None
print('image_url:', image_url) print('image_url:', image_url)
data_list.append(['us_fba_recalls', date, link_text, hazard, image_url, url, recall_title, 'us', data_list.append(['us_fba_recalls', date, link_text, hazard, image_url, url, recall_title, 'us',
product_category]) product_category,brand])
try: try:
df = pd.DataFrame(data=data_list, df = pd.DataFrame(data=data_list,
columns=['data_type', 'recall_date', 'product_title', 'hazard', columns=['data_type', 'recall_date', 'product_title', 'hazard',
'image_url', 'image_url',
'ext_url', 'recall_title', 'country', 'product_category']) 'ext_url', 'recall_title', 'country', 'product_category','brand'])
df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True) df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False) df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
except: except:
...@@ -260,7 +276,7 @@ class recall_cases(): ...@@ -260,7 +276,7 @@ class recall_cases():
def ec_europa_eu(self): def ec_europa_eu(self):
'欧盟召回' '欧盟召回'
for i in range(0, 32): for i in range(1, 33):
url = 'https://ec.europa.eu/safety-gate-alerts/public/api/notification/carousel/?' url = 'https://ec.europa.eu/safety-gate-alerts/public/api/notification/carousel/?'
data = {"language": "en", "page": f"{i}"} data = {"language": "en", "page": f"{i}"}
headers = { headers = {
...@@ -284,6 +300,7 @@ class recall_cases(): ...@@ -284,6 +300,7 @@ class recall_cases():
print(data, '请求列表页url:', url) print(data, '请求列表页url:', url)
is_None = True is_None = True
response = requests.post(url, headers=headers, impersonate="chrome120", timeout=120, json=data) response = requests.post(url, headers=headers, impersonate="chrome120", timeout=120, json=data)
print(response.url)
if response: if response:
content = response.json()['content'] content = response.json()['content']
for ids in content: for ids in content:
...@@ -314,11 +331,17 @@ class recall_cases(): ...@@ -314,11 +331,17 @@ class recall_cases():
recall_title = items['product']['versions'][0]['description'] recall_title = items['product']['versions'][0]['description']
print(product_title) print(product_title)
print(recall_title) print(recall_title)
brands = items['product']['brands']
if brands:
brand = brands[0].get('brand')
else:
brand = None
print('brand::1',brand)
hazard = items['risk']['versions'][0]['riskDescription'] hazard = items['risk']['versions'][0]['riskDescription']
print(hazard) print(hazard)
ext_url = 'https://ec.europa.eu/safety-gate-alerts/screen/webReport/alertDetail/' + str(items['id']) + '?lang=en' ext_url = 'https://ec.europa.eu/safety-gate-alerts/screen/webReport/alertDetail/' + str(
print('ext_url::',ext_url) items['id']) + '?lang=en'
print('ext_url::', ext_url)
if items['product']['photos']: if items['product']['photos']:
image_id = items['product']['photos'][0]['id'] image_id = items['product']['photos'][0]['id']
image_url = f'https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{image_id}' image_url = f'https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{image_id}'
...@@ -326,30 +349,36 @@ class recall_cases(): ...@@ -326,30 +349,36 @@ class recall_cases():
image_url = None image_url = None
print(image_url) print(image_url)
data_list.append( data_list.append(
[date, product_category, product_title, recall_title, hazard, 'eu_recall', image_url, 'eu', ext_url, [date, product_category, product_title, recall_title, hazard, 'eu_recall', image_url, 'eu',
data_json]) ext_url,data_json,brand])
# try:
# df = pd.DataFrame(data=data_list, keys = [
# columns=['recall_date', 'product_category', 'product_title', 'recall_title', "recall_date", "product_category", "product_title", "recall_title",
# 'hazard', 'data_type', 'image_url', "hazard", "data_type", "image_url", "country", "ext_url", "data_json", "brand"
# 'country', 'ext_url', 'data_json']) ]
# df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
# df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False) # 把 list of list 转成 list of dict
dict_list = [dict(zip(keys, row)) for row in data_list]
with self.mysql_db.begin() as conn: with self.mysql_db.begin() as conn:
conn.execute( conn.execute(
f"insert into recall_cases_data (recall_date, product_category, product_title,recall_title,hazard," text("""
f"data_type,image_url,country,ext_url,data_json) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE recall_date = values(recall_date),product_title=values (product_title),ext_url=values (ext_url)", INSERT INTO recall_cases_data
data_list) (recall_date, product_category, product_title, recall_title, hazard, data_type, image_url, country, ext_url, data_json, brand)
# except: VALUES (:recall_date, :product_category, :product_title, :recall_title, :hazard, :data_type, :image_url, :country, :ext_url, :data_json, :brand)
# is_None = False ON DUPLICATE KEY UPDATE
# break recall_date = VALUES(recall_date),
product_title = VALUES(product_title),
ext_url = VALUES(ext_url)
"""),
dict_list
)
if is_None == False: if is_None == False:
break break
else: else:
break break
time.sleep(random.uniform(2, 8)) time.sleep(random.uniform(2, 8))
def ec_europa_uk(self): def ec_europa_uk(self):
'https://www.gov.uk/product-safety-alerts-reports-recalls?page=2' 'https://www.gov.uk/product-safety-alerts-reports-recalls?page=2'
url = 'https://www.gov.uk/product-safety-alerts-reports-recalls' url = 'https://www.gov.uk/product-safety-alerts-reports-recalls'
...@@ -392,6 +421,8 @@ class recall_cases(): ...@@ -392,6 +421,8 @@ class recall_cases():
print('产品标题:', product_title) print('产品标题:', product_title)
hazard_list = resp_html.xpath("//p[contains(text(),'Hazard:')]/text()") hazard_list = resp_html.xpath("//p[contains(text(),'Hazard:')]/text()")
print('风险:', hazard_list) print('风险:', hazard_list)
Brand_list = resp_html.xpath("//td[contains(text(),'Brand')]/following-sibling::td/text()")
brand = Brand_list[0].strip() if Brand_list else None
image_url_list = resp_html.xpath("//span[@class='attachment-inline']/a/@href") image_url_list = resp_html.xpath("//span[@class='attachment-inline']/a/@href")
product_category = product_category[0].strip() if product_category else None product_category = product_category[0].strip() if product_category else None
product_title = product_title[0].strip().replace('Product:', '') if product_title else None product_title = product_title[0].strip().replace('Product:', '') if product_title else None
...@@ -399,13 +430,13 @@ class recall_cases(): ...@@ -399,13 +430,13 @@ class recall_cases():
image_url_list = image_url_list[0].strip() if image_url_list else None image_url_list = image_url_list[0].strip() if image_url_list else None
data_list.append( data_list.append(
[recall_title, detail_url, recall_date, product_category, product_title, [recall_title, detail_url, recall_date, product_category, product_title,
hazard_list, image_url_list, 'uk_recall', 'uk']) hazard_list, image_url_list, 'uk_recall', 'uk',brand])
if data_list: if data_list:
try: try:
df = pd.DataFrame(data=data_list, df = pd.DataFrame(data=data_list,
columns=['recall_title', 'ext_url', 'recall_date', 'product_category', columns=['recall_title', 'ext_url', 'recall_date', 'product_category',
'product_title', 'product_title',
'hazard', 'image_url', 'data_type', 'country']) 'hazard', 'image_url', 'data_type', 'country','brand'])
df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True) df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False) df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
...@@ -554,9 +585,12 @@ class recall_cases(): ...@@ -554,9 +585,12 @@ class recall_cases():
def get_globalrecalls(self): def get_globalrecalls(self):
# sql = 'SELECT data_json FROM global_recalls_data' # sql = 'SELECT data_json FROM global_recalls_data'
# df_data = pd.read_sql(sql, con=self.mysql_db) # df_data = pd.read_sql(sql, con=self.mysql_db)
list_url = 'https://globalrecalls.oecd.org/ws/search.xqy?end=20&lang=en&order=desc&q=&sort=date&start=0&uiLang=en' list_url = 'https://globalrecalls.oecd.org/ws/search.xqy?end=20&lang=en&order=desc&q=&sort=date&start=0&uiLang=en'
# list_url = f'https://globalrecalls.oecd.org/ws/search.xqy?end={i}&lang=en&order=desc&q=&sort=date&start={i - 20}&uiLang=en'
print('请求url', list_url)
# 'https://globalrecalls.oecd.org/ws/search.xqy?end=200&lang=en&order=desc&q=&sort=date&start=180&uiLang=en'
headers = { headers = {
'Accept': '*/*', 'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br, zstd', 'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8', 'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
...@@ -574,7 +608,8 @@ class recall_cases(): ...@@ -574,7 +608,8 @@ class recall_cases():
for result in result_list: for result in result_list:
countryId = result['countryId'] countryId = result['countryId']
imageUri = result['imageUri'] imageUri = result['imageUri']
if countryId.lower() in ['us', 'ca', 'mx', 'nl', 'sa', 'se', 'pl', 'tr', 'be', 'uk', 'de', 'es', 'fr', 'it', if countryId.lower() in ['us', 'ca', 'mx', 'nl', 'sa', 'se', 'pl', 'tr', 'be', 'uk', 'de', 'es', 'fr',
'it',
'jp']: 'jp']:
date_time = result['date'] date_time = result['date']
extUrl = result['extUrl'] extUrl = result['extUrl']
...@@ -600,7 +635,11 @@ class recall_cases(): ...@@ -600,7 +635,11 @@ class recall_cases():
if 'ENTITY_NOT_FOUN' in resp.text: if 'ENTITY_NOT_FOUN' in resp.text:
continue continue
items_data = resp.json() items_data = resp.json()
brands = items_data['product']['brands']
if brands:
brand = brands[0].get('brand')
else:
brand = None
time.sleep(random.uniform(1, 3)) time.sleep(random.uniform(1, 3))
items['country'] = countryId items['country'] = countryId
items['reacll_time'] = date_time items['reacll_time'] = date_time
...@@ -616,17 +655,19 @@ class recall_cases(): ...@@ -616,17 +655,19 @@ class recall_cases():
items['data_type'] = 'global_recalls' items['data_type'] = 'global_recalls'
items['product_title'] = re.findall(r'^(.*?)\s*;', title_name + ';')[0] items['product_title'] = re.findall(r'^(.*?)\s*;', title_name + ';')[0]
items['ext_url'] = extUrl items['ext_url'] = extUrl
items['brand'] = brand
data_json = json.dumps(items_data) data_json = json.dumps(items_data)
data_list.append([items['data_type'], items['product_title'], items['productCategory'], data_list.append([items['data_type'], items['product_title'], items['productCategory'],
items['reacll_time'], items['riskDescription'], items['country'], items['reacll_time'], items['riskDescription'], items['country'],
items['image_url'], items['image_url'],
items['recall_title'], items['ext_url'], data_json]) items['recall_title'], items['ext_url'], data_json,items['brand']])
print(items) print('itemsitems::',items)
try: try:
df = pd.DataFrame(data=data_list, df = pd.DataFrame(data=data_list,
columns=['data_type', 'product_title', 'product_category', 'recall_date', columns=['data_type', 'product_title', 'product_category', 'recall_date',
'hazard', 'hazard',
'country', 'image_url', 'recall_title', 'ext_url', 'data_json']) 'country', 'image_url', 'recall_title', 'ext_url', 'data_json','brand'])
df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False) df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
except: except:
print('数据重复=====') print('数据重复=====')
...@@ -635,17 +676,18 @@ class recall_cases(): ...@@ -635,17 +676,18 @@ class recall_cases():
print('没有解析到id') print('没有解析到id')
items = {} items = {}
url = result['uri'] url = result['uri']
items['country'] = countryId # 站点 items['country'] = countryId # 站点
encoded_url = urllib.parse.quote(url) encoded_url = urllib.parse.quote(url)
_url = 'https://globalrecalls.oecd.org/ws/getrecall.xqy?uiLang=en&uri='+encoded_url _url = 'https://globalrecalls.oecd.org/ws/getrecall.xqy?uiLang=en&uri=' + encoded_url
print('_url::',_url)
resp = requests.get(_url, headers=headers, timeout=60) resp = requests.get(_url, headers=headers, timeout=60)
result = resp.json() result = resp.json()
print("result::",result) print("result::", result)
time.sleep(random.uniform(1, 3)) time.sleep(random.uniform(1, 3))
extUrl = result['recall']['extUrl'] # 详情url extUrl = result['recall']['extUrl'] # 详情url
imageUri = result['recall']['images'][0]['imageUri'] imageUri = result['recall']['images'][0]['imageUri']
encode_imageUri = urllib.parse.quote(imageUri) encode_imageUri = urllib.parse.quote(imageUri)
imaurl = f"https://globalrecalls.oecd.org/ws/getdocument.xqy?uri={encode_imageUri}" # 图片 imaurl = f"https://globalrecalls.oecd.org/ws/getdocument.xqy?uri={encode_imageUri}" # 图片
date_time = result['recall']['date'] date_time = result['recall']['date']
items['reacll_time'] = date_time items['reacll_time'] = date_time
title_name = result['recall']['product.name'] title_name = result['recall']['product.name']
...@@ -653,7 +695,7 @@ class recall_cases(): ...@@ -653,7 +695,7 @@ class recall_cases():
if recall_title is None: if recall_title is None:
recall_title = result['recall']['images'][0]['alt.text'] recall_title = result['recall']['images'][0]['alt.text']
if recall_title: if recall_title:
recall_title.replace('Image of ','') recall_title.replace('Image of ', '')
hazard = result['recall']['hazard'] hazard = result['recall']['hazard']
items['recall_title'] = recall_title items['recall_title'] = recall_title
items['productCategory'] = result['recall']['product.type'] items['productCategory'] = result['recall']['product.type']
...@@ -667,7 +709,7 @@ class recall_cases(): ...@@ -667,7 +709,7 @@ class recall_cases():
items['reacll_time'], items['riskDescription'], items['country'], items['reacll_time'], items['riskDescription'], items['country'],
items['image_url'], items['image_url'],
items['recall_title'], items['ext_url'], data_json]) items['recall_title'], items['ext_url'], data_json])
print('没有解析到id的数据:',items) print('没有解析到id的数据:', items)
try: try:
df = pd.DataFrame(data=data_list, df = pd.DataFrame(data=data_list,
columns=['data_type', 'product_title', 'product_category', 'recall_date', columns=['data_type', 'product_title', 'product_category', 'recall_date',
...@@ -677,6 +719,7 @@ class recall_cases(): ...@@ -677,6 +719,7 @@ class recall_cases():
except: except:
print('没有解析到id 存储 数据重复=====') print('没有解析到id 存储 数据重复=====')
continue continue
def run(self): def run(self):
# self.global_recalls() # self.global_recalls()
self.get_globalrecalls() self.get_globalrecalls()
...@@ -685,25 +728,41 @@ class recall_cases(): ...@@ -685,25 +728,41 @@ class recall_cases():
self.ec_europa_eu() self.ec_europa_eu()
self.ec_europa_uk() self.ec_europa_uk()
self.gov_uk() self.gov_uk()
# """
# 数据类型,属于那个国的
# eu_recall
# global_recalls
# uk_drug_device 1
# uk_recall 2
# us_fba_recalls 3
# with self.mysql_db.begin() as conn: # us_recalls_product
# sql = "SELECT data_json FROM recall_cases_data WHERE data_type='eu_recall'" # """
# df_data = pd.read_sql(sql, con=self.mysql_db) # with self.mysql_db.begin() as conn:
# data_json_list = list(df_data.data_json) # sql = "SELECT id,product_title FROM recall_cases_data WHERE data_type='us_recalls_product'"
# for data_json in data_json_list: # df_data = pd.read_sql(sql, con=self.mysql_db)
# data_dict = json.loads(data_json) # df_data['id'] = df_data['id'].fillna('').astype(str)
# print(data_dict['product']['photos'][0]['id']) # df_data['product_title'] = df_data['product_title'].fillna('').astype(str)
# imgurl = f"https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{data_dict['product']['photos'][0]['id']}" # data_json_list = list(df_data.id+ "|=||+||" + df_data.product_title)
# up_sql = f"update recall_cases_data set image_url ='{imgurl}' WHERE data_type='eu_recall' and image_url like '%%/image/{data_dict['product']['photos'][0]['id']}'" # for data_json_id in data_json_list:
# print(up_sql) # if data_json_id:
# conn.execute(up_sql) # data_json_id_list = data_json_id.split('|=||+||')
# id = data_json_id_list[0]
# product_title = data_json_id_list[1]
# print(product_title)
# if bool(re.search(r'[\u4e00-\u9fff]', product_title)):
# # print(product_title,'23333333')
# matches = re.findall(r'[A-Za-z\-®]+(?: [A-Za-z\-®]+)*', product_title.strip())
# if matches:
# brand = ','.join(matches)
# else:
# brand = None
# print(id, brand,'23444444444')
# if brand:
# brand = brand.replace('"','').replace("'","")
# up_sql = f"""update recall_cases_data set brand ="{brand}" WHERE id={id}"""
# print(up_sql)
# conn.execute(up_sql)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -35,7 +35,7 @@ class Parse_asin_html(): ...@@ -35,7 +35,7 @@ class Parse_asin_html():
print('没有该 asin html') print('没有该 asin html')
def search_term_html(self, site_name='us', month='04'): def search_term_html(self, site_name='us', month='04'):
sql = f"SELECT search_term,page,html FROM search_term_html_2025_{month} WHERE search_term='lace white tops for women' and site_name = '{site_name}'" sql = f"SELECT search_term,page,html FROM search_term_html_2025_{month} WHERE search_term='resin kit' and site_name = '{site_name}'"
print(sql) print(sql)
df = pd.read_sql(sql, con=engine_strrocks) df = pd.read_sql(sql, con=engine_strrocks)
print(df.values) print(df.values)
...@@ -52,8 +52,8 @@ class Parse_asin_html(): ...@@ -52,8 +52,8 @@ class Parse_asin_html():
print('没有该 搜索词 html') print('没有该 搜索词 html')
def run(self): def run(self):
self.asin_html() # self.asin_html()
# self.search_term_html(site_name='us',month='04') self.search_term_html(site_name='uk',month='07')
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -402,7 +402,7 @@ class ParseAsinUs(object): ...@@ -402,7 +402,7 @@ class ParseAsinUs(object):
break break
if min_match_asin_data_list: if min_match_asin_data_list:
min_match_asin_json = json.dumps(min_match_asin_data_list, ensure_ascii=False) min_match_asin_json = json.dumps(min_match_asin_data_list, ensure_ascii=False)
# bundles_this_asins ,Bundles with this item B0BPV8R4K8 # bundles_this_asins ,Bundles with this item B0BPV8R4K8 变体下方位置。和五点描述挨着
bundles_this_asins_data_list = [] bundles_this_asins_data_list = []
bundles_this_asins_data_json = None bundles_this_asins_data_json = None
for i in ASIN_XPATH['bundles_this_asins']: for i in ASIN_XPATH['bundles_this_asins']:
...@@ -436,7 +436,48 @@ class ParseAsinUs(object): ...@@ -436,7 +436,48 @@ class ParseAsinUs(object):
break break
if bundles_this_asins_data_list: if bundles_this_asins_data_list:
bundles_this_asins_data_json = json.dumps(bundles_this_asins_data_list, ensure_ascii=False) bundles_this_asins_data_json = json.dumps(bundles_this_asins_data_list, ensure_ascii=False)
# 捆绑销售 B0DD8W2DZD This bundle contains 2 items
href_asin_list = self.response_s.xpath(
"//div[@class='bundle-title']/following-sibling::div//div[@class='bundle-components']//div[contains(@id,'bundle-component-details-component-title')]/a/@href")
bundle_asin_component_list = []
if href_asin_list:
bundle_component_asin_list = []
for href_asin in href_asin_list:
i_asin_list = re.findall(r'(?:[A-Z0-9]{10}|[0-9]{10})', href_asin)
bundle_component_asin_list.append(i_asin_list[0])
if bundle_component_asin_list:
bundle_component_asin_list = list(set(bundle_component_asin_list))
for bundle_component_asin in bundle_component_asin_list:
print('bundle_component_asin:', bundle_component_asin)
bundle_title_list = self.response_s.xpath(
f"//a[contains(@href,'{bundle_component_asin}')]/parent::div[contains(@id,'component-details-component-title')]/a/text()")
bundle_asin_title = bundle_title_list[0] if bundle_title_list else None
bundle_img_list = self.response_s.xpath(f"//a[contains(@href,'{bundle_component_asin}')]/img/@src")
bundle_asin_img = bundle_img_list[0] if bundle_img_list else None
bundle_review_list = self.response_s.xpath(
rf"//a[contains(@href,'{bundle_component_asin}')]/i[contains(@class,'component-details-component-review')]//following-sibling::span/text()")
bundle_asin_review = bundle_review_list[0] if bundle_review_list else None
bundle_starslist = self.response_s.xpath(
rf"//a[contains(@href,'{bundle_component_asin}')]/i[contains(@class,'component-details-component-review-stars')]/@class")
bundle_stars = bundle_starslist[0] if bundle_starslist else None
bundle_stars_list = re.findall(r'a-star-(.*?) ', bundle_stars)
bundle_asin_star = bundle_stars_list[0].replace('-', '.') if bundle_stars_list else None
bundle_asin_price_list = self.response_s.xpath(
f"//a[contains(@href,'{bundle_component_asin}')]/parent::div/following-sibling::div[contains(@class,'component-details-component-prices')]/span/text()")
bundle_asin_price = bundle_asin_price_list[0] if bundle_asin_price_list else None
bundle_asin_point_list = self.response_s.xpath(
f"//a[contains(@href,'{bundle_component_asin}')]/parent::div/following-sibling::ul/li[contains(@id,'component-details-component-bullet-point')]/span/text()")
bundle_asin_point = '|-|'.join(bundle_asin_point_list) if bundle_asin_point_list else None
bundle_component_asin_item = {"bundle_component_asin":bundle_component_asin,"bundle_asin_title":bundle_asin_title,
'bundle_asin_img':bundle_asin_img,"bundle_asin_review":bundle_asin_review,
"bundle_asin_star":bundle_asin_star,"bundle_asin_price":bundle_asin_price,
"bundle_asin_point":bundle_asin_point}
bundle_asin_component_list.append(bundle_component_asin_item)
if bundle_asin_component_list:
bundle_asin_component_json = json.dumps(bundle_asin_component_list)
else:
bundle_asin_component_json = None
# 五点描述 # 五点描述
for i in ASIN_XPATH['five_data']: for i in ASIN_XPATH['five_data']:
five_text_list = self.response_s.xpath(i) five_text_list = self.response_s.xpath(i)
...@@ -2815,7 +2856,7 @@ class ParseAsinUs(object): ...@@ -2815,7 +2856,7 @@ class ParseAsinUs(object):
'customer_reviews_json': customer_reviews_json, 'together_asin_json': together_asin_json, 'customer_reviews_json': customer_reviews_json, 'together_asin_json': together_asin_json,
'min_match_asin_json': min_match_asin_json, 'seller_json': seller_json, 'current_asin': current_asin, 'min_match_asin_json': min_match_asin_json, 'seller_json': seller_json, 'current_asin': current_asin,
'div_id_list': div_id_list, 'bundles_this_asins_data_json': bundles_this_asins_data_json, 'div_id_list': div_id_list, 'bundles_this_asins_data_json': bundles_this_asins_data_json,
'video_m3u8': video_m3u8, 'result_list_json': result_list_json} 'video_m3u8': video_m3u8, 'result_list_json': result_list_json,'bundle_asin_component_json':bundle_asin_component_json}
if self.site_name == 'us': if self.site_name == 'us':
item['three_four_val'] = Join_Prime_int item['three_four_val'] = Join_Prime_int
elif self.site_name in ['uk', 'fr', 'it', 'es']: elif self.site_name in ['uk', 'fr', 'it', 'es']:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment