import random import traceback from curl_cffi import requests from fake_useragent import UserAgent # Initialize the UserAgent object ua = UserAgent() # """ # 170.106.158.134:21527 # 101.32.166.145:24770 # 101.32.166.145:30253 # 43.157.122.238:30062 # 101.32.166.145:41984 # # """ # IP = "170.106.158.134:30535" # proxies = { # 'http': f'http://{IP}', # 'https': f'http://{IP}' # } # # proxiec # cookies = { # # 'AccessToken': "MY2VLHWMWBQIAOJ6UCJBSZ33NKX224OSXVF26WFMFYMLD2Y6PNQ01108dba2955" # } # # url_start = "https://www.temu.com/search_result.html?search_key=" # # headers = { # # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', # 'user-agent': ua.random, # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', # 'accept-language': 'zh-CN,zh;q=0.9', # } # key_words = "mini astronautas paracaidistas con paracaídas juguete volador favor de fiesta espacial regalos voladores requeridos ni montaje sin batería rellenos para bolsas de regalos" # url = url_start + key_words # url = "https://www.temu.com/search_result.html?search_key=mini%20astronautas%20paracaidistas%20con%20paraca%C3%ADdas%20juguete%20volador%20favor%20de%20fiesta%20espacial%20regalos%20voladores%20requeridos%20ni%20montaje%20sin%20bater%C3%ADa%20rellenos%20para%20bolsas%20de%20regalos" # r = requests.get(url, headers=headers, proxies=proxies, timeout=600, verify=False, impersonate="chrome110") # print(r.text) # # # Save the response content as a HTML file # with open('./html/output2.html', 'w', encoding='utf-8') as f: # f.write(r.text) import os import sys import numpy as np import pandas as pd os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" sys.path.append(os.path.dirname(sys.path[0])) # 上级目录 from pyspark_job.utils.templates_mysql import TemplatesMysql class TemuSpider(object): def __init__(self): self.engine = TemplatesMysql().engine self.df_read = pd.DataFrame() self.asin_title_list = pd.DataFrame() self.ip_list = pd.DataFrame() self.ip_url = "https://tq.lunaproxy.com/getflowip?neek=1050636&num=50&type=1&sep=1®ions=ca&ip_si=2&level=1&sb=" self.url_start = "https://www.temu.com/search_result.html?search_key=" self.ua = UserAgent() def read_data(self): sql = f""" SELECT * from us_self_asin_top WHERE asin in ( SELECT DISTINCT(asin) from us_self_asin_compet_amazon WHERE state=3 ) limit 100; """ self.df_read = pd.read_sql(sql, con=self.engine) print("self.df_read.shape:", self.df_read.shape) self.asin_title_list = zip(list(self.df_read.asin), list(self.df_read.new_title)) def get_ip(self): self.ip_list = """43.157.119.224:28668 43.157.119.224:28634 43.157.119.224:28778 43.157.119.224:28637 43.157.119.224:28598 43.157.119.224:28865 43.157.119.224:28845 43.157.119.224:28825""".split("\n") def run(self): self.read_data() self.get_ip() for asin, title in self.asin_title_list: while True: try: headers = { 'user-agent': self.ua.random, 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9', } ip = random.choice(self.ip_list) ip = "43.128.117.51:19686" proxies = { 'http': f'http://{ip}', 'https': f'http://{ip}' } url2 = f'{self.url_start}{title.replace(" ", "%20")}' url = "https://www.temu.com/search_result.html?search_key=mini%20astronautas%20paracaidistas%20con%20paraca%C3%ADdas%20juguete%20volador%20favor%20de%20fiesta%20espacial%20regalos%20voladores%20requeridos%20ni%20montaje%20sin%20bater%C3%ADa%20rellenos%20para%20bolsas%20de%20regalos" # url = f"https://www.temu.com/search_result.html?search_key={title}" print("ip:", ip) print("url:", url) print("url2:", url2) print(url == url2) print("asin, title:", asin, title) r = requests.get(url, headers=headers, proxies=proxies, timeout=60, verify=False, impersonate="chrome110") # print(r.text) with open(f'./html/{asin}.html', 'w', encoding='utf-8') as f: f.write(r.text) break except Exception as e: print(e, traceback.format_exc()) continue if __name__ == '__main__': spider = TemuSpider() spider.run()