temu.py 4.9 KB
import random
import traceback

from curl_cffi import requests

from fake_useragent import UserAgent

# Initialize the UserAgent object
ua = UserAgent()
# """
# 170.106.158.134:21527
# 101.32.166.145:24770
# 101.32.166.145:30253
# 43.157.122.238:30062
# 101.32.166.145:41984
#
# """
# IP = "170.106.158.134:30535"
# proxies = {
#         'http': f'http://{IP}',
#         'https': f'http://{IP}'
#     }
#     # proxiec
# cookies = {
#     # 'AccessToken': "MY2VLHWMWBQIAOJ6UCJBSZ33NKX224OSXVF26WFMFYMLD2Y6PNQ01108dba2955"
# }
#
# url_start = "https://www.temu.com/search_result.html?search_key="
#
# headers = {
#     # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
#     'user-agent': ua.random,
#     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
#     'accept-language': 'zh-CN,zh;q=0.9',
# }
# key_words = "mini astronautas paracaidistas con paracaídas juguete volador favor de fiesta espacial regalos voladores requeridos ni montaje sin batería rellenos para bolsas de regalos"
# url = url_start + key_words
# url = "https://www.temu.com/search_result.html?search_key=mini%20astronautas%20paracaidistas%20con%20paraca%C3%ADdas%20juguete%20volador%20favor%20de%20fiesta%20espacial%20regalos%20voladores%20requeridos%20ni%20montaje%20sin%20bater%C3%ADa%20rellenos%20para%20bolsas%20de%20regalos"
# r = requests.get(url, headers=headers, proxies=proxies, timeout=600, verify=False, impersonate="chrome110")
# print(r.text)
#
# # Save the response content as a HTML file
# with open('./html/output2.html', 'w', encoding='utf-8') as f:
#     f.write(r.text)


import os
import sys
import numpy as np
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from pyspark_job.utils.templates_mysql import TemplatesMysql


class TemuSpider(object):

    def __init__(self):
        self.engine = TemplatesMysql().engine
        self.df_read = pd.DataFrame()
        self.asin_title_list = pd.DataFrame()
        self.ip_list = pd.DataFrame()
        self.ip_url = "https://tq.lunaproxy.com/getflowip?neek=1050636&num=50&type=1&sep=1&regions=ca&ip_si=2&level=1&sb="

        self.url_start = "https://www.temu.com/search_result.html?search_key="
        self.ua = UserAgent()

    def read_data(self):
        sql = f"""
                SELECT * from us_self_asin_top WHERE asin in (
        SELECT DISTINCT(asin) from us_self_asin_compet_amazon WHERE state=3
        )  limit 100;
        """
        self.df_read = pd.read_sql(sql, con=self.engine)
        print("self.df_read.shape:", self.df_read.shape)
        self.asin_title_list = zip(list(self.df_read.asin), list(self.df_read.new_title))

    def get_ip(self):
        self.ip_list = """43.157.119.224:28668
43.157.119.224:28634
43.157.119.224:28778
43.157.119.224:28637
43.157.119.224:28598
43.157.119.224:28865
43.157.119.224:28845
43.157.119.224:28825""".split("\n")

    def run(self):
        self.read_data()
        self.get_ip()
        for asin, title in self.asin_title_list:
            while True:
                try:
                    headers = {
                        'user-agent': self.ua.random,
                        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                        'accept-language': 'zh-CN,zh;q=0.9',
                    }
                    ip = random.choice(self.ip_list)
                    ip = "43.128.117.51:19686"

                    proxies = {
                        'http': f'http://{ip}',
                        'https': f'http://{ip}'
                    }
                    url2 = f'{self.url_start}{title.replace(" ", "%20")}'
                    url = "https://www.temu.com/search_result.html?search_key=mini%20astronautas%20paracaidistas%20con%20paraca%C3%ADdas%20juguete%20volador%20favor%20de%20fiesta%20espacial%20regalos%20voladores%20requeridos%20ni%20montaje%20sin%20bater%C3%ADa%20rellenos%20para%20bolsas%20de%20regalos"
                    # url = f"https://www.temu.com/search_result.html?search_key={title}"
                    print("ip:", ip)
                    print("url:", url)
                    print("url2:", url2)
                    print(url == url2)
                    print("asin, title:", asin, title)
                    r = requests.get(url, headers=headers, proxies=proxies, timeout=60, verify=False, impersonate="chrome110")
                    # print(r.text)
                    with open(f'./html/{asin}.html', 'w', encoding='utf-8') as f:
                        f.write(r.text)
                    break
                except Exception as e:
                    print(e, traceback.format_exc())
                    continue


if __name__ == '__main__':
    spider = TemuSpider()
    spider.run()