temu.py 4.9 KB
Newer Older
chenyuanjie committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
import random
import traceback

from curl_cffi import requests

from fake_useragent import UserAgent

# Initialize the UserAgent object
ua = UserAgent()
# """
# 170.106.158.134:21527
# 101.32.166.145:24770
# 101.32.166.145:30253
# 43.157.122.238:30062
# 101.32.166.145:41984
#
# """
# IP = "170.106.158.134:30535"
# proxies = {
#         'http': f'http://{IP}',
#         'https': f'http://{IP}'
#     }
#     # proxiec
# cookies = {
#     # 'AccessToken': "MY2VLHWMWBQIAOJ6UCJBSZ33NKX224OSXVF26WFMFYMLD2Y6PNQ01108dba2955"
# }
#
# url_start = "https://www.temu.com/search_result.html?search_key="
#
# headers = {
#     # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
#     'user-agent': ua.random,
#     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
#     'accept-language': 'zh-CN,zh;q=0.9',
# }
# key_words = "mini astronautas paracaidistas con paracaídas juguete volador favor de fiesta espacial regalos voladores requeridos ni montaje sin batería rellenos para bolsas de regalos"
# url = url_start + key_words
# url = "https://www.temu.com/search_result.html?search_key=mini%20astronautas%20paracaidistas%20con%20paraca%C3%ADdas%20juguete%20volador%20favor%20de%20fiesta%20espacial%20regalos%20voladores%20requeridos%20ni%20montaje%20sin%20bater%C3%ADa%20rellenos%20para%20bolsas%20de%20regalos"
# r = requests.get(url, headers=headers, proxies=proxies, timeout=600, verify=False, impersonate="chrome110")
# print(r.text)
#
# # Save the response content as a HTML file
# with open('./html/output2.html', 'w', encoding='utf-8') as f:
#     f.write(r.text)


import os
import sys
import numpy as np
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from pyspark_job.utils.templates_mysql import TemplatesMysql


class TemuSpider(object):

    def __init__(self):
        self.engine = TemplatesMysql().engine
        self.df_read = pd.DataFrame()
        self.asin_title_list = pd.DataFrame()
        self.ip_list = pd.DataFrame()
        self.ip_url = "https://tq.lunaproxy.com/getflowip?neek=1050636&num=50&type=1&sep=1&regions=ca&ip_si=2&level=1&sb="

        self.url_start = "https://www.temu.com/search_result.html?search_key="
        self.ua = UserAgent()

    def read_data(self):
        sql = f"""
                SELECT * from us_self_asin_top WHERE asin in (
        SELECT DISTINCT(asin) from us_self_asin_compet_amazon WHERE state=3
        )  limit 100;
        """
        self.df_read = pd.read_sql(sql, con=self.engine)
        print("self.df_read.shape:", self.df_read.shape)
        self.asin_title_list = zip(list(self.df_read.asin), list(self.df_read.new_title))

    def get_ip(self):
        self.ip_list = """43.157.119.224:28668
43.157.119.224:28634
43.157.119.224:28778
43.157.119.224:28637
43.157.119.224:28598
43.157.119.224:28865
43.157.119.224:28845
43.157.119.224:28825""".split("\n")

    def run(self):
        self.read_data()
        self.get_ip()
        for asin, title in self.asin_title_list:
            while True:
                try:
                    headers = {
                        'user-agent': self.ua.random,
                        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                        'accept-language': 'zh-CN,zh;q=0.9',
                    }
                    ip = random.choice(self.ip_list)
                    ip = "43.128.117.51:19686"

                    proxies = {
                        'http': f'http://{ip}',
                        'https': f'http://{ip}'
                    }
                    url2 = f'{self.url_start}{title.replace(" ", "%20")}'
                    url = "https://www.temu.com/search_result.html?search_key=mini%20astronautas%20paracaidistas%20con%20paraca%C3%ADdas%20juguete%20volador%20favor%20de%20fiesta%20espacial%20regalos%20voladores%20requeridos%20ni%20montaje%20sin%20bater%C3%ADa%20rellenos%20para%20bolsas%20de%20regalos"
                    # url = f"https://www.temu.com/search_result.html?search_key={title}"
                    print("ip:", ip)
                    print("url:", url)
                    print("url2:", url2)
                    print(url == url2)
                    print("asin, title:", asin, title)
                    r = requests.get(url, headers=headers, proxies=proxies, timeout=60, verify=False, impersonate="chrome110")
                    # print(r.text)
                    with open(f'./html/{asin}.html', 'w', encoding='utf-8') as f:
                        f.write(r.text)
                    break
                except Exception as e:
                    print(e, traceback.format_exc())
                    continue


if __name__ == '__main__':
    spider = TemuSpider()
    spider.run()