1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import random
import traceback
from curl_cffi import requests
from fake_useragent import UserAgent
# Initialize the UserAgent object
ua = UserAgent()
# """
# 170.106.158.134:21527
# 101.32.166.145:24770
# 101.32.166.145:30253
# 43.157.122.238:30062
# 101.32.166.145:41984
#
# """
# IP = "170.106.158.134:30535"
# proxies = {
# 'http': f'http://{IP}',
# 'https': f'http://{IP}'
# }
# # proxiec
# cookies = {
# # 'AccessToken': "MY2VLHWMWBQIAOJ6UCJBSZ33NKX224OSXVF26WFMFYMLD2Y6PNQ01108dba2955"
# }
#
# url_start = "https://www.temu.com/search_result.html?search_key="
#
# headers = {
# # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
# 'user-agent': ua.random,
# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'accept-language': 'zh-CN,zh;q=0.9',
# }
# key_words = "mini astronautas paracaidistas con paracaídas juguete volador favor de fiesta espacial regalos voladores requeridos ni montaje sin batería rellenos para bolsas de regalos"
# url = url_start + key_words
# url = "https://www.temu.com/search_result.html?search_key=mini%20astronautas%20paracaidistas%20con%20paraca%C3%ADdas%20juguete%20volador%20favor%20de%20fiesta%20espacial%20regalos%20voladores%20requeridos%20ni%20montaje%20sin%20bater%C3%ADa%20rellenos%20para%20bolsas%20de%20regalos"
# r = requests.get(url, headers=headers, proxies=proxies, timeout=600, verify=False, impersonate="chrome110")
# print(r.text)
#
# # Save the response content as a HTML file
# with open('./html/output2.html', 'w', encoding='utf-8') as f:
# f.write(r.text)
import os
import sys
import numpy as np
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from pyspark_job.utils.templates_mysql import TemplatesMysql
class TemuSpider(object):
def __init__(self):
self.engine = TemplatesMysql().engine
self.df_read = pd.DataFrame()
self.asin_title_list = pd.DataFrame()
self.ip_list = pd.DataFrame()
self.ip_url = "https://tq.lunaproxy.com/getflowip?neek=1050636&num=50&type=1&sep=1®ions=ca&ip_si=2&level=1&sb="
self.url_start = "https://www.temu.com/search_result.html?search_key="
self.ua = UserAgent()
def read_data(self):
sql = f"""
SELECT * from us_self_asin_top WHERE asin in (
SELECT DISTINCT(asin) from us_self_asin_compet_amazon WHERE state=3
) limit 100;
"""
self.df_read = pd.read_sql(sql, con=self.engine)
print("self.df_read.shape:", self.df_read.shape)
self.asin_title_list = zip(list(self.df_read.asin), list(self.df_read.new_title))
def get_ip(self):
self.ip_list = """43.157.119.224:28668
43.157.119.224:28634
43.157.119.224:28778
43.157.119.224:28637
43.157.119.224:28598
43.157.119.224:28865
43.157.119.224:28845
43.157.119.224:28825""".split("\n")
def run(self):
self.read_data()
self.get_ip()
for asin, title in self.asin_title_list:
while True:
try:
headers = {
'user-agent': self.ua.random,
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
}
ip = random.choice(self.ip_list)
ip = "43.128.117.51:19686"
proxies = {
'http': f'http://{ip}',
'https': f'http://{ip}'
}
url2 = f'{self.url_start}{title.replace(" ", "%20")}'
url = "https://www.temu.com/search_result.html?search_key=mini%20astronautas%20paracaidistas%20con%20paraca%C3%ADdas%20juguete%20volador%20favor%20de%20fiesta%20espacial%20regalos%20voladores%20requeridos%20ni%20montaje%20sin%20bater%C3%ADa%20rellenos%20para%20bolsas%20de%20regalos"
# url = f"https://www.temu.com/search_result.html?search_key={title}"
print("ip:", ip)
print("url:", url)
print("url2:", url2)
print(url == url2)
print("asin, title:", asin, title)
r = requests.get(url, headers=headers, proxies=proxies, timeout=60, verify=False, impersonate="chrome110")
# print(r.text)
with open(f'./html/{asin}.html', 'w', encoding='utf-8') as f:
f.write(r.text)
break
except Exception as e:
print(e, traceback.format_exc())
continue
if __name__ == '__main__':
spider = TemuSpider()
spider.run()