temu_search.py
4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import sys
import numpy as np
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from utils.templates_mysql import TemplatesMysql
class TemuSearch():
def __init__(self, site_name='us', search_key='asin', search_value='B0BBNQCXZL', top_k=100):
self.site_name = site_name
self.search_key = search_key
self.search_value = search_value
self.top_k = top_k
self.engine = TemplatesMysql().engine
self.df_features = pd.DataFrame()
self.query_vector = np.array
# self.server_ip = "192.168.200.210"
self.server_ip = "113.100.143.162"
def read_data(self, search_value, site_name):
print("search_value, site_name:", search_value, site_name)
sql = f"""
SELECT main.asin, main.features, main.platform, a.local_path from (
SELECT asin, features, platform from us_pictures_features_self WHERE asin in
(
SELECT asin_compet from us_self_asin_compet_amazon WHERE asin='{search_value}' and site='{site_name}'
)
UNION all
SELECT asin, features, platform from us_pictures_features_self WHERE asin in
(
SELECT asin_compet from us_self_asin_compet_temu WHERE asin='{search_value}' and site='{site_name}'
)
)
as main
left join
(select asin, local_path from us_pictures_local_path_self) a
on main.asin = a.asin
"""
# sql = f"""
# SELECT asin, features, platform from us_pictures_features_self WHERE asin in
# (
# SELECT DISTINCT(asin_compet) from us_self_asin_compet_amazon WHERE asin ='{search_value}'
# )
# UNION all
#
# SELECT asin, features, platform from us_pictures_features_self WHERE asin in (
#
# SELECT DISTINCT(asin_compet) from us_self_asin_compet_temu WHERE asin ='{search_value}'
# )
#
# """
print("sql:", sql)
df_features = pd.read_sql(sql, con=self.engine)
print(df_features.shape)
print(df_features.head())
return df_features
def search_api(self, site_name, search_value):
print("111 search_value, site_name:", search_value, site_name)
query_vector = self.get_asin_features(search_value)
self.query_vector = np.array(query_vector)
self.query_vector = self.query_vector.reshape(1, -1)
# 计算相似度
similarities_dict = self.calculate_similarity(site_name=site_name, search_value=search_value, query_vector=query_vector)
print("similarities_dict:", similarities_dict)
# return indices_list, distances_list, asin_list, similarities_dict
return similarities_dict
def get_asin_features(self, search_value):
sql = f"select * from {self.site_name}_pictures_features_self where asin='{search_value}'"
print("sql:", sql)
df = pd.read_sql(sql, con=self.engine)
query_vector = eval(list(df.features)[0])
# print("query_vector:", query_vector[:10])
return query_vector
def calculate_similarity(self, site_name, search_value, query_vector):
df = self.read_data(search_value, site_name)
platform_list = list(df.platform)
all_vecs_dict = {asin: eval(features) for asin, features in zip(df.asin, df.features)}
# query_vector = query_vector.T
similarities = self.cosine_similarity_matrix(query_vec=query_vector, all_vecs=list(all_vecs_dict.values()))
# print(similarities)
similarities_dict = {asin: (platform, similarity, f"http://{self.server_ip}:8000/images/{local_path.replace('/mnt/data/img_data/', '')}") for platform, similarity, asin, local_path in zip(platform_list, similarities, df.asin, df.local_path)}
# print(similarities_dict)
# for asin in asin_list:
# print(asin, similarities_dict[asin])
# asin_similarities = {search_value: similarities_dict}
# print("asin_similarities:", asin_similarities)
return similarities_dict
def cosine_similarity_matrix(self, query_vec, all_vecs):
# 计算相似度
query_vec_norm = np.linalg.norm(query_vec)
all_vecs_norm = np.linalg.norm(all_vecs, axis=1)
print(query_vec_norm.shape)
print(all_vecs_norm.shape)
dot_products = np.dot(all_vecs, query_vec)
similarities = dot_products / (query_vec_norm * all_vecs_norm)
# 将相似度转换为百分比
similarities_percentage = similarities * 100
# 保留所需的小数位数,例如保留两位小数
similarities_percentage = np.round(similarities_percentage, 2)
return similarities_percentage
if __name__ == '__main__':
handle_obj = TemuSearch(site_name='us', search_value='B07GL4C9R9')
handle_obj.search_api(site_name='us', search_value='B07GL4C9R9')