temu_search.py 4.86 KB
Newer Older
chenyuanjie committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
import os
import sys
import numpy as np
import pandas as pd
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.templates_mysql import TemplatesMysql


class TemuSearch():

    def __init__(self, site_name='us', search_key='asin', search_value='B0BBNQCXZL', top_k=100):
        self.site_name = site_name
        self.search_key = search_key
        self.search_value = search_value
        self.top_k = top_k
        self.engine = TemplatesMysql().engine
        self.df_features = pd.DataFrame()
        self.query_vector = np.array
        # self.server_ip = "192.168.200.210"
        self.server_ip = "113.100.143.162"

    def read_data(self, search_value, site_name):
        print("search_value, site_name:", search_value, site_name)
        sql = f"""
        SELECT main.asin, main.features, main.platform, a.local_path from (
        SELECT asin, features, platform from us_pictures_features_self WHERE asin in 
            (
            SELECT asin_compet from us_self_asin_compet_amazon WHERE asin='{search_value}' and site='{site_name}'
            )
        UNION all
        SELECT asin, features, platform from us_pictures_features_self WHERE asin in 
            (
            SELECT asin_compet from us_self_asin_compet_temu WHERE asin='{search_value}' and site='{site_name}'
            )
        )
        as main 
        left join 
        (select asin, local_path from us_pictures_local_path_self) a
        on main.asin = a.asin
        """
        # sql = f"""
        # SELECT asin, features, platform from us_pictures_features_self WHERE asin in
        # (
        # SELECT DISTINCT(asin_compet) from us_self_asin_compet_amazon WHERE asin ='{search_value}'
        # )
        # UNION all
        #
        # SELECT asin, features, platform from us_pictures_features_self WHERE asin in (
        #
        # SELECT DISTINCT(asin_compet) from us_self_asin_compet_temu WHERE asin ='{search_value}'
        # )
        #
        # """

        print("sql:", sql)
        df_features = pd.read_sql(sql, con=self.engine)
        print(df_features.shape)
        print(df_features.head())
        return df_features

    def search_api(self, site_name, search_value):
        print("111 search_value, site_name:", search_value, site_name)
        query_vector = self.get_asin_features(search_value)
        self.query_vector = np.array(query_vector)
        self.query_vector = self.query_vector.reshape(1, -1)
        # 计算相似度
        similarities_dict = self.calculate_similarity(site_name=site_name, search_value=search_value, query_vector=query_vector)
        print("similarities_dict:", similarities_dict)

        # return indices_list, distances_list, asin_list, similarities_dict
        return similarities_dict

    def get_asin_features(self, search_value):
        sql = f"select * from {self.site_name}_pictures_features_self where asin='{search_value}'"
        print("sql:", sql)
        df = pd.read_sql(sql, con=self.engine)
        query_vector = eval(list(df.features)[0])
        # print("query_vector:", query_vector[:10])
        return query_vector

    def calculate_similarity(self, site_name, search_value, query_vector):
        df = self.read_data(search_value, site_name)
        platform_list = list(df.platform)
        all_vecs_dict = {asin: eval(features) for asin, features in zip(df.asin, df.features)}

        # query_vector = query_vector.T
        similarities = self.cosine_similarity_matrix(query_vec=query_vector, all_vecs=list(all_vecs_dict.values()))
        # print(similarities)
        similarities_dict = {asin: (platform, similarity, f"http://{self.server_ip}:8000/images/{local_path.replace('/mnt/data/img_data/', '')}") for platform, similarity, asin, local_path in zip(platform_list, similarities, df.asin, df.local_path)}
        # print(similarities_dict)
        # for asin in asin_list:
        #     print(asin, similarities_dict[asin])
        # asin_similarities = {search_value: similarities_dict}
        # print("asin_similarities:", asin_similarities)
        return similarities_dict

    def cosine_similarity_matrix(self, query_vec, all_vecs):
        # 计算相似度
        query_vec_norm = np.linalg.norm(query_vec)
        all_vecs_norm = np.linalg.norm(all_vecs, axis=1)

        print(query_vec_norm.shape)
        print(all_vecs_norm.shape)

        dot_products = np.dot(all_vecs, query_vec)
        similarities = dot_products / (query_vec_norm * all_vecs_norm)
        # 将相似度转换为百分比
        similarities_percentage = similarities * 100
        # 保留所需的小数位数,例如保留两位小数
        similarities_percentage = np.round(similarities_percentage, 2)
        return similarities_percentage


if __name__ == '__main__':
    handle_obj = TemuSearch(site_name='us', search_value='B07GL4C9R9')
    handle_obj.search_api(site_name='us', search_value='B07GL4C9R9')