import re
import nltk
import pandas as pd
import numpy as np
import time

from db.mysql_db import sql_insert_many, sql_insert, sql_connect, get_con


def get_amazon_temu_date():
    sql1 = f"SELECT DISTINCT(asin) AS asin,salesChannel from product_publish where  sales_30 >= 200 and asin in ('B0B7H41N5B', 'B0C3TX3726', 'B0C3HRVPWD', 'B0C4YQ11DL', 'B0C623H8RG') GROUP BY asin,salesChannel;"
    df_keepa_trend = pd.read_sql(sql1, con=keepa_engine)
    return df_keepa_trend


def get_self_asin_detail(site):
    if site in ["us", "de", "fr", "es", "uk", "it"]:
        site_ = site
        sql1 = f'SELECT asin, title, brand, date_info, img_url from {site_}_self_asin_detail where site="{site}" order by date_info desc;'
    else:
        site_ = "us"
        sql1 = f'SELECT asin, title, brand, date_info, img_url from {site_}_self_asin_detail where site="{site}" order by date_info desc;'
    print(sql1)
    df_keepa_trend = pd.read_sql(sql1, con=site_engine)
    return df_keepa_trend


def get_amazon_temu_not_likes():
    sql1 = "SELECT text from amazon_temu_not_likes;"
    df_not_like = pd.read_sql(sql1, con=engine)
    return df_not_like


def site_transition(x):
    if len(x) != 2:
        return list(sites.keys())[list(sites.values()).index(x)]
    else:
        return x


def filter_key_word(x):
    if x:
        tokens = nltk.word_tokenize(x.lower())

        word_tags = nltk.pos_tag(tokens)
        tags = [
            "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT",
            "POS",
            "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP",
            "WP$",
            "WRB"
        ]

        word = " ".join([i[0] for i in word_tags])
        del_num = []
        for i, v in enumerate(word_tags):
            if v[1] == "CD":
                try:
                    if word_tags[i + 1][1] == "NNS":
                        del_num.append([word_tags[i][0], word_tags[i + 1][0]])
                    else:
                        del_num.append([word_tags[i][0]])
                except:
                    del_num.append([word_tags[i][0]])
            else:
                continue

        for i in del_num:
            word = word.replace(" ".join(i), "")
        return re.sub(r"(\(.*?\))", "", word)
    else:
        return ""


def filter_fuhao(x):
    tokens = nltk.word_tokenize(x.lower())

    word_tags = nltk.pos_tag(tokens)
    tags = [
        "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS",
        "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP",
        "WP$",
        "WRB"
    ]

    word = " ".join([i[0] for i in word_tags if i[1] in tags])
    return word


def inset_syn(asin_list):
    not_in_asin = []
    for i in list(asin_list.values):
        not_in_asin.append([i[0], i[1], 12, 1, 2, 4, None])
    # 测试
    # inset_sql = f"insert into `{self.site}_self_all_syn` (`asin`, `is_variation`, `data_type`, `state`, `priority`, `date_info`, `site`) values (%s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `asin` = values(`asin`), `site` = values(`site`), `state` = values(`state`);"
    if not_in_asin:
        self_all_syn_site = not_in_asin[0][1] if not_in_asin[0][1] in ["us", "de", "fr", "es", "uk", "it"] else "us"
        sql_connect(self_all_syn_site)
        inset_sql = f"insert into `{self_all_syn_site}_self_all_syn` (`asin`, `site`, `data_type`, `state`, `is_variation`, `priority`, `date_info`) values (%s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `asin` = values(`asin`), `site` = values(`site`), `state` = values(`state`);"
        if len(not_in_asin) == 1:
            sql_insert(inset_sql, not_in_asin[0])
        else:
            sql_insert_many(inset_sql, not_in_asin)
        print(f"需要插入数据库数量为{len(not_in_asin)}----{not_in_asin}")


sites = {
    "us": "Amazon.com",
    "uk": "Amazon.co.uk",
    "de": "Amazon.de",
    "es": "Amazon.es",
    "it": "Amazon.it",
    "fr": "Amazon.fr",
    "mx": "Amazon.com.mx",
    "ca": "Amazon.ca",
}

keepa_engine = get_con("keepa")
amazon_temu = get_amazon_temu_date()
amazon_temu['salesChannel'] = amazon_temu.salesChannel.apply(site_transition)

engine = get_con("us")

# amazon title清洗转换为搜索词
for name, group in amazon_temu.groupby(['salesChannel']):
    print(f"{name}站点数据存储")
    site_engine = get_con(name)
    if name not in ["us", "de", "fr", "uk", "es", "it", "mx", "ca"]:
        continue
    self_asin_detail = get_self_asin_detail(name)
    up_a = self_asin_detail[self_asin_detail['asin'].isin(group["asin"])]
    not_in_asin = group[~group['asin'].isin(self_asin_detail["asin"])]
    up_a.drop_duplicates(['asin'], inplace=True)
    not_in_asin.drop_duplicates(['asin'], inplace=True)
    print(not_in_asin)
    print(up_a)
    # inset_syn(not_in_asin)
    up_a['brand'] = up_a['brand'].replace(np.nan, '')
    up_a["title_news"] = up_a["title"].apply(lambda x: filter_key_word(x))
    up_a["title_news"] = up_a["title_news"].apply(lambda x: filter_fuhao(x))
    up_a['title_news'] = up_a.apply(lambda row: row['title_news'].replace(row['brand'].lower(), ""), axis=1)
    stop = list(list(i)[0] for i in get_amazon_temu_not_likes().values)
    stop += ["silver"]
    # 停用词去除
    up_a['title_news'] = up_a['title_news'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    up_a.rename(columns={'asin': 'asin', 'title_news': 'new_title', 'title': 'old_title'}, inplace=True)
    up_a["state"] = 1
    up_a["site"] = name
    del up_a["brand"]
    del up_a["date_info"]
    # up_a.to_sql(name=f"us_self_asin_top", con=engine, if_exists='append', index=False)

