import time
import logging
import os, sys
import pandas as pd
from queue import Queue
# useful for handling different item types with a single interface
from func_timeout import func_set_timeout
from func_timeout.exceptions import FunctionTimedOut
from sqlalchemy.exc import OperationalError, DataError
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.utils.common import is_internet_available
from amazon_spider.db.mysql_db import df_to_sql, get_country_engine
from amazon_spider.db.pg_db import get_pg_country_engine, get_14pg_country_engine


class AmazonKeepaSpiderPipeline:
    def __init__(self, site):
        self.site = site
        self.q_dict = {
            "inner_item_queue": Queue(),
            "variat_item_queue": Queue(),
            "asin_img_queue": Queue(),
            "error_queue": Queue(),
            "self_variat_queue": Queue(),
        }
        self.num = 80
        self.save_num = 80

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            site=crawler.spider.site
        )

    @func_set_timeout(300)
    def up_del_dis(self, sql, data=None, site="us", db="mysql"):
        if db == "mysql":
            e = get_country_engine(site)
        elif db == "pg":
            e = get_pg_country_engine(site)
        elif db == "pg14":
            e = get_14pg_country_engine(site)
        try:
            if not is_internet_available():
                return False
            with e.connect() as conn:
                if data != None:
                    if data:
                        conn.execute(sql, data)
                else:
                    conn.execute(sql)
                e.dispose()
                # conn.commit()
                # conn.close()
                return True
        except OperationalError as e:
            logging.info(f"error sql is {sql}")
            return False

    def up_del_db(self, sql, data=None, site="us", db="mysql"):
        if 'delete' in sql.lower():
            sql_msg = "delete"
        else:
            sql_msg = "update"
        while True:
            try:
                if is_internet_available():
                    if self.up_del_dis(sql, data=data, site=site, db=db):
                        logging.info(f"{sql_msg} {db} asin state 3 ok ^_^ -----{len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
                        break
                    else:
                        time.sleep(3)
                        logging.info(
                            f"{sql_msg} {db} asin state 3 error T_T --> {len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
                        continue
                else:
                    time.sleep(3)
                    logging.info(
                        f"{sql_msg} {db} asin state 3 network error T_T --> {len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
                    continue
            except FunctionTimedOut as e:
                time.sleep(3)
                logging.info(f"{sql_msg} {db} asin state 3 time out T_T --> {e}----{len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
                continue

    def save_db(self, table, df, site, db):
        # 入库报错重试
        while True:
            try:
                if df_to_sql(table, df, site=site, db=db):
                    logging.info(
                        f"更新 {db} 数据库 {table} -----{df.shape}---------{df.head()}")
                    break
                else:
                    logging.info(f"更新 {db} 数据库 {table} -----失败")
                    continue
            except OperationalError as e:
                logging.info(f"更新 {db} 数据库 {table} 失败  连接错误{e}")
                continue
            except FunctionTimedOut as e:
                logging.info(
                    f"更新 {db} 数据库 {table} -超时-{e}---{df.shape}---------{df.head()}")
                continue
            except DataError as e:
                logging.info(f"{e}{list(df.values)}")
                raise DataError

    def asin_state_to_list(self, df):
        df_9 = df.loc[df.volume.isna() & df.weight.isna() & df[
            "rank"].isna() & df.launch_time.isna() & df.price.isna() & df.rating.isna() & df.total_comments.isna()]
        df_ = df.loc[~(df.volume.isna() & df.weight.isna() & df[
            "rank"].isna() & df.launch_time.isna() & df.price.isna() & df.rating.isna() & df.total_comments.isna())]
        df_7 = df_.loc[df.volume.isna() & df.weight.isna() & df["rank"].isna() & df.launch_time.isna()]
        df_3 = df_.loc[~(df.volume.isna() & df.weight.isna() & df["rank"].isna() & df.launch_time.isna())]
        df_9["state"] = 9
        df_7["state"] = 7
        df_3["state"] = 3
        df_9 = df_9.loc[:, ["state", "asin", "site"]]
        df_7 = df_7.loc[:, ["state", "asin", "site"]]
        df_3 = df_3.loc[:, ["state", "asin", "site"]]
        asin_list = []
        asin_list += [list(i) for i in df_9.values]
        asin_list += [list(i) for i in df_7.values]
        asin_list += [list(i) for i in df_3.values]
        return asin_list

    @staticmethod
    def asin_to_number(asin):
        """
        Convert a 10-character ASIN string to a unique number.
        This function assumes that ASIN consists of uppercase letters and digits.
        """

        def char_to_number(char):
            if char.isdigit():
                return int(char)
            else:
                return ord(char) - 55  # 'A' -> 10, 'B' -> 11, ..., 'Z' -> 35

        if len(asin) != 10:
            raise ValueError("ASIN must be 10 characters long")

        base = 36
        asin_number = 0
        for i, char in enumerate(reversed(asin)):
            asin_number += char_to_number(char) * (base ** i)

        # The final number is taken modulo 1 billion to fit the range 1-10 billion
        return asin_number % 1000000000

    def queue_consumer(self, q_size):
        for k, v in self.q_dict.items():
            if q_size == "max":
                if v.qsize():
                    dates = [v.get() for i in range(0, v.qsize())]
                else:
                    dates = []
            else:
                if v.qsize() >= self.num:
                    dates = [v.get() for i in range(0, self.num)]
                else:
                    dates = []
            if dates:
                df = pd.DataFrame(dates)
                if k == "inner_item_queue":
                    if dates:
                        pg_dates = df[df['asin_type'].astype(str).str.contains('4')]
                        # mysql_dates = df[~df['asin_type'].astype(str).str.contains('4')]
                        mysql_dates = df[~df['asin_type'].isin(['4', '4,', ',4'])]
                        if pg_dates.shape[0] and self.site == "us":
                            self.save_db(f"{self.site}_self_asin_detail_{time.gmtime().tm_year}", pg_dates, self.site, "pg")
                        if mysql_dates.shape[0]:
                            self.save_db(f"{self.site}_self_asin_detail", mysql_dates, self.site, "mysql")
                        sql_up = f"UPDATE `{self.site}_self_all_syn` set `state`=(%s)  where asin=(%s) and site=(%s);"
                        asin_list = self.asin_state_to_list(df)
                        if len(asin_list) == 1:
                            d = asin_list[0]
                        else:
                            d = asin_list
                        logging.info(f"{self.site}_self_asin_detail   {df.shape}")
                        self.up_del_db(sql_up, d, self.site, db="mysql")
                        site_up_asin = [(i[1], i[0]) for i in asin_list if i[2] == "us"]
                        if site_up_asin:
                            site_up_sql = f"insert into {self.site}_all_syn_st_asin (asin,state) values %s on conflict(asin) do update set state=excluded.state;"
                            # while True:
                            #     try:
                            #         if is_internet_available():
                            #             if updatas_14pg_asin(site_up_sql, data=site_up_asin, site=self.site):
                            #                 logging.info(f"修改pg asin状态3-----{len(site_up_asin)}---------{site_up_asin}")
                            #                 break
                            #             else:
                            #                 time.sleep(3)
                            #                 logging.info(
                            #                     f"修改pg asin状态3-失败----{len(site_up_asin)}---------{site_up_asin}")
                            #                 continue
                            #         else:
                            #             time.sleep(3)
                            #             logging.info(
                            #                 f"修改pg asin状态3-网络链接失败----{len(site_up_asin)}---------{site_up_asin}")
                            #             continue
                            #     except FunctionTimedOut as e:
                            #         time.sleep(3)
                            #         logging.info(f"修改pg asin状态3-超时{e}----{len(site_up_asin)}---------{site_up_asin}")
                            #         continue
                elif k == "error_queue":
                    if dates:
                        # 表名需要改
                        sql_up = f"UPDATE `{self.site}_self_all_syn` set `state`=(%s)  where asin=(%s) and site=(%s);"
                        up_datas = [list(i) for i in df.values]
                        if len(up_datas) == 1:
                            d = up_datas[0]
                        else:
                            d = up_datas
                        self.up_del_db(sql_up, d, self.site, db="mysql")
                        df_st_asin = df.loc[:, ["asin", "state"]]
                        site_up_asin = [list(i) for i in df_st_asin.values]
                        logging.info(f"{self.site}_self_all_syn   {df.shape}")
                        if site_up_asin:
                            site_up_sql = f"insert into {self.site}_all_syn_st_asin (asin,state) values %s on conflict(asin) do update set state=excluded.state;"

                            # while True:
                            #     try:
                            #         if is_internet_available():
                            #             if updatas_14pg_asin(site_up_sql, data=site_up_asin, site=self.site):
                            #                 logging.info(
                            #                     f"修改pg asin状态3-----{len(site_up_asin)}---------{site_up_asin}")
                            #                 break
                            #             else:
                            #                 time.sleep(3)
                            #                 logging.info(
                            #                     f"修改pg asin状态3-失败----{len(site_up_asin)}---------{site_up_asin}")
                            #                 continue
                            #         else:
                            #             time.sleep(3)
                            #             logging.info(
                            #                 f"修改pg asin状态3-网络链接失败----{len(site_up_asin)}---------{site_up_asin}")
                            #             continue
                            #     except FunctionTimedOut as e:
                            #         time.sleep(3)
                            #         logging.info(
                            #             f"修改pg asin状态3-超时{e}----{len(site_up_asin)}---------{site_up_asin}")
                            #         continue
                elif k == "asin_img_queue":
                    if dates:
                        for name, group in df.groupby(['site']):
                            logging.info(f"name: {name}")
                            img_exploded_list = group['asin_img'].explode()
                            # 展开后转换为一个大列表
                            img_list = [i for i in img_exploded_list.tolist() if not isinstance(i, float)]
                            if img_list:
                                logging.info(f"img处理{img_list[0:5]}")
                                df_img = pd.DataFrame(img_list)
                                if name in ["us", "uk", "fr", "de", "it", "es"]:
                                    df_img['mapped_asin'] = df_img['asin'].apply(self.asin_to_number)
                                df_img.drop_duplicates(subset=["asin", "img_order_by", "data_type"], inplace=True)
                                dele_asin = list(set(df_img.asin))

                                if len(dele_asin) == 1:
                                    sql_del = f"delete from {name}_asin_image where asin in ('{tuple(dele_asin)[0]}');"
                                else:
                                    sql_del = f"delete from {name}_asin_image where asin in {tuple(dele_asin)};"
                                self.up_del_db(sql_del, site=self.site, db="pg14")
                                logging.info(f"img delete {dele_asin[0:10]}")
                                # _asin_image库
                                self.save_db(f"{name}_asin_image", df_img, self.site, db="pg14")
                                logging.info(f"img save {df_img.head()}")
                elif k == "variat_item_queue":
                    if dates:
                        for name, group in df.groupby(['site']):
                            logging.info(f"variat name: {name}")
                            variat_exploded_list = group['variat_item'].explode()
                            # 展开后转换为一个大列表
                            variat_list = [i for i in variat_exploded_list.tolist() if not isinstance(i, float)]
                            if variat_list:
                                logging.info(f"variat 处理{variat_list[0:5]}")
                                df_variat = pd.DataFrame(variat_list)
                                dele_asin = list(set(df_variat["parent_asin"]))
                                if dele_asin:
                                    if len(dele_asin) == 1:
                                        sql_del = f"delete from `{self.site}_variat` where parent_asin in ('{tuple(dele_asin)[0]}');"
                                    else:
                                        sql_del = f"delete from `{self.site}_variat` where parent_asin in {tuple(dele_asin)};"
                                    # sql_delete(sql_del, site=self.site)
                                    self.up_del_db(sql_del, site=self.site, db="mysql")
                                    logging.info(f"variat delete {dele_asin[0:10]}")
                                df_variat.drop_duplicates(subset=["asin"], inplace=True)
                                self.save_db(f"{self.site}_variat", df_variat, self.site, "mysql")
                                logging.info(f"variat save {df_variat.head()}")
                elif k == "self_variat_queue":
                    if dates:
                        for name, group in df.groupby(['site']):
                            logging.info(f"self_variat_item name: {name}")
                            variat_exploded_list = group['self_variat_item'].explode()
                            # 展开后转换为一个大列表
                            variat_list = [i for i in variat_exploded_list.tolist() if not isinstance(i, float)]
                            if variat_list:
                                logging.info(f"variat 处理{variat_list[0:5]}")
                                df_variat = pd.DataFrame(variat_list)
                                dele_asin = list(set(df_variat["parent_asin"]))
                                if dele_asin:
                                    if len(dele_asin) == 1:
                                        sql_del = f"delete from `{self.site}_self_variat` where parent_asin in ('{tuple(dele_asin)[0]}');"
                                    else:
                                        sql_del = f"delete from `{self.site}_self_variat` where parent_asin in {tuple(dele_asin)};"
                                    # sql_delete(sql_del, site=self.site)
                                    self.up_del_db(sql_del, site=self.site, db="mysql")
                                    logging.info(f"_self_variat delete {dele_asin[0:10]}")
                                self.save_db(f"{self.site}_self_variat", df_variat, self.site, "mysql")
                                logging.info(f"_self_variat save {df_variat.head()}")


    def process_item(self, item, spider):
        if item.get("finish_spider"):
            print('等待时 将队列数据存储', {k: v.qsize() for k, v in self.q_dict.items()})
            self.queue_consumer(q_size="max")
        if item.get("inner_item"):
            self.q_dict.get("inner_item_queue").put(item.get('inner_item'))
        elif item.get("error_asin"):
            self.q_dict.get("error_queue").put(item.get("asin"))
        elif item.get("variat_item"):
            self.q_dict.get("variat_item_queue").put(item)
        elif item.get("asin_img"):
            self.q_dict.get("asin_img_queue").put(item)
        self.queue_consumer("min")

    def close_spider(self, spider):
        print('爬虫结束，存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
        self.queue_consumer("max")


# conn = get_con("us")
# s = sql_connect("us")
# dates = [('B09GM8Y8BN', 'amazon.com', '16707', '4.5', '167', 'Visit the AROEVE Store', 'Home & Kitchen', 'Heating, Cooling & Air Quality', 'Air Purifiers', 'HEPA Air Purifiers', '', '', '', '510192', '{"6474046": 804920, "6474270": 816090, "6476596": 576284, "6476866": 623361, "6478072": 712591}', '{"7141123011": 4860126}', '')]
#
# print(dates)
# # dates = list(set([tuple(v.get()) for i in range(0, self.num)]))
# # ['asin', 'salesChannel', 'reviews', 'rating', 'current_rank', 'brand_name', 'root_category', 'second_category', 'three_category', 'four_category', 'five_category', 'six_category', 'seven_category', 'cat_id', 'history_rank', 'history_category', 'current_rank_avg']
# # inset_sql = f"insert into `product_publish_keepa_text` (`asin`, `salesChannel`, `reviews`, `rating`, `current_rank`, `brand_name`, `root_category`, `second_category`, `three_category`, `four_category`, `five_category`, `six_category`, `seven_category`, `cat_id`, `history_rank`, `history_category`, `current_rank_avg`) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `history_rank` = values(`history_rank`)"
# inset_sql = f"REPLACE into `product_publish_keepa_text` (`asin`, `salesChannel`, `reviews`, `rating`, `current_rank`, `brand_name`, `root_category`, `second_category`, `three_category`, `four_category`, `five_category`, `six_category`, `seven_category`, `cat_id`, `history_rank`, `history_category`, `current_rank_avg`) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
# if len(dates) == 1:
#     sql_insert(inset_sql, dates[0])
# else:
#     sql_insert_many(inset_sql, dates)
# logging.info(f"更新product_publish_keepa_text-----{len(dates)}---------{dates}")