import time
import logging
import pandas as pd
from queue import Queue
from func_timeout import func_set_timeout
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
# useful for handling different item types with a single interface
from amazon_spider.conf.db import selection_table_name
from amazon_spider.utils.common import is_internet_available
from amazon_spider.db.mysql_db import df_to_sql, get_country_engine
from amazon_spider.db.pg_db import get_pg_country_engine, get_14pg_country_engine


class AmazonCommentSpiderPipeline:
    def __init__(self, site="us"):
        self.site = site
        self.comment_table_name = f"{self.site}_asin_comment_copy"
        # self.comment_table_name = selection_table_name.get(f"{self.site}_comment_table")
        self.asin_table_name = selection_table_name.get(f"{self.site}_asin_variat")
        self.comment_count_table = selection_table_name.get(f"{self.site}_comment_num_table")
        self.q_dict = {
            "comment_count_queue": Queue(),
            "error_queue": Queue(),
            "comment_queue": Queue(),
        }
        self.num = 50
        self.cols_comment_num_list = [
            'parent_asin', 'comment_num'
        ]

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            site=crawler.spider.site
        )

    @func_set_timeout(300)
    def up_del_dis(self, sql, data=None, site="us", db="mysql"):
        if db == "mysql":
            e = get_country_engine(site)
        elif db == "pg":
            e = get_pg_country_engine(site)
        elif db == "pg14":
            e = get_14pg_country_engine(site)
        try:
            if not is_internet_available():
                return False
            with e.connect() as conn:
                if data != None:
                    if data:
                        conn.execute(sql, data)
                else:
                    conn.execute(sql)
                e.dispose()
                # conn.commit()
                # conn.close()
                return True
        except OperationalError as e:
            logging.info(f"error sql is {sql} {e}")
            return False

    def up_del_db(self, sql, data=None, site="us", db="mysql"):
        if 'delete' in sql.lower():
            sql_msg = "delete"
        else:
            sql_msg = "update"
        while True:
            try:
                # if is_internet_available():
                if self.up_del_dis(sql, data=data, site=site, db=db):
                    logging.info(f"{sql_msg} {db} ok ^_^ -----{len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
                    break
                else:
                    time.sleep(3)
                    logging.info(
                        f"{sql_msg} {db} asin T_T --> {len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
                    continue
                # else:
                #     time.sleep(3)
                #     logging.info(
                #         f"{sql_msg} {db} network error T_T --> {len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
                #     continue
            except FunctionTimedOut as e:
                time.sleep(3)
                logging.info(f"{sql_msg} {db} time out T_T --> {e}----{len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
                continue

    def save_db(self, table, df, site, db):
        # 入库报错重试
        while True:
            try:
                if df_to_sql(table, df, site=site, db=db):
                    logging.info(
                        f"更新 {db} 数据库 {table} -----{df.shape}---------{df.head()}")
                    break
                else:
                    logging.info(f"更新 {db} 数据库 {table} -----失败")
                    continue
            except OperationalError as e:
                logging.info(f"更新 {db} 数据库 {table} 失败  连接错误{e}")
                continue
            except FunctionTimedOut as e:
                logging.info(
                    f"更新 {db} 数据库 {table} -超时-{e}---{df.shape}---------{df.head()}")
                continue

    def queue_consumer(self, q_size):
        for k, v in self.q_dict.items():
            if q_size == "max":
                if v.qsize():
                    if k == 'comment_queue':
                        dates = []
                        for i in range(0, v.qsize()):
                            dates += v.get()
                    else:
                        dates = [v.get() for i in range(0, v.qsize())]
                else:
                    dates = []
            else:
                if v.qsize() >= self.num:
                    if k == 'comment_queue':
                        dates = []
                        for i in range(0, self.num):
                            dates += v.get()
                    else:
                        dates = [v.get() for i in range(0, self.num)]
                else:
                    dates = []
            if dates:
                if k == 'comment_count_queue':
                    inset_sql = f"insert into `{self.comment_count_table}` (`parent_asin`, `comment_num`, `star`) values (%s, %s, %s) ON DUPLICATE KEY UPDATE `parent_asin` = values(`parent_asin`), `comment_num` = values(`comment_num`), `star` = values(`star`);"
                    d = dates[0] if len(dates) == 1 else dates
                    self.up_del_db(inset_sql, data=d, site=self.site, db="mysql")
                    logging.info(f"asin_comment_num {d}")
                elif k == 'error_queue':
                    sql_up = f"UPDATE `{self.asin_table_name}` set `state`=(%s)  where `parent_asin`=(%s);"
                    d = dates[0] if len(dates) == 1 else dates
                    self.up_del_db(sql_up, data=d, site=self.site, db="mysql")
                    logging.info(f"{self.asin_table_name} {d}")
                elif k == 'comment_queue':
                    cols_list = ['asin', 'parent_asin', 'title', 'content', 'is_vp', 'model', 'rating', 'agree_num',
                                 'img_num',
                                 'img_url', 'is_video', 'video_url', 'comment_url', 'user_name', 'user_img', 'country',
                                 'user_page', 'is_earns_commissions', 'comment_time', 'page', 'md5_unique', 'star']

                    df = pd.DataFrame(dates, columns=cols_list)
                    logging.info(f"去重前长度  {df.shape} ")
                    df.drop_duplicates(subset=["asin", "parent_asin", "title", "rating", "user_name", "comment_time"], inplace=True)
                    logging.info(f"去重后长度 {df.shape} ")
                    inset_sql = f"insert into `{self.comment_table_name}` (`asin`, `parent_asin`, `title`, `content`, `is_vp`, `model`, `rating`, `agree_num`, `img_num`, `img_url`, `is_video`, `video_url`, `comment_url`, `user_name`, `user_img`, `country`, `user_page`, `is_earns_commissions`, `comment_time`, `page`, `md5_unique`, `star`) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `asin` = values(`asin`), `parent_asin` = values(`parent_asin`), `title` = values(`title`), `content` = values(`content`), `is_vp` = values(`is_vp`), `model` = values(`model`), `rating` = values(`rating`), `agree_num` = values(`agree_num`), `img_num` = values(`img_num`), `img_url` = values(`img_url`), `is_video` = values(`is_video`), `video_url` = values(`video_url`), `comment_url` = values(`comment_url`), `user_name` = values(`user_name`), `user_img` = values(`user_img`), `country` = values(`country`), `user_page` = values(`user_page`), `is_earns_commissions` = values(`is_earns_commissions`), `comment_time` = values(`comment_time`), `page` = values(`page`), `md5_unique` = values(`md5_unique`), `star` = values(`star`);"
                    d = dates[0] if len(dates) == 1 else dates
                    self.up_del_db(inset_sql, data=d, site=self.site, db="mysql")
                    logging.info(f"入库成功--> 长度： {len(list(set(df.asin)))} asin: {list(set(df.asin))[:5]} parent_asin: {list(set(df.parent_asin))[:5]}")

    def process_item(self, item, spider):
        if item.get("finish_spider"):
            x = {k: v.qsize() for k, v in self.q_dict.items()}
            logging.info(f'sleep to queue data save {x}')
            self.queue_consumer(q_size="max")
        if q := item.get('queues_'):
            data = []
            for i in range(0, q.qsize()):
                data += q.get()
            self.q_dict.get("comment_queue").put(data)
            self.q_dict.get("error_queue").put((3, item.get("asin")))
            self.q_dict.get("comment_count_queue").put((item.get("asin"), item.get("comment_count"), item.get("star")))

            # cols_list = ['asin', 'parent_asin', 'title', 'content', 'is_vp', 'model', 'rating', 'agree_num', 'img_num',
            #              'img_url', 'is_video', 'video_url', 'comment_url', 'user_name', 'user_img', 'country',
            #              'user_page', 'is_earns_commissions', 'comment_time', 'page', 'md5_unique', 'star']

            # df = pd.DataFrame(data, columns=cols_list)
            # logging.info(f"去重前长度  {df.shape} {item['star']}")
            # df.drop_duplicates(subset=["md5_unique"], inplace=True)
            # logging.info(f"去重后长度 {df.shape} {item['star']}")
            # inset_sql = f"insert into `{self.comment_table_name}` (`asin`, `parent_asin`, `title`, `content`, `is_vp`, `model`, `rating`, `agree_num`, `img_num`, `img_url`, `is_video`, `video_url`, `comment_url`, `user_name`, `user_img`, `country`, `user_page`, `is_earns_commissions`, `comment_time`, `page`, `md5_unique`, `star`) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `asin` = values(`asin`), `parent_asin` = values(`parent_asin`), `title` = values(`title`), `content` = values(`content`), `is_vp` = values(`is_vp`), `is_vp` = values(`is_vp`), `rating` = values(`rating`), `agree_num` = values(`agree_num`), `img_num` = values(`img_num`), `img_url` = values(`img_url`), `is_video` = values(`is_video`), `video_url` = values(`video_url`), `comment_url` = values(`comment_url`), `user_name` = values(`user_name`), `user_img` = values(`user_img`), `country` = values(`country`), `user_page` = values(`user_page`), `is_earns_commissions` = values(`is_earns_commissions`), `comment_time` = values(`comment_time`), `page` = values(`page`), `md5_unique` = values(`md5_unique`), `star` = values(`star`);"
            # d = data[0] if len(data) == 1 else data
            #
            # self.up_del_db(inset_sql, data=d, site=self.site, db="mysql")
        elif item.get("count_max") and (not item.get("error_asin")):
            # 将数据添加到队列
            self.q_dict.get("comment_count_queue").put((item.get("asin"), item.get("comment_count"), item.get("star")))
            self.q_dict.get("error_queue").put(item.get("sql_data"))

        elif item.get("error_asin"):
            self.q_dict.get("error_queue").put(item.get("sql_data"))
        self.queue_consumer("min")

    def close_spider(self, spider):
        print(f'{self.site}  爬虫结束，存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
        self.queue_consumer("max")

