import logging
import time
import pandas as pd
from queue import Queue
# useful for handling different item types with a single interface
from amazon_spider.conf.db import selection_table_name
from amazon_spider.utils.common import is_internet_available
from amazon_spider.db.mysql_db import sql_update, sql_update_many, sql_connect, sql_insert_many, sql_insert


class AmazonCommentNewsSpiderPipeline:
    def __init__(self, site="us"):
        self.site = site
        self.comment_table_name = f"{self.site}_asin_comment"
        # self.comment_table_name = selection_table_name.get(f"{self.site}_comment_table")
        self.asin_table_name = selection_table_name.get(f"{self.site}_asin_variat")
        self.comment_count_table = selection_table_name.get(f"{self.site}_comment_num_table")
        self.q_dict = {
            "comment_count_queue": Queue(),
            "error_queue": Queue(),
            "comment_queue": Queue(),
        }
        self.num = 50
        self.cols_comment_num_list = [
            'parent_asin', 'comment_num'
        ]
        sql_connect(self.site)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            site=crawler.spider.site
        )

    def queue_consumer(self, q_size):
        for k, v in self.q_dict.items():
            if q_size == "max":
                if v.qsize():
                    if k == 'comment_queue':
                        dates = []
                        for i in range(0, v.qsize()):
                            dates += v.get()
                    else:
                        dates = [v.get() for i in range(0, v.qsize())]
                else:
                    dates = []
            else:
                if v.qsize() >= self.num:
                    if k == 'comment_queue':
                        dates = []
                        for i in range(0, self.num):
                            dates += v.get()
                    else:
                        dates = [v.get() for i in range(0, self.num)]
                else:
                    dates = []
            if dates:
                if k == 'comment_count_queue':
                    inset_sql = f"insert into `{self.comment_count_table}` (`parent_asin`, `comment_num`, `star`) values (%s, %s, %s) ON DUPLICATE KEY UPDATE `parent_asin` = values(`parent_asin`), `comment_num` = values(`comment_num`), `star` = values(`star`);"
                    while True:
                        if is_internet_available():
                            if len(dates) == 1:
                                sql_insert(inset_sql, dates[0])
                                logging.info(f"asin_comment_num {dates[0]}")
                                break
                            else:
                                sql_insert_many(inset_sql, dates)
                                logging.info(f"asin_comment_num {dates}")
                                break
                        else:
                            time.sleep(3)
                            logging.info(f"requests baidu error --> T_T")
                            continue
                elif k == 'error_queue':
                    sql_up = f"UPDATE `{self.asin_table_name}` set `state`=(%s), `comment_new_time`=(%s)  where `parent_asin`=(%s);"
                    while True:
                        if is_internet_available():
                            if len(dates) == 1:
                                sql_update(sql_up, dates[0])
                                logging.info(f"{self.asin_table_name} {dates[0]}")
                                break
                            else:
                                sql_update_many(sql_up, dates)
                                logging.info(f"{self.asin_table_name} {dates}")
                                break
                        else:
                            time.sleep(3)
                            logging.info(f"requests baidu error --> T_T")

                elif k == 'comment_queue':
                    cols_list = ['asin', 'parent_asin', 'title', 'content', 'is_vp', 'model', 'rating', 'agree_num',
                                 'img_num',
                                 'img_url', 'is_video', 'video_url', 'comment_url', 'user_name', 'user_img', 'country',
                                 'user_page', 'is_earns_commissions', 'comment_time', 'page', 'star', 'vine_review_flag', 'comment_id', 'page_state']

                    df = pd.DataFrame(dates, columns=cols_list)
                    logging.info(f"去重前长度  {df.shape} ")
                    df.drop_duplicates(subset=["comment_id"], inplace=True)
                    logging.info(f"去重后长度 {df.shape} ")
                    inset_sql = f"insert into `{self.comment_table_name}` (`asin`, `parent_asin`, `title`, `content`, `is_vp`, `model`, `rating`,`agree_num`, `img_num`, `img_url`, `is_video`, `video_url`,`comment_url`, `user_name`, `user_img`, `country`, `user_page`,`is_earns_commissions`, `comment_time`, `page`, `star`, `vine_review_flag`, `comment_id`,`page_state`) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `asin` = values(`asin`), `parent_asin` = values(`parent_asin`), `title` = values(`title`), `content` = values(`content`), `is_vp` = values(`is_vp`), `model` = values(`model`), `rating` = values(`rating`), `agree_num` = values(`agree_num`), `img_num` = values(`img_num`), `img_url` = values(`img_url`), `is_video` = values(`is_video`), `video_url` = values(`video_url`), `comment_url` = values(`comment_url`), `user_name` = values(`user_name`), `user_img` = values(`user_img`), `country` = values(`country`), `user_page` = values(`user_page`), `is_earns_commissions` = values(`is_earns_commissions`), `comment_time` = values(`comment_time`), `page` = values(`page`), `star` = values(`star`), `vine_review_flag` = values(`vine_review_flag`), `comment_id` = values(`comment_id`), `page_state` = values(`page_state`);"
                    while True:
                        if is_internet_available():
                            if len(dates) == 1:
                                sql_insert(inset_sql, dates[0])
                                logging.info(
                                    f"入库成功--> 长度： {len(list(set(df.asin)))} asin: {list(set(df.asin))[:5]} parent_asin: {list(set(df.parent_asin))[:5]}")
                                break

                            else:
                                sql_insert_many(inset_sql, dates)
                                logging.info(
                                    f"入库成功--> 长度： {len(list(set(df.asin)))} asin: {list(set(df.asin))[:5]} parent_asin: {list(set(df.parent_asin))[:5]}")
                                break
                        else:
                            time.sleep(3)
                            logging.info(f"requests baidu error --> T_T")
                            continue

    def process_item(self, item, spider):
        if item.get("finish_spider"):
            x = {k: v.qsize() for k, v in self.q_dict.items()}
            logging.info(f'sleep to queue data save {x}')
            self.queue_consumer(q_size="max")
        if q := item.get('queues_'):
            data = []
            for i in range(0, q.qsize()):
                data += q.get()
            max_comment_time = max([d[18] for d in data])
            self.q_dict.get("comment_queue").put(data)
            self.q_dict.get("error_queue").put((3, max_comment_time, item.get("asin")))
            self.q_dict.get("comment_count_queue").put((item.get("asin"), item.get("comment_count"), item.get("star")))
        elif item.get("count_max") and (not item.get("error_asin")):
            # 将数据添加到队列
            self.q_dict.get("comment_count_queue").put((item.get("asin"), item.get("comment_count"), item.get("star")))
            self.q_dict.get("error_queue").put(item.get("sql_data"))

        elif item.get("error_asin"):
            self.q_dict.get("error_queue").put(item.get("sql_data"))
        self.queue_consumer("min")
        #

    def close_spider(self, spider):
        print(f'{self.site}  爬虫结束，存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
        self.queue_consumer("max")
