import re
import queue
import scrapy
import random
import os, sys
import logging
import pandas as pd
from lxml import etree
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# 亚马逊历史评论获取
from amazon_spider.spiders.yswg_spider import SourceSpider


class CommentVineSpider(SourceSpider):
    name = 'comment_vine_api'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        #  curl_cffi 代理添加
        # 'PROXY_HOST': 'http-dynamic-S02.xiaoxiangdaili.com',
        # 'PROXY_PORT': 10030,
        # 'PROXY_USER': '******',
        # 'PROXY_PASS': '******',
        'CONCURRENT_REQUESTS': 3,
        'DOWNLOAD_TIMEOUT': 10,
        # 深度 廣度
        # 'DOWNLOAD_DELAY': 2,
        # # 启用后，当从相同的网站获取数据时，Scrapy将会等待一个随机的值，延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404, 401],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 480,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            'amazon_spider.middlewares.UpdateCookiesUrl': 530,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.amazon_comment_pipe_text.AmazonCommentSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        },
        # 'DOWNLOAD_HANDLERS': {
        #     'http': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        #     'https': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        # }
    }

    def __init__(self, site='us'):
        super(CommentVineSpider, self).__init__()
        self.site = site
        if not self.site_url.get(self.site):
            raise ValueError(f"{type(self).__name__} site error")
        self.url_ = self.site_url.get(self.site)
        self.update_cookies()
        self.comment_headers = None
        self.h_dict = {
            'rtt': '100',
            'sec-ch-device-memory': '8',
            'sec-ch-dpr': '1',
            'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^112^\\^, ^\\^Google',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '^\\^Windows^\\^',
            # 'sec-ch-ua-platform-version': '^\\^10.0.0^\\^',
            'sec-ch-viewport-width': '1587',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'viewport-width': '1587',
            'device-memory': '8',
            # 'downlink': '10',
            'dpr': '1',
            'ect': '4g',
        }
        self.utils_requests = [{"use_aiohttp": True}, {"curlcffi": True}, {"use_httpx": True}]

        # df_list = []
        # # 循环遍历每个 CSV 文件并读取
        # csv_files = ['cantonfair_玩具及孕婴童1.xlsx', 'cantonfair_玩具及孕婴童_931_2.xlsx', 'cantonfair_玩具及孕婴童_931_3.xlsx']
        # for csv_file in csv_files:
        #     df = pd.read_excel(csv_file, dtype={'shop_id': str})
        #     df_list.append(df)
        # df_save = pd.concat(df_list)
        # print(df_save.shape)
        # df_save.drop_duplicates(['shop_id'], inplace=True)
        # print(df_save.shape)
        # df_save.to_excel(f"./cantonfair_玩具及孕婴童max.xlsx", index=False, encoding='utf-8')

        # self.seeds = pd.read_csv('./cantonfair_2900_main.csv', dtype={'shop_id': str})
        self.seeds_file = './comment_seed.xlsx'
        self.save_file = './comment_seed.xlsx'
        self.seeds = pd.read_excel(self.seeds_file, dtype={'asin': str, 'sku': str})
        # self.seeds['state'] = 1
        self.sssite = 'Amazon.it'
        self.seeds.loc[((self.seeds['站点'] == self.sssite)), 'state'] = 1
        self.seeds_new = self.seeds[(self.seeds['state'] == 1) & (self.seeds['站点'] == self.sssite)]

        # self.seeds_new = self.seeds[self.seeds['hot_product'].isna()]
        # del self.seeds_new['Unnamed: 0.1']
        # del self.seeds_new['Unnamed: 0.1.1']
        # self.seeds['hot_product'] = ''
        # self.seeds['hot_product_len'] = ''
        # self.seeds['title_svg'] = ''
        # self.q = queue.Queue()

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(CommentVineSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------{self.site}------------')
        url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
        self.update_comment_headers()
        data = {
            'sortBy': 'recent',
            'reviewerType': 'all_reviews',
            'pageNumber': '1',
            'shouldAppend': 'undefined',
            'deviceType': 'desktop',
            'canShowIntHeader': 'undefined',
            'reftag': 'cm_cr_arp_d_viewopt_srt',
            'pageSize': '10',
            # 'asin': job['asin'],
            'scope': 'reviewsAjax',
            # 'filterByStar': star,
        }

        self.seeds_new = self.seeds[(self.seeds['state'] == 1) & (self.seeds['站点'] == self.sssite)]
        logging.info(f"-----长度{self.seeds_new.shape}")
        if self.seeds_new.shape[0] > 0:
            num = 0
            for i in self.seeds_new.values:
                # star_list = ["one_star", "two_star", "three_star", "four_star", "five_star"]
                star_list = ["all_stars"]
                logging.info(f"num {num}  长度{self.seeds_new.shape}")
                num += 1
                for num in range(len(star_list)):
                    star = star_list[num]
                    dq = queue.Queue()
                    queue_name = i[1] + star
                    data["asin"] = i[1]
                    data["filterByStar"] = star
                    data["pageNumber"] = '1'
                    meta = {
                        'priority': num,
                        # "use_aiohttp": True,
                        "queue_name": queue_name,
                        "star": star,
                        # "curlcffi": True,
                        "asin": i[1],
                        "pageNumber": data.get("pageNumber"),
                        "ck": True,
                        f"{i[1] + star}": dq,
                        "history_comment_count": 0,
                        'choice_header': True,
                    }
                    meta = self.random_r(meta)
                    cookies = self.update_site_cookie(self.json_cookies(self.site))
                    self.comment_headers = self.get_comment_headers(self.url_, i[1])
                    h_key = random.choice(list(self.h_dict))
                    self.comment_headers[h_key] = self.h_dict.get(h_key)
                    self.comment_headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                    cookies.update({
                        'csm-sid': '916-1904410-6680838',
                        'x-amz-captcha-1': '1706092664884599',
                        'x-amz-captcha-2': '5OU/Q7DyDmtg9QGCbivNXg=='
                    })
                    self.update_comment_headers()
                    self.crawler.engine.crawl(
                        scrapy.FormRequest(url=url + f'#star={star}#asin={i[1]}#pageNumber={data.get("pageNumber")}',
                                           headers=self.comment_headers, formdata=data, callback=self.parse, meta=meta,
                                           cookies=cookies, dont_filter=True, errback=self.err_parse), self)
            # self.seeds.to_excel('./cantonfair_玩具及孕婴童max.xlsx', encoding='utf-8')
            # self.seeds.to_csv('./cantonfair_2900_main.csv', encoding='utf-8')
        else:
            logging.info("爬取完成")
            logging.info("spider finish")
            self.seeds.to_excel(self.save_file, index=False, encoding='utf-8')
            # quit()
            raise

    def parse(self, response, **kwargs):
        comment_num = re.findall(
            "(rating|ratings|Gesamtbewertungen|totales|total|totali|totale|总评分), (.*?) (带评论|with|mit|con|avec)",
            response.text)
        if not comment_num:
            # 反爬重试
            logging.info(f'页面变狗：{response.meta.get("star")} {response.meta.get("asin")}')
            item = {
                "count_max": True,
                "sql_data": (3, response.meta.get("asin")),
                "asin": response.meta.get("asin"),
                "comment_count": 0,
                "star": response.meta.get("star"),
            }
            # yield item
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), '总评论数(修订)'] = 0
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), '有效评论数'] = 0
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), 'vine评论数'] = 0
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), 'state'] = 3
        else:
            comment_count = re.sub(r"\\u.{4}", '', comment_num[0][1].__repr__()).replace(",", "").replace(".",
                                                                                                          "").replace(
                "'", "") if comment_num else 0

            total_num = re.findall(
                "(\d+) total",
                response.text)

            url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
            logging.info(
                f'{response.meta.get("asin")}历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')

            if not comment_count == "0":
                r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
                    '\\"', '"')
                page_count = 1 if response.meta.get('error_page') else self.get_page_num(int(comment_count), 10)
                response.meta["pageNumber"] = response.meta.get('pageNumber')
                response.meta["page_count"] = 10 if page_count >= 10 else page_count
                response.meta["comment_num"] = comment_count
                response.meta['total_num'] = total_num[0] if total_num else None
                item = self.xpath_res(r, response)
                item["save_comment_num"] = True
                # if int(response.meta.get("history_comment_count")) > int(comment_count):
                #     # 判断评论最新时间
                #     logging.info(
                #         f'评论数小于历史评论数,历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')
                #     return
                yield item
                ran = range(2, 11) if page_count >= 10 else range(2, page_count + 1)
                for i in ran:
                    cookies = self.json_cookies(self.site)
                    # cookies = json.loads(random.choice(self.cookie_list)[1])
                    cookies = self.update_site_cookie(cookies)
                    data = {
                        'sortBy': 'recent',
                        'reviewerType': 'all_reviews',
                        'pageNumber': str(i),
                        'shouldAppend': 'undefined',
                        'deviceType': 'desktop',
                        'canShowIntHeader': 'undefined',
                        'reftag': 'cm_cr_arp_d_viewopt_srt',
                        'pageSize': '10',
                        'asin': response.meta.get("asin"),
                        'scope': 'reviewsAjax',
                        'filterByStar': response.meta["star"],
                    }
                    meta = self.random_r(response.meta)
                    meta["pageNumber"] = str(i)
                    meta["priority"] = i
                    meta['total_num'] = total_num[0] if total_num else None
                    self.comment_headers = self.get_comment_headers(self.url_, data['asin'])
                    h_key = random.choice(list(self.h_dict))
                    self.comment_headers[h_key] = self.h_dict.get(h_key)
                    self.comment_headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                    cookies.update({
                        'csm-sid': '916-1904410-6680838',
                        'x-amz-captcha-1': '1706092664884599',
                        'x-amz-captcha-2': '5OU/Q7DyDmtg9QGCbivNXg=='
                    })
                    self.update_comment_headers()
                    yield scrapy.FormRequest(
                        url=url + f'#star={response.meta.get("star")}#asin={data.get("asin")}#pageNumber={str(i)}',
                        headers=self.comment_headers, cookies=cookies, formdata=data, callback=self.parse_data,
                        errback=self.err_parse, meta=meta, dont_filter=True)

            elif comment_count == "0":
                logging.info(f'评论数 为0 页面变狗：{response.meta.get("asin")}')
                # item = {
                #     "count_max": True,
                #     "sql_data": (3, response.meta.get("asin")),
                #     "comment_count": 0,
                #     "asin": response.meta.get("asin"),
                #     "star": response.meta.get("star"),
                # }
                # yield item
                self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), '总评论数(修订)'] = 0
                self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), '有效评论数'] = 0
                self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), 'vine评论数'] = 0
                self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), 'state'] = 3

    def parse_data(self, response):
        """解析详情页数据"""
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
            '\\"', '"')
        if "@@@@@@@@@@@@@@@@@@@@___" in r:
            msg = self.r_utils(response)
            logging.info(
                f"页面 @@@@@@@@@@@@@ asin为: {response.meta.get('asin')} {response.meta.get('star')} 爬取页数 {response.meta.get('pageNumber')} 请求： {msg}")
            return None
        item = self.xpath_res(r, response)
        return item

    def comment_time(self, time_msg):
        if self.site == "de":
            time_msg = (time_msg[0].split(".")[0][-2:] + "." + time_msg[0].split(".")[1]).strip()
        elif self.site == "es":
            time_msg = time_msg[0].split("el ")[-1].strip()
        elif self.site == "fr":
            time_msg = time_msg[0].split("le ")[1].strip()
        elif self.site == "it":
            time_msg = time_msg[0].split("il ")[1].strip()
        else:
            if "年" in time_msg[0]:
                from datetime import datetime
                date_object = datetime.strptime(time_msg[0].split(" ")[0], "%Y年%m月%d日")
                time_msg = date_object.strftime("%Y-%m-%d")
            else:
                time_msg = time_msg[0].split("on")[1].strip()
        return time_msg

    def xpath_res(self, r, response):
        stats = self.crawler.stats
        ret = etree.HTML(r)
        # if not ret:
        #     # with open(f"./{response.meta.get('asin')}.html", "w", encoding='utf-8')as f:
        #     #     f.write(r)
        #     logging.info("NoneType object has no attribute xpath error find")
        # 10条评论 
        datas = ret.xpath("//div[@class='a-section review aok-relative']")
        items = []
        for i in datas:
            # 赞同数
            helpful = i.xpath(".//span[@data-hook='helpful-vote-statement']//text()")
            if helpful:
                if self.site == "es":
                    review = 1 if helpful[0].split(" ")[1].strip() == "One" else helpful[0].split(" ")[1].strip()
                else:
                    review = 1 if helpful[0].split(" ")[0].strip() == "One" else helpful[0].split(" ")[0].strip()
            else:
                review = 0
            try:
                review = int(review)
            except:
                review = 0

            vine = i.xpath(".//span[@class=\"a-color-success a-text-bold\"]//text()")
            item = {
                "vine": vine[0] if vine else None,
                "review": review,
            }
            items.append(list(item.values()))
        msg = self.r_utils(response)
        logging.info(f"成功爬取数据为: {msg} {len(items)} {items}")
        stats.inc_value(response.meta.get("queue_name"))
        response.meta.get(response.meta.get("queue_name")).put(items)
        msgs = {}
        if int(stats.get_value(response.meta.get("queue_name"))) == int(response.meta.get('page_count')):
            stats.set_value(response.meta.get("queue_name"), 0)
            logging.info(
                f"页数爬取成功asin为: {response.meta.get('asin')} {msg} {response.meta.get('star')} 爬取页数 {response.meta.get(response.meta.get('queue_name')).qsize()} 数据长度： {response.meta.get('comment_num')}")
            # msgs["queues_"] = response.meta.get(response.meta.get("queue_name"))
            # msgs["comment_count"] = response.meta.get("comment_num")
            # msgs["star"] = response.meta.get("star")
            q = response.meta.get(response.meta.get("queue_name"))
            data = []
            for i in range(0, q.qsize()):
                data += q.get()
            cols_list = ['vine', 'review']
            df = pd.DataFrame(data, columns=cols_list)
            print(df['vine'].count())
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), '总评论数(修订)'] = response.meta['total_num']
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), '有效评论数'] = response.meta['comment_num']
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), 'vine评论数'] = df['vine'].count()
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin']) & (self.seeds['站点'] == self.sssite)), 'state'] = 3

        msgs["asin"] = response.meta.get("asin")
        return msgs

    def err_parse(self, failure, **kwargs):
        request = failure.request
        logging.info(f"error______ {failure.getErrorMessage()}")

    def close(self, spider, reason):
        logging.info("spider finish")
        self.seeds.to_excel(self.save_file, index=False, encoding='utf-8')
        # raise
        # if self.q.qsize():
        #     d = []
        #     for i in range(self.q.qsize()):
        #         d += self.q.get()
        #     df_1 = pd.DataFrame(d)
        #     print(df_1.shape)
        #     try:
        #         df_2 = pd.read_excel(self.save_file, dtype={'shop_id': str, 'commodity_id': str})
        #         print(df_2.shape)
        #         df_save = pd.concat([df_1, df_2])
        #     except FileNotFoundError as e:
        #         df_save = pd.concat([df_1])
        #         print("没有该文件")
        #     print(df_save.shape)
        #     df_save.drop_duplicates(['commodity_id'], inplace=True)
        #     print(df_save.shape)
        #     df_save.to_excel(self.save_file, index=False, encoding='utf-8')
        # self.seeds.to_excel(self.seeds_file, index=False, encoding='utf-8')


if __name__ == '__main__':
    args = 'scrapy crawl comment_vine_api -a site=it'.split()
    cmdline.execute(args)

# source activate pyspark
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_comment_history -a site=us  > amazon.log 2>&1 &
# nohup scrapy crawl amazon_comment_all -a site=us > amazon_history_comment_us1.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl amazon_comment_all" |awk '{print $2}' `; do kill -9 $i ; done;
# nohup /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_comment -a site=us > amazon1.log 2>&1 &
# nohup scrapy crawl amazon_comment -a site=de > amazon_de1.log 2>&1 &



