import re
import json
import time
import queue
import redis
import scrapy
import random
import logging
import pandas as pd
import os, sys, platform
from scrapy import cmdline, signals, Selector
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.utils.utils import time_ch
from amazon_spider.spiders.yswg_spider import SourceSpider
from amazon_spider.db.redis_db import srandmembers, xadd_db0, lpop, lpush
# 亚马逊历史评论获取

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonCommentRealSpider(SourceSpider):
    name = 'amazon_comment_real_spider'  # 评论数据更新
    custom_settings = {
        #  curl_cffi 代理添加
        # 'PROXY_HOST': 'http-dynamic-S02.xiaoxiangdaili.com',
        # 'PROXY_PORT': 10030,
        # 'PROXY_USER': '******',
        # 'PROXY_PASS': '******',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'DOWNLOAD_DELAY': 0.5, # 每次请求之间的延迟时间为 2 秒
        'CONCURRENT_REQUESTS ': 8, # 设置全局并发请求数为 8
        'CONCURRENT_REQUESTS_PER_DOMAIN': 4,  # 设置每个域名的并发请求数为 4
        'CONCURRENT_REQUESTS_PER_IP': 4,
        'AUTOTHROTTLE_ENABLED ': True,
        'AUTOTHROTTLE_START_DELAY': 1,  # 初始延迟 1 秒
        'AUTOTHROTTLE_MAX_DELAY': 10,  # 最大延迟 10 秒
        'LOG_LEVEL': 'INFO',
        # # 启用后，当从相同的网站获取数据时，Scrapy将会等待一个随机的值，延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 10,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404, 401],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 480,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            'amazon_spider.middlewares.SelfAllUpdateCookiesUrl': 530,
            # 'amazon_spider.middlewares.UpdateCookiesUrl': 530,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.amazon_self_comment_update_pipe.AmazonCommentNewsSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonCommentRealSpider, self).__init__()
        self.site = site
        self.cols_list = ['asin', 'parent_asin', 'title', 'content', 'is_vp', 'model', 'rating', 'agree_num', 'img_num', 'img_url', 'is_video', 'video_url', 'comment_url', 'user_name', 'user_img', 'country', 'user_page', 'is_earns_commissions', 'comment_time', 'page', 'md5_unique', 'star']
        if not self.site_url.get(self.site):
            raise ValueError(f"{type(self).__name__} site error")
        self.url_ = self.site_url.get(self.site)
        self.update_cookies()
        self.comment_headers = None
        self.h_dict = {
            'rtt': '100',
            'sec-ch-device-memory': '8',
            'sec-ch-dpr': '1',
            'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^112^\\^, ^\\^Google',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '^\\^Windows^\\^',
            # 'sec-ch-ua-platform-version': '^\\^10.0.0^\\^',
            'sec-ch-viewport-width': '1587',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'viewport-width': '1587',
            'device-memory': '8',
            # 'downlink': '10',
            'dpr': '1',
            'ect': '4g',
        }
        # self.utils_requests = [{"use_aiohttp": True}, {"curlcffi": True}, {"use_httpx": True}, {}]
        self.seeds = ['{"siteName": "us", "taskId": "commentSpider_admin_1736132784", "asin": [{"asin": "B0CBV1V8Y7"}], "star": "all_stars", "error_count": 54, "totalReviews": "0", "totalRatings": "0"}']
        self.utils_requests = [{}]
        self.log_cookie = [
            {'main': '6IxhED1H6FMQTTmG7nT?LAez1MVVqPrelGSwprpEnnPP@sV8RLqnFVdp?l9IvyNj', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            {'main': "VmWEZcN1aeXE6Ki4vhW2OzY62prS7lQHK@7qGtFaeGxK@9u34S1ki58e4FLwmUkh", 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            {'main': "R5bEU6WFaJnTMdBRGTDFIVoVDGnSQFnf78wW3yV5QMO6bOasSvAfwc@3ZQmV4tCx", 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            # {'main': 'jPvNIv2B5a1C?WOi6XOoz2kOeCo7Hsd@sBjuYLLWEoc6GUluZQ3yfLXlVUE2VQh2', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            # {'main': 'fMhny8K1Vx?jc1p2x?HKEWkUhbDqRtwZvELLNc9wpQ1CcYqf6bazr4NFmTuYMHt1', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            {'main': 'VvTAIBiLWOtpU8eTIXfbBxWET4KRaZONjUSWcH6eSgnIVDutpoSpt@1qDTd@xPe0', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            # {'main': 'lJpfQtU3gFL0MdP0QB2npl07xPQplgjYMXuzH2KPM4F8z@f0saUFtSKNo1PHkweC', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
        ]
        self.log_cookie_df = pd.DataFrame(self.log_cookie)
        # 不同站点cookie_name 不同
        self.log_cookie_name = {
            "us": 'x-main',
            "de": 'x-acbde',
            "uk": 'x-acbuk',
            "it": 'x-acbit',
            "es": 'x-acbes',
            "fr": 'x-acbfr',
            "mx": 'x-acbmx',
            "ca": 'x-acbca',

            "ae": 'x-acbae',
            "au": 'x-acbau',
            "tr": 'x-acbtr',
            "be": 'x-acbbe',
            "jp": 'x-acbjp',
            "nl": 'x-acbnl',
            "pl": 'x-acbpl',
            "se": 'x-acbse',
        }
        self.time_statr = time.time()

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonCommentRealSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        while True:
            try:
                # seeds = self.seeds.pop()
                seeds = lpop(f'AsinCommentSpiderList')
                logging.info("获取任务成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue
            except redis.exceptions.TimeoutError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
        return seeds

    def save_redis(self, key, data):
        while True:
            try:
                xadd_db0(key, data)
                logging.info("save 成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"save 失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"save 超时 {e}")
                continue
            except redis.exceptions.TimeoutError as e:
                logging.info(f"save 失败{e}")
                time.sleep(5)
                continue

    def cookie_unblock(self):
        logging.info(f"异常cookie数 状态2{self.log_cookie_df[(self.log_cookie_df['status'] == 2)].shape}\n{self.log_cookie_df[(self.log_cookie_df['status'] == 4)].shape}")
        self.log_cookie_df.loc[((self.log_cookie_df['error_404_count'] > 8)), ['status', 'ban_time', 'error_404_count']] = [4, int(time.time()), 0]
        # 初始化异常次数
        self.log_cookie_df['error_404_count'] = 0
        self.log_cookie_df.loc[(self.log_cookie_df['status'] == 4) & (int(time.time()) >= (self.log_cookie_df['ban_time'] + 3600)), ['status', 'ban_time']] = [1, int(time.time())]
        self.log_cookie_df.loc[(self.log_cookie_df['status'] == 2) & (int(time.time()) >= (self.log_cookie_df['sleep_time'] + 1000)), ['status', 'sleep_time']] = [1, int(time.time())]
        logging.info(f"生效cookie数： {self.log_cookie_df[(self.log_cookie_df['status'] == 1)].shape}")

    def spider_idle(self, spider):
        stats = self.crawler.stats
        logging.info(f'IDLE------{self.site}------------{stats.get_value("up_requests_num")}')
        if (stats.get_value('up_requests_num') or 0) >= 200:
            x_main = self.log_cookie_df[self.log_cookie_df['status'] == 1].iloc[0].values[0]
            self.log_cookie_df.loc[(self.log_cookie_df['main'] == x_main), ['status', 'sleep_time']] = [2, int(time.time())]
            stats.set_value('up_requests_num', 0)
        # 将封禁1小时的cookie 状态设置为1
        self.cookie_unblock()
        # 判断是否 有生效的cookie 没有生效cookie则 计算最快能生效的cookie的可使用时间  等待到该可使用时间
        if not self.log_cookie_df[(self.log_cookie_df['status'] == 1)].shape[0]:
            # 没有则获取最小未封禁的cookie的封禁时间,设置等待
            sleep_time = self.log_cookie_df[(self.log_cookie_df['status'] == 2)]['sleep_time'].min()
            ban_time = self.log_cookie_df[(self.log_cookie_df['status'] == 4)]['ban_time'].min()
            logging.info(f"sleep_time {sleep_time} ban_time {ban_time}")
            # cookie中两个时间 等待时间 封禁时间 如果没有状态为1的则 优先获取 状态为2的等待时间
            if pd.isna(sleep_time) and pd.isna(ban_time):
                main_sleep_time = 30
            elif pd.isna(sleep_time):
                main_sleep_time = -(int(time.time()) - (ban_time+3600))
            elif pd.isna(ban_time):
                main_sleep_time = -(int(time.time()) - (sleep_time+1000))
            else:
                main_sleep_time = -(int(time.time()) - (sleep_time+1000))
            meta = {
                'handle_httpstatus_all': True,
                'SelfAllUpdateCookiesUrl': False,
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info(f'cookie ban sleep {main_sleep_time}s')
            time.sleep(main_sleep_time)
            raise DontCloseSpider()
        url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
        self.update_comment_headers()
        str_time = time.strftime("%H:%M:%S", time.localtime())
        # 定时更新最新邮编cookies 防止长时间爬取 cookie未更新
        if str_time >= "07:59:00" and str_time <= "08:02:00":
            self.update_cookies()
        seeds = self.get_seeds()
        data = {
            'sortBy': 'recent',
            'reviewerType': 'all_reviews',
            'pageNumber': '1',
            'shouldAppend': 'undefined',
            'deviceType': 'desktop',
            'canShowIntHeader': 'undefined',
            'reftag': 'cm_cr_arp_d_viewopt_srt',
            'pageSize': '10',
            # 'asin': job['asin'],
            'scope': 'reviewsAjax',
            'formatType': 'current_format',
            'filterByStar': 'all_stars',
        }
        if seeds:
            job = json.loads(seeds)
            for i in job['asin']:
                star = job.get('star') if job.get('star') else 'all_stars'
                data["asin"] = i['asin']
                data["pageNumber"] = job.get('error_page', '1')
                meta = {
                    "star": star,
                    "asin": i['asin'],
                    "pageNumber": data.get("pageNumber"),
                    "ck": True,
                    "comment_new_time": job.get('comment_new_time', ''),
                    "history_comment_count": job.get('history_comment_count') or 0,
                    # 'choice_header': True,
                    'error_page': job.get('error_page'),
                    'request_data': {
                        'sortBy': 'recent',
                        # 'formatType': 'current_format',
                        "filterByStar": star
                    },
                    'job': job,
                    'asin_list': i,
                    'site': job['siteName'],
                    'taskId': job['taskId'],
                    'SelfAllUpdateCookiesUrl': True,
                    # 'proxy': self.get_proxies(),
                }
                meta = self.random_r(meta)
                cookies = self.update_site_cookie(self.json_cookies(meta.get('site')))
                headers = self.get_comment_headers(self.url_, i['asin'])
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                headers['X-Forwarded-For'] = '2.2.2.2,2.2.2.2'
                x_main = self.log_cookie_df[self.log_cookie_df['status'] == 1].iloc[0].values[0]
                print('-----', x_main)
                cookies.update({self.log_cookie_name.get(self.site, 'us'): x_main})
                # self.update_comment_headers()
                # 分两种seed  一种需要获取评论总数  一种星级爬取失败重试的seed 走不同的处理逻辑
                if star == 'all_stars':
                    self.crawler.engine.crawl(
                        scrapy.FormRequest(url=url + f'#star={star}#asin={i["asin"]}#pageNumber={data.get("pageNumber")}',
                                           headers=headers, formdata=data, callback=self.parse, meta=meta,
                                           cookies=cookies, dont_filter=True, errback=self.err_parse), self)
                else:
                    data['filterByStar'] = star
                    dq = queue.Queue()
                    queue_name = meta['asin'] + star
                    meta['request_data'] = {
                        'sortBy': 'recent',
                        'formatType': 'current_format',
                        "filterByStar": star
                    }
                    meta["totalReviews"] = job.get('totalReviews', '0')
                    meta["totalRatings"] = job.get('totalRatings', '0')
                    meta['queue_name'] = queue_name
                    stats.set_value(queue_name, 0)
                    meta[queue_name] = dq
                    self.crawler.engine.crawl(
                        scrapy.FormRequest(
                            url=url + f'#star={star}#asin={meta["asin"]}#pageNumber={data.get("pageNumber")}',
                            headers=headers, formdata=data, callback=self.parse_two, meta=meta,
                            cookies=cookies, dont_filter=True, errback=self.err_parse), self)
        else:
            meta = {
                'handle_httpstatus_all': True,
                'SelfAllUpdateCookiesUrl': False,
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(3)
            raise DontCloseSpider()

    def get_proxies(self):
        ip = srandmembers('ip_lists', 1)
        if not ip:
            proxies = ''
        else:
            proxies = f'http://{str(ip[0], "utf-8")}:3389' if ip else ""
            print(f"ip --> {proxies}")
        return proxies

    def filter_comment(self, items, response):
        items = [item for item in items if response.meta.get('comment_new_time') <= item[18]]
        return items

    def parse(self, response, **kwargs):
        stats = self.crawler.stats
        meta = response.meta
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace('\\"', '"')
        sel = Selector(text=r, type="html")
        # 评分数
        comment_num_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get()
        logging.info(f"asin {response.meta.get('asin')} 总评论数: {comment_num_data} {response.meta.get('request_data')}")
        if not comment_num_data:
            # 反爬重试
            logging.info(f'页面变狗：{response.meta.get("star")} {response.meta.get("asin")}')
            api_data = {
                "taskId": meta.get('taskId'),
                "asin": meta.get('asin'),
                "site": meta.get('site'),
                "search": json.dumps(meta.get('request_data')),
                "header": json.dumps({
                    "totalReviews": '0',
                    "totalRatings": '0'
                }),
                "values": json.dumps([])
            }
            self.save_redis("AsinCommentResultStream", api_data)
            # xadd_db5("AsinCommentResultStream", api_data)
        else:
            comment_num_data = re.sub(r"\\u.{4}", '', comment_num_data.__repr__()).replace(",", "").replace(".", "").replace("'", "") if comment_num_data else ''
            comment_num_data = re.findall(r'(\d+)', comment_num_data)
            comment_num = comment_num_data[1]
            comment_score_num = comment_num_data[0]

            url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
            # logging.info(f'{response.meta.get("asin")}历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_num}')
            if not comment_num == "0":
                star_list = ["one_star", "two_star", "three_star", "four_star", "five_star"]
                data = {
                    'sortBy': 'recent',
                    'reviewerType': 'all_reviews',
                    'pageNumber': '1',
                    'shouldAppend': 'undefined',
                    'deviceType': 'desktop',
                    'canShowIntHeader': 'undefined',
                    'reftag': 'cm_cr_arp_d_viewopt_srt',
                    'pageSize': '10',
                    # 'asin': job['asin'],
                    'scope': 'reviewsAjax',
                    'formatType': 'current_format',
                }
                for num in range(len(star_list)):
                    dq = queue.Queue()
                    star = star_list[num]
                    queue_name = meta['asin'] + star

                    data["asin"] = meta['asin']
                    data["filterByStar"] = star
                    meta['queue_name'] = queue_name
                    stats.set_value(queue_name, 0)
                    meta['star'] = star
                    meta[queue_name] = dq
                    meta['request_data'] = {
                        'sortBy': 'recent',
                        'formatType': 'current_format',
                        "filterByStar": star
                    }
                    meta["totalReviews"] = comment_num
                    meta["totalRatings"] = comment_score_num
                    meta = self.random_r(meta)
                    cookies = self.update_site_cookie(self.json_cookies(meta.get('site')))
                    headers = self.get_comment_headers(self.url_, meta['asin'])
                    h_key = random.choice(list(self.h_dict))
                    headers[h_key] = self.h_dict.get(h_key)
                    headers['X-Forwarded-For'] = '2.2.2.2,2.2.2.2'
                    x_main = self.log_cookie_df[self.log_cookie_df['status'] == 1].iloc[0].values[0]
                    print(x_main)
                    cookies.update({self.log_cookie_name.get(self.site, 'us'): x_main})
                    # self.update_comment_headers()
                    yield scrapy.FormRequest(url=url + f'#star={star}#asin={meta["asin"]}#pageNumber={data.get("pageNumber")}',
                                           headers=headers, formdata=data, callback=self.parse_two, meta=meta,
                                           cookies=cookies, dont_filter=True, errback=self.err_parse)
            elif comment_num == "0":
                logging.info("评论总数为0")
                api_data = {
                    "taskId": response.meta.get('taskId'),
                    "asin": response.meta.get('asin'),
                    "site": response.meta.get('site'),
                    "search": json.dumps(response.meta.get('request_data')),
                    "header": json.dumps({
                        "totalReviews": '0',
                        "totalRatings": '0'
                    }),
                    "values": json.dumps([])
                }
                self.save_redis("AsinCommentResultStream", api_data)
                # xadd_db5("AsinCommentResultStream", api_data)

    def parse_two(self, response):
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace('\\"', '"')
        sel = Selector(text=r, type="html")
        # 评分数
        comment_num_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get()
        if not comment_num_data:
            # 反爬重试
            logging.info(f'页面变狗：{response.meta.get("star")} {response.meta.get("asin")}')
            api_data = {
                "taskId": response.meta.get('taskId'),
                "asin": response.meta.get('asin'),
                "site": response.meta.get('site'),
                "search": json.dumps(response.meta.get('request_data')),
                "header": json.dumps({
                    "reviews": '0',
                    "ratings": '0',
                    "totalReviews": response.meta.get('totalReviews'),
                    "totalRatings": response.meta.get('totalRatings'),
                }),
                "values": json.dumps([])
            }
            self.save_redis("AsinCommentResultStream", api_data)
            # xadd_db5("AsinCommentResultStream", api_data)
        else:
            comment_num_data = re.sub(r"\\u.{4}", '', comment_num_data.__repr__()).replace(",", "").replace(".", "").replace("'", "") if comment_num_data else ''
            comment_num_data = re.findall(r'(\d+)', comment_num_data)
            comment_num = comment_num_data[1]
            comment_score_num = comment_num_data[0]
            url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
            # logging.info(f'{response.meta.get("asin")}历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_num}')
            if not comment_num == "0":
                r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
                    '\\"', '"')
                page_count = 1 if response.meta.get('error_page') else self.get_page_num(int(comment_num), 10)
                response.meta["pageNumber"] = response.meta.get('pageNumber')
                response.meta["page_count"] = 10 if page_count >= 10 else page_count
                response.meta["comment_num"] = comment_num
                item = self.xpath_res(r, response)
                item["save_comment_num"] = True
                    # yield item
                ran = range(2, 11) if page_count >= 10 else range(2, page_count + 1)
                for i in ran:
                    cookies = self.json_cookies(response.meta.get('site'))
                    cookies = self.update_site_cookie(cookies)
                    data = {
                        'sortBy': 'recent',
                        'reviewerType': 'all_reviews',
                        'pageNumber': str(i),
                        'shouldAppend': 'undefined',
                        'deviceType': 'desktop',
                        'canShowIntHeader': 'undefined',
                        'reftag': 'cm_cr_arp_d_viewopt_srt',
                        'pageSize': '10',
                        'asin': response.meta.get("asin"),
                        'scope': 'reviewsAjax',
                        'filterByStar': response.meta["star"],
                        'formatType': 'current_format',
                    }
                    meta = self.random_r(response.meta)
                    meta["pageNumber"] = str(i)
                    meta["priority"] = i
                    self.comment_headers = self.get_comment_headers(self.url_, data['asin'])
                    h_key = random.choice(list(self.h_dict))
                    self.comment_headers[h_key] = self.h_dict.get(h_key)
                    x_main = self.log_cookie_df[self.log_cookie_df['status'] == 1].iloc[0].values[0]
                    print(x_main)
                    cookies.update({self.log_cookie_name.get(self.site, 'us'): x_main})
                    self.update_comment_headers()
                    yield scrapy.FormRequest(url=url+f'#star={response.meta.get("star")}#asin={data.get("asin")}#pageNumber={str(i)}', headers=self.comment_headers, cookies=cookies, formdata=data, callback=self.parse_data, errback=self.err_parse_page, meta=meta, dont_filter=True)
            elif comment_num == "0":
                api_data = {
                    "taskId": response.meta.get('taskId'),
                    "asin": response.meta.get('asin'),
                    "site": response.meta.get('site'),
                    "search": json.dumps(response.meta.get('request_data')),
                    "header": json.dumps({
                        "reviews": '0',
                        "ratings": '0',
                        "totalReviews": response.meta.get('totalReviews'),
                        "totalRatings": response.meta.get('totalRatings'),
                    }),
                    "values": json.dumps([])
                }
                self.save_redis("AsinCommentResultStream", api_data)
                # xadd_db5("AsinCommentResultStream", api_data)

    def err_parse_page(self, failure, **kwargs):
        from urllib.parse import parse_qs
        request = failure.request
        logging.info(f"error______ {failure.getErrorMessage()}, {failure.request.meta.get('asin')}")
        retry_times = request.meta.get('retry_times', 0)
        logging.info(f"重试{retry_times}次")
        max_retry_times = 8
        if retry_times < max_retry_times:
            # 增加重试次数并调度重试
            new_retry_times = retry_times + 1
            request.meta['retry_times'] = new_retry_times
            cookies = self.json_cookies(request.meta.get('site'))
            cookies = self.update_site_cookie(cookies, request.meta.get('site'))
            x_main = self.log_cookie_df[self.log_cookie_df['status'] == 1].iloc[0].values[0]
            print(x_main)
            cookies.update({'x-main': x_main})
            comment_headers = self.get_comment_headers(self.url_, request.meta['asin'])
            h_key = random.choice(list(self.h_dict))
            comment_headers[h_key] = self.h_dict.get(h_key)
            # request.cookies = cookies
            yield scrapy.FormRequest(url=request.url, headers=comment_headers, cookies=cookies, formdata=parse_qs(request.body), callback=self.parse_data,
                errback=self.err_parse_page, meta=request.meta, dont_filter=True)
        else:
            # 达到最大重试次数，不再重试，记录日志或其他处理
            self.logger.error(f'Failed to retry {failure.request.url}')
            api_data = {
                "taskId": request.meta.get('taskId'),
                "asin": request.meta.get('asin'),
                "site": request.meta.get('site'),
                "search": json.dumps(request.meta.get('request_data')),
                "header": json.dumps({
                    "reviews": '0',
                    "ratings": '0',
                    "totalReviews": request.meta.get('totalReviews'),
                    "totalRatings": request.meta.get('totalRatings'),
                }),
                "values": json.dumps([]),
                "errMsg": "请求异常次数过多 请稍后重试"
            }
            # xadd_db5("AsinCommentResultStream", api_data)
            self.save_redis("AsinCommentResultStream", api_data)

    def parse_data(self, response):
        """解析详情页数据"""
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
            '\\"', '"')
        if "@@@@@@@@@@@@@@@@@@@@___" in r:
            msg = self.r_utils(response)
            logging.info(f"页面 @@@@@@@@@@@@@ asin为: {response.meta.get('asin')} {response.meta.get('star')} 爬取页数 {response.meta.get('pageNumber')} 请求： {msg}")
            return None
        item = self.xpath_res(r, response)
        return item

    def comment_time(self, time_msg, site):
        if site == "de":
            time_msg = (time_msg.split(".")[0][-2:] + "." + time_msg.split(".")[1]).strip()
        elif site == "es":
            time_msg = time_msg.split("el ")[-1].strip()
        elif site == "fr":
            time_msg = time_msg.split("le ")[1].strip()
        elif site == "it":
            time_msg = time_msg.split("il ")[1].strip()
        else:
            if "年" in time_msg:
                from datetime import datetime
                date_object = datetime.strptime(time_msg.split(" ")[0], "%Y年%m月%d日")
                time_msg = date_object.strftime("%Y-%m-%d")
            else:
                time_msg = time_msg.split("on")[1].strip()
        return time_msg

    def xpath_res(self, r, response):
        stats = self.crawler.stats
        sel = Selector(text=r, type="html")
        comment_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get('')
        if comment_data:
            comment_count = re.sub(r"\\u.{4}", '', comment_data.__repr__()).replace(",", "").replace(".", "").replace("'", "") if comment_data else ''
            comment_count = re.findall(r'(\d+)', comment_count)
        else:
            comment_count = ['0', '0']
        # 10条评论
        datas = sel.xpath("//li[@data-hook='review']")
        items = []
        for i in datas:
            # if not i.xpath(".//span[@data-hook='review-body']/span/text()"):
            #     print("-------", etree.tostring(i))
            title = i.xpath(".//h5/a/span//text()").get("").strip()
            # 用户名称
            user_name = i.xpath(".//span[@class='a-profile-name']//text()").get('').strip()
            # 用户图片链接
            user_img = i.xpath(".//div[@class='a-profile-avatar']/img/@data-src").get('')
            content = i.xpath(".//span[@data-hook='review-body']/span/text()").getall()
            if content:
                content = [i for i in content]
            # 赞同数
            helpful = i.xpath(".//span[@data-hook='helpful-vote-statement']//text()").get()
            if helpful:
                if response.meta.get('site') == "es":
                    review = 1 if helpful.split(" ")[1].strip() == "One" else helpful.split(" ")[1].strip()
                else:
                    review = 1 if helpful.split(" ")[0].strip() == "One" else helpful.split(" ")[0].strip()
            else:
                review = 0
            try:
                review = int(review)
            except:
                review = 0
            # 是否购买
            size_mini = i.xpath(".//span[@class='a-size-mini a-color-state a-text-bold']//text()").get()
            mini = 1 if size_mini else 2
            # 时间 国家
            time_msg = i.xpath(".//span[@data-hook='review-date']//text()").get('')
            # Reviewed in Brazil 🇧🇷 on December 31, 2022
            # Rezension aus Deutschland vom 13. Februar 2023
            time_msg = self.comment_time(time_msg, response.meta.get("site"))
            # 评论内图片
            comment_img = i.xpath(".//div[@class='a-section a-spacing-top-mini cr-lightbox-image-thumbnails']/img/@src").getall()
            # 视频地址
            video_url_list = i.xpath('.//input[contains(@class,"video-url")]/@value').getall()
            if video_url_list:
                video_url = ';'.join(video_url_list)
            else:
                video_url = ""
            video_len_num = len(video_url_list)
            # 评论人主页连接
            user_page = i.xpath(".//div[@data-hook='genome-widget']/a/@href").get('')

            rat = i.xpath(".//span[@class='a-icon-alt']//text()").get()

            rating = rat.split(".")[0].replace("stars", "").strip() if len(rat.split(".")[0].replace("stars", "").strip()) == 1 else rat.split(",")[0].replace("stars", "").strip()
            # 评论链接
            comment_url = i.xpath(".//*[@data-hook='review-title']/@href").get("")
            # if (not comment_url) and (not i.xpath('./@id')):
            #     logging.info("not get comment_url filter ------")
            #     continue
            variat_asin = i.xpath(".//a[@data-hook='format-strip']/@href").get()
            variat_asin = re.findall(r"product-reviews/(.*?)(?:\/|\?)", variat_asin)[0] if variat_asin else ""
            star_list = {
                '1': "one_star",
                '2': "two_star",
                '3': "three_star",
                '4': "four_star",
                '5': "five_star",
            }
            vine = i.xpath(".//span[@class=\"a-color-success a-text-bold\"]//text()").get('')
            item = {
                "asin": variat_asin or response.meta.get("asin"),
                # "parent_asin": response.meta.get("asin"),
                "title": title,
                "content": "".join(content).strip() if content else "",
                # 是否确认购买
                "is_vp": str(mini),
                # 型号如color、size、style
                "model": "|-|".join(i.xpath(".//a[@data-hook='format-strip']//text()").getall()).strip().replace("What's this?", "") if i.xpath(".//a[@data-hook='format-strip']//text()") else "",
                # 星级评分
                "rating": rating,
                # 赞同数
                "agree_num": int(review),
                # 评论图片数量
                "img_num": len(comment_img),
                # 图片URL
                "img_url": ",".join(comment_img),
                # 是否有视频（1是2否）
                "is_video": str(1 if video_url else 2),
                # 视频地址
                "video_url": video_url,
                # 评论链接
                "comment_url": comment_url,
                # 评论人名称
                "user_name": user_name,
                # 评论人头像图片链接
                "user_img": user_img,
                # 所属国家
                "country": response.meta.get('site'),
                # # 所属国家
                # "country": time_msg[0].split("on")[0].split(" ")[-2].strip(),
                # 评论人主页URL
                "user_page": user_page,
                # '是否是红人计划链接(1是2否含有“Earns Commissions”标签的评论人主页)',
                "is_earns_commissions": "",
                # '评论时间',
                "comment_time": time_msg,
                # '评论时间',
                "comment_time_format": time_ch(response.meta.get('site'), time_msg),
                "page": int(response.meta.get("pageNumber")),
                # "star": star_list[rating],
                'vine_review_flag': '1' if vine else '2',
            }
            item = {k: v or None for k, v in item.items()}
            item["comment_id"] = i.xpath("./@id").get()
            item["page_state"] = 1 if item["comment_url"] else 2
            # items.append(list(item.values()))
            items.append(item)
        msg = self.r_utils(response)
        logging.info(f"成功爬取数据为: {msg} {len(items)} {items}")
        stats.inc_value(response.meta.get("queue_name"))
        # 有部分页面 显示11条评论  实际只有10条  当获取第2页时无数据
        response.meta.get(response.meta.get("queue_name")).put(items)
        msgs = {}
        if int(stats.get_value(response.meta.get("queue_name"))) == int(response.meta.get('page_count')):
            stats.set_value(response.meta.get("queue_name"), 0)
            logging.info(f"页数爬取成功asin为: {response.meta.get('asin')} {msg} {response.meta.get('star')} 爬取页数 {response.meta.get(response.meta.get('queue_name')).qsize()} 数据长度： {response.meta.get('comment_num')}")
            # msgs["queues_"] = response.meta.get(response.meta.get("queue_name"))
            # msgs["comment_count"] = response.meta.get("comment_num")
            # msgs["star"] = response.meta.get("star")
            comment_all = [response.meta.get(response.meta.get("queue_name")).get() for i in range(response.meta.get(response.meta.get("queue_name")).qsize())]
            flattened_list = [item for sublist in comment_all if sublist for item in sublist]
            if flattened_list:
                df = pd.DataFrame(flattened_list)
                df.drop_duplicates(['asin', 'comment_id'], inplace=True)
                df['agree_num'] = df['agree_num'].fillna("0")
                df['img_num'] = df['img_num'].fillna("0")
                api_item = df.to_dict(orient='records')
                api_data = {
                    "taskId": response.meta.get('taskId'),
                    "asin": response.meta.get('asin'),
                    "site": response.meta.get('site'),
                    "search": json.dumps(response.meta.get('request_data')),
                    "header": json.dumps({
                        "reviews": comment_count[1],
                        "ratings": comment_count[0],
                        "totalReviews": response.meta.get('totalReviews'),
                        "totalRatings": response.meta.get('totalRatings'),
                    }),
                    "values_len": len(api_item),
                    "values": json.dumps(api_item)
                }
            else:
                api_data = {
                    "taskId": response.meta.get('taskId'),
                    "asin": response.meta.get('asin'),
                    "site": response.meta.get('site'),
                    "search": json.dumps(response.meta.get('request_data')),
                    "header": json.dumps({
                        "reviews": comment_count[1],
                        "ratings": comment_count[0],
                        "totalReviews": response.meta.get('totalReviews'),
                        "totalRatings": response.meta.get('totalRatings'),
                    }),
                    "values": json.dumps([])
                }
            logging.info("xadd ---")
            # xadd_db5("AsinCommentResultStream", api_data)
            self.save_redis("AsinCommentResultStream", api_data)
        msgs["asin"] = response.meta.get("asin")
        msgs["items"] = items
        return msgs

    def err_parse(self, failure, **kwargs):
        request = failure.request
        logging.info(f"error______ {failure.getErrorMessage()}, {failure.request.meta.get('asin')}")
        job = request.meta.get("job")
        job['asin'] = [request.meta.get("asin_list")]
        job['star'] = request.meta.get("star")
        job['totalReviews'] = request.meta.get('totalReviews', '0')
        job['totalRatings'] = request.meta.get('totalRatings', '0')

        if job.get('error_count'):
            job["error_count"] = job.get('error_count') + 1
        else:
            job["error_count"] = 1
        k = json.dumps(job)
        while True:
            try:
                # sadd(f"{self.site}_real_seed", k, use_md5=False)
                lpush(f"AsinCommentSpiderList", k)
                logging.info(f"push AsinCommentSpiderList succeed")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"push AsinCommentSpiderList ConnectionError，afresh push --> T_T {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"push AsinCommentSpiderList time out --> T_T {e}")
                continue
            except redis.exceptions.TimeoutError as e:
                logging.info(f"push AsinCommentSpiderList time out --> T_T {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl amazon_comment_real_spider -a site=us'.split()
    cmdline.execute(args)

# source activate pyspark
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_comment_real_spider -a site=us  > amazon_real_api1.log 2>&1 &
# nohup scrapy crawl amazon_comment_real_spider -a site=us > amazon_real_api1.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl amazon_comment_real_spider" |awk '{print $2}' `; do kill -9 $i ; done;
# nohup /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_comment_real_spider -a site=us > amazon_real_api1.log 2>&1 &
# nohup scrapy crawl amazon_comment_real_spider -a site=us > amazon_real_api1.log 2>&1 &



