import re
import json
import queue
import time
import redis
import scrapy
import random
import logging
import pandas as pd
import os, sys, platform
from scrapy import Selector
from scrapy import cmdline, signals
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import lpop, lpush, srandmembers
# 亚马逊历史评论获取
from amazon_spider.spiders.yswg_spider import SourceSpider

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonCommentCountSpider(SourceSpider):
    name = 'amazon_comment_count_spider'  # 评论数据更新
    custom_settings = {
        #  curl_cffi 代理添加
        # 'PROXY_HOST': 'http-dynamic-S02.xiaoxiangdaili.com',
        # 'PROXY_PORT': 10030,
        # 'PROXY_USER': '******',
        # 'PROXY_PASS': '******',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'DOWNLOAD_DELAY': 0.5, # 每次请求之间的延迟时间为 2 秒
        'CONCURRENT_REQUESTS ': 8, # 设置全局并发请求数为 8
        'CONCURRENT_REQUESTS_PER_DOMAIN': 4,  # 设置每个域名的并发请求数为 4
        'CONCURRENT_REQUESTS_PER_IP': 4,
        'AUTOTHROTTLE_ENABLED ': True,
        'AUTOTHROTTLE_START_DELAY': 1,  # 初始延迟 1 秒
        'AUTOTHROTTLE_MAX_DELAY': 10,  # 最大延迟 10 秒
        'LOG_LEVEL': 'INFO',
        # # 启用后，当从相同的网站获取数据时，Scrapy将会等待一个随机的值，延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404, 401],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 480,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            # 'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            # 'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            'amazon_spider.middlewares.SelfAllUpdateCookiesUrl': 530,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.amazon_self_comment_update_pipe.AmazonCommentNewsSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonCommentCountSpider, self).__init__()
        self.site = site
        self.cols_list = ['asin', 'parent_asin', 'title', 'content', 'is_vp', 'model', 'rating', 'agree_num', 'img_num', 'img_url', 'is_video', 'video_url', 'comment_url', 'user_name', 'user_img', 'country', 'user_page', 'is_earns_commissions', 'comment_time', 'page', 'md5_unique', 'star']
        if not self.site_url.get(self.site):
            raise ValueError(f"{type(self).__name__} site error")
        self.url_ = self.site_url.get(self.site)
        self.update_cookies()
        self.comment_headers = None
        self.h_dict = {
            'rtt': '100',
            'sec-ch-device-memory': '8',
            'sec-ch-dpr': '1',
            'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^112^\\^, ^\\^Google',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '^\\^Windows^\\^',
            # 'sec-ch-ua-platform-version': '^\\^10.0.0^\\^',
            'sec-ch-viewport-width': '1587',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'viewport-width': '1587',
            'device-memory': '8',
            # 'downlink': '10',
            'dpr': '1',
            'ect': '4g',
        }
        # self.utils_requests = [{"use_aiohttp": True}, {"curlcffi": True}, {"use_httpx": True}, {}]
        self.seeds = ['{"siteName": "us", "taskId": "commentStat_luqing@yswg.com.cn_1741511646", "asin": ["B0BPYWH93D"]}']
        self.utils_requests = [{}]
        self.log_cookie = [
            {'main': '6IxhED1H6FMQTTmG7nT?LAez1MVVqPrelGSwprpEnnPP@sV8RLqnFVdp?l9IvyNj', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            {'main': "VmWEZcN1aeXE6Ki4vhW2OzY62prS7lQHK@7qGtFaeGxK@9u34S1ki58e4FLwmUkh", 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            {'main': "R5bEU6WFaJnTMdBRGTDFIVoVDGnSQFnf78wW3yV5QMO6bOasSvAfwc@3ZQmV4tCx", 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            # {'main': 'jPvNIv2B5a1C?WOi6XOoz2kOeCo7Hsd@sBjuYLLWEoc6GUluZQ3yfLXlVUE2VQh2', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            # {'main': 'fMhny8K1Vx?jc1p2x?HKEWkUhbDqRtwZvELLNc9wpQ1CcYqf6bazr4NFmTuYMHt1', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            {'main': 'VvTAIBiLWOtpU8eTIXfbBxWET4KRaZONjUSWcH6eSgnIVDutpoSpt@1qDTd@xPe0', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
            # {'main': 'lJpfQtU3gFL0MdP0QB2npl07xPQplgjYMXuzH2KPM4F8z@f0saUFtSKNo1PHkweC', 'status': 1, 'ban_time': int(time.time()), 'error_404_count': 0, 'sleep_time': int(time.time())},
        ]
        self.log_cookie_df = pd.DataFrame(self.log_cookie)
        self.log_cookie_name = {
            "us": 'x-main',
            "de": 'x-acbde',
            "uk": 'x-acbuk',
            "it": 'x-acbit',
            "es": 'x-acbes',
            "fr": 'x-acbfr',
            "mx": 'x-acbmx',
            "ca": 'x-acbca',

            "ae": 'x-acbae',
            "au": 'x-acbau',
            "tr": 'x-acbtr',
            "be": 'x-acbbe',
            "jp": 'x-acbjp',
            "nl": 'x-acbnl',
            "pl": 'x-acbpl',
            "se": 'x-acbse',
        }
        self.error_list = []
        self.time_statr = time.time()

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonCommentCountSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        while True:
            try:
                # seeds = self.seeds.pop()
                seeds = lpop(f'AsinCommentStatList')
                # seeds = spop(f'{self.site}_day_comment_seed', 1)
                logging.info("获取任务成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue
        return seeds

    def get_proxies(self):
        ip = srandmembers('ip_lists', 1)
        if not ip:
            proxies = ''
        else:
            proxies = f'http://{str(ip[0], "utf-8")}:3389' if ip else ""
            print(f"ip --> {proxies}")
        return proxies

    def cookie_unblock(self):
        logging.info(f"异常cookie数 状态2{self.log_cookie_df[(self.log_cookie_df['status'] == 2)].shape}\n{self.log_cookie_df[(self.log_cookie_df['status'] == 4)].shape}")
        self.log_cookie_df.loc[((self.log_cookie_df['error_404_count'] > 8)), ['status', 'ban_time', 'error_404_count']] = [4, int(time.time()), 0]
        # 初始化异常次数
        self.log_cookie_df['error_404_count'] = 0
        self.log_cookie_df.loc[(self.log_cookie_df['status'] == 4) & (int(time.time()) >= (self.log_cookie_df['ban_time'] + 3600)), ['status', 'ban_time']] = [1, int(time.time())]
        self.log_cookie_df.loc[(self.log_cookie_df['status'] == 2) & (int(time.time()) >= (self.log_cookie_df['sleep_time'] + 1000)), ['status', 'sleep_time']] = [1, int(time.time())]
        logging.info(f"生效cookie数： {self.log_cookie_df[(self.log_cookie_df['status'] == 1)].shape}")

    def spider_idle(self, spider):
        stats = self.crawler.stats
        logging.info(f'IDLE------{self.site}------------{stats.get_value("up_requests_num")}')
        if (stats.get_value('up_requests_num') or 0) >= 200:
            x_main = self.log_cookie_df[self.log_cookie_df['status'] == 1].iloc[0].values[0]
            self.log_cookie_df.loc[(self.log_cookie_df['main'] == x_main), ['status', 'sleep_time']] = [2, int(time.time())]
            stats.set_value('up_requests_num', 0)
        # 将封禁1小时的cookie 状态设置为1
        self.cookie_unblock()
        # 判断是否 有生效的cookie 没有生效cookie则 计算最快能生效的cookie的可使用时间  等待到该可使用时间
        if not self.log_cookie_df[(self.log_cookie_df['status'] == 1)].shape[0]:
            # 没有则获取最小未封禁的cookie的封禁时间,设置等待
            sleep_time = self.log_cookie_df[(self.log_cookie_df['status'] == 2)]['sleep_time'].min()
            ban_time = self.log_cookie_df[(self.log_cookie_df['status'] == 4)]['ban_time'].min()
            logging.info(f"sleep_time {sleep_time} ban_time {ban_time}")
            # cookie中两个时间 等待时间 封禁时间 如果没有状态为1的则 优先获取 状态为2的等待时间
            if pd.isna(sleep_time) and pd.isna(ban_time):
                main_sleep_time = 30
            elif pd.isna(sleep_time):
                main_sleep_time = -(int(time.time()) - (ban_time+3600))
            elif pd.isna(ban_time):
                main_sleep_time = -(int(time.time()) - (sleep_time+1000))
            else:
                main_sleep_time = -(int(time.time()) - (sleep_time+1000))
            meta = {
                'handle_httpstatus_all': True,
                'SelfAllUpdateCookiesUrl': False,
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info(f'cookie ban sleep {main_sleep_time}s')
            time.sleep(main_sleep_time)
            raise DontCloseSpider()

        url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
        self.update_comment_headers()
        str_time = time.strftime("%H:%M:%S", time.localtime())
        # 更新cookies
        if str_time >= "07:59:00" and str_time <= "08:02:00":
            self.update_cookies()
        seeds = self.get_seeds()
        logging.info(f"请求耗时---{time.time()-self.time_statr}")
        self.time_statr = time.time()
        if seeds:
            job = json.loads(seeds)
            # 计数队列 判断一轮asin是否爬取完
            q = queue.Queue()
            for asin in job['asin']:
                q_data = queue.Queue()
                for h in [{}, {'formatType': 'current_format'}]:
                    data = {
                        'sortBy': 'recent',
                        'reviewerType': 'all_reviews',
                        'pageNumber': '1',
                        'shouldAppend': 'undefined',
                        'deviceType': 'desktop',
                        'canShowIntHeader': 'undefined',
                        'reftag': 'cm_cr_arp_d_viewopt_srt',
                        'pageSize': '10',
                        # 'asin': job['asin'],
                        'scope': 'reviewsAjax',
                        'filterByStar': 'all_stars',
                    }
                    star = 'all_stars'
                    data["asin"] = asin
                    data["filterByStar"] = star
                    data["pageNumber"] = job.get('error_page', '1')
                    data.update(h)
                    meta = {
                        "site": job['siteName'],
                        "taskId": job['taskId'],
                        "asin": asin,
                        "pageNumber": data.get("pageNumber"),
                        "ck": True,
                        'error_page': job.get('error_page'),
                        "asin_list": job['asin'],
                        'queue': q,
                        "data_queue": q_data,
                        'proxy': self.get_proxies(),
                        'SelfAllUpdateCookiesUrl': True,
                    }
                    meta.update(h)
                    meta = self.random_r(meta)
                    cookies = self.update_site_cookie(self.json_cookies(meta.get('site')), meta['site'])
                    headers = self.get_comment_headers(self.url_, asin)
                    h_key = random.choice(list(self.h_dict))
                    headers[h_key] = self.h_dict.get(h_key)
                    headers['X-Forwarded-For'] = '2.2.2.2,2.2.2.2'
                    x_main = self.log_cookie_df[self.log_cookie_df['status'] == 1].iloc[0].values[0]
                    print(x_main)
                    cookies.update({'x-main': x_main})
                    # self.update_comment_headers()
                    self.crawler.engine.crawl(
                        scrapy.FormRequest(url=url + f'#star={star}#asin={asin}#pageNumber={data.get("pageNumber")}',
                                           headers=headers, formdata=data, callback=self.parse, meta=meta,
                                           cookies=cookies, dont_filter=True, errback=self.err_parse), self)
        else:
            meta = {
                'handle_httpstatus_all': True,
                'SelfAllUpdateCookiesUrl': False,
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 2s')
            time.sleep(2)
            raise DontCloseSpider()

    def filter_comment(self, items, response):
        items = [item for item in items if response.meta.get('comment_new_time') <= item[18]]
        return items

    def comment_count_api(self, data):
        import requests
        # http://119.23.61.82/soundasia_selection
        url = "http://selection.yswg.com.cn/soundasia_selection/asinComments/analysis/updateCommentStat"

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
        }
        while True:
            try:
                response = requests.post(url, headers=headers, json=data, verify=False, timeout=3)
                logging.info(f"spider comment count succeed：{response.text}")
                break
            except Exception as e:
                time.sleep(3)
                continue

    def parse(self, response, **kwargs):
        meta = response.meta
        meta.get('queue').put(meta.get('asin'))
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace('\\"', '"')
        sel = Selector(text=r, type="html")
        # 评分数
        comment_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get()
        if comment_data:
            comment_count = re.sub(r"\\u.{4}", '', comment_data.__repr__()).replace(",", "").replace(".", "").replace("'", "") if comment_data else ''
            datas = re.findall(r'(\d+)', comment_count)
            comment_num = datas[1]
            comment_score_num = datas[0]
        else:
            comment_num = '0'
            comment_score_num = '0'

        if meta.get('formatType'):
            item_dict = {'ratingNum': comment_score_num, 'reviewNum': comment_num}
        else:
            item_dict = {'ratings': comment_score_num, 'reviews': comment_num}
        if meta.get('data_queue').qsize():
            data = meta.get('data_queue').get()
            items = {
                "siteName": meta.get('site'),
                "asin": meta.get('asin'),
                "taskId": meta.get('taskId'),
                # "ratings": 0,
                # "reviews": 0,
                # "ratingNum": 0,
                # "reviewNum": 0
            }
            items.update(data)
            items.update(item_dict)
            if (len(meta.get("asin_list"))*2) == meta.get('queue').qsize():
                items["status"] = "done"
            print(items)
            self.comment_count_api(items)
            # print(items)
        else:
            meta.get('data_queue').put(item_dict)

    def parse_two(self, response, **kwargs):
        meta = response.meta
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace('\\"', '"')
        sel = Selector(text=r, type="html")
        # 评分数
        comment_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get()
        if sel.xpath('.//h3[@data-hook="cr-filtered-by-text"]').get():
            comment_count = re.sub(r"\\u.{4}", '', comment_data.__repr__()).replace(",", "").replace(".", "").replace("'",
                                                                                                                      "") if comment_data else ''
            datas = re.findall(r'(\d+)', comment_count)
            comment_num = datas[1]
            comment_score_num = datas[0]
            meta.get('queue').put(meta.get('asin'))
            items = {
                "siteName": meta.get('site'),
                "asin": meta.get('asin'),
                "taskId": meta.get('taskId'),
                "ratings": meta['ratings'],
                "reviews": meta['reviews'],
                "ratingNum": comment_score_num,
                "reviewNum": comment_num
            }
            if (len(meta.get("asin_list"))*2) == meta.get('queue').qsize():
                items["status"] = "done"
            print(items)
            self.comment_count_api(items)
        else:
            meta.get('queue').put(meta.get('asin'))
            items = {
                "siteName": meta.get('site'),
                "asin": meta.get('asin'),
                "taskId": meta.get('taskId'),
                "ratings": meta['ratings'],
                "reviews": meta['reviews'],
                "ratingNum": 0,
                "reviewNum": 0
            }
            if (len(meta.get("asin_list"))*2) == meta.get('queue').qsize():
                items["status"] = "done"
            print(items)
            self.comment_count_api(items)

    def err_parse(self, failure, **kwargs):
        request = failure.request
        logging.info(f"error______ {failure.getErrorMessage()}, {failure.request.meta.get('asin_list')}")
        meta = {
            "siteName": request.meta['site'],
            "taskId": request.meta['taskId'],
            "asin": [request.meta['asin']],
        }
        self.error_list.append(meta)
        print(self.error_list)
        k = json.dumps(meta)
        while True:
            try:
                # sadd(f"{self.site}_real_seed", k, use_md5=False)
                lpush(f"AsinCommentStatList", k)
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"push AsinCommentStatList ConnectionError，afresh push --> T_T {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"push AsinCommentStatList time out --> T_T {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl amazon_comment_count_spider -a site=us'.split()
    cmdline.execute(args)


# source activate pyspark
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_comment_count_spider -a site=us  > amazon.log 2>&1 &
# nohup scrapy crawl amazon_comment_count_spider -a site=us > amazon_real_comment_count1.log 2>&1 &
# for i in `ps -ef|grep "py" |awk '{print $2}' `; do kill -9 $i ; done;
# nohup /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_comment -a site=us > amazon1.log 2>&1 &
#