# coding:utf-8
import os
import re
import gc  # 用于手动触发垃圾回收
import sys
import json
import time
import scrapy
import logging
import random
import platform
import redis
from pprint import pprint
from urllib.parse import urlparse
from scrapy import cmdline, signals
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import spop, sadd, zadd, zpop
# 亚马逊详情页数据获取
from amazon_spider.utils.utils import mem_rate
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor
from amazon_spider.spiders.yswg_spider import SourceSpider

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonSelfRedisNewDetail(SourceSpider):
    name = 'self_asin_redis_detail'
    SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'
    custom_settings = {
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.real_keepa_pipe.AmazonRealKeepaSpiderPipeline': 230,
            # 'amazon_spider.pipeline.news_real_pipe.AmazonRealKeepaSpiderPipeline': 200,
            # 'amazon_spider.pipeline.cs_count.FidleMonitorPipeline': 200,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonSelfRedisNewDetail, self).__init__()
        self.site = site
        # cookie 缓存
        self.update_cookies()
        self.sleep_count = 0
        self.seller_work = {
            "us": "Sold by",
            "uk": "Sold by",
            "fr": "Vendu par",
            "de": "Verkäufer",
            "es": "Vendedor",
            "it": "Venditore",
            "mx": "Vendedor",
            "ca": "Sold by"
        }

        self.seller_type = {
            "us": "Ships from",
            "uk": "Dispatches from",
            "fr": "Expéditeur",
            "de": "Versand",
            "es": "Envía por",
            "it": "Spedito da",
            "mx": "Envío por",
            "ca": "Ships from"
        }
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        # B0C8SX1JYQ  B08B65RZ44
        self.seeds = [{"asin": "B0F13S7GBN", "site": "us"}]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonSelfRedisNewDetail, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        while True:
            try:
                # 临时爬取队列
                seeds = zpop(f'{self.site}_self_asin_temporary_seed', 25)
                # seeds = [self.seeds.pop()]
                logging.info("获取任务成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue
        return seeds

    def spider_idle(self, spider):
        # read database again and send new requests
        logging.debug(f'IDLE------------------{self.site} {time.time()}')
        stats = self.crawler.stats
        stats.clear_stats()
        memory_rate = mem_rate()
        logging.info(f'当前系统内存占比: {memory_rate:.2%}')
        if memory_rate > 0.95:
        # if 0:
            logging.info("内存占比超过 0.95 退出程序")
            raise
        seeds = self.get_seeds()
        str_time = time.strftime("%H:%M:%S", time.localtime())
        # 更新cookies
        if str_time >= "07:59:00" and str_time <= "08:02:00":
            self.update_cookies()
            # 每天手动触发垃圾回收
            gc.collect()
        if seeds:
            for i in seeds:
                # # text
                # job = i
                job = json.loads(i)
                if not self.site_url.get(job['site']):
                    logging.info(f"过滤 未知站点 {job['site']}")
                    continue
                url = f"{self.site_url.get(job['site'])}/dp/{job['asin']}" + "?th=1&psc=1"
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                try:
                    account_id = str(job["account_id"]) if job["account_id"] and job["account_id"] != "nan" else None
                except:
                    account_id = ""
                if account_id:
                    url = f"{self.site_url.get(job['site'])}/dp/{job['asin']}?m={account_id}"
                meta = {
                    # "use_aiohttp": True,
                    "asin": job["asin"],
                    # "amazon_proxy": True,
                    "choice_header": True,
                    "cookiejar": int(time.time()),
                    "site": job["site"] or self.site,
                    "account_id": account_id,
                    # 'proxy': "http://127.0.0.1:7890",
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                cookies = self.json_cookies(meta["site"])
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)

        else:
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()

    def parse(self, response, **kwargs):
        meta = response.meta
        print("-" * 20)
        # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        msg = None
        if item := self.page_state(response):
            # msg = meta.get("asin") + ',' + "异常页面"
            msg = {
                "asin": meta.get("asin"),
                "state": "异常页面"
            }
            # sadd(f"{self.site}_temporary_data", msg, use_md5=False)
            yield item
        else:
            amazon_detail_extractor = AmazonDetailExtractor(self.site)
            star_time = time.time()
            items = amazon_detail_extractor.run(response, url_all=True)
            logging.info(f'解析总耗时为：{time.time()-star_time}')
            # protection_plan = [i for i in response.xpath(".//fieldset/div[@class='a-section a-spacing-mini']//text()").getall() if i.strip()]
            # returns_data = [i for i in response.xpath(".//a[@data-csa-c-content-id='odf-desktop-return-info']//text()").getall() if i.strip()]
            # 获取所谓的view 数据
            data = response.xpath('''//*[@id="v3d-desktop-modal-iframe"]/@data-src''')
            items['view_in_3D'] = 'https://www.amazon.com/' + data[0].get() if data else ''

            data = response.xpath('''//span[text()=' View in 3D ']''')
            if data:
                if not items['view_in_3D']:
                    raise ValueError(f'存在 View in 3D 按钮，无法匹配url asin为{asin}')

            video_list = re.findall(r'<div data-reference-url="" data-asin=".*?data-video-url="(.*?)" data-vdp-url',
                                    response.text)

            items['video_url'] = video_list[0] if video_list else ''
            # protection_plan = response.xpath(
            #     ".//div[@class='mbb__m mbb__tsn']//span[@class='a-size-base a-color-secondary']//text()").getall()
            data = response.xpath('''//*[@id="imageBlock_feature_div"]/script[2]/text()''')
            hiRes_img_list = []
            if data.get():
                dic_str = re.search(r"var data = \{(.*?)\};", data.get(), re.DOTALL).group()
                dic_str = dic_str[11: -1].replace('null', 'None').replace('true', 'True').replace('false',
                                                                                                  'False').replace(
                    'Date.now()',
                    "'time.time()'")
                dic = eval(dic_str)
                for img_infos in dic.get('colorImages', {}).get('initial', {}):
                    if not img_infos.get('hiRes'):
                        hiRes_img_list.append(img_infos['large'])
                    else:
                        hiRes_img_list.append(img_infos['hiRes'])
            else:
                raise ValueError()
            items['img_url'] = '|-|'.join(hiRes_img_list)

            pprint(hiRes_img_list)
            # msg = {
            #     "asin": meta.get("asin"),
            #     "state": "正常页面",
            #     'protection_plan': protection_plan
            # }
            print(msg)
        while True:
            try:
                # sadd(f"{self.site}_real_seed", k, use_md5=False)
                sadd(f"{self.site}_temporary_data", json.dumps(items), use_md5=False)
                logging.info(f"push {self.site}_self_asin_temporary_seed succeed")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"push {self.site}_self_asin_temporary_seed ConnectionError，afresh push --> T_T {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"push {self.site}_self_asin_temporary_seed time out --> T_T {e}")
                continue

    def err_parse(self, failure, **kwargs):
        logging.info(f"error______ {failure.getErrorMessage()}, {failure.request.meta.get('asin')}")
        # 处理失败请求并将其重新发送到 Redis
        # 获取原始请求对象
        request = failure.request
        meta = {
            "asin": request.meta.get("asin"),
            "site": request.meta.get("site"),
        }
        k = json.dumps(meta)
        while True:
            try:
                # sadd(f"{self.site}_real_seed", k, use_md5=False)
                zadd(f"{self.site}_self_asin_temporary_seed", {k: 1})
                logging.info(f"push {self.site}_self_asin_temporary_seed succeed")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"push {self.site}_self_asin_temporary_seed ConnectionError，afresh push --> T_T {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"push {self.site}_self_asin_temporary_seed time out --> T_T {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl self_asin_redis_detail -a site=us'.split()
    cmdline.execute(args)


# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl real_new_detail  > real_redis_amazon1.log 2>&1 &
# nohup scrapy crawl self_asin_redis_detail -a site=us  > self_asin_redis_detail.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "self_asin_redis_detail" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl real_redis_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl real_redis_detail -a site=uk > real_redis_amazon1.log 2>&1 &

