# coding:utf-8
import os
import sys
import json
import time
import redis
import random
import scrapy
import logging
import platform
from pprint import pprint
from urllib.parse import urlparse
from scrapy import cmdline, signals
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import spop, sadd, zadd, zpop
# 亚马逊详情页数据获取
from amazon_spider.spiders.yswg_spider import SourceSpider
from amazon_spider.utils.common import field_length_dispose
from amazon_spider.items import variat_item, detail_inner_item
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor
from amazon_spider.monitor.bsr_spider_status import AsinStateFind

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonGetKeepaSpider(SourceSpider):
    name = 'amazon_keepa_detail'
    custom_settings = {
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            # 'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,

            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.keepa_pipe.AmazonKeepaSpiderPipeline': 230,
            # 'amazon_spider.pipeline.fidle_monitor_count.FidleMonitorPipeline': 200,
            # 'amazon_spider.pipeline.new_keepa_pipe.AmazonKeepaSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonGetKeepaSpider, self).__init__()
        self.site = site
        self.update_cookies()
        self.sleep_count = 0
        self.self_asin_detail = f"{self.site}_self_asin_detail"

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonGetKeepaSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        while True:
            try:
                seeds = zpop(f'{self.site}_day_seed', 25)
                logging.info("get seeds ok o-_-o")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"get seeds error T_T --> {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"get seeds timeout T_T -->{e}")
                continue
        return seeds

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site} {time.time()}')
        seeds = self.get_seeds()
        str_time = time.strftime("%H:%M:%S", time.localtime())
        # 更新cookies
        if str_time >= "09:09:00" and str_time <= "09:10:00":
            self.update_cookies()
        if seeds:
            for i in seeds:
                job = json.loads(i)
                if not self.site_url.get(job['site']):
                    logging.info(f"filter unknown site {job['site']}")
                    continue
                if job["is_variation"] == 2:
                    url = f"{self.site_url.get(job['site']) or self.url_}/dp/{job['asin']}"
                else:
                    url = f"{self.site_url.get(job['site']) or self.url_}/dp/{job['asin']}" + "?th=1&psc=1"
                headers = {
                    'Connection': 'close',
                    'host': urlparse(url).hostname,
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                meta = {
                    # "use_aiohttp": True,
                    # "curlcffi": True,
                    "asin": job['asin'],
                    # "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "asin_type": job['asin_type'],
                    "date_info": job['date_info'],
                    "site": job['site'] or self.site,
                    "is_variation": job["is_variation"],
                    "priority": job["priority"],
                }
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'random add headers：{h_key}')
                cookies = self.json_cookies(meta["site"])
                meta = self.random_r(meta)
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            self.sleep_count += 1
            if self.sleep_count >= random.randint(6, 15):
                raise
            logging.info('no task sleep 30s')
            time.sleep(30)
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)

    def parse(self, response, **kwargs):
        # 统一yield 字典类型
        print("-" * 20)
        # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        if item := self.page_state(response):
            yield item
        else:
            amazon_detail_extractor = AmazonDetailExtractor(self.site)
            items = amazon_detail_extractor.run(response)
            inner_item = detail_inner_item(items)
            # 添加字段长度限制
            inner_item = field_length_dispose(inner_item)
            inner_item = {k: v or None for k, v in inner_item.items()}
            inner_item["is_coupon"] = inner_item["is_coupon"] if inner_item["is_coupon"] else "0"

            variat_list = []
            for i in amazon_detail_extractor.variat_msgs(response):
                logging.info(f'variat datas --->：{i}')
                variat_list.append(list(variat_item(i).values()))
            inner_item['variat_list'] = json.dumps(variat_list) if variat_list else None
            pprint(inner_item)
            yield {'inner_item': inner_item}
            if response.meta.get("site") in ["us", "es", "fr", "it", "uk", "de"]:
                if variat_list:
                    # yield {"variat_item": variat_list, "site": response.meta["site"]}
                    if ("1" in response.meta.get("asin_type")) or ("3" in response.meta.get("asin_type")):
                        yield {"self_variat_item": variat_list, "site": response.meta["site"]}
                else:
                    self_variat = {}
                    self_variat["asin"] = response.meta['asin']
                    self_variat["color"] = ""
                    self_variat["size"] = ""
                    self_variat["style"] = ""
                    self_variat["column_2"] = ""
                    self_variat["parent_asin"] = response.meta['asin']
                    self_variat["state"] = "1"
                    yield {"self_variat_item": [self_variat], "site": response.meta["site"]}
            asin_image = amazon_detail_extractor.asin_image(response)
            # if asin_image:
            #     # 入库
            #     yield {"asin_img": asin_image, "site": response.meta["site"]}
            # if self.site == "us":
            #     kafka_item = copy.deepcopy(inner_item)
            #     kafka_item["img_list"] = [list(i.values()) for i in asin_image]
            #     kafka_item["asin_vartion_list"] = [list(i.values()) for i in variat_list]
            #     self.seed_kafka(self.self_asin_detail, kafka_item)


    def err_parse(self, failure, **kwargs):
        request = failure.request
        meta = {
            "asin": request.meta.get("asin"),
            "asin_type": request.meta.get("asin_type"),
            "is_variation": request.meta.get("is_variation"),
            "date_info": request.meta.get("date_info"),
            "site": request.meta.get("site"),
            "priority": request.meta.get('priority')
        }
        logging.info(f"{failure.getTraceback()}")
        k = json.dumps(meta)
        while True:
            try:
                zadd(f"{self.site}_day_seed", {k: meta.get('priority')})
                # sadd(f"{self.site}_day_seed", k, use_md5=False)
                logging.info("request push redis o-_-o --> ")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"request push redis error T_T --> {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"request push redis time out T_T {e}")
                continue

    def close(self, spider, reason):
        if self.site == "us":
            for i in range(0, 4):
                try:
                    # 爬虫结束时检测是否有状态1,2的 没有则发送爬取完成信息
                    AsinStateFind(self.site).if_bsr_spider_state()
                    break
                except:
                    logging.info("发送爬取完成信息失败")
                    continue


if __name__ == '__main__':
    args = 'scrapy crawl amazon_keepa_detail -a site=us'.split()
    cmdline.execute(args)


# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_keepa_detail  > keepa_amazon1.log 2>&1 &
# nohup scrapy crawl amazon_keepa_detail -a site=us  > keepa_us_amazon1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl amazon_keepa_detail" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_asin_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_asin_detail -a site=uk > amazon_get_asin_detail1.log 2>&1 &


# cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/bin/
