# coding:utf-8
import os
import sys
import json
import time
import redis
import scrapy
import random
import logging
import platform
from pprint import pprint
from urllib.parse import urlparse
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import zpop, zadd
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
from amazon_spider.spiders.yswg_spider import SourceSpider
# amazon 变体信息获取
from amazon_spider.utils.common import field_length_dispose
from amazon_spider.items import variat_item, detail_inner_item
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor


if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonGetVariatSpider(SourceSpider):
    name = 'amazon_get_variat'
    custom_settings = {
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.amazon_variat.AmazonVariatSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonGetVariatSpider, self).__init__()
        self.site = site
        self.update_cookies()
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        self.col = ['asin', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments', 'buy_box_seller_type',
                    'page_inventory', 'category', 'volume', 'weight', 'rank', 'launch_time', 'video_url', 'add_url',
                    'material', 'img_num', 'img_type', 'qa_num', 'brand', 'ac_name', 'node_id', 'sp_num', 'mpn',
                    'online_time', 'describe', 'one_star', 'two_star', 'three_star', 'four_star', 'five_star',
                    'low_star', 'asin_type', 'is_coupon', 'search_category', 'weight_str', 'date_info', 'site',
                    'account_name', 'other_seller_name', 'bsr_date_info', 'account_id', 'package_quantity',
                    'pattern_name', 'together_asin', 'activity_type', 'one_two_val', 'three_four_val', 'five_six_val',
                    'eight_val', 'product_description']
        # self.seeds = [{"asin": "B093PJLFG9"}, {"asin": "B0C4GPH8HF"}]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonGetVariatSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        while True:
            try:
                # seeds = spop(f'{self.site}_variat_seed', 25)
                seeds = zpop(f'{self.site}_variat_seed', 25)
                logging.info("获取任务成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue
        return seeds

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        stats = self.crawler.stats
        stats.clear_stats()
        seeds = self.get_seeds()
        str_time = time.strftime("%H:%M:%S", time.localtime())
        # 更新cookies
        if str_time >= "07:59:00" and str_time <= "08:02:00":
            self.update_cookies()
        if seeds:
            for i in seeds:
                job = json.loads(i)
                url = f"{self.site_url.get(self.site)}/dp/{job['asin']}"
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                meta = {
                    "use_aiohttp": True,
                    "asin": job['asin'],
                    "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "asin_type": "",
                    "site": self.site,
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                cookies = self.json_cookies(meta["site"])
                # cookies.update({
                #     'csm-sid': '916-1904410-6680838',
                #     'x-amz-captcha-1': '1706092664884599',
                #     'x-amz-captcha-2': '5OU/Q7DyDmtg9QGCbivNXg=='
                # })
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            logging.info("----------")
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider

    def parse(self, response, **kwargs):
        if item := self.page_state(response):
            yield item
        else:
            amazon_detail_extractor = AmazonDetailExtractor(self.site)
            items = amazon_detail_extractor.run(response)
            inner_item = detail_inner_item(items)
            # 添加字段长度限制
            inner_item = field_length_dispose(inner_item)
            inner_item = {k: v or None for k, v in inner_item.items()}
            inner_item["is_coupon"] = inner_item["is_coupon"] if inner_item["is_coupon"] else "0"
            # print(list(inner_item.keys()))

            variat_list = []
            for i in amazon_detail_extractor.variat_msgs(response):
                logging.info(f'变体数据为：{i}')
                variat_list.append(list(variat_item(i).values()))
            inner_item['variat_list'] = json.dumps(variat_list) if variat_list else None
            pprint(inner_item) #
            yield {'inner_item': inner_item}
            if variat_list:
                logging.info("变体数据------>")
                # yield {"variat_item": variat_list, "site": response.meta["site"]}
            else:
                self_variat = {}
                self_variat["asin"] = response.meta['asin']
                self_variat["color"] = ""
                self_variat["size"] = ""
                self_variat["style"] = ""
                self_variat["column_2"] = ""
                self_variat["parent_asin"] = response.meta['asin']
                self_variat["state"] = "1"
                yield {"self_variat_item": [self_variat], "site": response.meta["site"]}
            asin_image = amazon_detail_extractor.asin_image(response)
            if asin_image:
                # 入库
                yield {"asin_img": asin_image, "site": response.meta["site"]}

    def err_parse(self, failure, **kwargs):
        # item = {
        #     "error_asin": True,
        #     "asin": (1, failure.request.meta.get("asin")),
        # }
        # logging.info(f"爬取失败asin:{item.get('asin')}")
        # yield item
        request = failure.request
        meta = {
            "asin": request.meta.get("asin"),
        }
        k = json.dumps(meta)
        while True:
            try:
                zadd(f"{self.site}_variat_seed", {k: 1})
                # sadd(f"{self.site}_variat_seed", k, use_md5=False)
                logging.info(f"爬取失败请求重新，推送到爬取队列:{failure.getErrorMessage()}")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"爬取失败请求重新，推送到爬取队列失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"爬取失败请求重新，推送到爬取队列超时 {e}")
                continue
        # logging.info(f"爬取失败asin:{failure.request.meta.get('asin')}")
        # # failure.getTraceback()
        # # failure.getErrorMessage()
        # # failure.getBriefTraceback()
        # if "Feedback.process_response must return Response or Request" in failure.getErrorMessage():
        #     request = failure.request
        #     self.logger.error('重试次数满了 继续重试 %s', request.url)
        #     yield request
        # elif "User timeout caused connection" in failure.getErrorMessage():
        #     request = failure.request
        #     self.logger.error('TimeoutError on 继续重试 %s', request.url)
        #     yield request
        # elif "CONNECT tunnel with proxy" in failure.getErrorMessage():
        #     request = failure.request
        #     self.logger.error('proxy error on 继续重试 %s', request.url)
        #     yield request


if __name__ == '__main__':
    args = 'scrapy crawl amazon_get_variat -a site=us'.split()
    cmdline.execute(args)





# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_get_variat -a site=us  > variat1.log 2>&1 &
# nohup scrapy crawl amazon_get_variat -a site=us  > variat_us1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl amazon_get_variat" |awk '{print $2}' `; do kill -9 $i ; done;

# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_asin_detail
# ps -ef|grep "scrapy crawl amazon_get_variat -a site=fr"
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_asin_detail  > amazon_get_asin_detail1.log 2>&1 &
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_variat -a site=us > variat3.log 2>&1 &
