# coding:utf-8
import os
import sys
import json
import time
import redis
import random
import scrapy
import logging
import platform
from urllib.parse import urlparse
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# 亚马逊详情页数据获取
from amazon_spider.db.redis_db import spop, sadd
from func_timeout.exceptions import FunctionTimedOut
from amazon_spider.spiders.yswg_spider import SourceSpider


if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonSearchTextSpider(SourceSpider):
    name = 'amazon_get_search_text'
    custom_settings = {
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 5,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,
            'amazon_spider.middlewares.SearchCookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 递减调用
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.search_results.AmazonRealResultsSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonSearchTextSpider, self).__init__()
        self.site = site
        self.update_cookies()
        self.seeds = [
            {"asin": "B00MIQUJQ4", 'username': "ZH", "site": "us", "sku": "ZH0038"}
        ]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonSearchTextSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        # if self.seeds:
        #     seed = self.seeds.pop()
        #     return seed
        # else:
        #     return False
        while True:
            try:
                # seeds = zpop(f'{self.site}_search_seed', 25)
                seeds = spop(f'{self.site}_search_seed', 25)
                logging.info("获取任务成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue
        return seeds

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        # seeds = self.get_seeds()
        seeds = [{'asin': 'B0DG2P9ZQR', 'site': 'us', 'username': 'YD', 'sku': '12'}]
        if seeds:
            for i in seeds:
                # seed = json.loads(i)
                seed = i
                url = self.site_url.get(seed['site'])
                url = f"{url}/s?k={seed['asin']}"
                meta = {
                    "asin": seed['asin'],
                    "username": seed['username'],
                    "cookiejar": int(time.time()),
                    "site": seed['site'],
                    "sku": seed['sku'],
                }
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language': '*',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                cookies = self.json_cookies(meta["site"])
                cookies.update({
                    'csm-sid': '916-1904410-6680838',
                    'x-amz-captcha-1': '1706092664884599',
                    'x-amz-captcha-2': '5OU/Q7DyDmtg9QGCbivNXg==',
                })
                self.crawler.engine.crawl(
                    scrapy.Request(url=url, cookies=cookies, headers=headers,
                                   callback=self.parse, errback=self.err_parse,
                                   dont_filter=True, meta=meta), self)
        else:
            logging.info("爬取完成--> ")
            raise

    def parse(self, response, **kwargs):
        # print("-" * 20)
        print(response.xpath(".//div[@class='a-row']").get())
        meta = response.meta
        item = {
            # "username": meta['username'],
            "state": 3,
            "results": "n" if "No results for" in response.xpath(".//div[@class='a-row']").get() else "y",
            "asin": meta['asin'],
            "site": meta['site'],
        }
        print(item)
        yield item

    def err_parse(self, failure, **kwargs):
        logging.info(f"error______ {failure.getErrorMessage()}, {failure.request.meta.get('asin')}")
        # 获取原始请求对象
        request = failure.request
        meta = {
            "asin": request.meta.get("asin"),
            "username": request.meta.get("username"),
            "sku": request.meta.get("sku"),
            "site": request.meta.get("site"),
        }
        k = json.dumps(meta)
        while True:
            try:
                sadd(f"{self.site}_search_seed", k, use_md5=False)
                # zadd(f"{self.site}_real_zset_seed", {k: meta.get("priority")})
                logging.info(f"push {self.site}_real_zset_seed succeed")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"push {self.site}_search_seed ConnectionError，afresh push --> T_T {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"push {self.site}_search_seed time out --> T_T {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl amazon_get_search_text -a site=us'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_get_search  > amazon_search1.log 2>&1 &
# nohup scrapy crawl amazon_get_search_text > amazon_search_results1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl amazon_get_search_text" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_search
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_search > amazon_search1.log 2>&1 &
# 