# coding:utf-8
import os
import sys
import json
import time
import redis
import scrapy
import random
import logging
import platform
from urllib.parse import urlparse
from scrapy import cmdline, signals
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import zpop, zadd
# amazon 变体信息获取
from amazon_spider.utils.read_db_data import ReadCookie
from amazon_spider.items import detail_inner_return_item
from amazon_spider.spiders.yswg_spider import SourceSpider
from amazon_spider.utils.common import field_length_dispose
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor


if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonGetReturnSpider(SourceSpider):
    name = 'amazon_get_detail_returns'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.amazon_detail_return_pipe.AmazonVariatSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonGetReturnSpider, self).__init__()
        self.site = site

        self.us_cookie_list = ReadCookie('us').get_cookie()
        self.uk_cookie_list = ReadCookie('uk').get_cookie()
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        self.col = ['asin', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments', 'buy_box_seller_type',
                    'page_inventory', 'category', 'volume', 'weight', 'rank', 'launch_time', 'video_url', 'add_url',
                    'material', 'img_num', 'img_type', 'qa_num', 'brand', 'ac_name', 'node_id', 'sp_num', 'mpn',
                    'online_time', 'describe', 'one_star', 'two_star', 'three_star', 'four_star', 'five_star',
                    'low_star', 'asin_type', 'is_coupon', 'search_category', 'weight_str', 'date_info', 'site',
                    'account_name', 'other_seller_name', 'bsr_date_info', 'account_id', 'package_quantity',
                    'pattern_name', 'together_asin', 'activity_type', 'one_two_val', 'three_four_val', 'five_six_val',
                    'eight_val', 'product_description']
        # self.seeds = [{"asin": "B093PJLFG9"}, {"asin": "B0C4GPH8HF"}]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonGetReturnSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        while True:
            try:
                # seeds = spop(f'{self.site}_variat_seed', 25)
                seeds = zpop(f'{self.site}_detail_returns_seed', 25)
                logging.info("获取任务成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue
        return seeds

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        seeds = self.get_seeds()
        # 更新cookies
        if seeds:
            for i in seeds:
                job = json.loads(i)
                url = f"{self.site_url.get(job['site'])}/dp/{job['asin']}"
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                if job['site'] == 'us':
                    cookies = list(self.us_cookie_list.sample(n=1).values)
                elif job['site'] == 'uk':
                    cookies = list(self.uk_cookie_list.sample(n=1).values)
                cookies = json.loads(cookies[0][0])
                meta = {
                    "asin": job['asin'],
                    "cookiejar": int(time.time()),
                    "asin_type": "",
                    "site": job['site'],
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                cookies.update({
                    'csm-sid': '916-1904410-6680838',
                    'x-amz-captcha-1': '1706092664884599',
                    'x-amz-captcha-2': '5OU/Q7DyDmtg9QGCbivNXg=='
                })
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            logging.info("----------")
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider

    def parse(self, response, **kwargs):
        if item := self.page_state(response):
            yield item
        else:
            amazon_detail_extractor = AmazonDetailExtractor(self.site)
            items = amazon_detail_extractor.run(response)
            inner_item = detail_inner_return_item(items)
            # 添加字段长度限制
            inner_item = field_length_dispose(inner_item)
            inner_item = {k: v or None for k, v in inner_item.items()}
            inner_item["is_coupon"] = inner_item["is_coupon"] if inner_item["is_coupon"] else "0"
            returns = response.xpath("|".join([
                "//div[@id='desktop_qualifiedBuyBox']//a[@data-csa-c-content-id='odf-desktop-return-policy-tabular']//span[@class='a-size-small offer-display-feature-text-message']//text()",
                "//div[@id='returnsInfoFeature_feature_div']/div[@class='offer-display-feature-text']//span[@class='a-size-small offer-display-feature-text-message']//text()",
            ])).get()
            inner_item['returns'] = returns
            brand_new = response.xpath("//tr[@class='a-spacing-small po-brand']//span[@class='a-size-base po-break-word']//text()").get("")
            inner_item['brand_news'] = brand_new.strip()
            print(inner_item)
            yield {'inner_item': inner_item}

    def err_parse(self, failure, **kwargs):
        request = failure.request
        meta = {
            "asin": request.meta.get("asin"),
            "site": request.meta.get("site"),
        }
        k = json.dumps(meta)
        while True:
            try:
                zadd(f"{self.site}_detail_returns_seed", {k: 1})
                logging.info("爬取失败请求重新，推送到爬取队列")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"爬取失败请求重新，推送到爬取队列失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"爬取失败请求重新，推送到爬取队列超时 {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl amazon_get_detail_returns -a site=us'.split()
    cmdline.execute(args)


# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_get_variat -a site=us  > variat1.log 2>&1 &
# nohup scrapy crawl amazon_get_detail_returns -a site=us  > returns_us1.log 2>&1 &
# nohup scrapy crawl amazon_get_detail_returns -a site=us  > returns_us2.log 2>&1 &
# nohup scrapy crawl amazon_get_detail_returns -a site=us  > returns_us3.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl amazon_get_detail_returns" |awk '{print $2}' `; do kill -9 $i ; done;

# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_asin_detail
# ps -ef|grep "scrapy crawl amazon_get_detail_returns -a site=fr"
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_asin_detail  > amazon_get_asin_detail1.log 2>&1 &
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_variat -a site=us > variat3.log 2>&1 &
