# coding:utf-8
import os
import sys
import json
import time
import redis
import random
import scrapy
import logging
import platform
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from scrapy import cmdline, signals
from func_timeout.exceptions import FunctionTimedOut
from amazon_spider.db.redis_db import spop, sadd
# 亚马逊详情页数据获取

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class UpsGetDataSpider(scrapy.Spider):
    name = 'ups_datas_spider'
    custom_settings = {
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        # 'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            # 'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,

            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            'amazon_spider.middleware.17track.AddLastEventIdMiddleware': 450,
            # 'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,

            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.ups_pipe.UpsSpiderPipeline': 230,
            # 'amazon_spider.pipeline.fidle_monitor_count.FidleMonitorPipeline': 200,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self):
        super(UpsGetDataSpider, self).__init__()
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        self.col = ['transport_sn', 'provider_name', 'ups_orders', 'transport', 'data_json']
        self.sleep_count = 0

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(UpsGetDataSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------ {time.time()}')
        while True:
            try:
                seeds = spop(f'ups_transport', 1)
                logging.info("获取任务成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue

        if seeds:
            for i in seeds:
                fedex_dhl_FBA = str(i, "utf-8").split('|')
                provider_name = fedex_dhl_FBA[1]
                fh_orders = fedex_dhl_FBA[0]
                logging.info(f"fedex_dhl_FBA: {fedex_dhl_FBA}")

                # headers = {
                #     'Host': 't.17track.net',
                #     'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
                #     'accept': 'application/json, text/javascript, */*; q=0.01',
                #     'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
                #     'x-requested-with': 'XMLHttpRequest',
                #     'sec-ch-ua-mobile': '?0',
                #     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
                #     'sec-ch-ua-platform': '"Windows"',
                #     'origin': 'https://t.17track.net',
                #     'sec-fetch-site': 'same-origin',
                #     'sec-fetch-mode': 'cors',
                #     'sec-fetch-dest': 'empty',
                #     'referer': 'https://t.17track.net/zh-cn',
                #     'accept-language': 'zh-CN,zh;q=0.9',
                # }
                headers = {
                    'Host': 't.17track.net',
                    'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'x-requested-with': 'XMLHttpRequest',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                    'origin': 'https://t.17track.net',
                    'referer': 'https://t.17track.net/zh-cn',
                    'accept-language': 'zh-CN,zh;q=0.9',
                }

                if provider_name == 'dhl':
                    fc = 100001
                else:
                    fc = 100003

                data = '{"data":[{"num":"%s","fc":0,"sc":0}],"guid":"","timeZoneOffset":-480}' % fh_orders
                # print(data)
                url = "https://t.17track.net/track/restapi"
                meta = {
                    "provider_name": provider_name,
                    # "cookiejar": int(time.time()),
                    "fh_orders": fh_orders,
                    "id": fedex_dhl_FBA[2],
                    "fdata": data,
                }
                self.crawler.engine.crawl(scrapy.Request(url=url, method="POST", headers=headers, body=data,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            self.sleep_count += 1
            if self.sleep_count >= random.randint(6, 15):
                raise
            logging.info('no task sleep 30s')
            time.sleep(30)
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_new_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)

    def parse_new_page(self, response, **kwargs):
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def err_(self, response, **kwargs):
        print("sleep=========error", response.getErrorMessage(), response.request.meta.get("asin"))
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def parse(self, response, **kwargs):
        print("-" * 20)
        data_dict = response.json()
        if data_dict.get('guid'):
            prior_status = data_dict['shipments'][0]['prior_status']
            state = data_dict['shipments'][0]['state']
            logging.info(f'requisition number:{response.meta.get("fh_orders")}  prior_status:{prior_status}  state: {state}')
            if 'NotFound' in prior_status and 'Failure' in state:
                logging.info(f'NotFound-------Failure 请求单号:{response.meta.get("fh_orders")}')
                time.sleep(random.uniform(2, 5.5))
                logging.info(f"{data_dict.get('guid')}")
                data = '{"data":[{"num":"%s","fc":0,"sc":0}],"guid":"%s","timeZoneOffset":-480}' % (response.meta.get("fh_orders"), data_dict.get('guid'))
                headers = {
                    'Host': 't.17track.net',
                    'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'x-requested-with': 'XMLHttpRequest',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                    'origin': 'https://t.17track.net',
                    'referer': 'https://t.17track.net/zh-cn',
                    'accept-language': 'zh-CN,zh;q=0.9',
                }
                url = "https://t.17track.net/track/restapi"
                yield scrapy.Request(url=url, method="POST", headers=headers, body=data,
                                     callback=self.parse_two, errback=self.err_parse,
                                     dont_filter=True, meta=response.meta)
                # self.dhl_get_2(transport_sn)
            elif 'Delivered' in prior_status and 'Failure' in state:
                ups_data = [response.meta.get("fh_orders"), response.meta.get("provider_name"), None, prior_status, json.dumps(data_dict)]
                yield {"ups_data": ups_data, "id": response.meta.get("id")}
            else:
                ups_data = [response.meta.get("fh_orders"), response.meta.get("provider_name"), None, prior_status, json.dumps(data_dict)]
                yield {"ups_data": ups_data, "id": response.meta.get("id")}
        else:
            logging.info("not get guid Reassociation Request")
            data = '{"data":[{"num":"%s","fc":0,"sc":0}],"guid":"","timeZoneOffset":-480}' % response.meta.get(
                "fh_orders")
            headers = {
                'Host': 't.17track.net',
                'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
                'x-requested-with': 'XMLHttpRequest',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                'origin': 'https://t.17track.net',
                'referer': 'https://t.17track.net/zh-cn',
                'accept-language': 'zh-CN,zh;q=0.9',
            }
            url = "https://t.17track.net/track/restapi"
            yield scrapy.Request(url=url, method="POST", headers=headers, body=data,
                                 callback=self.parse_two, errback=self.err_parse,
                                 dont_filter=True, meta=response.meta)

    def parse_two(self, response, **kwargs):
        logging.info("第二次请求---------------------")
        data_dict = response.json()
        if data_dict.get('guid'):
            prior_status = data_dict['shipments'][0]['prior_status']
            state = data_dict['shipments'][0]['state']
            logging.info(f'请求单号:{response.meta.get("fh_orders")}  prior_status:{prior_status}  state: {state}')
            if 'NotFound' in prior_status and 'Failure' in state:
                logging.info(f'NotFound-------Failure 请求单号:{response.meta.get("fh_orders")}')
                time.sleep(random.uniform(2, 5.5))
                logging.info(f"{data_dict.get('guid')}")
                data = '{"data":[{"num":"%s","fc":0,"sc":0}],"guid":"%s","timeZoneOffset":-480}' % (response.meta.get("fh_orders"), data_dict.get('guid'))
                headers = {
                    'Host': 't.17track.net',
                    'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'x-requested-with': 'XMLHttpRequest',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                    'origin': 'https://t.17track.net',
                    'referer': 'https://t.17track.net/zh-cn',
                    'accept-language': 'zh-CN,zh;q=0.9',
                }
                url = "https://t.17track.net/track/restapi"
                yield scrapy.Request(url=url, method="POST", headers=headers, body=data,
                                     callback=self.parse_three, errback=self.err_parse,
                                     dont_filter=True, meta=response.meta)
                # self.dhl_get_2(transport_sn)
            elif 'Delivered' in prior_status and 'Failure' in state:
                ups_data = [response.meta.get("fh_orders"), response.meta.get("provider_name"), None, prior_status, json.dumps(data_dict)]
                yield {"ups_data": ups_data, "id": response.meta.get("id")}
            else:
                ups_data = [response.meta.get("fh_orders"), response.meta.get("provider_name"), None, prior_status, json.dumps(data_dict)]
                yield {"ups_data": ups_data, "id": response.meta.get("id")}

    def parse_three(self, response, **kwargs):
        logging.info("第三次请求---------------------")
        data_dict = response.json()
        if data_dict.get('guid'):
            prior_status = data_dict['shipments'][0]['prior_status']
            state = data_dict['shipments'][0]['state']
            logging.info(f'请求单号:{response.meta.get("fh_orders")}  prior_status:{prior_status}  state: {state}')
            if 'NotFound' in prior_status and 'Failure' in state:
                logging.info(f'NotFound-------Failure 请求单号:{response.meta.get("fh_orders")}')
                time.sleep(random.uniform(2, 5.5))
                # self.dhl_get_2(transport_sn)
            elif 'Delivered' in prior_status and 'Failure' in state:
                ups_data = [response.meta.get("fh_orders"), response.meta.get("provider_name"), None, prior_status, json.dumps(data_dict)]
                yield {"ups_data": ups_data, "id": response.meta.get("id")}
            else:
                ups_data = [response.meta.get("fh_orders"), response.meta.get("provider_name"), None, prior_status, json.dumps(data_dict)]
                yield {"ups_data": ups_data, "id": response.meta.get("id")}

    def err_parse(self, failure, **kwargs):
        request = failure.request
        logging.info(f"{failure.getTraceback()}")
        k = "|".join([request.meta.get("fh_orders"), request.meta.get("provider_name"),  request.meta.get("id")])
        while True:
            try:
                sadd(f"ups_transport", k, use_md5=False)
                logging.info("爬取失败请求重新，推送到爬取队列")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"爬取失败请求重新，推送到爬取队列失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"爬取失败请求重新，推送到爬取队列超时 {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl ups_datas_spider'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl ups_datas_spider  > ups_17track1.log 2>&1 &
# nohup scrapy crawl ups_datas_spider > ups_17track1.log 2>&1 &
# nohup scrapy crawl ups_datas_spider &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl ups_datas_spider" |awk '{print $2}' `; do kill -9 $i ; done;
