import re
import time
import json
import copy
import scrapy
import logging
import pandas as pd
import os, sys, platform
from urllib.parse import urlparse
from scrapy import cmdline, signals
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# amazon 邮编cookie获取
from amazon_spider.db.mysql_db import df_to_sql
from amazon_spider.conf.db import selection_table_name
from amazon_spider.amazon_captcha.solver import AmazonCaptcha

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class GetCookieSpider(scrapy.Spider):
    name = 'get_cookie_amazon'
    custom_settings = {
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': True,
        # 'DOWNLOAD_DELAY': 1,
        # 测试
        # 'TWISTED_REACTOR': 'twisted.internet.selectreactor.SelectReactor',
        # 将爬取信息添加到redis中
        # 'STATS_CLASS': 'scrapy_redis.stats.RedisStatsCollector',
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        'HTTPERROR_ALLOWED_CODES': [302],
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {

            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            # 开启h2时 必开启
            # 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': None,
            # 'amazon_spider.middleware.http2.HttpxMiddleware': 470,

            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 470,

            # 递减调用
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipelines.AmazonSpiderPipeline': 230
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            # 'https': 'amazon_spider.downloadhandlers.http2.HttpxDownloadHandler',
        },
    }

    def __init__(self, site='us', **kwargs):
        if kwargs.get("_job"):
            kwargs.pop("_job")
        super(GetCookieSpider, self).__init__(site, **kwargs)
        self.cols_list = ["cookies"]
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        if site == 'us':
            self.url_ = 'https://www.amazon.com'
        elif site == 'de':
            self.url_ = "https://www.amazon.de"
        elif site == 'uk':
            self.url_ = "https://www.amazon.co.uk"
        elif site == 'it':
            self.url_ = "https://www.amazon.it"
        elif site == 'es':
            self.url_ = "https://www.amazon.es"
        elif site == 'fr':
            self.url_ = "https://www.amazon.fr"
        elif site == 'mx':
            self.url_ = "https://www.amazon.com.mx"
        elif site == 'ca':
            self.url_ = "https://www.amazon.ca"

        elif site == 'ae':
            self.url_ = "https://www.amazon.ae"
        elif site == 'au':
            self.url_ = "https://www.amazon.com.au"
        elif site == 'tr':
            self.url_ = "https://www.amazon.com.tr"
        elif site == 'be':
            self.url_ = "https://www.amazon.com.be"
        elif site == 'jp':
            self.url_ = "https://www.amazon.co.jp"
        elif site == 'nl':
            self.url_ = "https://www.amazon.nl"
        elif site == 'pl':
            self.url_ = "https://www.amazon.pl"
        elif site == 'se':
            self.url_ = "https://www.amazon.se"

        self.headers = {
            'Host': urlparse(self.url_).hostname,
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'referer': self.url_,
            'origin': self.url_,
            # 'Accept-Encoding': 'gzip',
            'accept-language': 'zh-CN,zh;q=0.9',
        }
        self.site = site
        # self.engine = get_country_engine(self.site)
        self.count = 0
        self.site_dict = {
            "us": '10010',
            "de": '10115',
            "uk": 'W1S 3PR',
            "it": '00185',
            "es": '28001',
            "fr": '75019',
            "mx": '54607',
            "ca": 'M5B 2H1'
        }
        self.country_lc_main = {
            "lc_main": 'en_US',
            "lc-acbde": 'de_DE',
            "lc-acbuk": 'en_GB',
            "lc-acbit": 'it_IT',
            "lc-acbes": 'es_ES',
            "lc-acbfr": 'fr_FR',
            "lc-acbmx": 'mx_MX',
            "lc-acbca": 'ca_CA',
            "lc-acbae": 'ae_AE',
            "lc-acbau": 'au_AU',
            "lc-acbtr": 'tr_TR',
            "lc-acbbe": 'be_BE',
            "lc-acbjp": 'jp_JP',
            "lc-acbnl": 'nl_NL',
            "lc-acbpl": 'pl_PL',
            "lc-acbse": 'se_SE',
        }

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(GetCookieSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.info(f'IDLE--------{self.count}---------- site {self.site}---{time.time()}')
        self.count += 1
        if self.site in ["us", "uk", "fr", "es", "it", "ca", "mx", "de"]:
            cookie_count = 150
        else:
            cookie_count = 50
        if self.count >= cookie_count:
            raise
        for i in range(0, 30):
            meta = {
                "curlcffi": True,
                "cookiejar": f"{int(time.time())}{i}",
                "ck": {}
            }
            headers = {
                "Host": urlparse(self.url_).hostname,
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                "accept-language": "zh-CN,zh;q=0.9",
                "referer": f"{self.url_}/dp/B01C837T2S?th=1",
            }
            if self.site in ['de', 'fr', 'uk', 'it', 'es', 'be', 'ae', 'tr', 'pl', 'se', 'nl', 'us', 'ca', 'mx']:
                url = f"https://{urlparse(self.url_).hostname}/"
                self.crawler.engine.crawl(
                    scrapy.Request(url=url, headers=headers, callback=self.parse_two, dont_filter=True, meta=meta), self)
            else:
                url = f"{self.url_}/errors/validateCaptcha"
                self.crawler.engine.crawl(scrapy.Request(url=url, headers=headers, callback=self.parse, dont_filter=True, meta=meta), self)

        # else:
        #     raise
            # logging.info('no task sleep 3s')
            # time.sleep(3)
            # raise DontCloseSpider

    def parse(self, response, **kwargs):
        meta = response.meta
        captcha_url = response.xpath(".//div[@class='a-row a-text-center']/img/@src").get()
        if captcha_url:
            captcha = AmazonCaptcha.fromlink(captcha_url, proxy={})
            solution = captcha.solve()
            logging.info(f"验证码 {solution}")
            headers = {
                "Host": urlparse(self.url_).hostname,
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                "referer": f"{self.url_}/dp/B01C837T2S?th=1",
                "accept-language": "zh-CN,zh;q=0.9",
            }
            amzn = response.xpath(".//input[@name='amzn']/@value").get()
            amzn_r = response.xpath(".//input[@name='amzn-r']/@value").get()
            url = f"{self.url_}/errors/validateCaptcha?amzn={amzn}&amzn-r={amzn_r}&field-keywords={solution}"
            meta['dont_redirect'] = True
            # meta['handle_httpstatus_list'] = [302]
            yield scrapy.Request(url=url, headers=headers, callback=self.parse_one, dont_filter=True, meta=meta)

    def parse_one(self, response, **kwargs):
        meta = response.meta
        if response.status == 302:
            response_headers = response.headers.to_unicode_dict()
            url = response_headers.get('Location') or response_headers.get('location')
            cookies = {i.decode("utf-8").split(";")[0].split("=")[0] :i.decode("utf-8").split(";")[0].split("=")[1] for i in response.headers.getlist('Set-Cookie') if i.decode("utf-8").split(";")[0].split("=")[1] != "-"}
            headers = {
                "Host": urlparse(self.url_).hostname,
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                "accept-language": "zh-CN,zh;q=0.9",
                "referer": f"{self.url_}",
            }
            # if self.site == "nl":
            #     del headers['referer']
            logging.info(f"跳转页面为 {url} {cookies}")
            meta['ck'] = cookies
            yield scrapy.Request(url=url, headers=headers, cookies=cookies, callback=self.parse_two, dont_filter=True, meta=meta)

    def parse_two(self, response, **kwargs):
        d = response.xpath(".//span[@id='nav-global-location-data-modal-action']/@data-a-modal")
        if d:
            csrftoken = json.loads(d.get()).get("ajaxHeaders").get("anti-csrftoken-a2z")
            a = response.xpath(".//script[contains(text(), 'GwInstrumentation.markH1Af')]//text()").get()
            uri = re.findall(r'uri: "(.*?)" }', a)
            self.headers["accept"] = "*/*"
            # headers = Headers(response.headers or {}, encoding='utf-8').to_unicode_dict()
            response.meta['csrftoken'] = csrftoken
            response.meta["handle_httpstatus_all"] = True
            c = {i.decode("utf-8").split(";")[0].split("=")[0] :i.decode("utf-8").split(";")[0].split("=")[1]
                      for i in response.headers.getlist('Set-Cookie')
                      if i.decode("utf-8").split(";")[0].split("=")[1] != "-"}
            c.update(response.meta.get("ck"))
            response.meta["ck"] = c
            response.meta["cookiejar"] = response.meta.get("cookiejar")
            url_get_ubid = self.url_ + uri[0]
            yield scrapy.FormRequest(url=url_get_ubid, headers=self.headers, callback=self.parse_data, meta=response.meta, cookies=c, dont_filter=True)
        else:
            logging.info("验证码页面====")
            # yield get_retry_request(response.request, reason="验证码页面", spider=self.crawler.spider)

    def parse_data(self, response):
        """解析详情页数据"""
        if response.status != 400:
            c = {i.decode("utf-8").split(";")[0].split("=")[0]: i.decode("utf-8").split(";")[0].split("=")[1]
                 for i in response.headers.getlist('Set-Cookie')
                 if i.decode("utf-8").split(";")[0].split("=")[1] != "-"}
            c.update(response.meta.get("ck"))
            logging.info(f"ubid -------- {c}")
            headers = copy.deepcopy(self.headers)
            headers["anti-csrftoken-a2z"] = response.meta['csrftoken']
            c = {k: v for k, v in c.items() if k in ['x-amz-captcha-1', 'x-amz-captcha-2', 'i18n-prefs', 'session-id', 'session-id-time', 'sp-cdn', "ubid-main" if self.site == "us" else f"ubid-acb{self.site}"]}
            c['sp-cdn'] = '"L5Z9:HK"'
            response.meta["ck"] = c
            response.meta["cookiejar"] = response.meta["cookiejar"]
            url = f'{self.url_}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal'
            yield scrapy.Request(url=url, headers=headers, dont_filter=True, callback=self.parse_anti, cookies=c, meta=response.meta)
        else:
            logging.info("请求400失败")
            # yield get_retry_request(response.request, reason="请求400失败", spider=self.crawler.spider)

    def parse_anti(self, response):
        if anti := re.findall('CSRF_TOKEN : "(.*?)"', response.text):
            self.headers["anti-csrftoken-a2z"] = anti[0]
            headers = copy.deepcopy(self.headers)
            headers["content-type"] = 'application/json'
            response.meta["cookiejar"] = response.meta["cookiejar"]
            if self.site in ['tr', 'be', 'nl', 'pl', 'se']:
                url = f"https://{urlparse(response.url).hostname}/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
                yield scrapy.Request(url, headers=headers, callback=self.parse_zip,
                                     cookies=response.meta.get("ck"), meta=response.meta, dont_filter=True,
                                     method="GET")
            elif self.site == 'ae':
                data = '{"city":"Abu Dhabi","deviceType":"web","locationType":"CITY","storeContext":"generic","pageType":"Gateway","actionSource":"glow"}'
                url = f'https://{urlparse(response.url).hostname}/portal-migration/hz/glow/address-change?actionSource=glow'
                yield scrapy.Request(url, headers=headers, body=data, callback=self.parse_zip, cookies=response.meta.get("ck"), meta=response.meta, dont_filter=True, method="POST")
            elif self.site == 'au':
                data = '{"locationType":"POSTAL_CODE_WITH_CITY","zipCode":"2170","city":"WARWICK FARM","deviceType":"web","storeContext":"generic","pageType":"Gateway","actionSource":"glow"}'
                url = f'https://{urlparse(response.url).hostname}/portal-migration/hz/glow/address-change?actionSource=glow'
                yield scrapy.Request(url, headers=headers, body=data, callback=self.parse_zip, cookies=response.meta.get("ck"), meta=response.meta, dont_filter=True, method="POST")
            else:
                data = '{"locationType":"LOCATION_INPUT","zipCode":"%s","storeContext":"generic","deviceType":"web","pageType":"Gateway","actionSource":"glow"}' % self.site_dict.get(self.site)
                url = f'https://{urlparse(response.url).hostname}/portal-migration/hz/glow/address-change?actionSource=glow'
                yield scrapy.Request(url, headers=headers, body=data, callback=self.parse_zip, cookies=response.meta.get("ck"), meta=response.meta, dont_filter=True, method="POST")
        else:
            logging.info("请求获取CSRF_TOKEN失败")
            # yield get_retry_request(response.request, reason="请求失败", spider=self.crawler.spider)

    def parse_zip(self, response):
        if self.site in ['tr', 'be', 'nl', 'pl', 'se']:
            if response.status == 200:
                cookies = response.meta['ck']
                if cookies.get("session-id") and cookies.get("session-id-time") and (cookies.get("ubid-main") or cookies.get(f"ubid-acb{self.site}")):
                    c = json.dumps(cookies)
                    cook = {"cookies": c, 'site': self.site}
                    df = pd.DataFrame([cook])
                    if self.site in ['us', 'uk', 'fr', 'es', 'it', 'de']:
                        title_table = selection_table_name.get(f"{self.site}_comment_cookies_table")
                        del df['site']
                    else:
                        title_table = 'other_site_cookies'
                    logging.info(f"开始入库 {title_table} {c}")
                    # 兼容代理服务器
                    while True:
                        try:
                            df_to_sql(title_table, df, site=self.site, db="mysql")
                            logging.info(f"cookies入库成功-----")
                            if self.site in ['us', 'uk', 'fr', 'es', 'it', 'de']:
                                df['type'] = 'HZ'
                                df_to_sql(f'{self.site}_cookies', df, site=self.site, db="mysql")
                                logging.info(f"{self.site}_cookies 入库成功")
                            break
                        except OperationalError as e:
                            logging.info(f"cookies入库失败  连接错误{e}")
                            continue
                        except FunctionTimedOut as e:
                            logging.info(f"cookies入库-超时{e}--")
                            continue
            else:
                logging.info("请求修改cookies地址接口失败")
        else:
            items = response.json()
            if items.get('address'):
                if items.get('address').get('countryCode'):
                    cookies = response.meta['ck']
                    if cookies.get("session-id") and cookies.get("session-id-time") and (cookies.get("ubid-main") or cookies.get(f"ubid-acb{self.site}")):
                        if self.site == "us":
                            cookies["lc-main"] = 'en_US'
                        else:
                            cookies[f"lc-acb{self.site}"] = self.country_lc_main.get(f"lc-acb{self.site}")
                        c = json.dumps(cookies)
                        cook = {"cookies": c, 'site': self.site}
                        df = pd.DataFrame([cook])
                        if self.site in ['us', 'uk', 'fr', 'es', 'it', 'de']:
                            title_table = selection_table_name.get(f"{self.site}_comment_cookies_table")
                            del df['site']
                        else:
                            title_table = 'other_site_cookies'
                        logging.info(f"开始入库 {title_table} {c}")
                        # 兼容代理服务器
                        while True:
                            try:
                                df_to_sql(title_table, df, site=self.site, db="mysql")
                                logging.info(f"cookies入库成功-----")
                                # if self.site in ['us', 'uk','fr','es','it','de']:
                                #     df['type'] = 'HZ'
                                #     df_to_sql(f'{self.site}_cookies', df, site=self.site, db="mysql")
                                #     logging.info(f"{self.site}_cookies 入库成功")
                                break
                            except OperationalError as e:
                                logging.info(f"cookies入库失败  连接错误{e}")
                                continue
                            except FunctionTimedOut as e:
                                logging.info(f"cookies入库-超时{e}--")
                                continue
            else:
                logging.info("请求修改cookies地址接口失败")
                # yield get_retry_request(response.request, reason="请求失败", spider=self.crawler.spider)


if __name__ == '__main__':
    args = 'scrapy crawl get_cookie_amazon -a site=pl'.split()
    cmdline.execute(args)

# "lc-acbae": 'ae_AE',
# "lc-acbau": 'au_AU',
# "lc-acbtr": 'tr_TR',
# "lc-acbbe": 'be_BE',
# "lc-acbjp": 'jp_JP',
# "lc-acbnl": 'nl_NL',
# "lc-acbpl": 'pl_PL',
# "lc-acbse": 'se_SE',
# nohup scrapy crawl get_cookie_amazon -a site=us  > amazon_us_cookies1.log 2>&1 &
# nohup scrapy crawl get_cookie_amazon  > amazon_cookies2.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl get_cookie_amazon" |awk '{print $2}' `; do kill -9 $i ; done;

# 0 0 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl get_cookie_amazon  > amazon_cookies1.log 2>&1 &

