# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os
import re
import sys
import uuid
import json
import time
import httpx
import random
import string
import logging
from scrapy import signals, Selector
from scrapy.http.headers import Headers
from scrapy.exceptions import IgnoreRequest
from scrapy.http import HtmlResponse as Response
from twisted.internet.threads import deferToThread
from scrapy.downloadermiddlewares.retry import RetryMiddleware, get_retry_request
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))


class AmazonSpiderSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class AmazonSpiderDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


def get_proxy():
    # tunnel = "w168.kdltps.com:15818"
    # # 用户名密码方式
    # username = 't15000409448635'
    # password = 'ns0ox6so'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # tunnel = "y534.kdltps.com:15818"
    # # 用户名密码方式
    # username = 't16446630868035'
    # password = '3vvxhflz'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # tunnel = "tps676.kdlapi.com:15818"
    # # 用户名密码方式
    # username = 't15338505553535'
    # password = 'q76uvt76'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # tunnel = "tps439.kdlapi.com:15818"
    # # 用户名密码方式
    # username = 't14966226518447'
    # password = 'lb23grw2'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }

    # tunnel = "n378.kdltps.com:15818"
    # # 用户名密码方式
    # username = 't16244760032579'
    # password = '0gj5rbnp'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # 12
    tunnel = "i633.kdltps.com:15818"
    # 用户名密码方式
    username = 't17780866032960'
    password = '57b65ww2'
    proxies = {
        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    }
    # tunnel = "i537.kdltps.com:15818"
    # # 用户名密码方式
    # username = 't16208953323855'
    # password = 'ib80eped'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    return proxies


class ProxyMiddleware:
    def process_request(self, request, spider):
        meta = request.meta
        if meta.get('amazon_proxy'):
            logging.info(f'{request} set proxy')
            request.meta['proxy'] = get_proxy().get("http")


def get_hadoop10_proxy():
    # tunnel = "w168.kdltps.com:15818"
    # # 用户名密码方式
    # username = 't15000409448635'
    # password = 'ns0ox6so'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # tunnel = "y534.kdltps.com:15818"
    # # 用户名密码方式
    # username = 't16446630868035'
    # password = '3vvxhflz'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # tunnel = "tps676.kdlapi.com:15818"
    # # 用户名密码方式
    # username = 't15338505553535'
    # password = 'q76uvt76'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # tunnel = "tps439.kdlapi.com:15818"
    # # 用户名密码方式
    # username = 't14966226518447'
    # password = 'lb23grw2'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # tunnel = "i633.kdltps.com:15818"
    # # 用户名密码方式
    # username = 't17780866032960'
    # password = '57b65ww2'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    # tunnel = "n378.kdltps.com:15818"
    # # 用户名密码方式
    # username = 't16244760032579'
    # password = '0gj5rbnp'
    # proxies = {
    #     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    #     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    # }
    tunnel = "i537.kdltps.com:15818"
    # 用户名密码方式
    username = 't16208953323855'
    password = 'ib80eped'
    proxies = {
        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    }
    return proxies


def get_hadoop6_proxy():
    tunnel = "tps387.kdlapi.com:15818"
    # 用户名密码方式
    username = 't14731073449338'
    password = 's9vlzpox'
    proxies = {
        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    }
    return proxies


class Hadoop10ProxyMiddleware:
    def process_request(self, request, spider):
        meta = request.meta
        if meta.get('amazon_proxy'):
            logging.info(f'{request} set proxy')
            request.meta['proxy'] = get_hadoop10_proxy().get("http")


class Hadoop6ProxyMiddleware:
    def process_request(self, request, spider):
        meta = request.meta
        if meta.get('amazon_proxy'):
            logging.info(f'{request} set proxy')
            request.meta['proxy'] = get_hadoop6_proxy().get("http")


class RandomUserAgentMiddleware(object):

    def get_ua(self):
        """Get some user-agent ,But not necessarily accepted by the website"""
        first_num = random.randint(55, 62)
        third_num = random.randint(0, 3200)
        fourth_num = random.randint(0, 140)
        os_type = [
            '(Windows NT 6.1; WOW64)',
            '(Windows NT 10.0; WOW64)',
            '(X11; Linux x86_64)',
            '(Macintosh; Intel Mac OS X 10_12_6)'
        ]
        chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
        ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36',
                       '(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
                      )
        # userAgent = {"user-agent": ua}
        return ua

    def process_request(self, request, spider):
        # ua = UserAgent().random
        # if ua:
        #     if "Chrome/" in ua:
        #         ua_msg = f"Chrome/{random.randint(50, 102)}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36"
        #         request.headers['User-Agent'] = ua.split("Chrome/")[0] + ua_msg
        #     else:
        #         request.headers['User-Agent'] = ua
        request.headers['User-Agent'] = self.get_ua()
        if request.meta.get("choice_header"):
            k = ""
            for i in (0, random.randint(0, 5)):
                k += random.choice(string.ascii_lowercase)
            request.headers[k] = str(uuid.uuid4())
        # stats = spider.crawler.stats
        # reason = f"requests_count"
        # stats.inc_value(reason)


class UpdateCookiesUrl(object):

    def if_response(self, response):
        # 判断页面是否正常  有部分异常页面 会显示Sorry, no reviews match your current selections.
        # 页面显示11条评论 爬取只有10条 导致异常  asin：B0BV5W3V6H
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace('\\"', '"')
        sel = Selector(text=r, type="html")
        comment_num_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get()
        logging.info(f"页面判断 {comment_num_data}")
        if comment_num_data:
            comment_num_data = re.sub(r"\\u.{4}", '', comment_num_data.__repr__()).replace(",", "").replace(".",
                                                                                                            "").replace("'",                                                                                          "") if comment_num_data else ''
            comment_num_data = re.findall(r'(\d+)', comment_num_data)
            comment_num = comment_num_data[1]
            comment_data_len = len(sel.xpath("//li[@data-hook='review']"))
            # comment_score_num = comment_num_data[0]
            # 判断页面是否有数据  如果评论数有  评论内容无则 被反爬
            if (comment_num != "0") and (comment_data_len):
                logging.info("评论数不为0，解析到评论 页面正常")
                return False
            elif (comment_num == "0"):
                logging.info("评论数为0 页面正常")
                return False
            elif int(response.meta.get('pageNumber', 0)) == response.meta.get('page_count', 0) and int(
                    response.meta.get('comment_num', 0)) % 10 == 1:
                logging.info(f"最后一页  为解析到评论")
                return False
            else:
                logging.info(f"评论数不为0，未解析到评论 页面异常")
                return True
        else:
            logging.info("未解析到评论数 页面异常")
            return True

    def process_response(self, request, response, spider):
        if request.meta.get('SelfAllUpdateCookiesUrl'):
            return deferToThread(self.down, request, response, spider)
        else:
            logging.info("SelfAllUpdateCookiesUrl 不用处理")
            return response

    def down(self, request, response, spider):
        spider.crawler.stats.inc_value("up_requests_num")
        spider_utiles = spider.r_utils(request)
        if self.if_response(response) or response.status != 200:
            logging.info(f"request error asin:{request.meta.get('asin')} page: {request.meta.get('pageNumber')} star {request.meta.get('star')} spider utiles {spider_utiles}")
            headers = Headers(request.headers or {}, encoding='utf-8').to_unicode_dict()
            logging.info(f"{response.status} {headers}, {request.cookies}")
            # his_x_main = request.cookies.get('x-main')
            if (response.status in [404, 503]) or (response.status == 200 and self.if_response(response)):
                x_main = request.cookies.get(spider.log_cookie_name[request.meta.get('site')])
                logging.info(f"middlewares 404 or 503 main {x_main}")
                if response.status == 200:
                    logging.info(f"response ---> {response.text}")
                error_404_count = list(spider.log_cookie_df[spider.log_cookie_df['main'] == x_main]['error_404_count'])[0]
                spider.log_cookie_df.loc[((spider.log_cookie_df['main'] == x_main)), 'error_404_count'] = error_404_count + 1
                raise IgnoreRequest
            if headers.get("cookie"):
                del headers["cookie"]
            cookies = spider.json_cookies(request.meta.get('site'))
            cookies = spider.update_site_cookie(cookies, request.meta.get('site'))
            x_main = spider.log_cookie_df[spider.log_cookie_df['status'] == 1].iloc[0].values[0]
            # x_main = random.choice(spider.log_cookie)
            cookies.update({spider.log_cookie_name.get(request.meta.get('site'), 'us'): x_main})
            request._meta = spider.random_r(request.meta)
            request.headers = Headers(headers, encoding='utf-8')
            request.cookies = cookies
            return request
        else:
            logging.info(f"request asin:{request.meta.get('asin')} page: {request.meta.get('pageNumber')} star {request.meta.get('star')} spider utiles {spider_utiles}")
            return response


class SelfAllUpdateCookiesUrl(object):

    def if_response(self, response, request):
        # 判断页面是否正常  有部分异常页面 会显示Sorry, no reviews match your current selections.
        # 页面显示11条评论 爬取只有10条 导致异常  asin：B0BV5W3V6H
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace('\\"', '"')
        sel = Selector(text=r, type="html")
        comment_num_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get()
        comment_num_data = re.sub(r"\\u.{4}", '', comment_num_data.__repr__()).replace(",", "").replace(".",
                                                                                                        "").replace("'",
                                                                                                                    "") if comment_num_data else ''
        comment_num_data = re.findall(r'(\d+)', comment_num_data)
        logging.info(f"页面判断 {comment_num_data}")
        if comment_num_data:
            comment_num_data = re.sub(r"\\u.{4}", '', comment_num_data.__repr__()).replace(",", "").replace(".",
                                                                                                            "").replace("'",                                                                                          "") if comment_num_data else ''
            comment_num_data = re.findall(r'(\d+)', comment_num_data)
            comment_num = comment_num_data[1]
            comment_data_len = len(sel.xpath("//li[@data-hook='review']"))
            # comment_score_num = comment_num_data[0]
            # 判断页面是否有数据  如果评论数有  评论内容无则 被反爬
            if (comment_num != "0") and (comment_data_len):
                logging.info("评论数不为0，解析到评论 页面正常")
                return False
            elif (comment_num == "0"):
                logging.info("评论数为0 页面正常")
                return False
            elif int(request.meta.get('pageNumber', 0)) == request.meta.get('page_count', 0) and int(
                request.meta.get('comment_num', 0)) % 10 == 1 and (not comment_data_len):
                logging.info(f"最后一页 {request.meta.get('asin')} {request.meta.get('star')} 显示有数据 未解析到评论  参考asin B0BV5W3V6H")
                return False
            elif int(request.meta.get('pageNumber', 0)) == 1 and int(comment_num_data[1]) == 1:
                # 判断第一页是否为 显示为1 实际没数据
                logging.info(f"第一页 {request.meta.get('asin')} {request.meta.get('star')} 显示有数据 未解析到评论 参考asin B0C89ZFFSF")
                return False
            else:
                logging.info(f"评论数不为0，未解析到评论 页面异常")
                return True
        else:
            logging.info("未解析到评论数 页面异常")
            return True

    def process_response(self, request, response, spider):
        if request.meta.get('SelfAllUpdateCookiesUrl'):
            return deferToThread(self.down, request, response, spider)
        else:
            logging.info("SelfAllUpdateCookiesUrl 不用处理")
            return response

    def down(self, request, response, spider):
        spider.crawler.stats.inc_value("up_requests_num")
        spider_utiles = spider.r_utils(request)
        if self.if_response(response, request) or response.status != 200:
            logging.info(f"request error asin:{request.meta.get('asin')} page: {request.meta.get('pageNumber')} star {request.meta.get('star')} spider utiles {spider_utiles}")
            headers = Headers(request.headers or {}, encoding='utf-8').to_unicode_dict()
            logging.info(f"{response.status} {headers}, {request.cookies}")
            # his_x_main = request.cookies.get('x-main')
            if (response.status in [404, 503, 403]) or (response.status == 200 and self.if_response(response, request)):
                x_main = request.cookies.get(spider.log_cookie_name[request.meta.get('site')])
                logging.info(f"middlewares 404 or 503 main {x_main}")
                if response.status == 200:
                    logging.info(f"response ---> {response.text}")
                error_404_count = list(spider.log_cookie_df[spider.log_cookie_df['main'] == x_main]['error_404_count'])[0]
                spider.log_cookie_df.loc[((spider.log_cookie_df['main'] == x_main)), 'error_404_count'] = error_404_count + 1
                raise IgnoreRequest
            if headers.get("cookie"):
                del headers["cookie"]
            cookies = spider.json_cookies(request.meta.get('site'))
            cookies = spider.update_site_cookie(cookies, request.meta.get('site'))
            x_main = spider.log_cookie_df[spider.log_cookie_df['status'] == 1].iloc[0].values[0]
            # x_main = random.choice(spider.log_cookie) 
            cookies.update({spider.log_cookie_name.get(request.meta.get('site'), 'us'): x_main})
            request._meta = spider.random_r(request.meta)
            request.headers = Headers(headers, encoding='utf-8')
            request.cookies = cookies
            return request
        else:
            logging.info(f"request asin:{request.meta.get('asin')} page: {request.meta.get('pageNumber')} star {request.meta.get('star')} spider utiles {spider_utiles}")
            return response


class Http2Middleware:
    def __init__(self, delay=0):
        self.delay = delay

    def process_request(self, request, spider):
        return deferToThread(self.down, request)

    @classmethod
    def from_crawler(cls, crawler):
        s = crawler.settings
        delay = s.get('DOWNLOAD_DELAY', 0)
        return cls(delay)

    def down(self, request):
        if request.meta.get('Http2'):
            proxies = {
                'http://': request.meta.get('proxy'),
                'https://': request.meta.get('proxy')
            }
            logging.getLogger().info("Start to set http2")

            # headers = {i.decode('utf-8'): x[0].decode('utf-8') for i, x in request.headers.items()}
            headers = Headers(request.headers or {}, encoding='utf-8').to_unicode_dict()
            # sslgen = SSLFactory()
            # client = httpx.Client(http2=request.meta.get('http2staus', True), proxies=proxies, verify=sslgen(),
            #                       headers=headers)
            client = httpx.Client(http2=True, proxies=proxies, verify=False,
                                  headers=headers)
            r = client.get(request.url, timeout=request.meta.get('download_timeout', 15),
                           allow_redirects=request.meta.get('allow_redirects'), cookies=request.cookies)
            response = Response(url=str(r.url), status=r.status_code, body=r.content,
                                encoding=request.encoding, request=request, headers=request.headers)
            time.sleep(self.delay * random.randint(75, 125) / 100)
            return response


class CookiesZip(RetryMiddleware):
    def if_response(self, response):
        key_word = ["Enter the characters you see below", "Introduce los caracteres que se muestran",
                    "Saisissez les caractères que vous voyez", "Inserisci i caratteri visualizzati nello spazio"]
        for i in key_word:
            if i in response.text:
                logging.info(f" 站点  +   使用代理ip出现验证码：{response.text}")
                return True
        else:
            return False

    def process_response(self, request, response, spider):
        return deferToThread(self.down, request, response, spider)

    def down(self, request, response, spider):
        if ".baidu.com" in response.url:
            request.meta["handle_httpstatus_all"] = True
            return response
        if "www.amazon.com/gp/product/ajax" in response.url:
            request.meta["handle_httpstatus_all"] = True
            logging.info("更买页url")
            return response
        if "/dp/" in response.url:
            stats = spider.crawler.stats
            spider_utiles = spider.r_utils(request)
            zip_code = response.xpath(".//span[@class='nav-line-2 nav-progressive-content']//text()").get() or response.xpath("(.//div[@id='contextualIngressPtLabel_deliveryShortLine']/span)[2]/text()").get()
            if zip_code:
                if self.if_response(response) or response.status == 404 or (spider.country.get(request.meta.get("site") or spider.site) not in zip_code) or response.xpath("//meta[@name='viewport' and @content!='width=1236']").getall():
                    if ("Die eingegebene Webadresse ist keine" in response.text) \
                        or ("The Web address you entered is not a functioning" in response.text) \
                        or ("saisie n'est pas une page fonctionnelle" in response.text) \
                        or ("web inserito non è una pagina funzionante" in response.text) \
                        or ("web que has especificado no es" in response.text) \
                        or ("Page Not Found" in response.text) \
                        or ("We are sorry! This Gift Card is not available" in response.text) \
                        or ("500 - An error occurred" in response.text)\
                        or ("Lo sentimos! No pudimos encontrar la" in response.text) \
                        or ("Onze excuses. Het webadres dat je hebt" in response.text) \
                        or ("Üzgünüz. Girdiğiniz web adresi" in response.text)\
                        or ("Przepraszamy. Wyszukiwana" in response.text)\
                        or ("Lo sentimos! No pudimos encontrar la página" in response.text)\
                        or ("Vi ber om ursäkt. Webbadressen" in response.text):
                        # 响应中有这些文字  则是特殊页面
                        request.meta["handle_httpstatus_all"] = True
                        reason = f"特殊页面 {spider_utiles}"
                        stats.inc_value(reason)
                        logging.info(reason)
                        return response
                    elif ("keywords" in response.url) or ("dp/" not in response.url) or (
                            "ref=" in response.url and "encoding=" in response.url) or (
                            request.meta.get("asin") not in response.url):
                        # url中有这些文字 则是特殊页面
                        request.meta["handle_httpstatus_all"] = True
                        reason = f"特殊页面 {spider_utiles}"
                        stats.inc_value(reason)
                        logging.info(reason)
                        return response
                    elif (not response.xpath("//div[@id='dp']").getall()) and (
                            not response.xpath(".//input[@name='amzn']/@value")) and response.status == 200:
                        # 响应中没有数据和验证码字段则为正常页面
                        request.meta["handle_httpstatus_all"] = True
                        reason = f"特殊页面 {spider_utiles}"
                        stats.inc_value(reason)
                        logging.info(reason)
                        return response
                    if response.xpath("//meta[@name='viewport' and @content!='width=1236']").getall():
                        reason = f"asin 格式不支持 {spider_utiles}"
                    else:
                        reason = f"cookie邮编错误 {spider_utiles}"
                    cookies = spider.json_cookies(request.meta.get("site"))
                    request.cookies = cookies
                    return get_retry_request(
                        request,
                        reason=reason,
                        spider=spider
                    )
                else:
                    reason = f"cookie邮编正确 {spider_utiles} {zip_code}"
                    stats.inc_value(reason)
                    logging.info(reason)
                    return response
            else:
                if ("Die eingegebene Webadresse ist keine" in response.text) \
                        or ("The Web address you entered is not a functioning" in response.text) \
                        or ("saisie n'est pas une page fonctionnelle" in response.text) \
                        or ("web inserito non è una pagina funzionante" in response.text) \
                        or ("web que has especificado no es" in response.text) \
                        or ("Page Not Found" in response.text) \
                        or ("We are sorry! This Gift Card is not available" in response.text) \
                        or ("500 - An error occurred" in response.text) \
                        or ("Lo sentimos! No pudimos encontrar la" in response.text)\
                        or ("Onze excuses. Het webadres dat je hebt" in response.text)\
                        or ("Üzgünüz. Girdiğiniz web adresi" in response.text)\
                        or ("Przepraszamy. Wyszukiwana" in response.text)\
                        or ("Lo sentimos! No pudimos encontrar la página" in response.text)\
                        or ("Vi ber om ursäkt. Webbadressen" in response.text)\
                        or ("Vous cherchez quelque chose ?" in response.text):
                    # or ("Sorry! Something went wrong!" in response.text) #
                    request.meta["handle_httpstatus_all"] = True
                    reason = f"特殊页面 {spider_utiles}"
                    stats.inc_value(reason)
                    logging.info(reason)
                    return response
                elif (not response.xpath("//div[@id='dp']").getall()) and (
                        not response.xpath(".//input[@name='amzn']/@value")) and response.status == 200:
                    # 响应中没有数据和验证码字段则为正常页面
                    request.meta["handle_httpstatus_all"] = True
                    reason = f"特殊页面 {spider_utiles}"
                    stats.inc_value(reason)
                    logging.info(reason)
                    return response
                else:
                    # try:
                    if response.xpath(".//input[@name='amzn']/@value"):
                        reason = f"验证码重试 {spider_utiles}"
                        cookies = spider.json_cookies(request.meta.get("site"))
                        request.cookies = cookies
                        return get_retry_request(
                            request,
                            reason=reason,
                            spider=spider
                        )
                    else:
                        reason = f"未知原因 {spider_utiles}"
                        return get_retry_request(
                            request,
                            reason=reason,
                            spider=spider
                        )
        else:
            return response


class SearchCookiesZip(RetryMiddleware):
    def if_response(self, response):
        key_word = [
            "Enter the characters you see below",
            "Introduce los caracteres que se muestran",
            "Saisissez les caractères que vous voyez",
            "Inserisci i caratteri visualizzati nello spazio",
            "We are sorry! This Gift Card is not available",
            "500 - An error occurred",
            "Sorry! Something went wrong!",
            "Tut uns Leid!",
            "Page Not Found",
            "We are sorry! This Gift Card is not available",
            "500 - An error occurred",
            "Sorry! Something went wrong!",
            "Tut uns Leid!"
        ]
        for i in key_word:
            if i in response.text:
                logging.info(f" 站点  +   使用代理ip出现验证码：{response.text}")
                return True
        else:
            return False

    def process_response(self, request, response, spider):
        return deferToThread(self.down, request, response, spider)

    def down(self, request, response, spider):
        if ".baidu.com" in response.url:
            request.meta["handle_httpstatus_all"] = True
            return response
        # import requests
        zip_code = response.xpath(".//span[@class='nav-line-2 nav-progressive-content']//text()").get() or response.xpath("(.//div[@id='contextualIngressPtLabel_deliveryShortLine']/span)[2]/text()").get()
        if zip_code:
            if self.if_response(response) or response.status == 404 or (spider.country.get(request.meta.get("site") or spider.site) not in zip_code) or response.xpath("//meta[@name='viewport' and @content!='width=1236']").getall():
            # (not response.xpath("""//*[contains(text(), 'ue_pty = "Detail"')]""").get("")) or ("Personnaliser les cookies" in response.text):
                if ("Die eingegebene Webadresse ist keine" in response.text) \
                    or ("The Web address you entered is not a functioning" in response.text) \
                    or ("saisie n'est pas une page fonctionnelle" in response.text) \
                    or ("web inserito non è una pagina funzionante" in response.text) \
                    or ("web que has especificado no es" in response.text) \
                    or ("Page Not Found" in response.text) \
                    or ("We are sorry! This Gift Card is not available" in response.text) \
                    or ("500 - An error occurred" in response.text)\
                    or ("Lo sentimos! No pudimos encontrar la" in response.text):
                    # 响应中有这些文字  则是特殊页面
                    request.meta["handle_httpstatus_all"] = True
                    return response
                elif ("keywords" in response.url) or ("dp/" not in response.url) or (
                        "ref=" in response.url and "encoding=" in response.url) or (
                        request.meta.get("asin") not in response.url):
                    request.meta["handle_httpstatus_all"] = True
                    return response
                # elif (not response.xpath("//div[@id='dp']").getall()) and (
                #         not response.xpath(".//input[@name='amzn']/@value")):
                #     request.meta["handle_httpstatus_all"] = True
                #     return response

                if response.xpath("//meta[@name='viewport' and @content!='width=1236']").getall():
                    reason = f"asin 格式不支持 {request.meta['asin']}"
                else:
                    reason = f"cookie邮编错误 {zip_code}"
                msg = random.choice(spider.cookie_list)
                cookies = json.loads(msg[1])
                request.cookies = cookies
                request.meta["cookie_id"] = msg[0]
                time.sleep(2)
                return get_retry_request(
                    request,
                    reason=reason,
                    spider=spider
                )
            else:
                logging.info(f"cookie邮编正确 {zip_code}")

                return response
        else:
            if ("Die eingegebene Webadresse ist keine" in response.text) \
                    or ("The Web address you entered is not a functioning" in response.text) \
                    or ("saisie n'est pas une page fonctionnelle" in response.text) \
                    or ("web inserito non è una pagina funzionante" in response.text) \
                    or ("web que has especificado no es" in response.text) \
                    or ("Page Not Found" in response.text) \
                    or ("We are sorry! This Gift Card is not available" in response.text) \
                    or ("500 - An error occurred" in response.text)\
                    or ("Lo sentimos! No pudimos encontrar la" in response.text):
                # or ("Sorry! Something went wrong!" in response.text) #
                request.meta["handle_httpstatus_all"] = True
                return response
            # elif (not response.xpath("//div[@id='dp']").getall()) and (not response.xpath(".//input[@name='amzn']/@value")):
            #     request.meta["handle_httpstatus_all"] = True
            #     return response
            else:
                # try:
                if response.xpath(".//input[@name='amzn']/@value"):

                    reason = f"验证码重试 {zip_code}"
                    msg = random.choice(spider.cookie_list)
                    cookies = json.loads(msg[1])
                    request.cookies = cookies
                    request.meta["cookie_id"] = msg[0]
                    return get_retry_request(
                        request,
                        reason=reason,
                        spider=spider
                    )
                else:
                    reason = f"未知原因{response.url}"
                    return get_retry_request(
                        request,
                        reason=reason,
                        spider=spider
                    )



class Feedback(RetryMiddleware):
    def if_response(self, response):
        key_word = ["Enter the characters you see below", "Introduce los caracteres que se muestran",
                    "Saisissez les caractères que vous voyez", "Inserisci i caratteri visualizzati nello spazio"]
        for i in key_word:
            if i in response.text:
                logging.info(f" 站点  +   使用代理ip出现验证码：{response.text}")
                return True
        else:
            return False

    def process_response(self, request, response, spider):
        return deferToThread(self.down, request, response, spider)

    def down(self, request, response, spider):
        # with open(f"{request.meta.get('account_name')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        zip_code = response.xpath(".//span[@class='nav-line-2 nav-progressive-content']//text()").get() or response.xpath("(.//div[@id='contextualIngressPtLabel_deliveryShortLine']/span)[2]/text()").get()
        # logging.info(f"重试中间件 {zip_code}")
        if zip_code:
            if self.if_response(response) or response.status == 404 or (spider.country.get(spider.site) not in zip_code) or response.xpath("//meta[@name='viewport']").getall():
                if response.xpath("//meta[@name='viewport']").getall():
                    reason = f"account_name 格式不支持 {request.meta.get('account_name')}"
                else:
                    reason = f"cookie邮编错误 {zip_code}"
                msg = random.choice(spider.cookie_list)
                cookies = json.loads(msg[1])
                request.cookies = cookies
                request.meta["cookie_id"] = msg[0]
                time.sleep(2)
                return get_retry_request(
                    request,
                    reason=reason,
                    spider=spider
                )
            else:
                logging.info(f"cookie邮编正确 {zip_code}   {request.meta['cookie_id']}")


                return response
        else:
            if response.xpath(".//input[@name='amzn']/@value").get() or response.xpath("//meta[@name='viewport']").getall():
                if response.xpath("//meta[@name='viewport']").getall():
                    reason = f"account_name 格式不支持 {request.meta.get('account_name')}"
                else:
                    reason = f"验证码重试 {zip_code}"
                msg = random.choice(spider.cookie_list)
                cookies = json.loads(msg[1])
                request.cookies = cookies
                request.meta["cookie_id"] = msg[0]
                return get_retry_request(
                    request,
                    reason=reason,
                    spider=spider
                )
            else:
                reason = f"未知原因 {response.url}"
                return get_retry_request(
                    request,
                    reason=reason,
                    spider=spider
                )


# class FilterMiddleware():
#     # 爬取成功时添加redis,进行去重
#     def __init__(self, server, key, debug, bit, hash_number):
#         """Initialize the duplicates filter.
#
#         Parameters
#         ----------
#         server : redis.StrictRedis
#             The redis server instance.
#         key : str
#             Redis key Where to store fingerprints.
#         debug : bool, optional
#             Whether to log filtered requests.
#
#         """
#         self.server = server
#         self.key = key
#         self.debug = debug
#         self.bit = bit
#         self.hash_number = hash_number
#         self.logdupes = True
#         self.bf = BloomFilter(server, self.key, bit, hash_number)
#
#     @classmethod
#     def from_settings(cls, settings):
#         """Returns an instance from given settings.
#
#         This uses by default the key ``dupefilter:<timestamp>``. When using the
#         ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
#         it needs to pass the spider name in the key.
#
#         Parameters
#         ----------
#         settings : scrapy.settings.Settings
#
#         Returns
#         -------
#         RFPDupeFilter
#             A RFPDupeFilter instance.
#
#
#         """
#         server = get_redis_from_settings(settings)
#         # XXX: This creates one-time key. needed to support to use this
#         # class as standalone dupefilter with scrapy's default scheduler
#         # if scrapy passes spider on open() method this wouldn't be needed
#         # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
#         key = defaults.SCHEDULER_DUPEFILTER_KEY % {'spider': settings.get("name")}
#         debug = settings.getbool('DUPEFILTER_DEBUG', DUPEFILTER_DEBUG)
#         bit = settings.getint('BLOOMFILTER_BIT', BLOOMFILTER_BIT)
#         hash_number = settings.getint('BLOOMFILTER_HASH_NUMBER', BLOOMFILTER_HASH_NUMBER)
#         return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number)
#
#     def process_response(self, request, response, spider):
#         return deferToThread(self.down, request, response, spider)
#
#     def down(self, request, response, spider):
#         if response.status == 200:
#             re_msg = request_fingerprint(request)
#             self.bf.insert(re_msg)
#             return response
#         else:
#             return response







