import json
import time
import random
import scrapy
import logging
import os, sys
from urllib.parse import urlparse
from scrapy import signals
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.utils.read_db_data import ReadDb, ReadCookie


class SourceSpider(scrapy.Spider):
    name = 'source_spider'

    def __init__(self, site=None, **kwargs):
        super(SourceSpider, self).__init__()
        self.site = None
        self.cookie_list = None
        if site == 'us':
            self.url_ = 'https://www.amazon.com'
        elif site == 'de':
            self.url_ = "https://www.amazon.de"
        elif site == 'uk':
            self.url_ = "https://www.amazon.co.uk"
        elif site == 'it':
            self.url_ = "https://www.amazon.it"
        elif site == 'es':
            self.url_ = "https://www.amazon.es"
        elif site == 'fr':
            self.url_ = "https://www.amazon.fr"
        elif site == 'mx':
            self.url_ = "https://www.amazon.com.mx"

        elif site == 'ae':
            self.url_ = "https://www.amazon.ae"
        elif site == 'au':
            self.url_ = "https://www.amazon.com.au"
        elif site == 'tr':
            self.url_ = "https://www.amazon.com.tr"

        elif site == 'be':
            self.url_ = "https://www.amazon.com.be"
        elif site == 'jp':
            self.url_ = "https://www.amazon.jp"
        elif site == 'nl':
            self.url_ = "https://www.amazon.nl"

        elif site == 'pl':
            self.url_ = "https://www.amazon.pl"
        elif site == 'se':
            self.url_ = "https://www.amazon.se"

        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        self.h_dict = {
            'rtt': '100',
            'sec-ch-device-memory': '8',
            'sec-ch-dpr': '1',
            'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^112^\\^, ^\\^Google',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '^\\^Windows^\\^',
            'sec-ch-viewport-width': '1587',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'viewport-width': '1587',
            'device-memory': '8',
            'downlink': '10',
            'dpr': '1',
            'ect': '4g',
        }
        self.country_lc_main = {
            "lc_main": 'en_US',
            "lc-acbde": 'de_DE',
            "lc-acbuk": 'en_GB',
            "lc-acbit": 'it_IT',
            "lc-acbes": 'es_ES',
            "lc-acbfr": 'fr_FR',
            "lc-acbmx": 'es_MX',
            "lc-acbca": 'en_CA',
            "lc-acbae": 'en_AED',
        }
        self.site_url = {
            "us": 'https://www.amazon.com',
            'de': "https://www.amazon.de",
            "uk": "https://www.amazon.co.uk",
            "it": "https://www.amazon.it",
            "es": "https://www.amazon.es",
            "fr": "https://www.amazon.fr",
            "mx": "https://www.amazon.com.mx",
            "ca": "https://www.amazon.ca",

            "ae": "https://www.amazon.ae",
            "au": "https://www.amazon.com.au",
            "tr": "https://www.amazon.com.tr",
            "be": "https://www.amazon.com.be",
            "jp": "https://www.amazon.co.jp",
            "nl": "https://www.amazon.nl",
            "pl": "https://www.amazon.pl",
            "se": "https://www.amazon.se",
        }
        self.country = {
            "us": '10010',
            "de": '10115',
            "uk": 'London W1S',
            "it": '00185',
            "es": '28001',
            "fr": '75019',
            "mx": '54607',
            "ca": 'M5B 2H',
            "ae": 'Ab..., Al R...',
            "au": '2170',
            "tr": "Konumu",
            "be": "Mettre",
            "jp": "540-0002",
            "nl": "bijwerken",
            "pl": "Zaktualizuj",
            "se": "Uppdatera",
        }
        self.comment_headers = None
        self.header_error_count = 0
        self.utils_requests = [{"use_aiohttp": True}, {"curlcffi": True}, {}, {"use_httpx": True}]
        # self.utils_requests = [{"curlcffi": True}]


    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(SourceSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_comment_headers(self, url, asin='B07RLDWH83'):
        return {
            'host': urlparse(url).hostname,
            'authority': urlparse(url).hostname,
            'accept': 'text/html,*/*',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'cache-control': 'no-cache',
            # 'pragma': 'no-cache',
            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'origin':  url,
            'referer': f'{url}/Elastische-Einstellbar-Rutschfeste-Schwarz-Fleischfarbig/product-reviews/{asin}/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews',
            'x-requested-with': 'XMLHttpRequest',
            # 'accept-encoding': 'gzip, deflate, br',
        }

    def get_detail_headers(self, url, asin='B07RLDWH83'):
        return {
            'Connection': 'close',
            'authority': urlparse(url).hostname,
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': '*',
            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'origin': url,
            'Cache-Control': 'no-cache',
            'referer': f'{url}/{asin}/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
        }

    def update_comment_headers(self):
        if self.header_error_count >= 3:
            logging.info("错误次数 超过30次 修改heders")
            if self.comment_headers.get("x-requested-with") and self.comment_headers.get("accept-encoding"):
                del self.comment_headers['x-requested-with']
                del self.comment_headers['accept-encoding']
                logging.info("headers 方案1")
            elif not self.comment_headers.get("x-requested-with") and not self.comment_headers.get("accept-encoding"):
                self.comment_headers['x-requested-with'] = "XMLHttpRequest"
                logging.info("headers 方案2")
            elif not self.comment_headers.get("accept-encoding") and self.comment_headers.get("x-requested-with"):
                del self.comment_headers['x-requested-with']
                # self.comment_headers['accept-encoding'] = "gzip, deflate, br"
                logging.info("headers 方案3")
            elif self.comment_headers.get("accept-encoding") and not self.comment_headers.get("x-requested-with") and not self.comment_headers.get("sec-ch-ua-platform-version"):
                self.comment_headers['sec-ch-ua-platform-version'] = '^\\^10.0.0^\\^'
                logging.info("headers 方案4")
            else:
                self.comment_headers['x-requested-with'] = "XMLHttpRequest"
                # self.comment_headers['accept-encoding'] = "gzip, deflate, br"
                if self.comment_headers.get('sec-ch-ua-platform-version'):
                    del self.comment_headers['sec-ch-ua-platform-version']
                logging.info("headers 初始方案")
            self.header_error_count = 0
        else:
            logging.info("初始化错误次数")
            self.header_error_count = 0

    def update_cookies(self):
        while True:
            try:
                if self.site == 'us':
                    self.cookie_list = ReadCookie(self.site).get_cookie()
                    self.cookie_other = ReadCookie('other').get_cookie()
                elif not self.site or self.site not in ['uk', 'fr', 'es', 'it', 'de', 'us']:
                    self.cookie_other = ReadCookie('other').get_cookie()
                else:
                    self.cookie_list = ReadCookie(self.site).get_cookie()
                logging.info("cookie list news updates o-_-o")
                break
            except OperationalError as e:
                logging.info(f'get cookies OperationalError sleep 5s T_T --> {e}')
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f'get cookies time out sleep 5s T_T --> {e}')
                time.sleep(5)
                continue

    def get_page_num(self, sum, count):
        page_count = 1 if int(sum) // count == 0 else int(sum) // count
        if int(sum) % count > 0 and int(sum) >= count:
            page_count += 1
        return page_count

    def page_state(self, response):
        asin = response.request.meta.get("asin")
        site = response.request.meta.get("site")
        if ("Die eingegebene Webadresse ist keine" in response.text) \
                    or ("The Web address you entered is not a functioning" in response.text) \
                    or ("saisie n'est pas une page fonctionnelle" in response.text) \
                    or ("web inserito non è una pagina funzionante" in response.text) \
                    or ("web que has especificado no es" in response.text) \
                    or ("Page Not Found" in response.text) \
                    or ("We are sorry! This Gift Card is not available" in response.text) \
                    or ("500 - An error occurred" in response.text) \
                    or ("Onze excuses. Het webadres dat je hebt" in response.text)\
                    or ("Üzgünüz. Girdiğiniz web adresi" in response.text)\
                    or ("Przepraszamy. Wyszukiwana" in response.text)\
                    or ("Lo sentimos! No pudimos encontrar la página" in response.text)\
                    or ("Vi ber om ursäkt. Webbadressen" in response.text):
            item = {
                "error_asin": True,
                "asin": {"state": 4, "asin": asin, "site": site},
            }
            # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
            #     f.write(response.text)
            logging.info(f"页面为空:{item.get('asin')}")
            return item
        elif ("keywords" in response.url) or ("dp/" not in response.url) or (
                "ref=" in response.url and "encoding=" in response.url) or (
                response.meta.get("asin") not in response.url):
            item = {
                "error_asin": True,
                "asin": {"state": 12, "asin": asin, "site": site},
            }
            logging.info(f"跳转页面:{item.get('asin')}")
            return item
        elif not response.xpath("//div[@id='dp']").getall():
            item = {
                "error_asin": True,
                "asin": {"state": 13, "asin": asin, "site": site},
            }
            logging.info(f"跳转或者视频页面:{item.get('asin')}")
            return item
        else:
            return None

    def update_site_cookie(self, cookies, site='us'):
        if site == "us":
            cookies["lc-main"] = 'en_US'
        else:
            cookies[f"lc-acb{site}"] = self.country_lc_main.get(f"lc-acb{site}")
        if cookies.get("skin"):
            del cookies["skin"]
        if cookies.get("session-token"):
            del cookies["session-token"]
        return cookies

    def parse_news_page(self, response, **kwargs):
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def err_(self, response, **kwargs):
        logging.info(f"requests baidu.com error{response.getErrorMessage()} {response.request.meta.get('asin')}")
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def json_cookies(self, site):
        while True:
            if site not in ['us', 'uk', 'fr', 'es', 'it', 'de']:
                logging.info(f"开始获取 {site} cookie")
                cookies = list(self.cookie_other[self.cookie_other.site == site].sample(n=1).values)
                logging.info(f"成功获取 {site} cookie")
            else:
                cookies = list(self.cookie_list.sample(n=1).values)
                logging.info(f"获取{site} cookie")
            try:
                cookies = json.loads(cookies[0][0])
                break
            except TypeError as e:
                logging.info(f"cookie json error {cookies}")
                continue
        return cookies

    def random_r(self, meta):
        for i in self.utils_requests:
            k = list(i.keys())
            if k:
                if meta.get(k[0]):
                    del meta[k[0]]
        r = random.choice(self.utils_requests)
        # if r:
        #     if list(r.keys())[0] == 'curlcffi':
        #         r["impersonate"] = 'chrome110'
        logging.info(f"请求库为： {r or 'scrapy'}")
        meta.update(r)
        return meta

    def is_internet_available(self):
        import requests
        try:
            headers = {
                'accept': '*',
                'accept-language': 'zh-CN,zh;q=0.9',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
            }
            requests.get("http://www.baidu.com", headers=headers, timeout=3)
            return True
        except:
            return False

    def r_utils(self, response):
        meta = response.meta
        for i in self.utils_requests:
            k = list(i.keys())
            if k:
                if meta.get(k[0]):
                    return k[0]
        return "scrapy"








