# coding:utf-8
import os
import re
import sys
import json
import time
import nltk
import redis
import queue
import random
import scrapy
import logging
import platform
from pprint import pprint
from urllib.parse import urlparse
from scrapy import cmdline, signals
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import spop, sadd
from amazon_spider.items import detail_inner_item
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor
from amazon_spider.utils.read_db_data import ReadCookie

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class TopAsinSpider(scrapy.Spider):
    name = 'top_asin_spider'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            # 'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,

            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.keepa_pipe.AmazonKeepaSpiderPipeline': 230,
            'amazon_spider.pipeline.top_asin_pipe.TopAsinPipeline': 200,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(TopAsinSpider, self).__init__()
        self.site = site
        while True:
            try:
                self.cookie_list = list([i[1], i[0]] for i in ReadCookie(self.site).get_cookie().values)
                self.amazon_not_likes = list(list(i) for i in ReadCookie(self.site).get_amazon_temu_not_likes().values)
                if self.site == "us":
                    self.ca_cookies_list = list([i[1], i[0]] for i in ReadCookie("ca").get_cookie().values)
                    self.mx_cookies_list = list([i[1], i[0]] for i in ReadCookie("mx").get_cookie().values)
                logging.info("cookie 列表更新")
                break
            except OperationalError as e:
                logging.info(f'get cookies failure in link sleep 5s{e}')
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f'get cookies time out sleep 5s{e}')
                time.sleep(5)
                continue
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)

        self.site_url = {
            "us": 'https://www.amazon.com',
            'de': "https://www.amazon.de",
            "uk": "https://www.amazon.co.uk",
            "it": "https://www.amazon.it",
            "es": "https://www.amazon.es",
            "fr": "https://www.amazon.fr",
            "mx": "https://www.amazon.com.mx",
            "ca": "https://www.amazon.ca",
        }
        if site == 'us':
            self.url_ = 'https://www.amazon.com'
        elif site == 'de':
            self.url_ = "https://www.amazon.de"
        elif site == 'uk':
            self.url_ = "https://www.amazon.co.uk"
        elif site == 'it':
            self.url_ = "https://www.amazon.it"
        elif site == 'es':
            self.url_ = "https://www.amazon.es"
        elif site == 'fr':
            self.url_ = "https://www.amazon.fr"
        elif site == 'mx':
            self.url_ = "https://www.amazon.com.mx"
        elif site == 'ca':
            self.url_ = "https://www.amazon.ca"

        self.country = {
            "us": '10010',
            "de": '10115',
            "uk": 'London W1S 3',
            "it": '00185',
            "es": '28001',
            "fr": '75019',
            "mx": '54607',
            "ca": 'M5B 2H'
        }
        # self.seeds = self.s.sql_fetch_rows(f'SELECT asin,id,data_type,is_variation FROM {self.site}_self_all_syn LIMIT 30')
        # self.seeds = sql_fetch_rows(f"SELECT asin,id,data_type,is_variation, date_info, site FROM {self.site}_self_all_syn where asin in ('0002158434') LIMIT 30")
        self.qd = queue.Queue()
        self.col = ['asin', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments', 'buy_box_seller_type',
                    'page_inventory', 'category', 'volume', 'weight', 'rank', 'launch_time', 'video_url', 'add_url',
                    'material', 'img_num', 'img_type', 'qa_num', 'brand', 'ac_name', 'node_id', 'sp_num', 'mpn',
                    'online_time', 'describe', 'one_star', 'two_star', 'three_star', 'four_star', 'five_star',
                    'low_star', 'asin_type', 'is_coupon', 'search_category', 'weight_str', 'date_info', 'site',
                    'account_name', 'other_seller_name', 'bsr_date_info', 'account_id', 'package_quantity',
                    'pattern_name', 'together_asin', 'activity_type', 'one_two_val', 'three_four_val', 'five_six_val',
                    'eight_val', 'product_description']
        self.sleep_count = 0
        self.h_dict = {
            'rtt': '100',
            'sec-ch-device-memory': '8',
            'sec-ch-dpr': '1',
            'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^112^\\^, ^\\^Google',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '^\\^Windows^\\^',
            # 'sec-ch-ua-platform-version': '^\\^10.0.0^\\^',
            'sec-ch-viewport-width': '1587',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'viewport-width': '1587',
            'device-memory': '8',
            'downlink': '10',
            'dpr': '1',
            'ect': '4g',
        }
        # while True:
        #     if self.is_internet_available():
        #         self.producer = KafkaProducer(
        #             bootstrap_servers=['61.145.136.61:39092'],
        #             api_version=(2, 4, 1),
        #             value_serializer=lambda v: json.dumps(v).encode('utf-8')
        #         )
        #         break
        #     else:
        #         time.sleep(2)
        #         logging.info("Network connection failure")
        #         continue
        # self.self_asin_detail_topic = f"{self.site}_self_asin_detail"

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(TopAsinSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site} {time.time()}')
        while True:
            try:
                seeds = spop(f'{self.site}_top_asin_spider', 1)
                logging.info("获取任务成功")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue
        str_time = time.strftime("%H:%M:%S", time.localtime())
        # 更新cookies
        if str_time >= "09:09:00" and str_time <= "09:10:00":
            while True:
                try:
                    self.cookie_list = list([i[1], i[0]] for i in ReadCookie(self.site).get_cookie().values)
                    if self.site == "us":
                        self.ca_cookies_list = list([i[1], i[0]] for i in ReadCookie("ca").get_cookie().values)
                        self.mx_cookies_list = list([i[1], i[0]] for i in ReadCookie("mx").get_cookie().values)
                    logging.info("cookie 列表更新")
                    break
                except OperationalError as e:
                    logging.info(f'get cookies failure in link sleep 5s{e}')
                    time.sleep(5)
                    continue
                except FunctionTimedOut as e:
                    logging.info(f'get cookies time out sleep 5s{e}')
                    time.sleep(5)
                    continue
        if seeds:
            for i in seeds:
                job = json.loads(i)
                if job["site"] == "ca":
                    cookies = random.choice(self.ca_cookies_list)
                    logging.info("获取ca cookie")
                elif job["site"] == "mx":
                    cookies = random.choice(self.mx_cookies_list)
                    logging.info("获取mx cookie")
                else:
                    cookies = random.choice(self.cookie_list)
                    logging.info(f"获取{self.site} cookie")
                if not self.site_url.get(job['site']):
                    logging.info(f"过滤 未知站点 {job['site']}")
                    continue

                url = f"{self.site_url.get(job['site']) or self.url_}/dp/{job['asin']}"

                headers = {
                    'Connection': 'close',
                    'host': urlparse(url).hostname,
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                meta = {
                    # "use_aiohttp": True,
                    "curlcffi": True,
                    "asin": job['asin'],
                    "cookie_id": cookies[0],
                    # "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "site": job['site'] or self.site,
                    "asin_type": "top"
                }
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            self.sleep_count += 1
            if self.sleep_count >= random.randint(6, 15):
                raise
            logging.info('no task sleep 30s')
            time.sleep(30)
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_new_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)

    def parse_new_page(self, response, **kwargs):
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def err_(self, response, **kwargs):
        print("sleep=========error", response.getErrorMessage(), response.request.meta.get("asin"))
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def is_internet_available(self):
        import requests
        try:
            requests.get("http://www.baidu.com", timeout=1)
            return True
        except:
            return False

    def if_page_state(self, response):
        if ("Die eingegebene Webadresse ist keine" in response.text) \
                    or ("The Web address you entered is not a functioning" in response.text) \
                    or ("saisie n'est pas une page fonctionnelle" in response.text) \
                    or ("web inserito non è una pagina funzionante" in response.text) \
                    or ("web que has especificado no es" in response.text) \
                    or ("Page Not Found" in response.text) \
                    or ("We are sorry! This Gift Card is not available" in response.text) \
                    or ("500 - An error occurred" in response.text):
            item = {
                "error_asin": True,
                "asin": (4, response.request.meta.get("asin"), response.request.meta.get("site")),
            }
            # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
            #     f.write(response.text)
            logging.info(f"页面为空:{item.get('asin')}")
            return item
        elif ("keywords" in response.url) or ("dp/" not in response.url) or (
                "ref=" in response.url and "encoding=" in response.url) or (
                response.meta.get("asin") not in response.url):
            item = {
                "error_asin": True,
                "asin": (12, response.request.meta.get("asin"), response.request.meta.get("site")),
            }
            logging.info(f"跳转页面:{item.get('asin')}")
            return item
        elif not response.xpath("//div[@id='dp']").getall():
            item = {
                "error_asin": True,
                "asin": (13, response.request.meta.get("asin"), response.request.meta.get("site")),
            }
            logging.info(f"跳转或者视频页面:{item.get('asin')}")
            return item
        else:
            return None

    def filter_key_word(self, x):
        if x:
            tokens = nltk.word_tokenize(x.lower())

            word_tags = nltk.pos_tag(tokens)
            tags = [
                "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT",
                "POS",
                "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT",
                "WP",
                "WP$",
                "WRB"
            ]

            word = " ".join([i[0] for i in word_tags])
            del_num = []
            for i, v in enumerate(word_tags):
                if v[1] == "CD":
                    try:
                        if word_tags[i + 1][1] == "NNS":
                            del_num.append([word_tags[i][0], word_tags[i + 1][0]])
                        else:
                            del_num.append([word_tags[i][0]])
                    except:
                        del_num.append([word_tags[i][0]])
                else:
                    continue
            del_num += self.amazon_not_likes
            for i in del_num:
                word = word.replace(" ".join(i).lower(), "")
            word_tags = nltk.pos_tag(nltk.word_tokenize(re.sub(r"(\(.*?\))", "", word)))
            return " ".join([i[0] for i in word_tags if i[1] in tags])
        else:
            return ""

    def parse(self, response, **kwargs):
        print("-" * 20)
        # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        if item := self.if_page_state(response):
            yield item
        else:
            amazon_detail_extractor = AmazonDetailExtractor(self.site)
            items = amazon_detail_extractor.run(response)
            inner_item = detail_inner_item(items)
            # 添加字段长度限制
            new_title = self.filter_key_word(inner_item.get("title")).replace(inner_item.get("brand", ""), "")
            item = {}
            item["state"] = 3
            item["img_url"] = inner_item["img_url"]
            item["new_title"] = new_title
            item["old_title"] = inner_item["title"]
            item["asin"] = inner_item["asin"]
            item["site"] = inner_item["site"]
            pprint(item)

            yield {'inner_item': item}

    def err_parse(self, failure, **kwargs):
        request = failure.request
        meta = {
            "asin": request.meta.get("asin"),
            "site": request.meta.get("site"),
        }
        logging.info(f"{failure.getTraceback()}")
        k = json.dumps(meta)
        while True:
            try:
                sadd(f"{self.site}_top_asin_spider", k, use_md5=False)
                logging.info("爬取失败请求重新，推送到爬取队列")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"爬取失败请求重新，推送到爬取队列失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"爬取失败请求重新，推送到爬取队列超时 {e}")
                continue

    # def close(self, spider, reason):
    #     if self.site == "us":
    #         for i in range(0, 4):
    #             try:
    #                 # 爬虫结束时检测是否有状态1,2的 没有则发送爬取完成信息
    #                 AsinStateFind(self.site).if_bsr_spider_state()
    #                 break
    #             except:
    #                 continue


if __name__ == '__main__':
    args = 'scrapy crawl top_asin_spider -a site=us'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mxB0BNVQWTXV
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_keepa_detail  > keepa_amazon1.log 2>&1 &
# nohup scrapy crawl amazon_keepa_detail -a site=us  > keepa_us_amazon1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl amazon_keepa_detail" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_asin_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_asin_detail -a site=uk > amazon_get_asin_detail1.log 2>&1 &


# cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/bin/
