# coding:utf-8
import os
import sys
import json
import time
import queue
import scrapy
import random
import logging
from pprint import pprint
from urllib.parse import urlparse
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.mysql_db import sql_connect
from amazon_spider.items import detail_item, variat_item
from amazon_spider.utils.read_db_data import ReadDb, ReadCookie
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor


class AmazonGetAsinDetailSpider(scrapy.Spider):
    name = 'amazon_get_asin_detail'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 6,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,

            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipelines.AmazonDetailSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonGetAsinDetailSpider, self).__init__()
        self.site = site
        self.r_db = ReadDb(self.site)
        self.r_ck = ReadCookie(self.site)
        self.s = sql_connect(self.site)
        self.cookie_list = list([i[1], i[0]] for i in self.r_ck.get_cookie().values)
        if site == 'us':
            self.url_ = 'https://www.amazon.com'
        elif site == 'de':
            self.url_ = "https://www.amazon.de"
        elif site == 'uk':
            self.url_ = "https://www.amazon.co.uk"
        elif site == 'it':
            self.url_ = "https://www.amazon.it"
        elif site == 'es':
            self.url_ = "https://www.amazon.es"
        elif site == 'fr':
            self.url_ = "https://www.amazon.fr"
        elif site == 'mx':
            self.url_ = "https://www.amazon.com.mx"

        self.headers = {
            'Connection': 'close',
            'authority': urlparse(self.url_).hostname,
            'accept': 'text/html,*/*',
            'accept-language': '*',
            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'origin': self.url_,
            'referer': f'{self.url_}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
        }
        self.country = {
            "us": '10010',
            "de": '10115',
            "uk": 'London W1S 3',
            "it": '00185',
            "es": '28001',
            "fr": '75019',
            "mx": '54607'
        }
        # self.seeds = sql_fetch_rows(f"SELECT `asin`, `is_variat`, `week`, `data_type` FROM {self.site}_all_syn_st WHERE state=1 and week=16;")
        # self.seeds = sql_fetch_rows(f"SELECT `asin`, `is_variat`, `week`, `data_type` from {self.site}_all_syn_st WHERE asin in (SELECT asin from it_asin_detail_2023_12 WHERE img_url is null and price!=0);")
        # self.seeds = sql_fetch_rows(f"SELECT `asin`, `is_variat`, `week`, `data_type` from {self.site}_all_syn_st limit 100;")
        # self.seeds = sql_fetch_rows(f"SELECT `asin`, `is_variat`, `week`, `data_type` from {self.site}_all_syn_st where asin in ('B08RMT8ZQG');")
        self.qd = queue.Queue()
        # self.col = ['asin', 'asin_state', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments', 'buy_box_seller_type', 'page_inventory', 'category', 'volume', 'weight', 'rank', 'launch_time', 'img_num', 'img_type', 'activity_type', 'one_two_val', 'three_four_val', 'five_six_val', 'eight_val', 'qa_num', 'one_star', 'two_star', 'three_star', 'four_star', 'five_star', 'low_star', 'together_asin', 'brand', 'ac_name', 'sp_num', 'data_type', 'describe', 'material', 'node_id']
        self.col = ['asin', 'asin_state', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments', 'buy_box_seller_type', 'page_inventory', 'category', 'volume', 'weight', 'rank', 'launch_time', 'img_num', 'img_type', 'activity_type', 'one_two_val', 'three_four_val', 'five_six_val', 'eight_val', 'qa_num', 'one_star', 'two_star', 'three_star', 'four_star', 'five_star', 'low_star', 'together_asin', 'brand', 'ac_name', 'sp_num', 'data_type', 'describe', 'material', 'node_id', 'weight_str', 'date_info', 'package_quantity', 'pattern_name']

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonGetAsinDetailSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site} {time.time()}')
        df = self.r_db.read_db_detail()
        c = int(time.time())
        str_time = time.strftime("%H:%M:%S", time.gmtime(c))
        # 更新cookies
        if str_time >= "09:09:00" and str_time <= "09:10:00":
            self.cookie_list = list([i[1], i[0]] for i in self.r_ck.get_cookie().values)
            logging.info("cookie 列表更新")
        if df.shape[0] > 0:
            for i in df.values:
                cookies = random.choice(self.cookie_list)
                if i[2] == 0:
                    url = f"{self.url_}/dp/{i[0]}"
                else:
                    url = f"{self.url_}/dp/{i[0]}" + "?th=1&psc=1"
                meta = {
                    "use_aiohttp": True,
                    "asin": i[0],
                    "cookie_id": cookies[0],
                    "week": i[3],
                    "data_type": i[4],
                    "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "site": self.site,
                }
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            raise
        # logging.debug(f'队列长度为：{len(self.seeds)}')
        # if not self.seeds:
        #     raise
        # if len(self.seeds) >= 30:
        #     list_num = range(0, 30)
        # else:
        #     list_num = range(0, len(self.seeds))
        # for i in list_num:
        #     seed = self.seeds.pop()
        #     cookies = random.choice(self.cookie_list)
        #     if seed["is_variat"] == 0:
        #         url = f"{self.url_}/dp/{seed['asin']}"
        #     else:
        #         url = f"{self.url_}/dp/{seed['asin']}" + "?th=1&psc=1"
        #     meta = {
        #         "use_aiohttp": True,
        #         "asin": seed['asin'],
        #         "cookie_id": cookies[0],
        #         "data_type": seed["data_type"],
        #         "week": seed["week"],
        #         "amazon_proxy": True,
        #         "cookiejar": int(time.time()),
        #     }
        #     self.crawler.engine.crawl(scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers,
        #                                              callback=self.parse, errback=self.err_parse,
        #                                              dont_filter=True, meta=meta), self)


    def if_page_state(self, response):
        if ("Page Not Found" in response.text) or ("We are sorry! This Gift Card is not available" in response.text) or (
                "500 - An error occurred" in response.text) or ("Sorry! Something went wrong!" in response.text) or\
                len(response.xpath("//div").getall()) < 2:
            item = {
                "error_asin": True,
                "asin": (4, response.request.meta.get("asin"), response.request.meta.get("week")),
            }
            logging.info(f"页面为空:{item.get('asin')}")
            return item
        elif ("keywords" in response.url) or ("dp/" not in response.url) or (
                "ref=" in response.url and "encoding=" in response.url) or (response.meta.get("asin") not in response.url):
            item = {
                "error_asin": True,
                "asin": (12, response.request.meta.get("asin"), response.request.meta.get("week")),
            }
            logging.info(f"跳转页面:{item.get('asin')}")
            return item
        elif not response.xpath("//div[@id='dp']").getall():
            item = {
                "error_asin": True,
                "asin": (13, response.request.meta.get("asin"), response.request.meta.get("week")),
            }
            logging.info(f"跳转或者视频页面:{item.get('asin')}")
            return item
        else:
            return None

    def parse(self, response, **kwargs):
        print("-" * 20)
        # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        if item := self.if_page_state(response):
            yield item
        else:
            amazon_detail_extractor = AmazonDetailExtractor(self.site, self.url_)
            items = amazon_detail_extractor.run(response)
            det_items = detail_item(items)
            details_item = {k: v or None for k, v in det_items.items()}
            pprint(details_item)
            logging.info(f"详情数据 {details_item}")
            yield {"detail_item": details_item, "week": response.meta.get("week")}
            for i in amazon_detail_extractor.variat_msgs(response):
                yield {"variat_item": variat_item(i), "week": response.meta.get("week")}
            asin_seller_account_item = amazon_detail_extractor.asin_seller_account_syn(response)
            asin_image = amazon_detail_extractor.asin_image(response)
            asin_bs_category_asin_detail = amazon_detail_extractor.asin_bs_category_asin_detail(response)
            if asin_seller_account_item:
                asin_seller_account_item = {k: v or None for k, v in asin_seller_account_item.items()}
                # 入库
                yield {"seller_account_item": asin_seller_account_item, "week": response.meta.get("week")}
                # pass
            else:
                print("asin_seller_account_item", "没有", response.url)
            if asin_image:
                # 入库
                # yield {"asin_img": asin_image, "week": response.meta.get("week")}
                yield {"asin_img": asin_image, "site": response.meta["site"]}
            else:
                print("asin_img", "没有", response.url)
            if asin_bs_category_asin_detail:
                asin_bs_category_asin_detail = {k: v or None for k, v in asin_bs_category_asin_detail.items()}
                # 入库
                yield {"bs_category": asin_bs_category_asin_detail, "week": response.meta.get("week")}
                # pass
            else:
                print("asin_bs_category_asin_detail", "没有", response.url)

    def err_parse(self, response, **kwargs):
        logging.info(f"error___________  {response.getErrorMessage()}, {response.request.meta.get('asin')}")
        item = {
            "error_asin": True,
            "asin": (1, response.request.meta.get("asin"), response.request.meta.get("week")),
        }
        logging.info(f"爬取失败asin:{item.get('asin')}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl amazon_get_asin_detail -a site=us'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_get_asin_detail  > amazon.log 2>&1 &
# nohup scrapy crawl amazon_get_asin_detail -a site=it  > amazon_it_detail1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl amazon_get_asin_detail -a site=de" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_asin_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_asin_detail -a site=uk > amazon_get_asin_detail1.log 2>&1 &
