# coding:utf-8
import os
import re
import sys
import json
import time
import random
import scrapy
import logging
import platform
from pprint import pprint
from urllib.parse import urlparse
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# 亚马逊详情页数据获取
from amazon_spider.items import self_ama_temu_item
from amazon_spider.utils.read_db_data import ReadDb, ReadCookie

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonGetSearchSpider(scrapy.Spider):
    name = 'amazon_get_search'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 18,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,
            'amazon_spider.middlewares.SearchCookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 递减调用
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.amazon_search_pip.AmazonSearchSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonGetSearchSpider, self).__init__()
        self.site = site
        self.r_db = ReadDb(self.site)
        # logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
        #                     level=logging.INFO)
        self.site_url = {
            "us": 'https://www.amazon.com',
            'de': "https://www.amazon.de",
            "uk": "https://www.amazon.co.uk",
            "it": "https://www.amazon.it",
            "es": "https://www.amazon.es",
            "fr": "https://www.amazon.fr",
            "mx": "https://www.amazon.com.mx",
            "ca": "https://www.amazon.ca",
        }
        self.site_cookies = {
            "us": list([i[1], i[0]] for i in ReadCookie("us").get_cookie().values),
            'de': list([i[1], i[0]] for i in ReadCookie("de").get_cookie().values),
            "uk": list([i[1], i[0]] for i in ReadCookie("uk").get_cookie().values),
            "it": list([i[1], i[0]] for i in ReadCookie("it").get_cookie().values),
            "es": list([i[1], i[0]] for i in ReadCookie("es").get_cookie().values),
            "fr": list([i[1], i[0]] for i in ReadCookie("fr").get_cookie().values),
            "mx": list([i[1], i[0]] for i in ReadCookie("mx").get_cookie().values),
            "ca": list([i[1], i[0]] for i in ReadCookie("ca").get_cookie().values),
        }
        self.country = {
            "us": '10010',
            "de": '10115',
            "uk": 'London W1S 3',
            "it": '00185',
            "es": '28001',
            "fr": '75019',
            "mx": '54607',
            "ca": 'M5B 2H'
        }

        # self.seeds = sql_fetch_rows(f'SELECT new_title as search_term, asin, site from us_self_asin_top limit 100 for update;')
        # self.seeds = [
        #     "tears of the kingdom switch",
        #     # "fan",
        # ]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonGetSearchSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        df = self.r_db.read_db_serch_keyword()
        if df.shape[0] > 0:
            for i in df.values:
                url = self.site_url.get(i[3])
                # for p in range(1, 6):
                cookies = random.choice(self.site_cookies.get(i[3]))
                # dq = queue.Queue()
                url = f"{url}/s?k={i[1]}&page={1}"
                meta = {
                    "search_term": i[1],
                    "cookie_id": cookies[0],
                    "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "site": i[3],
                    "page": 1,
                    "self_asin": i[2],
                    "msgs": [],
                }
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language': '*',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                self.crawler.engine.crawl(
                    scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=headers,
                                   callback=self.parse, errback=self.err_parse,
                                   dont_filter=True, meta=meta), self)
        else:
            raise
        # logging.debug(f'队列长度为：{len(self.seeds)}')
        # if not self.seeds:
        #     raise
        # if len(self.seeds) >= 30:
        #     list_num = range(0, 30)
        # else:
        #     list_num = range(0, len(self.seeds))
        # for i in list_num:
        #     seed = self.seeds.pop()
        #     cookies = random.choice(self.site_cookies.get(seed.get("site")))
        #     url = self.site_url.get(seed.get("site"))
        #     # for p in range(1, 6):
        #     url = f"{url}/s?k={seed['search_term']}&page={1}"
        #     meta = {
        #         "search_term": seed["search_term"],
        #         "cookie_id": cookies[0],
        #         "amazon_proxy": True,
        #         "cookiejar": int(time.time()),
        #         "page": 1,
        #         "site": seed["site"],
        #         "self_asin": seed["asin"],
        #         "data_type": 1,
        #         "msgs": [],
        #     }
        #     headers = {
        #         'Connection': 'close',
        #         'authority': urlparse(url).hostname,
        #         'accept': 'text/html,*/*',
        #         'accept-language': '*',
        #         'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
        #         'origin': url,
        #         'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
        #     }
        #     self.crawler.engine.crawl(scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=headers,
        #                                              callback=self.parse, errback=self.err_parse,
        #                                              dont_filter=True, meta=meta), self)

    def parse(self, response, **kwargs):
        # print("-" * 20)
        # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        datas = response.xpath(".//div[@data-uuid and @data-asin]") or response.xpath(".//div[@class='sg-row']")
        logging.info(f'{response.url} asin数为 {len(datas)}')
        items_lists = []
        for index, i in enumerate(datas):
            item = {}
            title = i.xpath(".//h2[contains(@class, 'a-size-mini a-spacing-none a-color-base s-line-clamp')]//text()").get()
            if not title:
                logging.info(f"没有title {response.url}")
                continue
            asin_url = i.xpath(".//a/@href").get()
            if not asin_url:
                logging.info(f"没有asin_url {response.url}")
                continue
            asin_ = re.findall("/dp/(.*?)/|%2Fdp%2F(.*?)%2F", asin_url)
            asin = [i for i in asin_[0] if i][0] if asin_ else ""
            if asin:
                if asin == response.meta.get("self_asin"):
                    logging.info(f"asin 等于 self_asin {asin}  {response.meta.get('self_asin')}")
                    continue
            else:
                logging.info(f"没获取到asin 跳过  {response.meta.get('self_asin')}")
                continue
            item["asin"] = response.meta.get("self_asin")
            item["asin_compet"] = asin
            item["img_url"] = i.xpath(".//img/@src").get()
            item["page"] = response.meta.get("page")
            item["page_row"] = index + 1
            item["state"] = 1
            # item["new_title"] = response.meta.get("search_term")
            #
            # item["data_type"] = 1
            item["site"] = response.meta.get("site")
            # item["self_asin"] = response.meta.get("self_asin")
            item = self_ama_temu_item(item)
            items_lists.append(list(item.values()))
            pprint(item)

        if p:=response.xpath("//span[@class='s-pagination-strip']/a[@class='s-pagination-item s-pagination-next s-pagination-button s-pagination-separator']/@href").get():
            if "page=6" in p:
                logging.info("第六页不用爬取")
                response.meta["msgs"] += items_lists
                logging.info(f"{response.meta.get('self_asin')}")
                yield {'asin': response.meta.get("self_asin"), 'site': response.meta.get("site"), 'items': response.meta["msgs"]}
            else:
                cookies = random.choice(self.site_cookies.get(response.meta.get('site')))
                url = self.site_url.get(response.meta.get('site'))
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language': '*',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                response.meta["page"] = response.meta.get("page") + 1
                response.meta["msgs"] += items_lists
                yield scrapy.Request(url=url+p, cookies=json.loads(cookies[1]), headers=headers, callback=self.parse, errback=self.err_parse, dont_filter=True, meta=response.meta)
        else:
            response.meta["msgs"] += items_lists
            if need := response.xpath("//span[@class='a-size-medium-plus a-color-base']//text()").get():
                logging.info("正常页面")
                logging.info(f"{need}")
                logging.info(f"{response.meta.get('self_asin')}")
                logging.info("未获取到页面链接")
                yield {'asin': response.meta.get("self_asin"), 'site': response.meta.get("site"), 'items': response.meta["msgs"]}
            else:
                logging.info(response.meta.get("search_term"))
                raise

    def err_parse(self, response, **kwargs):
        item = {
            "error_asin": True,
            "search_term": (1, response.request.meta.get("self_asin"), response.request.meta.get("site")),
        }
        logging.info(f"爬取失败关键词:{item.get('search_term')}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl amazon_get_search -a site=us'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_get_search  > amazon_search1.log 2>&1 &
# nohup scrapy crawl amazon_get_search > amazon_search1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl amazon_get_search" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_search
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_search > amazon_search1.log 2>&1 &
