# coding:utf-8
import os
import sys
import platform
import time
import scrapy
import logging
import random
from urllib.parse import urlparse
from sqlalchemy.exc import OperationalError
from scrapy import cmdline, signals
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# mx站点  详情页产品描述获取
from amazon_spider.utils.read_db_data import ReadDb
from amazon_spider.spiders.yswg_spider import SourceSpider
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonMxAsinSpider(SourceSpider):
    name = 'mx_self_asin'
    custom_settings = {
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.amazon_mx_pip.AmazonMxSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonMxAsinSpider, self).__init__()
        self.site = site
        self.update_cookies()
        self.sleep_count = 0

        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        self.col = ['asin', 'title', 'describe', 'selling_point', 'site']
        self.long_site_to_dict = {
            'Amazon.com.mx': "mx",
            'Amazon.nl': "nl",
            'Amazon.com.be': "be",
        }
        self.short_site_to_dict = {
            'mx': "Amazon.com.mx",
            'nl': "Amazon.nl",
            'be': 'Amazon.com.be',
        }

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonMxAsinSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        c = int(time.time())
        str_time = time.strftime("%H:%M:%S", time.gmtime(c))
        # 更新cookies
        if str_time >= "09:09:00" and str_time <= "09:10:00":
            self.update_cookies()
        try:
            df = ReadDb(self.site).read_db_mx_asin()
        except OperationalError as e:
            logging.info(f'get seeds failure in link sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        except FunctionTimedOut as e:
            logging.info(f'get seeds time out sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider

        if df.shape[0] > 0:
            for i in df.values:
                # 暂时只支持  mx,nl
                site = self.long_site_to_dict[i[2]]
                url = f"{self.site_url.get(site)}/dp/{i[0]}"
                meta = {
                    "use_aiohttp": True,
                    "asin": i[0],
                    "site": site,
                    # "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                }
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                cookies = self.json_cookies(site)
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            logging.info('no task sleep 30s----------')
            time.sleep(30)
            raise DontCloseSpider()


    def parse(self, response, **kwargs):
        if response.xpath("//title/text()").get("").strip() == "Documento no encontrado":
            logging.info(f"页面变狗 {response.meta.get('asin')}")
            item = {
                "asin": response.meta.get("asin"),
                "title": "",
                "describe": "",
                "selling_point": "",
                "site": self.short_site_to_dict[response.meta.get("site")]
            }
        else:
            import unicodedata
            item = {}
            item['asin'] = response.meta.get("asin")

            amazon_detail_extractor = AmazonDetailExtractor(self.site)
            datas_msg = amazon_detail_extractor.run(response)

            item['title'] = datas_msg['title']
            item['describe'] = datas_msg['product_description']
            item['selling_point'] = datas_msg['describe']
            item['site'] = self.short_site_to_dict[response.meta.get("site")]
            # if item.get("title") and (not item.get("describe")) and (not item.get("selling_point")):
            #     with open(f"../../../../pythonProject/{item.get('asin')}.html", "w", encoding="utf-8")as f:
            #         f.write(response.text)
        logging.info(f"详情数据 {item}")
        yield item

    def err_parse(self, failure, **kwargs):
        item = {
            "error_asin": True,
            "asin": failure.request.meta.get("asin"),
            "site": failure.request.meta.get("site"),
        }
        logging.info(f"爬取失败asin:{item.get('asin')}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl mx_self_asin -a site=mx'.split()
    cmdline.execute(args)

#
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl mx_self_asin -a site=mx  > mx_amazon1.log 2>&1 &
# nohup scrapy crawl mx_self_asin -a site=mx  > mx_amazon.log 2>&1 &
# source activate pyspark 
# for i in `ps -ef|grep "scrapy crawl mx_self_asin" |awk '{print $2}' `; do kill -9 $i ; done;

# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl mx_self_asin

# 0 0 * * * cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl mx_self_asin -a site=mx > mx_asin1.log 2>&1 &

# cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl mx_self_asin -a site=mx  > mx_asin1.log 2>&1 &
