# coding:utf-8
import os
import sys
import json
import time
import redis
import scrapy
import random
import logging
import pandas as pd
from urllib.parse import urlparse
from scrapy import cmdline, signals
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import spop, sadd
# mx站点  详情页产品描述获取
from amazon_spider.db.mysql_db import df_to_sql
from amazon_spider.spiders.yswg_spider import SourceSpider


class AmazonBsrCateSpider(SourceSpider):
    name = 'bsr_list_spider'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.bsr_video_num_pipe.AmazonUpcSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonBsrCateSpider, self).__init__()
        self.site = site
        self.update_cookies()
        self.sleep_count = 0
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)

        self.col = ['asin', 'page_state', 'upc']

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonBsrCateSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        while True:
            try:
                seeds = spop(f'{self.site}_upc_cate_video', 25)
                logging.info("get seeds ok o-_-o")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"get seeds error T_T --> {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"get seeds timeout T_T -->{e}")
                continue
        return seeds

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        str_time = time.strftime("%H:%M:%S", time.localtime())
        # 更新cookies
        if str_time >= "09:09:00" and str_time <= "09:10:00":
            self.update_cookies()
        seeds = self.get_seeds()
        if seeds:
            for i in seeds:
                job = json.loads(i)
        # try:
        #     df = ReadDb("us").read_db_video_bsr_cate_asin(self.site)
        # except OperationalError as e:
        #     logging.info(f'get seeds failure in link sleep 30s{e}')
        #     time.sleep(30)
        #     raise DontCloseSpider
        # except FunctionTimedOut as e:
        #     logging.info(f'get seeds time out sleep 30s{e}')
        #     time.sleep(30)
        #     raise DontCloseSpider

        # if df.shape[0] > 0:
        #     for job in df.values:
        #         url = f"{self.site_url.get(job[2])}{job[4]}"
                url = f"{self.site_url.get(job['site'])}{job['bsr_url']}"
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }

                meta = {
                    # "use_aiohttp": True,
                    "asin": job["asin"],
                    "site": job["site"],
                    "rank": job["rank"],
                    "bsr_url": job["bsr_url"],
                    "choice_header": True,
                    "date_info": "",
                    # "cookie_id": cookies[0],
                    # "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                cookies = self.json_cookies(meta.get('site'))
                print(cookies)
                cookies.update({
                    'csm-sid': '916-1904410-6680838',
                    'x-amz-captcha-1': '1706092664884599',
                    'x-amz-captcha-2': '5OU/Q7DyDmtg9QGCbivNXg=='
                })
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            # meta = {
            #     'handle_httpstatus_all': True
            # }
            # request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_, dont_filter=True, meta=meta)
            # self.crawler.engine.crawl(request, spider=self)
            # logging.info('no task sleep 30s')
            # time.sleep(30)
            # raise DontCloseSpider()
            raise

    def page_state(self, response):
        asin = response.request.meta.get("asin")
        site = response.request.meta.get("site")
        if ("Die eingegebene Webadresse ist keine" in response.text) \
                    or ("The Web address you entered is not a functioning" in response.text) \
                    or ("saisie n'est pas une page fonctionnelle" in response.text) \
                    or ("web inserito non è una pagina funzionante" in response.text) \
                    or ("web que has especificado no es" in response.text) \
                    or ("Page Not Found" in response.text) \
                    or ("We are sorry! This Gift Card is not available" in response.text) \
                    or ("500 - An error occurred" in response.text):
            item = {
                "error_asin": True,
                "asin": {"state": 4, "asin": asin, "site": site},
            }
            # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
            #     f.write(response.text)
            logging.info(f"页面为空:{item.get('asin')}")
            return item
        elif ("keywords" in response.url) or ("gp/" not in response.url):
            item = {
                "error_asin": True,
                "asin": {"state": 12, "asin": asin, "site": site},
            }
            logging.info(f"跳转页面:{item.get('asin')}")
            return item
        elif not response.xpath("//div[@class='p13n-desktop-grid']/@data-client-recs-list").getall():
            item = {
                "error_asin": True,
                "asin": {"state": 13, "asin": asin, "site": site},
            }
            logging.info(f"跳转或者视频页面:{item.get('asin')}")
            return item
        else:
            return None

    def save_db(self, table, df, site, db):
        # 入库报错重试
        while True:
            try:
                if df_to_sql(table, df, site=site, db=db):
                    logging.info(
                        f"更新 {db} 数据库 {table} -----{df.shape}---------{df.head()} {list(df.values)[0]}")
                    break
                else:
                    logging.info(f"更新 {db} 数据库 {table} -----失败")
                    continue
            except OperationalError as e:
                logging.info(f"更新 {db} 数据库 {table} 失败  连接错误{e}")
                continue
            except FunctionTimedOut as e:
                logging.info(
                    f"更新 {db} 数据库 {table} -超时-{e}---{df.shape}---------{df.head()}")
                continue

    def parse(self, response, **kwargs):
        if item := self.page_state(response):
            print(item)
            yield item
        else:
            print(response.xpath("//div[@class='p13n-desktop-grid']/@data-client-recs-list"))
            items = []
            for i in json.loads(response.xpath("//div[@class='p13n-desktop-grid']/@data-client-recs-list").get()):
                d = {
                    "asin": i['id'],
                    "page_row": i['metadataMap']['render.zg.rank'],
                    "source_asin": response.meta.get("asin"),
                    "rank": response.meta.get("rank"),
                    "bsr_url": response.meta.get("bsr_url"),
                    "site": response.meta.get("site"),
                    "state": 1,
                }
                items.append(d)
            df = pd.DataFrame(items)
            logging.info(f"df --->{df.head()}")
            df_to_sql("asin_video_bsr_spider", df, site="us", db="mysql")
            # yield items

            # items =
            # amazon_detail_extractor = AmazonDetailExtractor(self.site)
            # items = amazon_detail_extractor.asin_bs_category_asin_detail(response)
            # detail = amazon_detail_extractor.run(response)
            # print({
            #     "asin": response.meta.get("asin"),
            #     "site": response.request.meta.get("site"),
            #     "video": "是" if detail.get("video_url") else "否",
            #     "add_url": "是" if detail.get("add_url") else "否",
            #     "rank": detail.get('rank'),
            #     "bsr_url": items.get('last_herf')[-1] if items else None,
            # })
            #
            # yield {
            #     "asin": response.meta.get("asin"),
            #     "site": response.request.meta.get("site"),
            #     "video": "是" if detail.get("video_url") else "否",
            #     "add_url": "是" if detail.get("add_url") else "否",
            #     "rank": detail.get('rank'),
            #     "bsr_url": items.get('last_herf')[-1] if items else None,
            # }

    def err_parse(self, failure, **kwargs):
        # item = {
        #     "error_asin": True,
        #     "asin": (1, failure.request.meta.get("asin"), failure.request.meta.get("site"))
        # }
        # logging.info(f"爬取失败asin:{item.get('asin')}")
        # yield item
        logging.info(f"error______ {failure.getErrorMessage()}, {failure.request.meta.get('asin')}")

        # 处理失败请求并将其重新发送到 Redis
        # 获取原始请求对象
        request = failure.request
        meta = {
            "asin": request.meta.get("asin"),
            "site": request.meta.get("site"),
            "rank": request.meta.get("rank"),
            "bsr_url": request.meta.get("bsr_url"),
        }
        k = json.dumps(meta)
        while True:
            try:
                sadd(f"{self.site}_upc_cate_video", k, use_md5=False)
                logging.info(f"push {self.site}_upc_cate_video succeed")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"push {self.site}_upc_cate_video ConnectionError，afresh push --> T_T {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"push {self.site}_upc_cate_video time out --> T_T {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl bsr_list_spider -a site=us'.split()
    cmdline.execute(args)


# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl bsr_video_spider -a site=us  > mx_amazon1.log 2>&1 &
# nohup scrapy crawl bsr_list_spider -a site=us  > upc_amazon1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl bsr_video_spider" |awk '{print $2}' `; do kill -9 $i ; done;

# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl mx_self_asin

# 0 0 * * * cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl mx_self_asin -a site=mx > mx_asin1.log 2>&1 &

# cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl mx_self_asin -a site=mx  > mx_asin1.log 2>&1 &
