# coding:utf-8
import os
import re
import sys
import json
import time
import nltk
import scrapy
import random
import logging
from pprint import pprint
from urllib.parse import urlparse
from scrapy import cmdline, signals
from sqlalchemy.exc import OperationalError
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.items import detail_inner_item
from amazon_spider.db.mysql_db import get_country_engine
from amazon_spider.utils.common import field_length_dispose
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor
from amazon_spider.utils.read_db_data import ReadDb, ReadCookie


class ImgSearchSpider(scrapy.Spider):
    name = 'img_search_spider'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.amazon_upc_pip.AmazonUpcSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(ImgSearchSpider, self).__init__()
        self.site = site
        while True:
            try:
                self.cookie_list = list([i[1], i[0]] for i in ReadCookie(self.site).get_cookie().values)
                logging.info("cookie 列表更新")
                break
            except OperationalError as e:
                logging.info(f'get seeds failure in link sleep 30s{e}')
                continue
            except FunctionTimedOut as e:
                logging.info(f'get seeds time out sleep 30s{e}')
                continue
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        if site == 'us':
            self.url_ = 'https://www.amazon.com'
        elif site == 'de':
            self.url_ = "https://www.amazon.de"
        elif site == 'uk':
            self.url_ = "https://www.amazon.co.uk"
        elif site == 'it':
            self.url_ = "https://www.amazon.it"
        elif site == 'es':
            self.url_ = "https://www.amazon.es"
        elif site == 'fr':
            self.url_ = "https://www.amazon.fr"
        elif site == 'mx':
            self.url_ = "https://www.amazon.com.mx"

        self.headers = {
            'Connection': 'close',
            'authority': urlparse(self.url_).hostname,
            'accept': 'text/html,*/*',
            'accept-language': '*',
            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'origin': self.url_,
            'referer': f'{self.url_}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
        }
        self.country = {
            "us": '10010',
            "de": '10115',
            "uk": 'W1S 3PR',
            "it": '00185',
            "es": '28001',
            "fr": '75019',
            "mx": '54607'
        }
        self.country_lc_main = {
            "lc_main": 'en_US',
            "lc-acbde": 'de_DE',
            "lc-acbuk": 'en_GB',
            "lc-acbit": 'it_IT',
            "lc-acbes": 'es_ES',
            "lc-acbfr": 'fr_FR',
            "lc-acbmx": 'es_MX'
        }
        self.not_like_keyword = self.get_amazon_temu_not_likes()
        # self.conn = get_con()
        # self.seeds = sql_fetch_rows("SELECT `asin` FROM us_all_syn_st_hezhe_test;")
        self.col = ['asin', 'page_state', 'upc']
        # self.seeds = [{"asin": "B0BLGG2FBC"}, {"asin": "B07R8D1RHS"}, {"asin": "B099JKWCSN"}, {"asin": "B084T8236C"}, {"asin": "B08HSD7C8X"}]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(ImgSearchSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        c = int(time.time())
        str_time = time.strftime("%H:%M:%S", time.gmtime(c))
        # 更新cookies
        if str_time >= "09:09:00" and str_time <= "09:10:00":
            try:
                self.cookie_list = list([i[1], i[0]] for i in ReadCookie(self.site).get_cookie().values)
                logging.info("cookie 列表更新")
            except OperationalError as e:
                logging.info(f'get seeds failure in link sleep 30s{e}')
            except FunctionTimedOut as e:
                logging.info(f'get seeds time out sleep 30s{e}')

        try:
            df = ReadDb(self.site).read_db_temu_img_search_keyword()
        except OperationalError as e:
            logging.info(f'get seeds failure in link sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        except FunctionTimedOut as e:
            logging.info(f'get seeds time out sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider

        if df.shape[0] > 0:
            for i in df.values:
                cookies = random.choice(self.cookie_list)
                url = f"{self.url_}/dp/{i[0]}"
                meta = {
                    "curlcffi": True,
                    # "use_aiohttp": True,
                    "asin": i[0],
                    "cookie_id": cookies[0],
                    # "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                }
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            raise
            # logging.info('no task sleep 30s----------')
            # time.sleep(30)
            # raise DontCloseSpider()
        # logging.debug(f'队列长度为：{len(self.seeds)}')
        # if not self.seeds:
        #     raise
        # if len(self.seeds) >= 30:
        #     list_num = range(0, 30)
        # else:
        #     list_num = range(0, len(self.seeds))
        # for i in list_num:
        #     asin = self.seeds.pop()["asin"]
        #     cookies = random.choice(self.cookie_list)
        #     url = f"https://www.amazon.com/dp/{asin}"
        #     meta = {
        #         "asin": asin,
        #         "cookie_id": cookies[0],
        #         "amazon_proxy": True,
        #         "cookiejar": int(time.time()),
        #     }
        #     self.crawler.engine.crawl(scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers,
        #                                              callback=self.parse, errback=self.err_parse,
        #                                              dont_filter=True, meta=meta), self)

    def if_page_state(self, response):
        if ("Page Not Found" in response.text) or (
                "We are sorry! This Gift Card is not available" in response.text) or (
                "500 - An error occurred" in response.text) or ("Sorry! Something went wrong!" in response.text) or \
                len(response.xpath("//div").getall()) < 2:
            item = {
                "error_asin": True,
                "asin": (4, response.request.meta.get("asin")),
            }
            logging.info(f"页面为空:{item.get('asin')}")
            return {"asin": response.meta.get("asin"), "page_state": "狗头页面", "upc": None}
        elif ("keywords" in response.url) or ("dp/" not in response.url) or (
                "ref=" in response.url and "encoding=" in response.url) or (
                response.meta.get("asin") not in response.url):
            item = {
                "error_asin": True,
                "asin": (12, response.request.meta.get("asin")),
            }
            logging.info(f"跳转页面:{item.get('asin')}")
            return {"asin": response.meta.get("asin"), "page_state": "跳转页面", "upc": None}
        elif not response.xpath("//div[@id='dp']").getall():
            item = {
                "error_asin": True,
                "asin": (13, response.request.meta.get("asin")),
            }
            logging.info(f"跳转或者视频页面:{item.get('asin')}")
            return {"asin": response.meta.get("asin"), "page_state": "跳转页面", "upc": None}
        else:
            return None


    def filter_key_word(self, x):
        if x:
            tokens = nltk.word_tokenize(x.lower())

            word_tags = nltk.pos_tag(tokens)
            word = " ".join([i[0] for i in word_tags])
            del_num = []
            for i, v in enumerate(word_tags):
                if v[1] == "CD":
                    try:
                        if word_tags[i + 1][1] == "NNS":
                            del_num.append([word_tags[i][0], word_tags[i + 1][0]])
                        else:
                            del_num.append([word_tags[i][0]])
                    except:
                        del_num.append([word_tags[i][0]])
                else:
                    continue

            for i in del_num:
                word = word.replace(" ".join(i), "")
            return re.sub(r"(\(.*?\))", "", word)
        else:
            return ""

    def filter_fuhao(self, x):
        tokens = nltk.word_tokenize(x.lower())

        word_tags = nltk.pos_tag(tokens)
        tags = [
            "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT",
            "POS",
            "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP",
            "WP$",
            "WRB"
        ]

        word = " ".join([i[0] for i in word_tags if i[1] in tags])
        return word

    def get_amazon_temu_not_likes(self):


        get_country_engine(self.site)
        import pandas as pd
        sql1 = "SELECT text from amazon_temu_not_likes;"
        df_not_like = pd.read_sql(sql1, con=get_country_engine(self.site))
        return df_not_like

    def parse(self, response, **kwargs):
        if item := self.if_page_state(response):
            yield item
        else:
            amazon_detail_extractor = AmazonDetailExtractor(self.site)
            items = amazon_detail_extractor.run(response)
            inner_item = detail_inner_item(items)
            # 添加字段长度限制
            inner_item = field_length_dispose(inner_item)
            inner_item = {k: v or None for k, v in inner_item.items()}
            inner_item["is_coupon"] = inner_item["is_coupon"] if inner_item["is_coupon"] else "0"
            # print(list(inner_item.keys()))
            pprint(inner_item)
            new_title = self.filter_fuhao(self.filter_key_word(inner_item.get("title"))).replace(inner_item.get("brand").lower(), "")
            stop = list(list(i)[0] for i in self.not_like_keyword.values)
            stop += ["silver"]
            # 停用词去除
            new_title = lambda x: " ".join(x for x in new_title.split() if x not in stop)
            print({
                "title": inner_item.get("title"),
                "img_url": inner_item.get("img_url"),
                "site": response.meta["site"],
                "new_title": new_title
            })

    def err_parse(self, failure, **kwargs):
        item = {
            "error_asin": True,
            "asin": failure.request.meta.get("asin")
        }
        logging.info(f"爬取失败asin:{item.get('asin')}")
        yield item
        # if "Feedback.process_response must return Response or Request" in failure.getErrorMessage():
        #     request = failure.request
        #     self.logger.error('重试次数满了 继续重试 %s', request.url)
        #     yield request
        # elif "User timeout caused connection" in failure.getErrorMessage():
        #     request = failure.request
        #     self.logger.error('TimeoutError on 继续重试 %s', request.url)
        #     yield request
        # elif "CONNECT tunnel with proxy" in failure.getErrorMessage():
        #     request = failure.request
        #     self.logger.error('proxy error on 继续重试 %s', request.url)
        #     yield request


if __name__ == '__main__':
    args = 'scrapy crawl img_search_spider -a site=us'.split()
    cmdline.execute(args)


# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl mx_self_asin -a site=mx  > mx_amazon1.log 2>&1 &
# nohup scrapy crawl upc_spider -a site=us  > upc_amazon.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl upc_spider" |awk '{print $2}' `; do kill -9 $i ; done;

# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl mx_self_asin

# 0 0 * * * cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl mx_self_asin -a site=mx > mx_asin1.log 2>&1 &

# cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl mx_self_asin -a site=mx  > mx_asin1.log 2>&1 &
