import json
import re
import time
import scrapy
import logging
import random
import sys, os
from lxml import etree
from scrapy import cmdline, signals
from urllib.parse import urlparse
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# amazom 每日更新评论获取
from scrapy.exceptions import DontCloseSpider
from scrapy.utils.project import get_project_settings
from amazon_spider.utils.common import md5
from amazon_spider.utils.read_db_data import ReadCookie
from amazon_spider.utils.utils import time_ch


class CommentDaySpider(scrapy.Spider):
    name = 'redis_day_comment'
    SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['amazon.com'],
        # # 这样设置后指纹和请求队列则会一直保存在redis数据库中，默认为False
        # 'SCHEDULER_PERSIST': True,
        # 设置重启爬虫时是否清空爬取队列
        'SCHEDULER_FLUSH_ON_START': False,
        # 启用Redis调度存储请求队列
        'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # 确保所有的爬虫通过Redis去重
        'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # 种子队列的信息
        'REDIS_URL': None,
        'REDIS_HOST': '192.168.10.224',
        'REDIS_PORT': 6379,
        'REDIS_PARAMS': {
            # 'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
            'db': 0
        },
        # # 6379
        # 'FILTER_URL': None,
        # 'FILTER_HOST': '127.0.0.1',
        # 'FILTER_PORT': 6379,
        # # 6379
        # 'FILTER_DB': 0,
        # Retry settings
        'RETRY_ENABLED': True,
        'SCHEDULER_QUEUE_KEY': "day_seed",
        'RETRY_TIMES': 5,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 480,
            # 'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 递减调用
            'amazon_spider.middlewares.UpdateCookiesUrl': 530,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipelines.DayAmazonSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(CommentDaySpider, self).__init__()
        self.cols_list = ['asin', 'parent_asin', 'variat_asin', 'title', 'content', 'is_vp', 'model', 'rating', 'agree_num', 'img_num', 'img_url', 'is_video', 'video_url', 'comment_url', 'user_name', 'user_img', 'country', 'user_page', 'is_earns_commissions', 'comment_time', 'page', 'md5_unique']
        self.site = site
        while True:
            try:
                self.cookie_list = list([i[1], i[0]] for i in ReadCookie(self.site).get_cookie().values)
                logging.info("cookie 列表更新")
                break
            except OperationalError as e:
                logging.info(f'get seeds failure in link sleep 30s{e}')
                continue
            except FunctionTimedOut as e:
                logging.info(f'get seeds time out sleep 30s{e}')
                continue
        if site == 'us':
            self.url_ = 'https://www.amazon.com'
        elif site == 'de':
            self.url_ = "https://www.amazon.de"
        elif site == 'uk':
            self.url_ = "https://www.amazon.co.uk"
        elif site == 'it':
            self.url_ = "https://www.amazon.it"
        elif site == 'es':
            self.url_ = "https://www.amazon.es"
        elif site == 'fr':
            self.url_ = "https://www.amazon.fr"
        elif site == 'mx':
            self.url_ = "https://www.amazon.com.mx"
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        self.headers = {
            'host': urlparse(self.url_).hostname,
            'authority': urlparse(self.url_).hostname,
            'accept': 'text/html,*/*',
            'accept-language': '*',
            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'origin':  self.url_,
            'referer': f'{self.url_}/Boao-Ausschnitt-Gepolstert-Schlaf-BH-elastischen/product-reviews/B08CNLH1KC/ref=cm_cr_getr_d_paging_btm_next_5?ie=UTF8&reviewerType=all_reviews&pageNumber=5',
            'x-requested-with': 'XMLHttpRequest',
            'accept-encoding': 'gzip, deflate, br',
        }
        # self.seeds = list(list(i) for i in self.r_db.get_seeds().values)
        # self.seeds = [["1", "B099RW3VVH", "462"]]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        settings = get_project_settings()
        cls.custom_settings["SCHEDULER_QUEUE_KEY"] = f"{kwargs.get('site', 'us')}_day_seed"
        settings.update(cls.custom_settings)
        crawler.settings = settings
        spider = super(CommentDaySpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def update_headers(self):
        if self.headers.get("x-requested-with") and self.headers.get("accept-encoding"):
            del self.headers['x-requested-with']
            del self.headers['accept-encoding']
            logging.info("headers 方案1")
        elif not self.headers.get("x-requested-with") and not self.headers.get("accept-encoding"):
            self.headers['x-requested-with'] = "XMLHttpRequest"
            logging.info("headers 方案2")
        elif not self.headers.get("accept-encoding") and self.headers.get("x-requested-with"):
            del self.headers['x-requested-with']
            self.headers['accept-encoding'] = "gzip, deflate, br"
            logging.info("headers 方案3")
        elif self.headers.get("accept-encoding") and not self.headers.get("x-requested-with") and not self.headers.get("sec-ch-ua-platform-version"):
            self.headers['sec-ch-ua-platform-version'] = '^\\^10.0.0^\\^'
            logging.info("headers 方案4")
        else:
            self.headers['x-requested-with'] = "XMLHttpRequest"
            self.headers['accept-encoding'] = "gzip, deflate, br"
            if self.headers.get('sec-ch-ua-platform-version'):
                del self.headers['sec-ch-ua-platform-version']
            logging.info("headers 初始方案")

    def spider_idle(self):
        # read database again and send new requests
        c = int(time.time())
        str_time = time.strftime("%H:%M:%S", time.gmtime(c))
        # 更新cookies
        if str_time >= "09:09:00" and str_time <= "09:10:00":
            try:
                self.cookie_list = list([i[1], i[0]] for i in ReadCookie(self.site).get_cookie().values)
                logging.info("cookie 列表更新")
            except OperationalError as e:
                logging.info(f'get seeds failure in link sleep 30s{e}')
            except FunctionTimedOut as e:
                logging.info(f'get seeds time out sleep 30s{e}')
        # check that sending new requests here is different
        logging.info('no task sleep 30s----------')
        time.sleep(30)
        raise DontCloseSpider()

    def parse(self, response, **kwargs):
        comment_num = re.findall("(rating|ratings|Gesamtbewertungen|totales|total|totali|totale), (.*?) (with|mit|con|avec) (recensioni|avis|review|Rezensionen|reseñas|con)", response.text)
        comment_count = re.sub(r"\\u.{4}", '', comment_num[0][1].__repr__()).replace(",", "").replace(".", "").replace("'", "")  if comment_num else 0
        # if int(comment_count) == int(response.meta.get("history_comment_count")):
        #     logging.info(f'评论数和历史评论数一致,历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')
        #     return
        if int(response.meta.get("history_comment_count")) > int(comment_count):
            logging.info(f'评论数小于历史评论数,历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')
            return
        url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
        logging.info(f'历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')
        if not comment_count == "0":
            r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
                '\\"', '"')
            page_count = 1 if int(comment_count) // 20 == 0 else int(comment_count) // 20
            if int(comment_count) % 20 > 0 and int(comment_count) >= 20:
                page_count += 1
            response.meta["pageNumber"] = str(1)
            response.meta["page_count"] = page_count
            response.meta["comment_num"] = comment_count
            item = self.xpath_res(r, response)
            item["save_comment_num"] = True
            yield item
            if page_count > 4:
                ran = range(2, 4)
            else:
                ran = range(2, page_count + 1)
            for i in ran:
                cookies = random.choice(self.cookie_list)
                data = {
                    'sortBy': 'recent',
                    'reviewerType': 'all_reviews',
                    'pageNumber': str(i),
                    'shouldAppend': 'undefined',
                    'deviceType': 'desktop',
                    'canShowIntHeader': 'undefined',
                    'reftag': 'cm_cr_arp_d_paging_btm_next_1',
                    'pageSize': '20',
                    'asin': response.meta.get("asin"),
                    'scope': 'reviewsAjax0'
                }
                response.meta["pageNumber"] = str(i)
                yield scrapy.FormRequest(url=url+f'#asin={data.get("asin")}#pageNumber={str(i)}', headers=self.headers, cookies=json.loads(cookies), formdata=data, callback=self.parse_data, errback=self.err_parse, meta=response.meta, dont_filter=True)
        elif comment_count == "0":
            logging.info(f'页面变狗：{response.meta.get("asin")}')

    def parse_data(self, response):
        """解析详情页数据"""
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
            '\\"', '"')
        item = self.xpath_res(r, response)
        return item

    def xpath_res(self, r, response):
        ret = etree.HTML(r)
        # 10条评论
        datas = ret.xpath("//div[@class='a-section review aok-relative']")
        items = []
        for i in datas:
            # if not i.xpath(".//span[@data-hook='review-body']/span/text()"):
            #     print("-------", etree.tostring(i))
            title = i.xpath(".//*[@data-hook='review-title']/span/text()")[0].strip() if i.xpath(".//*[@data-hook='review-title']/span/text()") else ""
            # 用户名称
            user_name = i.xpath(".//span[@class='a-profile-name']//text()")[0].strip()
            # 用户图片链接
            user_img = i.xpath(".//div[@class='a-profile-avatar']/img/@data-src")
            content = i.xpath(".//span[@data-hook='review-body']/span/text()")
            # 赞同数
            helpful = i.xpath(".//span[@data-hook='helpful-vote-statement']//text()")
            if helpful:
                review = 1 if helpful[0].split(" ")[0].strip() == "One" else helpful[0].split(" ")[0].strip()
            else:
                review = 0
            # 是否购买
            size_mini = i.xpath(".//span[@class='a-size-mini a-color-state a-text-bold']//text()")
            mini = 1 if size_mini else 2
            # 时间 国家
            time_msg = i.xpath(".//span[@data-hook='review-date']//text()")
            if self.site == "de":
                time_msg = (time_msg[0].split(".")[0][-2:] + "." + time_msg[0].split(".")[1]).strip()
            elif self.site == "es":
                time_msg = time_msg[0].split("el ")[-1].strip()
            elif self.site == "fr":
                time_msg = time_msg[0].split("le ")[1].strip()
            elif self.site == "it":
                time_msg = time_msg[0].split("il ")[1].strip()
            else:
                time_msg = time_msg[0].split("on")[1].strip()
            # 评论内图片
            comment_img = i.xpath(".//div[@class='a-section a-spacing-top-mini cr-lightbox-image-thumbnails']/img/@src")
            # 视频地址
            video_url_list = i.xpath('.//input[contains(@class,"video-url")]/@value')
            if video_url_list:
                video_url = ';'.join(video_url_list)
            else:
                video_url = None
            video_len_num = len(video_url_list)
            # 评论人主页连接
            user_page = i.xpath(".//div[@data-hook='genome-widget']/a/@href")

            rat = i.xpath(".//span[@class='a-icon-alt']//text()")

            rating = rat[0].split(".")[0].replace("stars", "").strip()
            # 评论链接
            comment_url = i.xpath(".//*[@data-hook='review-title']/@href")
            variat_asin = i.xpath(".//a[@data-hook='format-strip']/@href")[0].split("/")[-2] if i.xpath(".//a[@data-hook='format-strip']/@href") else ""
            item = {
                "asin": variat_asin or response.meta.get("asin"),
                "parent_asin": response.meta.get("asin"),
                "title": title,
                "content": " ".join(content).strip() if content else "",
                # 是否确认购买
                "is_vp": str(mini),
                # 型号如color、size、style
                "model": " ".join(i.xpath(".//a[@data-hook='format-strip']//text()")).strip() if i.xpath(".//a[@data-hook='format-strip']//text()") else "",
                # 星级评分
                "rating": rating,
                # 赞同数
                "agree_num": str(review),
                # 评论图片数量
                "img_num": str(len(comment_img)),
                # 图片URL
                "img_url": ",".join(comment_img),
                # 是否有视频（1是2否）
                "is_video": str(1 if video_url else 2),
                # 视频地址
                "video_url": video_url,
                # 评论链接
                "comment_url": comment_url[0] if comment_url else None,
                # 评论人名称
                "user_name": user_name,
                # 评论人头像图片链接
                "user_img": user_img[0],
                # 所属国家
                "country": self.site,
                # # 所属国家
                # "country": time_msg[0].split("on")[0].split(" ")[-2].strip(),
                # 评论人主页URL
                "user_page": user_page[0] if user_page else "",
                # '是否是红人计划链接(1是2否含有“Earns Commissions”标签的评论人主页)',
                "is_earns_commissions": "",
                # '评论时间',
                "comment_time": time_ch(self.site, time_msg),
                "page": response.meta.get("pageNumber"),
            }
            s_md5 = title+item.get("comment_time", "")+item.get("user_name", "")+item.get("parent_asin", "")+item.get("content", "")+item.get("model", "")+item.get("is_vp", "")
            md5_num = md5(s_md5)
            item["md5_unique"] = md5_num
            items.append(list(item.values()))
        msgs = {}
        msgs["comment_count"] = response.meta.get("comment_num")
        msgs["data"] = items
        msgs["asin"] = response.meta.get("asin")
        logging.info(f"成功爬取数据为: {items}")
        return msgs

    def err_parse(self, failure, **kwargs):
        # 处理失败请求并将其重新发送到 Redis
        # 获取原始请求对象
        request = failure.request
        self.logger.error(f"error msg {failure.getErrorMessage()}")
        # 在这里可以添加一些自定义的处理逻辑，比如记录失败请求的日志等
        self.logger.error(f"Err_Parse request: {request.url}")
        self.update_headers()
        request.headers = self.headers
        self.logger.error(f"失败时更换headers{self.headers}")
        # 重新发送失败请求到 Redis
        self.crawler.engine.crawl(request, spider=self)


if __name__ == '__main__':
    args = 'scrapy crawl redis_day_comment -a site=es'.split()
    cmdline.execute(args)



# nohup scrapy crawl redis_day_comment -a site=de  > amazon_de_day1.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl redis_day_comment" |awk '{print $2}' `; do kill -9 $i ; done;

# 0 0 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl test  > day_amazon1.log 2>&1 &

# ps -ef|grep "scrapy crawl redis_day_comment"
