# coding:utf-8
import os
import re
import sys
import json
import time
import queue
import scrapy
import random
import logging
from urllib.parse import urlparse
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# 亚马逊 feedback 数据获取
from amazon_spider.db.mysql_db import sql_connect
from amazon_spider.utils.read_db_data import ReadDb, ReadCookie


class AmazonFeedbackSpider(scrapy.Spider):
    name = 'amazon_feedback'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 6,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.Feedback': 480,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,

            # 递减调用
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipelines.AmazonFeedbackSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(AmazonFeedbackSpider, self).__init__()
        self.site = site
        self.r_db = ReadDb(self.site)
        self.r_ck = ReadCookie(self.site)
        self.s = sql_connect("us")
        self.cookie_list = list([i[1], i[0]] for i in self.r_ck.get_cookie().values)
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        if site == 'us':
            self.url_ = 'https://www.amazon.com'
        elif site == 'de':
            self.url_ = "https://www.amazon.de"
        elif site == 'uk':
            self.url_ = "https://www.amazon.co.uk"
        elif site == 'it':
            self.url_ = "https://www.amazon.it"
        elif site == 'es':
            self.url_ = "https://www.amazon.es"
        elif site == 'fr':
            self.url_ = "https://www.amazon.fr"
        elif site == 'mx':
            self.url_ = "https://www.amazon.com.mx"

        self.headers = {
            'Connection': 'close',
            'authority': urlparse(self.url_).hostname,
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': '*',
            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'origin': self.url_,
            'referer': f'{self.url_}',
        }

        self.country = {
            "us": '10010',
            "de": '10115',
            "uk": 'London W1S 3',
            "it": '00185',
            "es": '28001',
            "fr": '75019',
            "mx": '54607'
        }
        # self.seeds = sql_fetch_rows(f"SELECT `id`, `account_name`, `url` FROM {self.site}_seller_account_syn;")

        # self.seeds = sql_fetch_rows(f"SELECT `id`, `account_name`, `url` FROM {self.site}_seller_account_syn where"
        #                             f" account_name='fxydozdzz';")
        self.week = self.s.sql_fetch_one(f"SELECT `week`, created_time FROM {self.site}_search_term;")
        self.qd = queue.Queue()
        self.col = ['asin', 'asin_state', 'img_url', 'title', 'title_len', 'price', 'rating', 'total_comments',
                    'buy_box_seller_type', 'page_inventory', 'category', 'volume', 'weight', 'rank', 'launch_time',
                    'img_num', 'img_type', 'activity_type', 'one_two_val', 'three_four_val', 'five_six_val',
                    'eight_val', 'qa_num', 'one_star', 'two_star', 'three_star', 'four_star', 'five_star', 'low_star',
                    'together_asin', 'brand', 'ac_name', 'material', 'node_id']
        # self.seeds = [
        #     {
        #         'id': 1127, 'account_name': 'The Bookstore',
        #         'url': 'https://www.amazon.com/gp/help/seller/at-a-glance.html?ie=UTF8&seller=A38GI2J35EGERS&isAmazonFulfilled=1'
        #     }
        # ]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonFeedbackSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        c = int(time.time())
        str_time = time.strftime("%H:%M:%S", time.gmtime(c))
        # 更新cookies
        if str_time >= "09:09:00" and str_time <= "09:10:00":
            self.cookie_list = list([i[1], i[0]] for i in self.r_ck.get_cookie().values)
            logging.info("cookie 列表更新")
        df = self.r_db.read_db_feedback()
        if df.shape[0] > 0:
            for i in df.values:
                cookies = random.choice(self.cookie_list)
                seller_id = i[2].split("seller=")[-1].split("&")[0]
                meta = {
                    "use_aiohttp": True,
                    "id": i[0],
                    "account_name": i[1],
                    "cookie_id": cookies[0],
                    "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "seller_id": seller_id,
                }
                url = f"{self.url_}/sp?seller={seller_id}"
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers,
                                     callback=self.parse, errback=self.err_parse,
                                     dont_filter=True, meta=meta), self)
        else:
            raise
        # logging.debug(f'队列长度为：{len(self.seeds)}')
        # if not self.seeds:
        #     raise
        # if len(self.seeds) >= 30:
        #     list_num = range(0, 30)
        # else:
        #     list_num = range(0, len(self.seeds))
        # for i in list_num:
        #     seed = self.seeds.pop()
        #     cookies = random.choice(self.cookie_list)
        #     seller_id = seed["url"].split("seller=")[-1].split("&")[0]
        #     # url = "https://www.amazon.com/sp?ie=UTF8&seller=AC7INV6650QP1"
        #     meta = {
        #         "id": seed["id"],
        #         "account_name": seed['account_name'],
        #         "cookie_id": cookies[0],
        #         "amazon_proxy": True,
        #         "cookiejar": int(time.time()),
        #         "seller_id": seller_id,
        #     }
        #     url = f"{self.url_}/sp?seller={seller_id}"
        #     self.crawler.engine.crawl(scrapy.Request(url=url,
        #                                              cookies=json.loads(cookies[1]), headers=self.headers,
        #                                              callback=self.parse, errback=self.err_parse,
        #                                              dont_filter=True, meta=meta), self)

    def asin_detail(self, response):
        item = {}
        id = response.meta.get("id")
        seller_url = response.xpath("//div[@id='seller-info-storefront-link']//a/@href").get()
        seller_name = response.xpath("//h1[@id='seller-name']/text()").get()
        site_name = response.xpath("//div[@class='a-row a-spacing-none indent-left']//text()").getall()
        if not seller_name:
            logging.info(f"未获取到店铺名称 {response.url}")
            return None
        site_name = site_name[-1] if site_name else None
        count = response.xpath("//span[@class='ratings-reviews-count']//text()").getall()
        if (not count) and (not site_name):
            logging.info(f"店铺 刚刚推出 {response.url}")
            return None, seller_url
        count_30_day = count[0].replace(",", "").replace(".", "") if count else None
        count_1_year = count[2].replace(",", "").replace(".", "") if count else None
        count_lifetime = count[3].replace(",", "").replace(".", "") if count else None
        item["site_name"] = site_name
        item["account_id"] = id
        item["count_30_day"] = count_30_day
        item["count_1_year"] = count_1_year
        item["count_lifetime"] = count_lifetime
        item["created_at"] = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime())
        return item, seller_url

    def parse(self, response, **kwargs):
        # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        if items := self.asin_detail(response):
            detail_item, url = items
            if detail_item:
                logging.info(f"feedback_detail :{detail_item}")
                yield {"feedback_detail": detail_item}
            if url:
                cookies = random.choice(self.cookie_list)
                self.headers["accept"] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
                self.headers["referer"] = response.url
                url = self.url_ + f"/s?me={response.meta.get('seller_id')}"
                yield scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers,
                                                         callback=self.parse_seller_num, errback=self.err_parse,
                                                         dont_filter=True, meta=response.meta)
            else:
                logging.info(f'{response.meta["account_name"]}  没有店铺url {response.url}')

    def parse_seller_num(self, response, **kwargs):
        # us_asin_detail_product_2023
        # if ("Try checking your spelling or use more general terms" in response.text) or\
        #         ("ortografia o utilizza termini più generici" in response.text):
        #     logging.info(f"空白页    :{response.url}")
        #     return None
        if not response.xpath("//div[@class='s-main-slot s-result-list s-search-results sg-row']/div[@data-uuid]").get():
            logging.info(f"空白页    :{response.url}")
            return None
        results_num = response.xpath("//h1[@class='a-size-base s-desktop-toolbar a-text-normal']//div[@class='a-section a-spacing-small a-spacing-top-small']//text()").getall()
        if self.site == "us":
            results_num = [i for i in results_num if i.strip()][0].split(" result")[0].split(" ")[-1].replace(",", "")
        elif self.site == "it":
            results_num = [i for i in results_num if i.strip()][0].split(" risult")[0].split(" ")[-1].replace(",", "")\
                .replace(".", "")
        elif self.site == "uk":
            results_num = [i for i in results_num if i.strip()][0].split(" result")[0].split(" ")[-1].replace(",", "")\
                .replace(".", "")
        elif self.site == "fr":
            results_num = [i for i in results_num if i.strip()][0].split(" résult")[0].split(" ")[-1].replace(",", "")\
                .replace(".", "")
        elif self.site == "de":
            results_num = [i for i in results_num if i.strip()][0].split("von")[-1].strip().split(" ")[0]\
                .replace(",", "").replace(".", "")
        elif self.site == "es":
            results_num = [i for i in results_num if i.strip()][0].split(" result")[0].split(" ")[-1]\
                .replace(",", "").replace(".", "")

        commodity_num = {
            "num": results_num,
            "id": response.meta.get("id")
        }
        logging.info(f"commodity_num :{commodity_num}")
        yield {"commodity_num": commodity_num}
        date_list = response.xpath("//div[@class='s-main-slot s-result-list s-search-results sg-row']/div[@data-uuid]")
        items = []
        for k, i in enumerate(date_list):
            item = {}
            account_id = response.meta.get("id")
            asin = i.xpath("./@data-asin").get()
            title = re.sub(r"\\u.{4}", '', i.xpath(".//h2//text()").get().__repr__()).replace("\\xa0", "")
            img_url = i.xpath(".//div[@class='a-section aok-relative s-image-fixed-height']/img/@src").get()
            price = i.xpath(".//span[@class='a-price']/span/text()").get("").replace("$", "").replace("\xa0", "")\
                .replace("€", "").replace(",", ".")
            rating = i.xpath(".//div[@class='a-section a-spacing-none a-spacing-top-micro']//"
                             "span[@class='a-size-base']//text()").get()
            total_comments = i.xpath(".//span[@class='a-size-base s-underline-text']//text()").get("")\
                .replace("(", "").replace(",", "").replace(")", "")
            week = self.week["week"]
            row_num = k + 1
            item["account_id"] = account_id
            item["asin"] = asin or None
            item["title"] = title or None
            item["img_url"] = img_url or None
            item["price"] = price or None
            item["rating"] = rating or None
            item["total_comments"] = total_comments or None
            item["week"] = week or None
            item["row_num"] = row_num or None
            item["month"] = str(int(str(self.week["created_time"]).split("-")[1]))
            items.append(item)
        logging.info(f"seller :{items}")
        yield {"seller": items}

    def err_parse(self, failure, **kwargs):
        # print("------", failure.reason)
        item = {
            "error_asin": True,
            "account_name": (1, failure.request.meta.get("account_name")),
        }
        logging.info(f"爬取失败 account_name :{failure.request.meta.get('account_name')}, {failure.request.url}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl amazon_feedback -a site=de'.split()
    cmdline.execute(args)


# us, fr, it, de, uk, es, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_feedback  > amazon_de_feedback1.log 2>&1 &
# nohup scrapy crawl amazon_feedback -a site=de  > amazon_de_feedback1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl amazon_feedback" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_asin_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_asin_detail -a site=uk > amazon_get_asin_detail1.log 2>&1 &
