import json
import time
import scrapy
import random
import logging
import pandas as pd
import os, sys, platform
from urllib.parse import urlparse
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
# 亚马逊历史评论获取
from amazon_spider.utils.read_db_data import ReadCookie
from amazon_spider.spiders.yswg_spider import SourceSpider

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonGPSRSpider(SourceSpider):
    name = 'amazon_gpsr_api'
    custom_settings = {
        #  curl_cffi 代理添加
        # 'PROXY_HOST': 'http-dynamic-S02.xiaoxiangdaili.com',
        # 'PROXY_PORT': 10030,
        # 'PROXY_USER': '******',
        # 'PROXY_PASS': '******',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        # 深度 廣度
        # 'DOWNLOAD_DELAY': 2,
        # # 启用后，当从相同的网站获取数据时，Scrapy将会等待一个随机的值，延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404, 401],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 480,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'amazon_spider.middlewares.UpdateCookiesUrl': 530,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.amazon_comment_pipe_text.AmazonCommentSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        },
        # 'DOWNLOAD_HANDLERS': {
        #     'http': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        #     'https': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        # }
    }

    def __init__(self, site='us'):
        super(AmazonGPSRSpider, self).__init__()
        self.site = site
        self.update_cookies('other')
        self.utils_requests = [{}, {"use_aiohttp": True}, {"curlcffi": True}, {"use_httpx": True}]
        # self.utils_requests = [{}]

        self.sites = {
            "us": "Amazon.com",
            "uk": "Amazon.co.uk",
            "de": "Amazon.de",
            "es": "Amazon.es",
            "it": "Amazon.it",
            "fr": "Amazon.fr",
            "mx": "Amazon.com.mx",
            "ca": "Amazon.ca",
            "nl": "Amazon.nl",
            "be": "Amazon.com.be",
            "se": "Amazon.se",
            "pl": "Amazon.pl",
            "tr": "Amazon.com.tr",
            'au': "Amazon.com.au",
        }
        self.sleep_count = 0
        # self.seeds_file = '/mnt/amazon_spider/amazon_spider/amazon_spider/spiders/gpsr_job.xlsx'
        # self.save_file = '/mnt/amazon_spider/amazon_spider/amazon_spider/spiders/gpsr_datas.xlsx'
        self.seeds_file = './gpsr_job.xlsx'
        self.save_file = './gpsr_datas.xlsx'
        self.seeds = pd.read_excel(self.seeds_file, dtype={'asin': str, 'sku': str})
        # self.seeds['status'] = 1
        # B0CQYPM3NV  B0CQYP1ZPG  B0CQYPXX8R B0CQYQ5DL4 B0CZLK31K5 B0CZLJY49H B0CZLKYYYV
        # self.seeds = pd.DataFrame([{"a": "1", "id": 1, "asin": 'B0CZLKYYYV', "status": 1, "site": 'Amazon.com'}])
        self.log_cookie = [
            'jPvNIv2B5a1C?WOi6XOoz2kOeCo7Hsd@sBjuYLLWEoc6GUluZQ3yfLXlVUE2VQh2',
            '0WabUGUu8uSgALAoRpAiD5US1c6Am8syTSOFv@LYMWYou@5TfBXLeQr9N4TP@aYM',
            '9Ixyrh@Iq450FZ@srt2FrAEXZaeHSGqjZmKu8v0FQRIPPcsLEUlMWvFwJpCb@n3N',
            'P71hK2UZXye@VN4ml6PMMTvS9krpXt4IOf?LUCb1p??mtPEVFWsJTBAOiDJsQm31'
        ]
        # self.seeds.loc[((self.seeds['站点'] == self.sssite)), 'state'] = 1
        # self.seeds['status'] = 1
        # self.seeds_new = self.seeds[(self.seeds['status'] == 1)]

        # self.seeds_new = self.seeds[self.seeds['hot_product'].isna()]
        # del self.seeds_new['Unnamed: 0.1']
        # del self.seeds_new['Unnamed: 0.1.1']
        # self.seeds['hot_product'] = ''
        # self.seeds['hot_product_len'] = ''
        # self.seeds['title_svg'] = ''
        # self.q = queue.Queue()

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonGPSRSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def update_cookies(self, site):
        while True:
            try:
                if self.site == 'us':
                    self.cookie_list = ReadCookie(self.site).get_cookie()
                    self.cookie_other = ReadCookie('other').get_cookie()
                elif not self.site or self.site not in ['uk', 'fr', 'es', 'it', 'de', 'us']:
                    self.cookie_other = ReadCookie('other').get_cookie()
                    self.cookie_list = {
                        "us": ReadCookie('us').get_cookie(),
                        "de": ReadCookie('de').get_cookie(),
                        "uk": ReadCookie('uk').get_cookie(),
                        "fr": ReadCookie('fr').get_cookie(),
                        "it": ReadCookie('it').get_cookie(),
                        "es": ReadCookie('es').get_cookie(),
                    }
                    for site, v in self.cookie_list.items():
                        v['site'] = site
                    self.cookie_list = pd.concat(list(self.cookie_list.values()), axis=0)
                else:
                    self.cookie_list = ReadCookie(self.site).get_cookie()
                logging.info("cookie list news updates o-_-o")
                break
            except OperationalError as e:
                logging.info(f'get cookies OperationalError sleep 5s T_T --> {e}')
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f'get cookies time out sleep 5s T_T --> {e}')
                time.sleep(5)
                continue

    def json_cookies(self, site):
        while True:
            if site not in ['us', 'uk', 'fr', 'es', 'it', 'de']:
                cookies = list(self.cookie_other[self.cookie_other.site == site].sample(n=1).values)
                logging.info(f"获取 {site} cookie")
            else:
                cookies = list(self.cookie_list[self.cookie_list.site == site].sample(n=1).values)
                logging.info(f"获取{site} cookie")
            try:
                cookies = json.loads(cookies[0][0])
                break
            except TypeError as e:
                logging.info(f"cookie json error {cookies}")
                continue
        return cookies

    def spider_idle(self, spider):
        logging.debug(f'IDLE------{self.site}------------')

        self.seeds_new = self.seeds[(self.seeds['status'] == '评论数为0或者变狗') & ~(self.seeds['asin'].isnull())]
        logging.info(f"-----长度{self.seeds_new.shape}")
        if self.seeds_new.shape[0] > 0:
            num = 0
            for i in self.seeds_new.values:
                site = i[2].split(".")[-1]
                site = 'us' if site == 'com' else site
                # de fr it es nl pl se tr
                if site not in ['de', 'fr', 'it', 'es', 'nl', 'pl', 'se', 'tr']:
                    self.seeds.loc[((self.seeds['asin'] == i[1]) & (
                                self.seeds['site'] == self.sites.get(site))), 'status'] = '3'
                    self.seeds.loc[((self.seeds['asin'] == i[1]) & (
                            self.seeds['site'] == self.sites.get(site))), 'gpsr_img'] = '不需要处理的站点'
                    continue
                url = f"{self.site_url.get(site)}/dp/{i[1]}"
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'accept-encoding': 'gzip, deflate, br, zstd',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                meta = {
                    # "use_aiohttp": True,
                    "asin": i[1],
                    # "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "asin_type": "",
                    "site": site,
                    # "choice_header": True,
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                cookies = self.json_cookies(site)
                logging.info(f"{url}: {cookies}")
                # if meta['site'] in ['au', 'tr', 'nl']:
                #     del headers['referer']
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                                         callback=self.parse, errback=self.err_parse,
                                                         dont_filter=True, meta=meta), self)
            # self.seeds.to_excel('./cantonfair_玩具及孕婴童max.xlsx', encoding='utf-8')
            # self.seeds.to_csv('./cantonfair_2900_main.csv', encoding='utf-8')
        else:
            self.sleep_count += 1
            if self.sleep_count >= random.randint(6, 15):
                raise
            logging.info('no task sleep 30s')
            time.sleep(30)
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)

    def page_state(self, response):
        asin = response.request.meta.get("asin")
        site = response.request.meta.get("site")
        if ("Die eingegebene Webadresse ist keine" in response.text) \
                    or ("The Web address you entered is not a functioning" in response.text) \
                    or ("saisie n'est pas une page fonctionnelle" in response.text) \
                    or ("web inserito non è una pagina funzionante" in response.text) \
                    or ("web que has especificado no es" in response.text) \
                    or ("Page Not Found" in response.text) \
                    or ("We are sorry! This Gift Card is not available" in response.text) \
                    or ("500 - An error occurred" in response.text) \
                    or ("Onze excuses. Het webadres dat je hebt" in response.text)\
                    or ("Üzgünüz. Girdiğiniz web adresi" in response.text)\
                    or ("Przepraszamy. Wyszukiwana" in response.text)\
                    or ("Lo sentimos! No pudimos encontrar la página" in response.text)\
                    or ("Vi ber om ursäkt. Webbadressen" in response.text):
            item = {
                "error_asin": True,
                "asin": {"state": 4, "asin": asin, "site": site},
            }
            # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
            #     f.write(response.text)
            logging.info(f"页面为空:{item.get('asin')}")
            return item
        elif ("keywords" in response.url) or ("dp/" not in response.url) or (
                "ref=" in response.url and "encoding=" in response.url) or (
                response.meta.get("asin") not in response.url):
            item = {
                "error_asin": True,
                "asin": {"state": 12, "asin": asin, "site": site},
            }
            logging.info(f"跳转页面:{item.get('asin')}")
            return item
        elif not response.xpath("//div[@id='dp']").getall():
            item = {
                "error_asin": True,
                "asin": {"state": 13, "asin": asin, "site": site},
            }
            # with open(f"{response.meta.get('asin')}_{response.meta.get('site')}.html", "w", encoding="utf-8")as f:
            #     f.write(response.text)
            logging.info(f"跳转或者视频页面:{item.get('asin')}")
            return item
        else:
            return None

    def parse(self, response, **kwargs):
        meta = response.meta
        if item := self.page_state(response):
            stats = self.crawler.stats
            stats.inc_value("other_page")
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == self.sites.get(meta['site']))), 'status'] = '4'
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (
                    self.seeds['site'] == self.sites.get(meta['site']))), 'gpsr_img'] = "变狗页"
            yield item
        else:
            gpsr_url = response.xpath("//div[contains(@data-acp-path, 'buffet-disclaimers-card/')]/@data-acp-path").get() or ""
            if 'acp/buffet-disclaimers-card' not in gpsr_url:
                self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (
                            self.seeds['site'] == self.sites.get(meta['site']))), 'status'] = '3'
                self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (
                            self.seeds['site'] == self.sites.get(meta['site']))), 'gpsr_img'] = "没有gpsr_url"
                logging.info(
                    f"-----长度{self.seeds[(self.seeds['status'] == '评论数为0或者变狗') & ~(self.seeds['asin'].isnull())].shape}")
            else:
                x_amz_acp_params = response.xpath("//div[contains(@data-acp-path, 'buffet-disclaimers-card/')]/@data-acp-params").get()
                url = f"{self.site_url.get(meta.get('site'))}{gpsr_url}getPsiContent"
                headers = {
                    # 'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    "Host": urlparse(url).hostname,
                    # "x-amz-acp-params": "tok=grm68wkkVUPG8KOjRoc6gjQEInMUk6nq6sAioZ01UDI;ts=1733724138425;rid=F1X630A04DD6SJ4SSAKZ;d1=261;d2=0",
                    "x-requested-with": "XMLHttpRequest",
                    "accept": "text/html, application/json",
                    "content-type": "application/json",
                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
                    "origin": urlparse(url).hostname,
                    "referer": f"https://{urlparse(url).hostname}/dp/{meta.get('asin')}?th=1",
                    "accept-language": "zh-CN,zh;q=0.9,de;q=0.8",
                }
                headers.update({'x-amz-acp-params': x_amz_acp_params})
                data = {
                    "asin": meta.get("asin")
                }
                cookies = self.json_cookies(meta.get('site'))
                meta['choice_header'] = False
                yield scrapy.FormRequest(url=url, method="POST", headers=headers, body=json.dumps(data), cookies=cookies, callback=self.parse_data, errback=self.err_parse, dont_filter=True, meta=meta)

    def parse_data(self, response):
        """解析详情页数据"""
        logging.info("爬取完成")
        meta = response.meta
        img_etree = response.xpath("//ol[@class='a-carousel']//img/@src").getall()
        url = f"{self.site_url.get(response.meta.get('site'))}/dp/{response.meta.get('asin')}"
        print(url, "get img_url_list :", img_etree)
        self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == self.sites.get(meta['site']))), 'status'] = '3'
        self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == self.sites.get(meta['site']))), 'gpsr_img'] = ",".join(img_etree)
        logging.info(f"-----长度{self.seeds[(self.seeds['status'] == '评论数为0或者变狗') & ~(self.seeds['asin'].isnull())].shape}")

    def err_parse(self, failure, **kwargs):
        request = failure.request
        logging.info(f"error______ {failure.getErrorMessage()}")

    def close(self, spider, reason):
        logging.info("spider finish")
        self.seeds.to_excel(self.seeds_file, index=False)
        # raise
        # if self.q.qsize():
        #     d = []
        #     for i in range(self.q.qsize()):
        #         d += self.q.get()
        #     df_1 = pd.DataFrame(d)
        #     print(df_1.shape)
        #     try:
        #         df_2 = pd.read_excel(self.save_file, dtype={'shop_id': str, 'commodity_id': str})
        #         print(df_2.shape)
        #         df_save = pd.concat([df_1, df_2])
        #     except FileNotFoundError as e:
        #         df_save = pd.concat([df_1])
        #         print("没有该文件")
        #     print(df_save.shape)
        #     df_save.drop_duplicates(['commodity_id'], inplace=True)
        #     print(df_save.shape)
        #     df_save.to_excel(self.save_file, index=False, encoding='utf-8')
        # self.seeds.to_excel(self.seeds_file, index=False, encoding='utf-8')


if __name__ == '__main__':
    args = 'scrapy crawl amazon_gpsr_api -a site=other'.split()
    cmdline.execute(args)

# source activate pyspark
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_comment_history -a site=us  > amazon.log 2>&1 &
# nohup scrapy crawl amazon_comment_all -a site=us > amazon_history_comment_us1.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl amazon_comment_all" |awk '{print $2}' `; do kill -9 $i ; done;
# nohup /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_comment -a site=us > amazon1.log 2>&1 &
# nohup scrapy crawl amazon_comment -a site=de > amazon_de1.log 2>&1 &



