import json
import time
import scrapy
import random
import logging
import pandas as pd
import os, sys, platform
from urllib.parse import urlparse
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()
from sqlalchemy.exc import OperationalError
# 亚马逊历史评论获取
from amazon_spider.utils.read_db_data import ReadCookie
from amazon_spider.spiders.yswg_spider import SourceSpider
from func_timeout.exceptions import FunctionTimedOut


class AmazonKontakteSpider(SourceSpider):
    """
    欧代数据获取
    """
    name = 'amazon_kontakte_api'
    custom_settings = {
        #  curl_cffi 代理添加
        # 'PROXY_HOST': 'http-dynamic-S02.xiaoxiangdaili.com',
        # 'PROXY_PORT': 10030,
        # 'PROXY_USER': '******',
        # 'PROXY_PASS': '******',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        # 深度 廣度
        # 'DOWNLOAD_DELAY': 2,
        # # 启用后，当从相同的网站获取数据时，Scrapy将会等待一个随机的值，延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404, 401],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.amazon_comment_pipe_text.AmazonCommentSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        },
        # 'DOWNLOAD_HANDLERS': {
        #     'http': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        #     'https': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        # }
    }

    def __init__(self, site='us'):
        super(AmazonKontakteSpider, self).__init__()
        self.site = site
        self.update_cookies('other')
        self.utils_requests = [{}, {"use_aiohttp": True}, {"curlcffi": True}, {"use_httpx": True}]
        # self.utils_requests = [{}]

        self.sites = {
            "us": "Amazon.com",
            "uk": "Amazon.co.uk",
            "de": "Amazon.de",
            "es": "Amazon.es",
            "it": "Amazon.it",
            "fr": "Amazon.fr",
            "mx": "Amazon.com.mx",
            "ca": "Amazon.ca",
            "nl": "Amazon.nl",
            "be": "Amazon.com.be",
            "se": "Amazon.se",
            "pl": "Amazon.pl",
            "tr": "Amazon.com.tr",
            'au': "Amazon.com.au",
        }
        self.sleep_count = 0
        # self.seeds_file = '/mnt/amazon_spider/amazon_spider/amazon_spider/spiders/gpsr_job.xlsx'
        # self.save_file = '/mnt/amazon_spider/amazon_spider/amazon_spider/spiders/gpsr_datas.xlsx'
        self.seeds_file = '/mnt/amazon_spider/amazon_spider/amazon_spider/spiders/Amazon.it欧盟sku.xlsx'
        # self.seeds_file = './Amazon.it欧盟sku.xlsx'
        self.save_file = '/mnt/amazon_spider/amazon_spider/amazon_spider/spiders/sku_datas_it.xlsx'
        # self.save_file = './sku_datas_it.xlsx'
        self.seeds = pd.read_excel(self.seeds_file, dtype={'asin': str})
        self.seeds.loc[(self.seeds['status'] == '没有kontakte_url') | (self.seeds['status'] == 13), 'status'] = 1
        # self.seeds['status'] = 1
        # B0CQYPM3NV  B0CQYP1ZPG  B0CQYPXX8R B0CQYQ5DL4 B0CZLK31K5 B0CZLJY49H B0CZLKYYYV
        # self.seeds = pd.DataFrame([{"id": 1, "账号管理": "朱春兰", "账号简称": "J0", "site": "Amazon.de", "sku": "12345", "asin": 'B06WRPT35W', "status": 1}])
        self.log_cookie = [
            'jPvNIv2B5a1C?WOi6XOoz2kOeCo7Hsd@sBjuYLLWEoc6GUluZQ3yfLXlVUE2VQh2',
            '0WabUGUu8uSgALAoRpAiD5US1c6Am8syTSOFv@LYMWYou@5TfBXLeQr9N4TP@aYM',
            '9Ixyrh@Iq450FZ@srt2FrAEXZaeHSGqjZmKu8v0FQRIPPcsLEUlMWvFwJpCb@n3N',
            'P71hK2UZXye@VN4ml6PMMTvS9krpXt4IOf?LUCb1p??mtPEVFWsJTBAOiDJsQm31'
        ]
        # self.seeds.loc[((self.seeds['site'] == self.sssite)), 'state'] = 1
        # self.seeds['status'] = 1
        # self.seeds_new = self.seeds[(self.seeds['status'] == 1)]
        # self.seeds_new = self.seeds[self.seeds['hot_product'].isna()]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonKontakteSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def update_cookies(self, site):
        while True:
            try:
                if self.site == 'us':
                    self.cookie_list = ReadCookie(self.site).get_cookie()
                    self.cookie_other = ReadCookie('other').get_cookie()
                elif not self.site or self.site not in ['uk', 'fr', 'es', 'it', 'de', 'us']:
                    self.cookie_other = ReadCookie('other').get_cookie()
                    self.cookie_list = {
                        "us": ReadCookie('us').get_cookie(),
                        "de": ReadCookie('de').get_cookie(),
                        "uk": ReadCookie('uk').get_cookie(),
                        "fr": ReadCookie('fr').get_cookie(),
                        "it": ReadCookie('it').get_cookie(),
                        "es": ReadCookie('es').get_cookie(),
                    }
                    for site, v in self.cookie_list.items():
                        v['site'] = site
                    self.cookie_list = pd.concat(list(self.cookie_list.values()), axis=0)
                else:
                    self.cookie_list = ReadCookie(self.site).get_cookie()
                logging.info("cookie list news updates o-_-o")
                break
            except OperationalError as e:
                logging.info(f'get cookies OperationalError sleep 5s T_T --> {e}')
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f'get cookies time out sleep 5s T_T --> {e}')
                time.sleep(5)
                continue

    def json_cookies(self, site):
        while True:
            if site not in ['us', 'uk', 'fr', 'es', 'it', 'de']:
                cookies = list(self.cookie_other[self.cookie_other.site == site].sample(n=1).values)
                logging.info(f"获取 {site} cookie")
            else:
                cookies = list(self.cookie_list[self.cookie_list.site == site].sample(n=1).values)
                logging.info(f"获取{site} cookie")
            try:
                cookies = json.loads(cookies[0][0])
                break
            except TypeError as e:
                logging.info(f"cookie json error {cookies}")
                continue
        return cookies

    def spider_idle(self, spider):
        logging.debug(f'IDLE------{self.site}------------')

        self.seeds_new = self.seeds[(self.seeds['status'] == 1) & ~(self.seeds['asin'].isnull())]
        logging.info(f"-----长度{self.seeds_new.shape}")
        if self.seeds_new.shape[0] > 0:
            num = 0
            # self.seeds.to_excel(self.seeds_file, index=False)
            for i in self.seeds_new.to_dict(orient='records')[0:30]:
                site = i['site']
                # site = 'us' if site == 'com' else site
                # de fr it es nl pl se tr
                if not i['asin']:
                    self.seeds.loc[((self.seeds['asin'] == i['asin']) & (
                                self.seeds['site'] == site)), 'status'] = '没有asin不需要处理'
                    continue
                url = f"{self.site_url.get(site)}/dp/{i['asin']}?th=1&psc=1"
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language':  'zh-CN,zh;q=0.9',
                    'accept-encoding': 'gzip, deflate, br, zstd',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                meta = {
                    # "use_aiohttp": True,
                    "asin": i['asin'],
                    # "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "asin_type": "",
                    "site": site,
                    # "choice_header": True,
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers：{h_key}')
                headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                cookies = self.json_cookies(site)
                logging.info(f"{url}: {cookies}")
                # if meta['site'] in ['au', 'tr', 'nl']:
                #     del headers['referer']
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                                         callback=self.parse, errback=self.err_parse,
                                                         dont_filter=True, meta=meta), self)
            # self.seeds.to_excel('./cantonfair_玩具及孕婴童max.xlsx', encoding='utf-8')
            # self.seeds.to_csv('./cantonfair_2900_main.csv', encoding='utf-8')
        else:
            self.sleep_count += 1
            if self.sleep_count >= random.randint(2, 4):
                raise
            logging.info('no task sleep 30s')
            time.sleep(30)
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)

    def page_state(self, response):
        asin = response.request.meta.get("asin")
        site = response.request.meta.get("site")
        if ("Die eingegebene Webadresse ist keine" in response.text) \
                    or ("The Web address you entered is not a functioning" in response.text) \
                    or ("saisie n'est pas une page fonctionnelle" in response.text) \
                    or ("web inserito non è una pagina funzionante" in response.text) \
                    or ("web que has especificado no es" in response.text) \
                    or ("Page Not Found" in response.text) \
                    or ("We are sorry! This Gift Card is not available" in response.text) \
                    or ("500 - An error occurred" in response.text) \
                    or ("Onze excuses. Het webadres dat je hebt" in response.text)\
                    or ("Üzgünüz. Girdiğiniz web adresi" in response.text)\
                    or ("Przepraszamy. Wyszukiwana" in response.text)\
                    or ("Lo sentimos! No pudimos encontrar la página" in response.text)\
                    or ("Vi ber om ursäkt. Webbadressen" in response.text)\
                    or ("Lo sentimos. La dirección" in response.text) \
                    or ("Vous cherchez quelque chose ?" in response.text):
            item = {
                "error_asin": True,
                "asin": {"state": 4, "asin": asin, "site": site},
            }
            # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
            #     f.write(response.text)
            logging.info(f"页面为空:{item.get('asin')}")
            return item
        elif response.xpath("//input[@id='captchacharacters']").get():
            item = {
                "error_asin": True,
                "asin": {"state": '验证码', "asin": asin, "site": site},
            }
            logging.info(f"验证码:{item.get('asin')}")
            return item
        elif ("keywords" in response.url) or ("dp/" not in response.url) or (
                "ref=" in response.url and "encoding=" in response.url) or (
                response.meta.get("asin") not in response.url):
            item = {
                "error_asin": True,
                "asin": {"state": 12, "asin": asin, "site": site},
            }
            logging.info(f"跳转页面:{item.get('asin')}")
            return item
        elif not response.xpath("//div[@id='dp']").getall():
            item = {
                "error_asin": True,
                "asin": {"state": 13, "asin": asin, "site": site},
            }
            # with open(f"{response.meta.get('asin')}_{response.meta.get('site')}.html", "w", encoding="utf-8")as f:
            #     f.write(response.text)
            logging.info(f"跳转或者视频页面:{item.get('asin')}")
            return item
        else:
            return None

    def parse(self, response, **kwargs):
        meta = response.meta
        if item := self.page_state(response):
            stats = self.crawler.stats
            stats.inc_value("other_page")
            # self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == self.sites.get(meta['site']))), 'status'] = '4'
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (
                    self.seeds['site'] == meta['site'])), 'status'] = item['asin']['state']
            yield item
        else:
            kontakte_url = response.xpath("//div[contains(@data-acp-path, 'acp/buffet-high')]/@data-acp-path").get() or ""
            if 'acp/buffet-high' not in kontakte_url:
                # self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (
                #             self.seeds['site'] == self.sites.get(meta['site']))), 'status'] = '3'
                self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (
                            self.seeds['site'] == meta['site'])), 'status'] = "没有kontakte_url"
                logging.info(
                    f"-----长度{self.seeds[(self.seeds['status'] == 1) & ~(self.seeds['asin'].isnull())].shape}")
            else:
                x_amz_acp_params = response.xpath("//div[contains(@data-acp-path, 'buffet-disclaimers-card/')]/@data-acp-params").get()
                url = f"{self.site_url.get(meta.get('site'))}{kontakte_url}getRspManufacturerContent"
                print(url)
                headers = {
                    # 'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    "Host": urlparse(url).hostname,
                    # "x-amz-acp-params": "tok=grm68wkkVUPG8KOjRoc6gjQEInMUk6nq6sAioZ01UDI;ts=1733724138425;rid=F1X630A04DD6SJ4SSAKZ;d1=261;d2=0",
                    "x-requested-with": "XMLHttpRequest",
                    "accept": "text/html, application/json",
                    "content-type": "application/json",
                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
                    "origin": urlparse(url).hostname,
                    "referer": f"https://{urlparse(url).hostname}/dp/{meta.get('asin')}?th=1",
                    "accept-language": "zh-CN,zh;q=0.9,de;q=0.8",
                }
                headers.update({'x-amz-acp-params': x_amz_acp_params})
                data = {
                    "asin": meta.get("asin")
                }
                cookies = self.json_cookies(meta.get('site'))
                meta['choice_header'] = False
                yield scrapy.FormRequest(url=url, method="POST", headers=headers, body=json.dumps(data), cookies=cookies, callback=self.parse_data, errback=self.err_parse, dont_filter=True, meta=meta)

    def parse_data(self, response):
        """解析详情页数据"""
        logging.info("爬取完成")
        meta = response.meta
        box_etree = response.xpath("//div[@id='rspComponent']//div[@class='a-box-inner']//ul")
        if box_etree:
            logging.info(f"box_etree len {box_etree}")
            name_list = ['UE Fast Refund GmbH', 'EMMS Trading GmbH', 'AMZLAB GmbH']
            datas = ["\n".join(i.xpath(".//text()").getall()).replace('\xa0', '') for i in box_etree[0].xpath(".//li")]
            logging.info(f"数据 {datas} 长度 {len(datas)}")
            item = {
                '欧代公司': datas[0],
                '欧代地址': datas[1],
                '欧代电话': datas[-2],
                '欧代邮箱': datas[-1],
            }
            logging.info(f"{item}")
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == meta['site'])), 'status'] = 3
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == meta['site'])), '欧代公司'] = datas[0]
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == meta['site'])), '欧代地址'] = datas[1]
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == meta['site'])), '欧代电话'] = datas[-2]
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == meta['site'])), '欧代邮箱'] = datas[-1]
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == meta['site'])), '是否是欧代'] = '是' if datas[0] in name_list else '否'
            logging.info(f"-----长度{self.seeds[(self.seeds['status'] == 1) & ~(self.seeds['asin'].isnull())].shape}")
        else:
            logging.info(f"无欧代数据")
            #
            self.seeds.loc[((self.seeds['asin'] == meta['asin']) & (self.seeds['site'] == meta['site'])), 'status'] = '无欧代数据'

    def err_parse(self, failure, **kwargs):
        request = failure.request
        logging.info(f"error______ {failure.getErrorMessage()}")

    def close(self, spider, reason):
        logging.info("spider finish")
        self.seeds.to_excel(self.save_file, index=False)


if __name__ == '__main__':
    args = 'scrapy crawl amazon_kontakte_api -a site=other'.split()
    cmdline.execute(args)

# source activate pyspark 
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_comment_history -a site=us  > amazon.log 2>&1 &
# nohup scrapy crawl amazon_kontakte_api -a site=other > amazon_kontakte_api_it.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl amazon_comment_all" |awk '{print $2}' `; do kill -9 $i ; done;
# nohup /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_comment -a site=us > amazon1.log 2>&1 &
# nohup scrapy crawl amazon_comment -a site=de > amazon_de1.log 2>&1 &


