# coding:utf-8
import os
import sys
import json
import time
import redis
import scrapy
import logging
from pprint import pprint
from kafka import KafkaProducer
from kafka.errors import KafkaError
from scrapy import cmdline, signals
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import spop, sadd


class TemuPostDetailSpider(scrapy.Spider):
    name = 'temu_post_detail'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'CONCURRENT_REQUESTS_PER_IP': 1,
        'DOWNLOAD_DELAY': 2,
        'DEPTH_PRIORITY': 1,

        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['temu.com'],

        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        "COOKIES_ENABLED": False,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],

        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            # 'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middleware.temu.ForeignProxyMiddleware': 450,
            'amazon_spider.middleware.temu.AddAntiMiddleware': 470,
            'amazon_spider.middleware.temu.TemuIsPageError': 480,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # Http2Middleware
            # 'amazon_spider.middlewares.Http2Middleware': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.temu_detail_pipe.TemuDetailSpiderTextPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(TemuPostDetailSpider, self).__init__()
        self.site = site
        self.site_url = {
            "us": 'https://www.temu.com',
            'de': "https://www.temu.de",
            "uk": "https://www.temu.co.uk",
            "it": "https://www.temu.it",
            "es": "https://www.temu.es",
            "fr": "https://www.temu.fr",
            "mx": "https://www.temu.mx",
            "ca": "https://www.temu.ca",
        }

        self.headers = {
            'authority': 'www.temu.com',
            'Host': 'www.temu.com',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9',
        }
        self.col = ['rating', 'one_star', 'two_star', 'three_star', 'four_star', 'thumbUrl', 'five_star', 'low_star', 'img_num', 'goodid', 'title', 'img_url', 'title_len', 'price', 'total_comments', 'category', 'followers', 'sold', 'items', 'describe', 'shop_comments', 'url']
        # self.cookie_bee = None
        while True:
            if self.is_internet_available():
                self.producer = KafkaProducer(bootstrap_servers=['113.100.143.162:39092'], api_version=(2, 4, 1))
                break
            else:
                time.sleep(2)
                logging.info("Network connection failure")
                continue

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(TemuPostDetailSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def hamc_encrypt(self, message, key):
        import hmac, hashlib
        result1 = hmac.new(key.encode(), message.encode(), hashlib.md5).hexdigest()
        return result1

    def spider_idle(self, spider):
        # logging.debug(f'IDLE------------------{self.site}')
        # df = ReadDb(self.site).read_db_temu_detail()
        # if df.shape[0] > 0:
        #     for i in df.values:
        #         url = self.site_url.get('us')
        #         url = f"{url}/goods.html?goods_id={i[1]}"
        #         meta = {
        #             "bee": True,
        #             "goodid": i[1],
        #             "cookiejar": int(time.time()),
        #             "search_term": i[2],
        #             "use_aiohttp": True,
        #             'proxy': "http://127.0.0.1:9900",
        #             "page": 1,
        #         }
        #         headers = {
        #             'Host': 'www.temu.com',
        #             'authority': 'www.temu.com',
        #             'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        #             'accept-language': 'zh-CN,zh;q=0.9',
        #             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
        #         }
        #         cookies = {
        #             'currency': 'USD',
        #             'language': 'en',
        #             'region': '211',
        #             # '_bee': self.hamc_encrypt(f"{int(time.time())}", "0be0fc45cf9f392c"),
        #             # 'privacy_setting': '111',
        #             # 'timezone': 'Asia%2FShanghai',
        #             # "pragma": "no-cache",
        #         }
        #         self.crawler.engine.crawl(scrapy.Request(url=url, headers=headers, cookies=cookies,
        #                                                  callback=self.parse, errback=self.err_parse,
        #                                                  dont_filter=True, meta=meta), self)
        # else:
        #     raise

        logging.debug(f'IDLE------------------{self.site}')
        while True:
            try:
                # seeds = zpop(f'{self.site}_real_zset_seed', 25)
                seeds = spop(f"{self.site}_temu_detail_seed", 1)
                logging.info("get seed succeed")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"get seed error {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"get seed timeout {e}")
                continue
        # df = ReadDb(self.site).read_db_temu_serch_keyword()
        if seeds:
            for i in seeds:
                job = json.loads(i)
                url = self.site_url.get('us')
                # url = f"{url}/goods.html?goods_id={job['goodid']}"
                url = "https://www.temu.com/api/oak/integration/render"
                meta = {
                    "bee": True,
                    "goodid": job['goodid'],
                    "cookiejar": int(time.time()),
                    # "search_term": job['search_term'],
                    "method": "GET",
                    "curlcffi": True,
                    # "use_aiohttp": True,
                    # 'proxy': "http://127.0.0.1:7890",
                    "page": 1,
                }
                headers = {
                    'Host': 'www.temu.com',
                    'authority': 'www.temu.com',
                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'accept-language': 'zh-CN,zh;q=0.9',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                }
                cookies = {
                    'currency': 'USD',
                    'language': 'en',
                    'region': '211',
                    # '_bee': self.cookie_bee
                }
                self.crawler.engine.crawl(scrapy.Request(url=url, headers=headers, cookies=cookies,
                                                         callback=self.parse, errback=self.err_parse,
                                                         dont_filter=True, meta=meta), self)
        else:
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()

    def is_internet_available(self):
        import requests
        try:
            requests.get("http://www.baidu.com", timeout=1)
            return True
        except:
            return False

    def if_page_state(self, response):
        if response.xpath("//div[@class='baseContent']//div[contains(text(), 'This item is sold out. View more details')]")\
                or response.xpath("//div[@class='baseContent']//div[contains(text(), 'This item was discontinued')]"):
            item = {
                "error_asin": True,
                "status": {"status": 4, "goodid": response.request.meta.get("goodid")},
            }
            logging.info(f"页面为空:{item.get('asin')}")
            return item
        elif "scoreNumInfoList" not in response.text:
            # self.cookie_bee = None
            # 没有星级 one_star
            item = {
                "error_asin": True,
                "status": {"status": 4, "goodid": response.request.meta.get("goodid")},
            }
            logging.info(f"页面异常:{item.get('asin')}")
            return item
        else:
            return None

    def k_seed(self, topic, message):
        if self.is_internet_available():
            if self.producer.bootstrap_connected():
                future = self.producer.send(topic, bytes(json.dumps(message), 'utf-8'), partition=0)
                try:
                    record_metadata = future.get(timeout=10)
                    logging.info('Message {} sent to partition {} with offset {}'.format(message, record_metadata.partition,
                                                                                  record_metadata.offset))
                    return True
                except KafkaError as e:
                    logging.info('Failed to send message {}: {}'.format(message, e))
                    return False
            else:
                self.producer.close()
                self.producer = KafkaProducer(bootstrap_servers=['113.100.143.162:39092'], api_version=(2, 4, 1))
                logging.info("kafka producer disconnect")
                time.sleep(2)
                return False
        else:
            time.sleep(2)
            logging.info("Network connection failure")
            return False

    def parse(self, response, **kwargs):
        # print("-" * 20)
        # with open(f"{response.meta.get('goodid')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        # if 'Just a moment' in response.text:  # tls指纹被检测到，会返回这个信息
        #     print('被检测')
        # else:
        #     print('成功绕过')
        if item := self.if_page_state(response):
            yield item
        else:
            print("----------------------")
            datas_json = json.loads(
                response.xpath("//script[contains(text(), 'rawData')]//text()").get().split("rawData=")[-1].split(
                    ";document.dispatchEvent")[0])
            items = {}
            # 商品评论
            # datas_json["store"]["reviewStore"]["reviewNumStr"]
            # if datas_json["store"].get("reviewStore"):
            total_comments = str(datas_json["store"]["reviewStore"].get("reviewNumStr", "")).replace(",", "")
            shop_comments = datas_json["store"]["reviewStore"]["mallReviewNum"]
            items["rating"] = datas_json["store"]["reviewStore"]["showScoreStr"]

            items["one_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][4]["percent"]
            items["two_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][3]["percent"]
            items["three_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][2]["percent"]
            items["four_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][1]["percent"]
            items["five_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][0]["percent"]
            items["low_star"] = items["one_star"] + items["two_star"] + items["three_star"]
            # else:
                # total_comments = response.xpath("//h2[contains(@text, 'Item reviews')]//text()").get("").replace(
                #     "Item reviews (", "").replace(")", "")
                # # 商店评论
                # shop_comments = response.xpath("//h2[contains(@text, 'Shop reviews')]//text()").get("").replace(
                #     "Shop reviews (", "").replace(")", "") or datas_json["store"]["review"]["reviewData"]["mallReview"]["reviewNumStr"]
                # items["rating"] = datas_json["store"]["review"]["reviewData"]["mallReview"]["mallScore"]
                # with open(f"{response.meta.get('goodid')}.html", "w", encoding="utf-8")as f:
                #     f.write(response.text)
                #     print("未获取到 star")
                # items["one_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][4]["percent"]
                # items["two_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][3]["percent"]
                # items["three_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][2]["percent"]
                # items["four_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][1]["percent"]
                # items["five_star"] = datas_json["store"]["reviewStore"]["scoreNumInfoList"][0]["percent"]
                # items["low_star"] = items["one_star"] + items["two_star"] + items["three_star"]
            if datas_json["store"].get("preloadFirstImgList"):
                img_url = ",".join([i["url"] for i in datas_json["store"]["preloadFirstImgList"]])
                items["img_num"] = len([i["url"] for i in datas_json["store"]["preloadFirstImgList"]])
                items["thumbUrl"] = [i["url"] for i in datas_json["store"]["preloadFirstImgList"]][0]
            else:
                img_url = ",".join([i["items"][0]["url"] for i in datas_json["store"]["productDetail"]["floorList"]])
                items["img_num"] = len([i["items"][0]["url"] for i in datas_json["store"]["productDetail"]["floorList"]])
                items["thumbUrl"] = [i["items"][0]["url"] for i in datas_json["store"]["productDetail"]["floorList"]][0]

            followers = datas_json["store"]["mall"]["mallData"].get("followerNumUnit")[0] if datas_json["store"]["mall"]["mallData"].get("followerNumUnit") else ""
            SalesNum = datas_json["store"]["mall"]["mallData"]["goodsSalesNumUnit"][0] if datas_json["store"]["mall"]["mallData"].get("goodsSalesNumUnit") else ""
            items_num = datas_json["store"]["mall"]["mallData"]["goodsNumUnit"][0] if datas_json["store"]["mall"]["mallData"].get("goodsNumUnit") else ""
            price1 = datas_json["store"]["goods"]["minOnSalePriceStr"].replace("$", "").strip()
            # source_price = datas_json["store"]["sku"][0]["normalLinePriceStr"].replace("$", "").strip()
            # copue = datas_json["store"]["sku"][0]["reduction"]["textRich"][0]["text"]
            items["goodid"] = response.meta.get("goodid")
            items["title"] = response.xpath("//h1//text()").get()
            items["img_url"] = img_url
            items["title_len"] = len(items["title"] or "")
            price2 = response.xpath('//div[@style="align-items:center"]//text()').getall()
            items["price"] = price2[1] if price2 else price1

            items["total_comments"] = total_comments
            # page_inventory = list(
            #     datas_json["store"]["skuModuleMap"]["skuSpecSelectorModule"]["data"]["skuStockQuantityTip"].values())
            # print(page_inventory)
            # items["page_inventory"] = page_inventory[0].split(" ")[1].strip() if page_inventory else ""
            items["category"] = ">".join(response.xpath('//nav//li//text()').getall())
            # items["node_id"] = ",".join(response.xpath('//nav//li//a/@href').getall())
            # followers = [i for i in response.xpath('//div[@style="text-align:left"]//text()').getall() if i.strip()]
            items["followers"] = float(followers.lower().replace("k+", "")) * 1000 if "k" in followers.lower() else followers
            items["sold"] = float(SalesNum.lower().replace("k+", "")) * 1000 if "k" in SalesNum.lower() else SalesNum
            items["items"] = float(items_num.lower().replace("k+", "")) * 1000 if "k" in items_num.lower() else items_num
            items["describe"] = " ".join(response.xpath('//*[text()="Description"]/parent::div/div//text()').getall())
            items["shop_comments"] = shop_comments
            items["url"] = response.url
            pprint(items)
            print(list(items.keys()))
            # self.cookie_bee = response.meta.get("bee")
            while True:
                if self.k_seed("us_temu_detail", items):
                    logging.info("seed kafka ok")
                    break
                else:
                    logging.info("seed kafka error")
                    continue
            logging.info(f"spider goodid {response.meta.get('goodid')}")
            yield {"inner_item": items}

    def err_parse(self, failure, **kwargs):
        # self.cookie_bee = None
        request = failure.request
        # item = {
        #     "error_asin": True,
        #     "status": (1, failure.request.meta.get("goodid"), failure.request.meta.get("search_term")),
        # }
        # logging.info(f"爬取失败 goodid: {failure.request.meta.get('goodid')}")
        # yield item
        meta = {
            "goodid": request.meta.get("goodid"),
            # "search_term": request.meta.get("search_term"),
        }
        logging.info(f"spider get error ---：{failure.getErrorMessage()}")
        k = json.dumps(meta)
        while True:
            try:
                sadd(f"{self.site}_temu_detail_seed", k, use_md5=False)
                # zadd(f"{self.site}_temu_detail_seed", {k: meta.get("priority")})
                logging.info("spider get data error")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"request push redis error {e}")
                time.sleep(3)
                continue
            except FunctionTimedOut as e:
                logging.info(f"request push redis timeout {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl temu_post_detail -a site=us'.split()
    cmdline.execute(args)


# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl temu_get_detail  > temu_detail.log 2>&1 &
# nohup scrapy crawl temu_get_detail -a site=us  > temu_detail.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl temu_get_detail -a site=us" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl temu_get_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl temu_get_detail -a site=us > temu_detail.log 2>&1 &

# 4.账号：highiwgfra@outlook.com
# 密码：dvfdsad487