# coding:utf-8
import os
import sys
import json
import time
import redis
import queue
import scrapy
import logging
import pandas as pd
from scrapy import cmdline, signals
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import spop, sadd


class TemuGetSearchSpider(scrapy.Spider):
    name = 'temu_get_search'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'CONCURRENT_REQUESTS_PER_IP': 1,
        'DOWNLOAD_DELAY': 3,
        'DEPTH_PRIORITY': 1,

        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['temu.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 1,  # 想重试几次就写几
        "COOKIES_ENABLED": False,
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            'amazon_spider.middleware.temu.AddAntiMiddleware': 470,
            'amazon_spider.middleware.temu.TemuIsPageError': 480,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            # 'amazon_spider.middleware.temu.CurlCffiRequests': 490,

            # 'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # Http2Middleware
            # 'amazon_spider.middlewares.Http2Middleware': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.temu_search_pip.TemuSearchSpiderTextPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(TemuGetSearchSpider, self).__init__()
        self.site = site
        self.site_url = {
            "us": 'https://www.temu.com',
            'de': "https://www.temu.de",
            "uk": "https://www.temu.co.uk",
            "it": "https://www.temu.it",
            "es": "https://www.temu.es",
            "fr": "https://www.temu.fr",
            "mx": "https://www.temu.mx",
            "ca": "https://www.temu.ca",
        }

        self.headers = {
            'authority': 'www.temu.com',
            'Host': 'www.temu.com',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9',
        }

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(TemuGetSearchSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        while True:
            try:
                # seeds = zpop(f'{self.site}_real_zset_seed', 25)
                seeds = spop(f"{self.site}_temu_search_seed", 1)
                logging.info("get seed succeed")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"get seed error {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"get seed timeout {e}")
                continue
        # df = ReadDb(self.site).read_db_temu_serch_keyword()

        if seeds:
            for i in seeds:
                job = json.loads(i)
                url = self.site_url.get('us')
                # https://www.temu.com/search_result.html?search_key=feelers%20gauge
                url = f"{url}/search_result.html?search_key={job['search_term']}"
                meta = {
                    "search_term": job['search_term'],
                    "cookiejar": int(time.time()),
                    "page": 1,
                    "site": self.site,
                    "bee": True,
                    "method": "GET"
                    # "curlcffi": True,
                    # 'proxy': "http://127.0.0.1:9900"
                }
                headers = {
                    'authority': 'www.temu.com',
                    'Host': 'www.temu.com',
                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'accept-language': 'zh-CN,zh;q=0.9',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
                }
                cookies = {
                    'currency': 'USD',
                    'language': 'en',
                    'region': '211',
                }
                self.crawler.engine.crawl(scrapy.Request(url=url, headers=headers, cookies=cookies,
                                                         callback=self.parse, errback=self.err_parse,
                                                         dont_filter=True, meta=meta), self)
        else:
            raise

    def parse(self, response, **kwargs):
        # print("-" * 20)
        # with open(f"{response.meta.get('self_asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        stats = self.crawler.stats
        datas_json = json.loads(response.xpath("//script[contains(text(), 'rawData')]//text()").get().split("rawData=")[-1].split(";document.dispatchEvent")[0])
        items = []
        if not datas_json["store"].get("goodsList"):
            meta = {
                "search_term": response.meta.get("search_term"),
            }
            k = json.dumps(meta)
            while True:
                try:
                    sadd(f"{self.site}_temu_search_seed", k, use_md5=False)
                    # zadd(f"{self.site}_real_zset_seed", {k: meta.get("priority")})
                    logging.info("spider get data error")
                    break
                except redis.exceptions.ConnectionError as e:
                    logging.info(f"request push redis error {e}")
                    time.sleep(5)
                    continue
                except FunctionTimedOut as e:
                    logging.info(f"request push redis timeout {e}")
                    continue
        else:
            for index, i in enumerate(datas_json["store"]["goodsList"]):
                item = {}
                # item["site"] = "us"
                item["search_term"] = response.meta.get("search_term")
                item["title"] = i["data"]["title"]
                item["goodid"] = str(i["data"]["goodsId"])
                item["img_url"] = i["data"]["thumbUrl"]
                item["page"] = response.meta.get("page")
                item["state"] = 1
                # item = self_temu_item(item)
                items.append(item)
            stats.inc_value(response.meta.get("search_term"))
            # print(items)
            q = queue.Queue()
            q.put(items)
            # yield {"datas": items, "asin": response.meta.get("self_asin"), "site": response.meta.get("site")}
            for i in range(120, 245, 120):
                headers = {
                    'authority': 'www.temu.com',
                    'accept': 'application/json, text/plain, */*',
                    'accept-language': 'zh-CN,zh;q=0.9',
                    'content-type': 'application/json;charset=UTF-8',
                    'origin': 'https://www.temu.com',
                    'referer': 'https://www.temu.com/search_result.html?search_key=feelers%20gauge',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
                }
                data = '{"scene":"search_result_rec","pageSn":10009,"listId":"1b9086ea1a2d4a1a9b2179f38457cea4","offset":%d,"pageSize":%d,"query":"%s","searchMethod":"","sprefix":""}' % (i, 120, response.meta.get('search_term'))
                cookies = {
                    'currency': 'USD',
                    'language': 'en',
                    'region': '211',
                }
                response.meta["anti"] = True
                response.meta["method"] = "POST"
                response.meta["page"] = 2 if i == 120 else 3
                response.meta["q"] = q
                # response.meta["curlcffi"] = False

                url = 'https://www.temu.com/api/poppy/v1/search?scene=search_result_rec'
                yield scrapy.Request(url=url, headers=headers, method="POST", callback=self.parse_tow, body=data, cookies=cookies,
                                                                    errback=self.err_parse,
                                                                    dont_filter=True, meta=response.meta)

            # index_url = self.site_url.get(self.site) + i["data"]["seoLinkUrl"].split("?")[0]
            # data = '{"goods_id":"601099512381169","_oak_query_app_only":1,"_oak_stage":4}'
            #
            # r = requests.post(url, verify=False, headers=headers, proxies=proxies, timeout=12, data=data)
            # img_url = "https://www.temu.com/api/oak/integration/render"
            #
            # headers = {
            #     'Host': urlparse(img_url).hostname,
            #     'authority': urlparse(img_url).hostname,
            #     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            #     'accept': 'application/json, text/plain, */*',
            #     'Content-Type': 'application/json;charset=UTF-8',
            #     'accept-language': 'zh-CN,zh;q=0.9',
            #     'referrer': index_url
            # }
            # print(index_url)
            # data = {"goods_id":str(i["data"]["goodsId"]),"_oak_query_app_only":1,"_oak_stage":4}
            # yield scrapy.Request(url=img_url, headers=headers, method="POST", callback=self.parse_img, body=json.dumps(data),
            #                                                     errback=self.err_parse,
            #                                                     dont_filter=True, meta=response.meta)
        # df = pd.DataFrame(items, columns=["new_title", "page", "total_rank", "asin", "data_type", "site"])
        # df.to_sql(name=f"amazon_temu_items", con=self.conn, if_exists='append', index=False)
        # df.to_csv('result.csv', mode='a')

    def parse_tow(self, response, **kwargs):
        stats = self.crawler.stats
        logging.info(f"第{response.meta.get('page')}页")
        items = []
        for i in response.json()["result"]["data"]["rec_list"]:
            item = {}
            item["search_term"] = response.meta.get("search_term")
            item["title"] = i["title"]
            item["goodid"] = str(i["goods_id"])
            item["img_url"] = i["thumb_url"]
            item["page"] = response.meta.get("page")
            item["state"] = 1
            items.append(item)
        # print(items)
        stats.inc_value(response.meta.get("search_term"))
        response.meta.get("q").put(items)
        if int(stats.get_value(response.meta.get("search_term"))) == 3:
            stats.set_value(response.meta.get("search_term"), 0)
            logging.info(f"Page spider succeed search_term : {response.meta.get('search_term')}")
            datas = []
            for i in range(0, response.meta.get("q").qsize()):
                datas += response.meta.get("q").get()

            df = pd.DataFrame(datas, columns=["search_term", "title", "goodid", "img_url", "page", "img_url"])
            print(df)
            df.to_csv(f'{response.meta.get("search_term")}.csv', mode='a')
            logging.info("to_csv save good")
            # yield {"datas": datas, "search_term": response.meta.get("search_term")}

    def err_parse(self, failure, **kwargs):
        stats = self.crawler.stats
        request = failure.request
        stats.set_value(request.meta.get("search_term"), 0)
        meta = {
            "search_term": request.meta.get("search_term"),
        }
        print(failure.getErrorMessage())
        k = json.dumps(meta)
        while True:
            try:
                sadd(f"{self.site}_temu_search_seed", k, use_md5=False)
                # zadd(f"{self.site}_real_zset_seed", {k: meta.get("priority")})
                logging.info("spider get data error")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"request push redis error {e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"request push redis timeout {e}")
                continue


if __name__ == '__main__':
    args = 'scrapy crawl temu_get_search -a site=us'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_get_asin_detail  > amazon.log 2>&1 &
# nohup scrapy crawl temu_get_search -a site=us  > temu_us_search1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl temu_get_search -a site=us" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl amazon_get_asin_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_asin_detail -a site=uk > amazon_get_asin_detail1.log 2>&1 &
# mini astronautas paracaidistas con paracaídas juguete volador favor de fiesta espacial regalos voladores requeridos ni montaje sin batería rellenos para bolsas de regalos

# 4.账号：highiwgfra@outlook.com
# 密码：dvfdsad487