# coding:utf-8
import os
import sys
import json
import time
import redis
import queue
import random
import scrapy
import logging
from kafka.errors import KafkaError
from scrapy import cmdline, signals
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.redis_db import sadd


class TemuGetImgSearchSpider(scrapy.Spider):
    name = 'temu_img_get_search'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'CONCURRENT_REQUESTS_PER_IP': 1,
        'DOWNLOAD_DELAY': 2,
        'DEPTH_PRIORITY': 1,

        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['temu.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        "COOKIES_ENABLED": False,
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            # 'amazon_spider.middleware.temu.RandomUserAgentMiddleware': 460,
            'amazon_spider.middleware.temu.AddAntiMiddleware': 470,
            'amazon_spider.middleware.temu.TemuIsPageError': 480,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            # 'amazon_spider.middleware.temu.CurlCffiRequests': 490,

            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # Http2Middleware
            # 'amazon_spider.middlewares.Http2Middleware': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.temu_search_pip.TemuSearchSpiderTextPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(TemuGetImgSearchSpider, self).__init__()
        self.site = site
        self.site_url = {
            "us": 'https://www.temu.com',
            'de': "https://www.temu.de",
            "uk": "https://www.temu.co.uk",
            "it": "https://www.temu.it",
            "es": "https://www.temu.es",
            "fr": "https://www.temu.fr",
            "mx": "https://www.temu.mx",
            "ca": "https://www.temu.ca",
        }

        self.headers = {
            'authority': 'www.temu.com',
            'Host': 'www.temu.com',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9',
        }
        self.cookie_bee = None
        # while True:
        #     if self.is_internet_available():
        #         self.producer = KafkaProducer(
        #             bootstrap_servers=['113.100.143.162:39092'],
        #             api_version=(2, 4, 1),
        #             value_serializer=lambda v: json.dumps(v).encode('utf-8')
        #         )
        #         break
        #     else:
        #         time.sleep(2)
        #         logging.info("Network connection failure")
        #         continue

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(TemuGetImgSearchSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def is_internet_available(self):
        import requests
        try:
            requests.get("http://www.baidu.com", timeout=1)
            return True
        except:
            return False

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        # while True:
        #     try:
        #         # seeds = zpop(f'{self.site}_real_zset_seed', 25)
        #         seeds = spop(f"{self.site}_temu_img_search_seed", 1)
        #         logging.info("get seed succeed")
        #         break
        #     except redis.exceptions.ConnectionError as e:
        #         logging.info(f"get seed error {e}")
        #         time.sleep(5)
        #         continue
        #     except FunctionTimedOut as e:
        #         logging.info(f"get seed timeout {e}")
        #         continue
        # df = ReadDb(self.site).read_db_temu_serch_keyword()
        seeds = [
            '''{
                "asin": "B0B3MM5D3J", 
                "search_term": "papier anhänger schön dass du da präsent anhänger schilder cm runde weiße blätter anhängerschilder etiketten mit schnur für hochzeit party verlobung diy kunsthandwerk projekt",
                "site": "us"
            }'''
        ]
        if seeds:
            for i in seeds:
                job = json.loads(i)
                url = self.site_url.get('us')
                # https://www.temu.com/search_result.html?search_key=feelers%20gauge
                url = f"{url}/search_result.html?search_key={job['search_term'].replace('#', '').replace('/', '').replace('±', '').replace('%', '')}"
                print(url)
                meta = {
                    "search_term": job['search_term'],
                    "asin": job["asin"],
                    "site": job["site"],
                    "cookiejar": int(time.time()),
                    "page": 1,
                    # "site": self.site,
                    "bee": True,
                    "method": "GET",
                    # "curlcffi": True,
                    'proxy': "http://127.0.0.1:7890"
                }
                headers = {
                    'authority': 'www.temu.com',
                    'Host': 'www.temu.com',
                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'accept-language': 'zh-CN,zh;q=0.9',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
                }
                cookies = {
                    'currency': 'USD',
                    'language': 'en',
                    'region': '211',
                    '_bee': self.cookie_bee
                }
                self.crawler.engine.crawl(scrapy.Request(url=url, headers=headers, cookies=cookies,
                                                         callback=self.parse, errback=self.err_parse,
                                                         dont_filter=True, meta=meta), self)
        else:
            raise

    def parse(self, response, **kwargs):
        # print("-" * 20)
        # with open(f"{response.meta.get('self_asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        stats = self.crawler.stats
        datas_json = json.loads(response.xpath("//script[contains(text(), 'window.rawData')]//text()").get().split("rawData=")[-1].split(";document.dispatchEvent")[0])
        items = []
        if not datas_json["store"].get("goodsList"):
            logging.info(f"not get goodsList")
            meta = {
                "asin": response.meta.get("asin"),
                "search_term": response.meta.get("search_term"),
                "site": response.meta.get("site"),
            }
            k = json.dumps(meta)
            stats.set_value(response.meta.get("asin") + f"_{response.meta.get('site')}", 0)
            while True:
                try:
                    sadd(f"{self.site}_temu_img_search_seed", k, use_md5=False)
                    # zadd(f"{self.site}_real_zset_seed", {k: meta.get("priority")})
                    logging.info("spider get data error")
                    break
                except redis.exceptions.ConnectionError as e:
                    logging.info(f"request push redis error {e}")
                    time.sleep(3)
                    continue
                except FunctionTimedOut as e:
                    logging.info(f"request push redis timeout {e}")
                    continue
        else:
            logging.info(f"第{response.meta.get('page')}页数据长度为：{len(datas_json['store']['goodsList'])}")
            for n, i in enumerate(datas_json["store"]["goodsList"]):
                item = {}
                # item["site"] = "us"
                # item["search_term"] = response.meta.get("search_term")
                # item["title"] = i["data"]["title"]
                # item["goodid"] = str(i["data"]["goodsId"])
                # item["img_url"] = i["data"]["thumbUrl"]
                # item["page"] = response.meta.get("page")
                # item["state"] = 1
                item["asin"] = response.meta.get("asin")
                item["asin_compet"] = str(i["data"]["goodsId"])
                item["img_url"] = i["data"]["thumbUrl"]
                item["page"] = response.meta.get("page")
                item["page_row"] = n + 1
                item["state"] = 1
                item["site"] = response.meta.get("site")
                # item = self_temu_item(item)
                items.append(item)
            logging.info(f"第{response.meta.get('page')}页 data finish")
            stats.inc_value(response.meta.get("asin") + f"_{response.meta.get('site')}")
            q = queue.Queue()
            q.put(items)
            # yield {"datas": items, "asin": response.meta.get("self_asin"), "site": response.meta.get("site")}
            for i in range(120, 245, 120):
                headers = {
                    'authority': 'www.temu.com',
                    'accept': 'application/json, text/plain, */*',
                    'accept-language': 'zh-CN,zh;q=0.9',
                    'content-type': 'application/json;charset=UTF-8',
                    'origin': 'https://www.temu.com',
                    'referer': 'https://www.temu.com/search_result.html?search_key=feelers%20gauge',
                    # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
                }
                headers['user-agent'] = f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(50, 102)}.0.{random.randint(1000, 5000)}.{random.randint(1, 181)} Safari/537.36'

                data = '{"scene":"search_result_rec","pageSn":10009,"listId":"1b9086ea1a2d4a1a9b2179f38457cea4","offset":%d,"pageSize":%d,"query":"%s","searchMethod":"","sprefix":""}' % (i, 120, response.meta.get('search_term').replace('"', "").replace("\s", ""))
                logging.info(f"post data: {data}")
                cookies = {
                    'currency': 'USD',
                    'language': 'en',
                    'region': '211',
                    '_bee': response.meta.get("bee"),
                    'api_uid': response.meta.get("api_uid"),

                }
                response.meta["anti"] = True
                response.meta["method"] = "POST"
                response.meta["page"] = 2 if i == 120 else 3
                response.meta["q"] = q
                response.meta["curlcffi"] = True
                url = 'https://www.temu.com/api/poppy/v1/search?scene=search_result_rec'
                time.sleep(1)
                yield scrapy.Request(url=url, headers=headers, method="POST", callback=self.parse_tow, body=data, cookies=cookies,
                                                                    errback=self.err_parse,
                                                                    dont_filter=True, meta=response.meta)

            # index_url = self.site_url.get(self.site) + i["data"]["seoLinkUrl"].split("?")[0]
            # data = '{"goods_id":"601099512381169","_oak_query_app_only":1,"_oak_stage":4}'
            #
            # r = requests.post(url, verify=False, headers=headers, proxies=proxies, timeout=12, data=data)
            # img_url = "https://www.temu.com/api/oak/integration/render"
            #
            # headers = {
            #     'Host': urlparse(img_url).hostname,
            #     'authority': urlparse(img_url).hostname,
            #     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            #     'accept': 'application/json, text/plain, */*',
            #     'Content-Type': 'application/json;charset=UTF-8',
            #     'accept-language': 'zh-CN,zh;q=0.9',
            #     'referrer': index_url
            # }
            # print(index_url)
            # data = {"goods_id":str(i["data"]["goodsId"]),"_oak_query_app_only":1,"_oak_stage":4}
            # yield scrapy.Request(url=img_url, headers=headers, method="POST", callback=self.parse_img, body=json.dumps(data),
            #                                                     errback=self.err_parse,
            #                                                     dont_filter=True, meta=response.meta)
        # df = pd.DataFrame(items, columns=["new_title", "page", "total_rank", "asin", "data_type", "site"])
        # df.to_sql(name=f"amazon_temu_items", con=self.conn, if_exists='append', index=False)
        # df.to_csv('result.csv', mode='a')

    def time_network_if(self, switch_time):
        """

        :param switch_time: 定时切换ip时间
        :return:
        """
        logging.info(f"{time.localtime()}")
        min = time.localtime().tm_min
        min = 60 if min == 0 else min
        sec = time.localtime().tm_sec
        # sec = 60 if sec == 0 else sec
        if min % switch_time == switch_time - 1 and sec >= 50:
            logging.info(f"% switch time 为Switch time -1 等待切换代理  等待{60 - (60 - sec)}")
            time.sleep(60 - (60 - sec))
        elif min % switch_time == 0 and sec <= 50:
            logging.info(f"% switch time 为0 等待切换代理  等待{60 - sec}")
            time.sleep(60 - sec)
        else:
            logging.info("开始入库")
            return True

    def divide(self, lst, size):
        from math import ceil
        if size <= 0:
            return [lst]
        return [lst[i * size:(i + 1) * size] for i in range(0, ceil(len(lst) / size))]

    def parse_tow(self, response, **kwargs):
        stats = self.crawler.stats
        logging.info(f"第{response.meta.get('page')}页 {response.meta.get('bee')}")
        if response.json().get("result"):
            items = []
            logging.info(f"第{response.meta.get('page')}页数据长度为：{len(response.json()['result']['data']['rec_list'])}")

            for n, i in enumerate(response.json()["result"]["data"]["rec_list"]):
                item = {}
                item["asin"] = response.meta.get("asin")
                item["asin_compet"] = str(i["goods_id"])
                item["img_url"] = i["thumb_url"]
                item["page"] = response.meta.get("page")
                item["page_row"] = n + 1
                item["state"] = 1
                item["site"] = response.meta.get("site")
                # item["search_term"] = response.meta.get("search_term")
                # item["title"] = i["title"]
                # item["goodid"] = str(i["goods_id"])
                items.append(item)
            logging.info(f"第{response.meta.get('page')}页 data finish")
            stats.inc_value(response.meta.get("asin") + f"_{response.meta.get('site')}")
            response.meta.get("q").put(items)
            if int(stats.get_value(response.meta.get("asin") + f"_{response.meta.get('site')}")) == 3:
                stats.set_value(response.meta.get("asin") + f"_{response.meta.get('site')}", 0)
                logging.info(f"Page spider succeed search_term : {response.meta.get('search_term')}")
                datas = []
                for i in range(0, response.meta.get("q").qsize()):
                    datas += response.meta.get("q").get()
                # df = pd.DataFrame(datas, columns=["asin", "asin_compet", "img_url", "page", "page_row", "state", "site"])
                # file_name = response.meta.get("asin") + f"_{response.meta.get('site')}"
                # df.to_csv(f'{file_name}.csv', mode='w')
                self.cookie_bee = response.meta.get('bee')
                statr_time = int(time.time())
                logging.info(f"statr_time: {statr_time}")
                for data in self.divide(datas, len(datas)//3):
                    logging.info(f"data len {len(data)}")
                    while True:
                        if self.time_network_if(2):
                            if self.is_internet_available():
                                future = None
                                start_future = None
                                end_future = None
                                for i, x in enumerate(data):
                                    if i == 0:
                                        start_future = self.producer.send("it_asin_detail", x, partition=3)
                                    elif len(datas) // 2 == i:
                                        future = self.producer.send("it_asin_detail", x, partition=3)
                                    else:
                                        end_future = self.producer.send("it_asin_detail", x, partition=3)
                                try:
                                    if start_future:
                                        start_future.get(timeout=10)
                                    if future:
                                        future.get(timeout=10)
                                    if end_future:
                                        end_future.get(timeout=10)
                                    logging.info('end data seed succeed')
                                    break
                                except KafkaError as e:
                                    logging.info(f'end data seed error {e}')
                                    continue
                            else:
                                logging.info("work error")
                logging.info(f"time consuming : {int(time.time()) - statr_time}")
                logging.info(f"to_csv save good asin {response.meta.get('asin')} {len(datas)}")
            # yield {"datas": datas, "search_term": response.meta.get("search_term")}
        else:
            logging.info(f"page {response.meta.get('page')} not get result")
            meta = {
                "asin": response.meta.get("asin"),
                "search_term": response.meta.get("search_term"),
                "site": response.meta.get("site"),
            }
            k = json.dumps(meta)
            stats.set_value(response.meta.get("asin") + f"_{response.meta.get('site')}", 0)
            while True:
                try:
                    sadd(f"{self.site}_temu_img_search_seed", k, use_md5=False)
                    # zadd(f"{self.site}_real_zset_seed", {k: meta.get("priority")})
                    logging.info("spider get data error")
                    break
                except redis.exceptions.ConnectionError as e:
                    logging.info(f"request push redis error {e}")
                    time.sleep(3)
                    continue
                except FunctionTimedOut as e:
                    logging.info(f"request push redis timeout {e}")
                    continue

    def err_parse(self, failure, **kwargs):
        stats = self.crawler.stats
        request = failure.request
        self.cookie_bee = None
        stats.set_value(request.meta.get("asin") + f"_{request.meta.get('site')}", 0)
        meta = {
            "asin": request.meta.get("asin"),
            "search_term": request.meta.get("search_term"),
            "site": request.meta.get("site"),
        }
        logging.info(f"spider get error ---：{failure.getErrorMessage()}")
        k = json.dumps(meta)
        while True:
            try:
                sadd(f"{self.site}_temu_img_search_seed", k, use_md5=False)
                # zadd(f"{self.site}_real_zset_seed", {k: meta.get("priority")})
                logging.info("spider get data error")
                break
            except redis.exceptions.ConnectionError as e:
                logging.info(f"request push redis error {e}")
                time.sleep(3)
                continue
            except FunctionTimedOut as e:
                logging.info(f"request push redis timeout {e}")
                continue

    # def k_seed(self, topic, message, partition):
    #     # if self.is_internet_available():
    #         # if self.producer.bootstrap_connected():
    #     try:
    #         future = self.producer.send(topic, message, partition=partition)
    #     except kafka.errors.KafkaTimeoutError as err:
    #         logging.info(f"kafka error {err}")
    #         return False
    #         # self.producer.flush()
    #         # logging.info("send kafka -----------")
    #         # return True
    #     try:
    #         record_metadata = future.get(timeout=10)
    #         logging.info('Message {} sent to partition {} with offset {}'.format(message, record_metadata.partition,
    #                                                                       record_metadata.offset))
    #         return True
    #     except KafkaError as e:
    #         logging.info('Failed to send message {}: {}'.format(message, e))
    #         return False
    #         # else:
    #         #     # self.producer.close()
    #         #     # self.producer = KafkaProducer(bootstrap_servers=['113.100.143.162:39092'], api_version=(2, 4, 1))
    #         #     logging.info("kafka producer disconnect")
    #         #     time.sleep(1)
    #         #     return False
    #     # else:
    #     #     time.sleep(2)
    #     #     logging.info("Network connection failure")
    #     #     return False


if __name__ == '__main__':
    args = 'scrapy crawl temu_img_get_search -a site=us'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl temu_img_get_search  > amazon.log 2>&1 &
# nohup scrapy crawl temu_img_get_search -a site=us  > temu_img_get_search.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl temu_img_get_search -a site=us" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl temu_img_get_search
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl temu_img_get_search -a site=uk > temu_img_get_search.log 2>&1 &
# mini astronautas paracaidistas con paracaídas juguete volador favor de fiesta espacial regalos voladores requeridos ni montaje sin batería rellenos para bolsas de regalos

# 4.账号：highiwgfra@outlook.com
# 密码：dvfdsad487