# coding:utf-8
import copy
import os
import sys
import json
import queue
import time
import scrapy
import logging
from pprint import pprint
from scrapy import cmdline, signals
from scrapy.exceptions import DontCloseSpider
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.utils.read_db_data import ReadDb
from amazon_spider.spiders.yswg_spider import SourceSpider


class TemuPostBsrSpider(SourceSpider):
    name = 'temu_post_bsr'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'CONCURRENT_REQUESTS_PER_IP': 1,
        'DOWNLOAD_DELAY': 5,
        'DEPTH_PRIORITY': 0, # 1 广度优先 0 深度优先

        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'DOWNLOAD_TIMEOUT': 10,
        'allowed_domains': ['temu.com'],

        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        "COOKIES_ENABLED": False,
        'COOKIES_DEBUG': True,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],

        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            # 'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middleware.temu.ForeignProxyMiddleware': 450,
            'amazon_spider.middleware.temu.AddAntiMiddleware': 470,
            'amazon_spider.middleware.temu.TemuIsPageError': 480,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # Http2Middleware
            # 'amazon_spider.middlewares.Http2Middleware': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.temu_bsr_pipe.TemuBsrSpiderTextPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(TemuPostBsrSpider, self).__init__()
        self.site = site
        self.site_url = {
            "us": 'https://www.temu.com',
            'de': "https://www.temu.de",
            "uk": "https://www.temu.co.uk",
            "it": "https://www.temu.it",
            "es": "https://www.temu.es",
            "fr": "https://www.temu.fr",
            "mx": "https://www.temu.mx",
            "ca": "https://www.temu.ca",
        }
        # self.cookie_bee = None

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(TemuPostBsrSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def hamc_encrypt(self, message, key):
        import hmac, hashlib
        result1 = hmac.new(key.encode(), message.encode(), hashlib.md5).hexdigest()
        return result1

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        # while True:
        #     try:
        #         # seeds = zpop(f'{self.site}_real_zset_seed', 25)
        #         seeds = spop(f"{self.site}_temu_detail_seed", 1)
        #         logging.info("get seed succeed")
        #         break
        #     except redis.exceptions.ConnectionError as e:
        #         logging.info(f"get seed error {e}")
        #         time.sleep(5)
        #         continue
        #     except FunctionTimedOut as e:
        #         logging.info(f"get seed timeout {e}")
        #         continue
        df = ReadDb(self.site).read_db_temu_bsr()
        if df.shape[0] > 0:
            for i in df.values:

        # if seeds:
        #     for i in seeds:
        #         job = json.loads(i)
                q = queue.Queue()
                t = int(time.time())
                for page in range(0, 241, 120):
                    time.sleep(5)
                    url = "https://www.temu.com/api/poppy/v1/title_bar_recommend?scene=home_title_bar_recommend"
                    meta = {
                        # "bee": True,
                        "anti": True,
                        "cookiejar": int(time.time()),
                        "curlcffi": True,
                        "impersonate": 'chrome110',
                        "opt_name": i[1],
                        "opt_type": i[2],
                        "opt_id": i[3],
                        "page": page//120+1,
                        "q": q,
                        "priority": page//120+1,
                        # "use_aiohttp": True,
                        # 'proxy': "http://127.0.0.1:7890",
                    }
                    # bsr 接口请求中  请求第3页时 相同的参数 请求第一次时第一页 请求第二次时第二页 请求第三次时第三页
                    headers = {
                        "Host": "www.temu.com",
                        # "pragma": "no-cache",
                        # "cache-control": "no-cache",
                        "accept": "application/json, text/plain, */*",
                        "content-type": "application/json;charset=UTF-8",
                        # "sec-ch-ua-mobile": "?0",
                        # Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36
                        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
                        "origin": "https://www.temu.com",
                        # "sec-fetch-site": "same-origin",
                        # "sec-fetch-mode": "cors",
                        # "sec-fetch-dest": "empty",
                        "referer": "https://www.temu.com/channel/best-sellers.html?filter_items=1%3A1&scene=home_title_bar_recommend&refer_page_el_sn=201341&refer_page_name=bgn_verification&refer_page_id=10017_1710131084558_ho4hb596j9&refer_page_sn=10017&_x_sessn_id=euqaunw2e6",
                        "accept-language": "zh-CN,zh;q=0.9"
                    }
                    cookies = {
                        "region": "211",
                        "language": "en",
                        "currency": "USD",
                        "api_uid": "CmxcDmXgP2UJygD8XVrlAg==",
                        "timezone": "Asia%2FShanghai",
                        "shipping_city": "211",
                        # "webp": "1",
                        # "_nano_fp": "XpmolpgbnpgxXqXbl9_8kaAyiNlY_5Oa95oDDOWp",
                        # "_bee": "Gct5E3XIer1cCHJZG28gSwTH38zwrapy",
                        # "njrpl": "Gct5E3XIer1cCHJZG28gSwTH38zwrapy",
                        # "dilx": "WNim1gXWzOdTMzIFnN~4u",
                        # "hfsc": "L3yKcYg57D/525fFfQ==",
                        # "_ttc": "3.FIECdr3fW3lm.1739938412"
                    }
                    data = {
                        "scene": "home_title_bar_recommend",
                        "pageSn": 10125,
                        "offset": 240,
                        "pageSize": 120,
                        # "listId": f"best_sellers_list_{t}",
                        "listId": f"best_sellers_list",
                        "filterItems": "1:1",
                        "optId": meta['opt_id'],
                        "optType": meta['opt_type'],
                    }
                    logging.info(f"data ---> {data}")
                    data = json.dumps(data, separators=(',', ':'))

                    r = scrapy.Request(url=url, headers=headers, method="POST", callback=self.parse,
                                       body=copy.deepcopy(data), cookies=cookies, priority=-page,
                                                                        errback=self.err_parse,
                                                                        dont_filter=True, meta=meta)
                    self.crawler.engine.crawl(r, self)

        else:
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()
            # logging.info('no task sleep 30s')
            # time.sleep(30)
            # raise DontCloseSpider()

    def is_internet_available(self):
        import requests
        try:
            requests.get("http://www.baidu.com", timeout=1)
            return True
        except:
            return False

    def parse(self, response, **kwargs):
        # print("-" * 20)
        # with open(f"{response.meta.get('goodid')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        # if 'Just a moment' in response.text:  # tls指纹被检测到，会返回这个信息
        #     print('被检测')
        # else:
        #     print('成功绕过')
        logging.info(f"{response.meta.get('opt_name')} 爬取第{response.meta.get('page')}页")
        stats = self.crawler.stats
        page_row = 1
        if response.json()["result"].get("data"):
            datas = response.json()["result"]['data']['goods_list']
            items = []
            for i in datas:
                img = i['thumb_url']
                title = i['title']
                sales_tip = i['sales_tip']
                price = i['price_info']['price_str']
                goodid = i['goods_id']
                goods_score = i['comment'].get('goods_score')
                comment_num_tips = i['comment'].get('comment_num_tips')
                item = {
                    "title": title,
                    "img_url": img,
                    "sales_tip": sales_tip,
                    "price": price,
                    "goodid": goodid,
                    "goods_score": goods_score,
                    "comment_num_tips": comment_num_tips,
                    "opt_name": response.meta['opt_name'],
                    "page": response.meta['page'],
                    "state": 1,
                    "page_row": page_row
                }
                items.append(item)
                pprint(item)
                page_row += 1
            stats.inc_value(response.meta.get("opt_name"))
            response.meta.get('q').put(items)
            if 3 == int(stats.get_value(response.meta.get("opt_name"))):
                logging.info(
                    f"页数爬取成功opt_name为: {response.meta.get('opt_name')} 数据长度： {response.meta.get('q').qsize()}")
                stats.set_value(response.meta.get("opt_name"), 0)
                max_datas = []
                for i in range(response.meta.get("q").qsize()):
                    max_datas += response.meta.get("q").get()

                yield {"inner_item": max_datas}
        else:
            stats.set_value(response.meta.get("opt_name"), 0)
            item = {
                "error_asin": True,
                "status": (1, response.meta.get("opt_name")),
            }
            logging.info(f"爬取失败 opt_name: {response.meta.get('opt_name')}")
            yield item

    def err_parse(self, failure, **kwargs):
        stats = self.crawler.stats
        request = failure.request
        stats.set_value(request.meta.get("opt_name"), 0)
        item = {
            "error_asin": True,
            "status": (1, request.meta.get("opt_name")),
        }
        logging.info(f"爬取失败 opt_name: {request.meta.get('opt_name')}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl temu_post_bsr -a site=us'.split()
    cmdline.execute(args)


# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl temu_get_detail  > temu_detail.log 2>&1 &
# nohup scrapy crawl temu_post_bsr -a site=us  > temu_bsr.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl temu_get_detail -a site=us" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl temu_get_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl temu_get_detail -a site=us > temu_detail.log 2>&1 &

# 4.账号：highiwgfra@outlook.com
# 密码：dvfdsad487