import time
import queue
import scrapy
import logging
import pandas as pd
from urllib.parse import urlencode
from scrapy import cmdline, signals
from scrapy.exceptions import DontCloseSpider


class CantonfairBsrSpider(scrapy.Spider):
    name = 'b2b_shop_bsr_api'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        #  curl_cffi 代理添加
        # 'PROXY_HOST': 'http-dynamic-S02.xiaoxiangdaili.com',
        # 'PROXY_PORT': 10030,
        # 'PROXY_USER': '******',
        # 'PROXY_PASS': '******',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        # 'DOWNLOAD_DELAY': 2,
        # # 启用后，当从相同的网站获取数据时，Scrapy将会等待一个随机的值，延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
        # 'RANDOMIZE_DOWNLOAD_DELAY': True,
        'allowed_domains': ['cantonfair.org.cn'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404, 401],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 480,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'amazon_spider.middlewares.UpdateCookiesUrl': 530,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.amazon_comment_pipe_text.AmazonCommentSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        },
        # 'DOWNLOAD_HANDLERS': {
        #     'http': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        #     'https': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        # }
    }

    def __init__(self):
        super(CantonfairBsrSpider, self).__init__()
        self.utils_requests = [{"use_aiohttp": True}, {"curlcffi": True}, {"use_httpx": True}]

        # df_list = []
        # # 循环遍历每个 CSV 文件并读取
        # csv_files = ['cantonfair_玩具及孕婴童1.xlsx', 'cantonfair_玩具及孕婴童_931_2.xlsx', 'cantonfair_玩具及孕婴童_931_3.xlsx']
        # for csv_file in csv_files:
        #     df = pd.read_excel(csv_file, dtype={'shop_id': str})
        #     df_list.append(df)
        # df_save = pd.concat(df_list)
        # print(df_save.shape)
        # df_save.drop_duplicates(['shop_id'], inplace=True)
        # print(df_save.shape)
        # df_save.to_excel(f"./cantonfair_玩具及孕婴童max.xlsx", index=False, encoding='utf-8')

        # self.seeds = pd.read_csv('./cantonfair_2900_main.csv', dtype={'shop_id': str})
        self.seeds_file = './cantonfair_五金工具_2088_1.xlsx'
        self.save_file = './cantonfair_五金工具_bsr_max1.xlsx'
        self.seeds = pd.read_excel(self.seeds_file, dtype={'shop_id': str, 'commodity_id': str})
        # self.seeds['state'] = 1
        self.seeds_new = self.seeds[self.seeds['state'] == 1]

        # self.seeds_new = self.seeds[self.seeds['hot_product'].isna()]
        # del self.seeds_new['Unnamed: 0.1']
        # del self.seeds_new['Unnamed: 0.1.1']
        # self.seeds['hot_product'] = ''
        # self.seeds['hot_product_len'] = ''
        # self.seeds['title_svg'] = ''
        self.q = queue.Queue()

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(CantonfairBsrSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    # def start_requests(self):
    #     p = [
    #         # {
    #         #     "productSearchable": "true",
    #         #     "industrySiteId": "461110967833538560",
    #         #     "unbox": "true",
    #         #     "lang": "zh-CN,en-US",
    #         #     "categoryId": "461147061383880704",
    #         #     "page": "0",
    #         #     "size": "20",
    #         #     "scoreStrategy": "shop",
    #         #     "productFilter": "salesInfo.status eq 'LISTED' and salesInfo.shop.id ne '665972138192240640'"
    #         # },
    #         {
    #             "productSearchable": "true",
    #             "industrySiteId": "461110967833538560",
    #             "unbox": "true",
    #             "lang": "zh-CN,en-US",
    #             "categoryId": "",
    #             "page": "0",
    #             "size": "20",
    #             "scoreStrategy": "shop",
    #             # salesInfo.seller.sellerType eq 'IMPORT' and salesInfo.status eq 'LISTED' and salesInfo.shop.id ne '665972138192240640'
    #             "productFilter": "salesInfo.seller.sellerType eq 'IMPORT' and salesInfo.status eq 'LISTED' and salesInfo.shop.id ne '665972138192240640'"
    #         }
    #     ]
    #     headers = {
    #         "accept": "application/json, text/plain, */*",
    #         "accept-language": "zh-CN",
    #         "cache-control": "no-cache",
    #         "pragma": "no-cache",
    #         "referer": "https://www.cantonfair.org.cn/zh-CN/detailed?seller=import&category=&scategory=&type=1&keyword=&page=1&size=20&tab=exhibitor&sort=relate^%^20desc&filter=18f0ed42f27-a7f1^",
    #         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    #         "x-requested-with": "XMLHttpRequest",
    #         "x-user-lan": "zh-CN"
    #     }
    #     url = "https://www.cantonfair.org.cn/b2bshop/api/themeRos/public/productShops/searchByVariables"
    #     return [scrapy.Request(url + '?' + urlencode(i), headers=headers, callback=self.parse, meta={'param': i, 'datas': queue.Queue()}) for i in p]


    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------')
        if self.seeds_new.shape[0] > 0:
            for i in self.seeds_new.values:
                headers = {
                    "accept": "application/json, text/plain, */*",
                    "accept-language": "zh-CN",
                    "cache-control": "no-cache",
                    "pragma": "no-cache",
                    "referer": "https://www.cantonfair.org.cn/zh-CN/shops/492417923772864?keyword=",
                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
                    "x-requested-with": "XMLHttpRequest",
                    "x-user-lan": "zh-CN"
                }
                url = "https://www.cantonfair.org.cn/b2bshop/api/themeRos/public/shopProducts/searchByVariables"
                params = {
                    "lang": "zh-CN,en-US",
                    "industrySiteId": "461110967833538560",
                    "shopCode": f"{i[6]}",
                    "unbox": "true",
                    "filter": "salesInfo.udfs.productGroup in ['热门推荐'] and salesInfo.status eq 'LISTED'",
                    "orderBy": "salesInfo.udfs.sequence asc, salesInfo.lastModifiedTime desc",
                    "page": "0",
                    "size": "10"
                }

                meta = {
                    "shopCode": f"{i[6]}",
                    'choice_header': True,
                    'index_url': f'https://www.cantonfair.org.cn/zh-CN/shops/{i[6]}'
                }

                self.crawler.engine.crawl(
                    scrapy.Request(url + '?' + urlencode(params), headers=headers, callback=self.parse, meta=meta), self)

            # self.seeds.to_excel('./cantonfair_玩具及孕婴童max.xlsx', encoding='utf-8')
            # self.seeds.to_csv('./cantonfair_2900_main.csv', encoding='utf-8')
        else:
            # self.seeds.to_excel('./cantonfair_玩具及孕婴童max.xlsx', encoding='utf-8')

            # meta = {
            #     'handle_httpstatus_all': True
            # }
            # request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
            #                          dont_filter=True, meta=meta)
            # self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()

    def parse(self, response, **kwargs):
        print("________+++++++")
        # datas = response.json()
        items = []
        for i in response.json()['_embedded']['b2b:listedProducts']:
            item = {
                'shop_id': response.meta["shopCode"],
                'commodity_id': i['id'],
                'minimumOrderQuantity': i['minimumOrderQuantity'],
                'price': str(i['udfs']),
                'uom': str(i['uom']),
                'material': i['salesInfo']['material'],
                'title': i['sku']['name'].replace('\x01', ' ').replace('\x05', ' ').replace('\x08', ' '),
                'MainMarkets': ','.join(i['salesInfo']['udfs']['MainMarkets']),
                'commodity_url': f'https://www.cantonfair.org.cn/zh-CN/shops/{response.meta["shopCode"]}/listedProducts/{i["id"]}?search=',
                'login': i['salesInfo']['isDisplayPrice'],
                'shop_url': f"https://www.cantonfair.org.cn/zh-CN/shops/{response.meta['shopCode']}?keyword=#/"
            }
            items.append(item)
            print(item)
            # print(i)
            # print(i['id'])

            # 起订量
            # print(i['minimumOrderQuantity'])
            #
            # 最小到最大之间
            # print(i['udfs']['currency'])
            # print(i['udfs'])
            # # 产品材质
            # print(i['salesInfo']['material'])
            # print(i['sku']['name'])
            # print(i['salesInfo']['udfs']['MainMarkets'])
        self.q.put(items)
        self.seeds.loc[(self.seeds['shop_id'] == response.meta['shopCode']), 'state'] = 3
        # self.seeds.loc[(self.seeds['shop_id'] == response.meta['shopCode']), 'hot_product'] = str(items)
        # self.seeds.loc[(self.seeds['shop_id'] == response.meta['shopCode']), 'shop_url'] = f"https://www.cantonfair.org.cn/zh-CN/shops/{response.meta['shopCode']}?keyword=#/"
        # self.seeds.loc[(self.seeds['shop_id'] == response.meta['shopCode']), 'hot_product_len'] = len(items)
        # self.seeds.loc[(self.seeds['shop_id'] == response.meta['shopCode']), 'title_svg'] = '|-|'.join([i.get("title", '') or "" for i in items] if items else [])

    def err_parse(self, failure, **kwargs):
        request = failure.request
        logging.info(f"error______ {failure.getErrorMessage()}")

    def close(self, spider, reason):
        logging.info("spider finish")
        if self.q.qsize():
            d = []
            for i in range(self.q.qsize()):
                d += self.q.get()
            df_1 = pd.DataFrame(d)
            print(df_1.shape)
            try:
                df_2 = pd.read_excel(self.save_file, dtype={'shop_id': str, 'commodity_id': str})
                print(df_2.shape)
                df_save = pd.concat([df_1, df_2])
            except FileNotFoundError as e:
                df_save = pd.concat([df_1])
                print("没有该文件")
            print(df_save.shape)
            df_save.drop_duplicates(['commodity_id'], inplace=True)
            print(df_save.shape)
            df_save.to_excel(self.save_file, index=False, encoding='utf-8')
        self.seeds.to_excel(self.seeds_file, index=False, encoding='utf-8')


if __name__ == '__main__':
    args = 'scrapy crawl b2b_shop_bsr_api'.split()
    cmdline.execute(args)

# source activate pyspark
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_comment_history -a site=us  > amazon.log 2>&1 &
# nohup scrapy crawl amazon_comment_all -a site=us > amazon_history_comment_us1.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl amazon_comment_all" |awk '{print $2}' `; do kill -9 $i ; done;
# nohup /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_comment -a site=us > amazon1.log 2>&1 &
# nohup scrapy crawl amazon_comment -a site=de > amazon_de1.log 2>&1 &



