# coding:utf-8
import os
import sys
import time
import queue
import scrapy
import logging
import platform
from urllib.parse import quote
from scrapy import cmdline, signals
from sqlalchemy.exc import OperationalError
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# 亚马逊详情页数据获取
from amazon_spider.utils.read_db_data import ReadDb

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class Get1688SearchSpider(scrapy.Spider):
    name = 'get_1688_search'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        'DOWNLOAD_DELAY': 2,
        'allowed_domains': ['1688.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.SearchCookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 递减调用
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.detail_1688_pipe.Search1688SpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, page_spider=5):
        super(Get1688SearchSpider, self).__init__()
        self.page_spider = int(page_spider)
        self.site = "us"
        self.col = ['company_name', 'memberId', 'memberId', 'page', 'search_term']

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(Get1688SearchSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------')
        try:
            df = ReadDb("us").read_db_1688_serch_keyword()
        except OperationalError as e:
            logging.info(f'get seeds failure in link sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        except FunctionTimedOut as e:
            logging.info(f'get seeds time out sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        if df.shape[0] > 0:
            for i in df.values:
                # keyword = quote(i[1], encoding='gbk')
                dq = queue.Queue()
                for n in range(1, self.page_spider):
                    for asyncCount in [6, 14]:
                        startIndex = 0 if asyncCount == 6 else 6
                        # url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={quote(i[1], encoding="gbk")}&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'
                        # url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={quote(i[1], encoding="gbk")}&async=true&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'
                        str_encoding = quote(i[1], encoding='gbk').replace("/", "%5C%2F")
                        url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={str_encoding}&async=true&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'

                        headers = {
                            'authority': 'search.1688.com',
                            'accept': '*/*',
                            'accept-language': 'zh-CN,zh;q=0.9',
                            'referer': 'https://search.1688.com/company/pc/factory_search.html?spm=a260k.19776607.kyttf5s0.3.1c5c4d84sYaCQA&charset=utf8&hideMainTab=1&keywords=&pagesource=sem_a0c5fccd75056341b8789b8cc8dcf77ea43f&beginPage=2',
                            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
                        }
                        # cookies = {
                        #     'cna': 'PLR9HYOP43QCAXd7sddOg5Kq',
                        # }
                        meta = {
                            "use_aiohttp": True,
                            "search_term": i[1],
                            # "amazon_proxy": True,
                            # "cookiejar": int(time.time()),
                            "page": n,
                            "asyncCount": asyncCount,
                            "dq": dq
                        }
                        r = scrapy.Request(url=url, headers=headers,
                                             callback=self.parse_json, errback=self.err_parse,
                                             dont_filter=True, meta=meta)
                        self.crawler.engine.crawl(r, self)
                # url = f"https://search.1688.com/company/pc/factory_search.html?keywords={keyword}"
                #
                # meta = {
                #     "search_term": i[1],
                #     # "amazon_proxy": True,
                #     "cookiejar": int(time.time()),
                #     "page": 1,
                # }
                # headers = {
                #     'authority': 'search.1688.com',
                #     'accept': '*/*',
                #     'accept-language': 'zh-CN,zh;q=0.9',
                #     'referer': 'https://search.1688.com/company/pc/factory_search.html?spm=a260k.19776607.kyttf5s0.3.1c5c4d84sYaCQA&charset=utf8&hideMainTab=1&keywords=&pagesource=sem_a0c5fccd75056341b8789b8cc8dcf77ea43f&beginPage=2',
                #     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
                # }
                # cookies = {
                #     'cna': 'o3wqHTwhERgCAXFXAZFa3tLV',
                # }
                # self.crawler.engine.crawl(
                #     scrapy.Request(url=url, cookies=cookies, headers=headers,
                #                    callback=self.parse, errback=self.err_parse,
                #                    dont_filter=True, meta=meta), self)
        else:
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_new_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()

    def parse_new_page(self, response, **kwargs):
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def err_(self, response, **kwargs):
        print("sleep=========error", response.getErrorMessage(), response.request.meta.get("asin"))
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    # def parse(self, response, **kwargs):
    #     stats = self.crawler.stats
    #     data_list = re.findall(r"window.data.offerresultData = successDataCheck\((.*?)\);", response.text)
    #     item = []
    #     if data_list:
    #         datas = json.loads(data_list[0])
    #         for i in datas["data"]["companyWithOfferLists"]:
    #             memberId = i["factoryInfo"]["factoryDetailUrl"].split("memberId=")[-1].split("&")[0]
    #             item.append({
    #                 "company_name": i["factoryInfo"]["company"],
    #                 "memberId": memberId,
    #                 "page": response.meta.get("page"),
    #                 "search_term": response.meta.get("search_term")
    #             })
    #             print(i["factoryInfo"]["company"])
    #             print(memberId)
    #             print("页数", response.meta.get("page"))
    #             print("搜索词", response.meta.get("search_term"))
    #         stats.inc_value(response.meta.get("search_term"))
    #
    #     response.meta["items"] = item

    def parse_json(self, response, **kwargs):
        stats = self.crawler.stats
        item = []
        if response.json()["data"]["data"]["totalCount"] == 0 and response.meta.get("page") == 1:
            print("没有数据")
            item = {
                "error_asin": True,
                "search_term": (4, response.request.meta.get("search_term")),
            }
            yield item
        else:
            for i in response.json()["data"]["data"]["companyWithOfferLists"]:
                print("---------------------------------------")
                print("company name", i["factoryInfo"]["company"])
                # 金牌制造
                # print("工厂类型", i["factoryInfo"]["factoryLevel"])
                # print("工厂类型", i["factoryInfo"].get("factoryLevel"))
                # # 场地面积
                # print("场地面积", i["factoryInfo"]["factorySize"])
                # # 回头率
                # print("回头率", i["factoryInfo"]["repeatRate"])
                # # 响应率
                # print("响应率", i["factoryInfo"]["wwResponseRate"])
                # print("工厂详情页url", i["factoryInfo"]["factoryDetailUrl"])
                memberId = i["factoryInfo"]["factoryDetailUrl"].split("memberId=")[-1].split("&")[0]
                item.append({
                    "company_name": i["factoryInfo"]["company"],
                    "memberId": memberId,
                    "page": response.meta.get("page"),
                    "search_term": response.meta.get("search_term")
                })
            response.meta.get("dq").put(item)
            stats.inc_value(response.meta.get("search_term"))
            if int(stats.get_value(response.meta.get("search_term"))) == ((self.page_spider-1) * 2):
                max_item = []
                for i in range(0, response.meta.get("dq").qsize()):
                    max_item += response.meta.get("dq").get()
                stats.set_value(response.meta.get("search_term"), 0)
                yield {"inner_item": max_item}
                # df = pd.DataFrame(max_item, columns=['company_name', 'memberId', 'memberId', 'page', 'search_term'])
                # df.to_sql(name='1688_company_id', con=get_country_engine("us"), if_exists='append', index=False)

    def err_parse(self, response, **kwargs):
        stats = self.crawler.stats
        stats.set_value(response.request.meta.get("search_term"), 0)
        item = {
            "error_asin": True,
            "search_term": (1, response.request.meta.get("search_term")),
        }
        logging.info(f"爬取失败关键词:{item.get('search_term')}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl get_1688_search -a page_spider=3'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl get_1688_search  > 1688_search1.log 2>&1 &
# nohup scrapy crawl get_1688_search -a page_spider=5 > 1688_search1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl get_1688_search" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl get_1688_search
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl get_1688_search > 1688_search1.log 2>&1 &
