# coding:utf-8
import os
import re
import sys
import time
import queue
import scrapy
import logging
import platform
from urllib.parse import quote
from scrapy import cmdline, signals
from sqlalchemy.exc import OperationalError
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.utils.read_db_data import ReadDb

# 亚马逊详情页数据获取
if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class Post1688SearchSpider(scrapy.Spider):
    name = 'post_1688_search_company'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'CONCURRENT_REQUESTS_PER_IP': 1,
        'DOWNLOAD_DELAY': 4,
        'DEPTH_PRIORITY': 1,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['search.1688.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.SearchCookiesZip': 480,
            'amazon_spider.middleware.contact_info_1688.AddSignMiddleware': 470,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,

            # 递减调用
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.one_688_get_address.ContactInfo1688SpiderPipeline': 230,
            # 'amazon_spider.pipeline.detail_1688_pipe.Search1688SpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, page_spider=5):
        super(Post1688SearchSpider, self).__init__()
        self.page_spider = int(page_spider)
        self.site = "us"
        self.col = ['company_name', 'memberId', 'memberId', 'page', 'search_term']

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(Post1688SearchSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        try:
            df = ReadDb(self.site).read_db_1688_address()
        except OperationalError as e:
            logging.info(f'get seeds failure in link sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        except FunctionTimedOut as e:
            logging.info(f'get seeds time out sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        return df

    def key_word(self, i):
        s_ = "<append>" if "<append>" in i[1] else "/"
        keyword = [k.strip() for k in i[1].split(s_)]
        k_list = []
        for d in keyword:
            if d:
                group = re.findall(
                    r".*?(?:市|上海|县|区|乡)?(.*)(?:商务商行|用品厂|有限公司|厂|实力供应商|经营部|商行)+",
                    d)
                group = group if group else [d]
            else:
                group = None
            while group:
                pattern_site = re.compile('市|上海|县|区|乡')
                if pattern_site.search(group[0]):
                    group = re.findall(r".*?(?:市|上海|县|区|乡)+(.*)",
                                       group[0])
                else:
                    break
            k = group or [d] or [i[1]]
            pattern_2 = re.compile('供应链|实力供应商|官方旗舰店|实力旗舰店|五金制品|义乌|深圳|徐州|玻璃制品')
            if pattern_2.search(k[0]):
                k[0] = re.sub(pattern_2, "", k[0])
            k_list.append(k[0])
        return set(k_list)

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------')
        try:
            df = ReadDb(self.site).read_db_1688_address()
        except OperationalError as e:
            logging.info(f'get seeds failure in link sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        except FunctionTimedOut as e:
            logging.info(f'get seeds time out sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        if df.shape[0] > 0:
            for i in df.values:
                for k in self.key_word(i):
                    dq = queue.Queue()
                    for n in range(1, self.page_spider):
                        for asyncCount in [6, 14]:
                            startIndex = 0 if asyncCount == 6 else 6
                            # url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={quote(i[1], encoding="gbk")}&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'
                            # url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={quote(i[1], encoding="gbk")}&async=true&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'
                            str_encoding = quote(k, encoding='gbk').replace("/", "%5C%2F")
                            url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={str_encoding}&async=true&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'

                            headers = {
                                'authority': 'search.1688.com',
                                'accept': '*/*',
                                'accept-language': 'zh-CN,zh;q=0.9',
                                'referer': 'https://search.1688.com/company/pc/factory_search.html?spm=a260k.19776607.kyttf5s0.3.1c5c4d84sYaCQA&charset=utf8&hideMainTab=1&keywords=&pagesource=sem_a0c5fccd75056341b8789b8cc8dcf77ea43f&beginPage=2',
                                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
                            }

                            meta = {
                                "name": i[1],
                                "cookiejar": int(time.time()),
                                "curlcffi": True,
                                "search_keyword": k,
                                "page": n,
                                "asyncCount": asyncCount,
                                "dq": dq,
                                "impersonate": 'chrome101',
                                # "search_term": i[3]
                            }
                            r = scrapy.Request(url=url, headers=headers,
                                                 callback=self.parse_json, errback=self.err_parse,
                                                 dont_filter=True, meta=meta)
                            self.crawler.engine.crawl(r, self)
                # pattern_ = re.compile('商务商行|用品厂|有限公司|厂|实力供应商|经营部|商行')
        else:
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_new_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()

    def parse_new_page(self, response, **kwargs):
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def err_(self, response, **kwargs):
        print("sleep=========error", response.getErrorMessage(), response.request.meta.get("asin"))
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    # def parse(self, response, **kwargs):
    #     stats = self.crawler.stats
    #     data_list = re.findall(r"window.data.offerresultData = successDataCheck\((.*?)\);", response.text)
    #     item = []
    #     if data_list:
    #         datas = json.loads(data_list[0])
    #         for i in datas["data"]["companyWithOfferLists"]:
    #             memberId = i["factoryInfo"]["factoryDetailUrl"].split("memberId=")[-1].split("&")[0]
    #             item.append({
    #                 "company_name": i["factoryInfo"]["company"],
    #                 "memberId": memberId,
    #                 "page": response.meta.get("page"),
    #                 "search_term": response.meta.get("search_term")
    #             })
    #             print(i["factoryInfo"]["company"])
    #             print(memberId)
    #             print("页数", response.meta.get("page"))
    #             print("搜索词", response.meta.get("search_term"))
    #         stats.inc_value(response.meta.get("search_term"))
    #
    #     response.meta["items"] = item

    def parse_json(self, response, **kwargs):
        if "window._config_ =" in response.text:
            logging.info("验证码")
        elif response.json()["data"]["data"]["totalCount"] == 0 and response.meta.get("page") == 1:
            print("没有数据")
            item = {
                "error_asin": True,
                "search_term": (4, response.request.meta.get("search_term")),
            }
            yield item
        else:
            info = []
            memberid_list = []
            factory_name_list = []
            for i in response.json()["data"]["data"]["companyWithOfferLists"]:
                print("---------------------------------------")
                if (response.meta.get('name') in i["factoryInfo"]["company"]) or (
                        response.meta.get('search_keyword') in i["factoryInfo"]["company"]):
                    memberId = i["factoryInfo"]["factoryDetailUrl"].split("memberId=")[-1].split("&")[0]
                    info.append({
                        "name": i["factoryInfo"]["company"],
                        "source_name": response.meta.get("name"),
                        "search_keyword": response.meta.get('search_keyword'),
                        "member_id": memberId,
                    })
                    memberid_list.append(memberId)
                    factory_name_list.append(i["factoryInfo"]["company"])
            if info:
                logging.info(f"符合： {response.meta.get('name')} {info}")
                logging.info(
                    f"通过公司名称 {response.meta.get('search_keyword')} {response.meta.get('name')} 搜索  匹配符合的为：{info}")
                for i in info:
                    url = f'https://h5api.m.1688.com/h5/mtop.alibaba.alisite.cbu.server.pc.moduleasyncservice/1.0/'
                    meta = {
                        "sign_1688": True,
                        "memberId": i.get("member_id"),
                        # "amazon_proxy": True,
                        "cookiejar": int(time.time()),
                        "company_name": i.get("name"),
                        'search_term': response.meta.get('name')
                    }

                    headers = {
                        'Host': 'h5api.m.1688.com',
                        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                        'referer': 'https://kszwyr.1688.com/',
                        'accept-language': 'zh-CN,zh;q=0.9',
                    }
                    yield scrapy.Request(url=url, headers=headers,
                                         callback=self.parse_info, errback=self.err_parse,
                                         dont_filter=True, meta=meta)
            else:
                logging.info(
                    f"通过公司名称 {response.meta.get('search_keyword')} {response.meta.get('name')} 搜索  匹配不符合的为：{factory_name_list}")

    def parse_info(self, response, **kwargs):
        data_dict = response.json()
        company_name = data_dict["data"].get("companyName")
        mobileNo = data_dict.get("data").get("mobileNo")
        phoneNum = data_dict.get("data").get("phoneNum")
        fax = data_dict.get("data").get("faxNum")
        print("传真", fax)
        contact_name = data_dict.get("data").get("name")
        position = data_dict.get("data").get("jobTitle")
        print(response.json())
        items = {
            "company_name": company_name,
            "mobileNo": mobileNo,
            "phoneNum": phoneNum,
            "fax": fax or "",
            "contact_name": contact_name,
            "position": position,
            "memberId": response.meta.get("memberId"),
            "search_term": response.meta.get("search_term"),
        }
        yield {"inner_item": items}

    def err_parse(self, response, **kwargs):
        stats = self.crawler.stats
        stats.set_value(response.request.meta.get("search_term"), 0)
        item = {
            "error_asin": True,
            "search_term": (1, response.request.meta.get("search_term")),
        }
        logging.info(f"爬取失败关键词:{item.get('search_term')}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl post_1688_search_company -a page_spider=2'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl get_1688_search  > 1688_search1.log 2>&1 &
# nohup scrapy crawl get_1688_search -a page_spider=5 > 1688_search1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl get_1688_search" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl get_1688_search
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl get_1688_search > 1688_search1.log 2>&1 &
