# coding:utf-8
import os
import re
import sys
import json
import time
import queue
import scrapy
import random
import logging
import platform
import Levenshtein
from urllib.parse import quote
from scrapy import cmdline, signals
from sqlalchemy.exc import OperationalError
from scrapy.exceptions import DontCloseSpider
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from func_timeout.exceptions import FunctionTimedOut
# 亚马逊详情页数据获取
from amazon_spider.utils.read_db_data import ReadDb

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class Get1688SearchCompanySpider(scrapy.Spider):
    name = 'get_1688_search_company'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'CONCURRENT_REQUESTS_PER_IP': 1,
        'DOWNLOAD_DELAY': 6,
        'DEPTH_PRIORITY': 1,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['search.1688.com'],
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        'COOKIES_ENABLED': False,
        'COOKIES_DEBUG': True,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            # 'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.SearchCookiesZip': 480,
            'amazon_spider.middleware.contact_info_1688.AddSignMiddleware': 470,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,

            # 递减调用
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.one_688_get_address.ContactInfo1688SpiderPipeline': 230,
            # 'amazon_spider.pipeline.detail_1688_pipe.Search1688SpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self):
        super(Get1688SearchCompanySpider, self).__init__()
        self.site = "us"
        self.col = ['company_name', 'memberId', 'memberId', 'page', 'search_term']
        self.method = "GET"
        self.reCaptcha_count = 0

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(Get1688SearchCompanySpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        try:
            df = ReadDb(self.site).read_db_1688_address()
        except OperationalError as e:
            logging.info(f'get seeds failure in link sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        except FunctionTimedOut as e:
            logging.info(f'get seeds time out sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        return df

    def key_word(self, i):
        s_ = "<append>" if "<append>" in i[1] else "/"
        keyword = [k.strip() for k in i[1].split(s_)]
        k_list = []
        for d in keyword:
            if d:
                group = re.findall(
                    r".*?(?:市|上海|县|区|乡|镇|省)?(.*)(?:商务商行|用品厂|有限公司|工厂|实力供应商|经营部|商行|销售部|电子商务)+",
                    d)
                group = group if group else [d]
            else:
                group = None
            while group:
                pattern_site = re.compile('市|上海|县|区|乡|镇')
                if pattern_site.search(group[0]):
                    group = re.findall(r".*?(?:市|上海|县|区|乡|镇)+(.*)",
                                       group[0])
                else:
                    break
            k = group or [d] or [i[1]]
            pattern_2 = re.compile(
                '株洲|漳州|扬州|滨州|常州|东莞|宁波|宿迁|温州|台州|汕头|广州|南通|龙港|惠州|杭州|南京|东阳|浙江|苍南|随州|坂田|青岛|石家庄|厂家|贸易|供应链|配件厂|源头|实力供应商|旗舰店|品牌旗舰店|官方旗舰店|店|实力|实力旗舰店|五金制品|进出口|义乌|深圳|徐州|玻璃制品|电子商务|电子')
            while pattern_2.search(k[0]):
                k[0] = re.sub(pattern_2, "", k[0])
            k_list.append(k[0])
        return set(k_list)

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------')
        try:
            df = ReadDb(self.site).read_db_1688_address()
        except OperationalError as e:
            logging.info(f'get seeds failure in link sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        except FunctionTimedOut as e:
            logging.info(f'get seeds time out sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        time.sleep(10)
        if self.reCaptcha_count >= 50:
            self.reCaptcha_count = 0
            logging.info("出现验证码次数达到50次以上 等待30分钟 开始等待.......")
            time.sleep(1800)
        if df.shape[0] > 0:
            for i in df.values:
                s_ = "<append>" if "<append>" in i[1] else "/"
                keyword = [k.strip() for k in i[1].split(s_)]
                for d in keyword:
                    if d:
                        group = re.findall(
                            r".*?(?:市|上海|县|区|乡|镇|省)?(.*)(?:商务商行|用品厂|有限公司|工厂|实力供应商|经营部|商行|销售部|电子商务)+",
                            d)
                        group = group if group else [d]
                    else:
                        group = None
                    while group:
                        pattern_site = re.compile('市|上海|县|区|乡|镇')
                        if pattern_site.search(group[0]):
                            group = re.findall(r".*?(?:市|上海|县|区|乡|镇)+(.*)",
                                               group[0])
                        else:
                            break
                    k = group or [d] or [i[1]]
                    pattern_2 = re.compile(
                        '株洲|漳州|扬州|滨州|常州|东莞|宁波|宿迁|温州|台州|汕头|广州|南通|龙港|惠州|杭州|南京|东阳|浙江|苍南|随州|坂田|青岛|石家庄|厂家|贸易|供应链|配件厂|源头|实力供应商|旗舰店|品牌旗舰店|官方旗舰店|店|实力|实力旗舰店|五金制品|文具|进出口|义乌|深圳|徐州|玻璃制品|电子商务|电子')
                    while pattern_2.search(k[0]):
                        k[0] = re.sub(pattern_2, "", k[0])
                    if self.method == "GET":
                        str_encoding = quote(k[0], encoding='gbk').replace("/", "%5C%2F")

                        url = f"https://search.1688.com/company/pc/factory_search.htm?keywords={str_encoding}&hideMainTab=1&spm=a26352.13672862.searchbox.input"
                        headers = {
                            "authority": "search.1688.com",
                            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
                            "accept-language": "zh-CN,zh;q=0.9",
                            "cache-control": "no-cache",
                            "referer": f"https://search.1688.com/company/pc/factory_search.htm?keywords={str_encoding}&hideMainTab=1&spm=a26352.13672862.searchbox.input",
                            "upgrade-insecure-requests": "1",
                            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
                        }

                        meta = {
                            "name": i[1],
                            "cookiejar": int(time.time()),
                            "curlcffi": True,
                            "search_keyword": k[0],
                            "split_search": d,

                            # "search_term": i[3]
                        }
                        cookies = {
                            # '__cn_logon__': 'true',
                            # '__cn_logon_id__': '%E8%B4%BA950910',
                            # '__mwb_logon_id__': '%25E8%25B4%25BA950910',
                            # '__wpkreporterwid_': 'a039370c-570a-41a1-0064-23727adb6078',
                            # '_csrf_token': '1706491361811',
                            # '_m_h5_tk': 'c5b134941e39759d9e40b499e6c967f0_1706518549069',
                            # '_m_h5_tk_enc': 'b8cb801ed0e81a68e82ccb8122c33c20',
                            # '_tb_token_': '7be3b6a33e810',
                            # 'ali_ab': '113.87.226.152.1706239929264.4',
                            # 'ali_apache_track': 'c_mid=b2b-34894394119a407|c_lid=%E8%B4%BA950910|c_ms=1',
                            # 'ali_apache_tracktmp': 'c_w_signed=Y',
                            # 'cna': 'hquFHT5i2VUCAXd7sFPR2L2p',
                            # 'cookie2': '25c6f867f8cdd547658c0e74df179c95',
                            # 'csg': '223be446',
                            # 'isg': 'BFJSBK3ZY1r7Cp7MgmyuMkeIoxg0Y1b9WplCZRyqeYXwL_ApBfaTDRyNn4sTX86V',
                            # 'keywordsHistory': '%E5%B0%9A%E9%83%BD%E9%A5%B0%E5%93%81%3B%E7%88%B1%E5%A5%87%E6%96%87%E5%85%B7%3B%E6%99%9F%E8%80%80%E7%A7%91%E6%8A%80%3B%E9%93%AD%E8%BD%A9%3B%E8%B4%B5%E5%B1%BF%E6%A8%B1%E9%87%91%E8%8E%89%E9%92%88%E7%BB%87%E5%86%85%E8%A1%A3%3B%E9%92%A7%E7%A7%91%E5%8D%B0%E5%88%B7%E7%BA%B8%E5%93%81%3B%E8%87%BB%E7%BE%8E%E7%8E%BB%E7%92%83%E5%88%B6%E5%93%81%3B%E5%BE%90%E5%B7%9E%E8%87%BB%E7%BE%8E%E7%8E%BB%E7%92%83%E5%88%B6%E5%93%81%3B%E8%8B%8F%E8%B1%AA%E4%BA%9A%E5%85%8B%E5%8A%9B%E5%88%B6%E5%93%81%3B%E5%B0%9A%E9%83%BD%E9%A5%B0%E5%93%81%E6%BA%90%E5%A4%B4',
                            # 'l': 'fBMVkJCINVimJOOGBOfwPurza77OSIRAguPzaNbMi9fP9wC95K05W1Fmr5TpC3GVFsXyR3RpoAReBeYBquqonxvTpgyr6fHmn7ZWbkC..',
                            # 'lid': '%E8%B4%BA950910',
                            # 'mtop_partitioned_detect': '1',
                            # 'mwb': 'tm',
                            # 'sgcookie': 'E100IP2ofj8whYSmCN6Tsyo7JyuPW79eAaj2BPAlSYSukLtmNucKDraWOFKeVCVYof%2BfvuG2kGL5gfEUDGXoXxKhK9f4toiVIH0XxYO3FA0htG43S3jdh4zZKGXZHoHh4unn',
                            # 't': '81058c7cc99471b3d0b16bcefa295b68',
                            # 'taklid': '40b38fb1c9494eed90bcb4309284e074',
                            # 'tfstk': 'eo02RDgSVEL22YfI0u4a4ZLJ8tUYPypCmVw_sfcgG-20hIUajAD0jSVGS_mrIYEbnKUj7fDsFsw05KHPbAMO5-Z1cYwidAPslVGG_5D7olnbC-wa7fHeNpTBRjhYWyvBdeO8B0p8Sqjwb0cxMPCJR3VqSjQwtVKRcD3ee9JQVpcLE4rWfnrWqUycSN0DWo2Fw8F8a4P40JquUNAjzSr4KjSPpOFl4xut0Ojam7FzdQRzLGriMV4iFTIOXu4LaJObGGITm7Q10VOcXGEun7yBM6C..',
                            # 'uc4': 'nk4=0%40255tO%2B%2FCYobnAStJmFGGZ4XU%2BA%3D%3D&id4=0%40UgP0CoIkYK5Wqx3PgbRvYGuAc3y4',
                            # 'xlly_s': '1'
                        }
                        r = scrapy.Request(url=url, headers=headers, cookies=cookies,
                                             callback=self.parse_json, errback=self.err_parse,
                                             dont_filter=True, meta=meta)
                        self.crawler.engine.crawl(r, self)
                    else:
                        dq = queue.Queue()
                        for n in range(1, 2):
                            for asyncCount in [6, 14]:
                                startIndex = 0 if asyncCount == 6 else 6
                                # url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={quote(i[1], encoding="gbk")}&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'
                                # url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={quote(i[1], encoding="gbk")}&async=true&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'
                                str_encoding = quote(k[0], encoding='gbk').replace("/", "%5C%2F")
                                url = f'https://search.1688.com/service/companyInfoSearchDataService?keywords={str_encoding}&async=true&asyncCount={asyncCount}&beginPage={n}&pageSize=20&startIndex={startIndex}&pageName=findPCFactory'

                                headers = {
                                    'authority': 'search.1688.com',
                                    'accept': '*/*',
                                    'accept-language': 'zh-CN,zh;q=0.9',
                                    'referer': 'https://search.1688.com/company/pc/factory_search.html?spm=a260k.19776607.kyttf5s0.3.1c5c4d84sYaCQA&charset=utf8&hideMainTab=1&keywords=&pagesource=sem_a0c5fccd75056341b8789b8cc8dcf77ea43f&beginPage=2',
                                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
                                }

                                meta = {
                                    "name": i[1],
                                    "cookiejar": int(time.time()),
                                    "curlcffi": True,
                                    "search_keyword": k[0],
                                    "page": n,
                                    "asyncCount": asyncCount,
                                    "dq": dq,
                                    "impersonate": 'chrome101',
                                    "split_search": d,
                                    # "search_term": i[3]
                                }
                                r = scrapy.Request(url=url, headers=headers,
                                                   callback=self.parse_json, errback=self.err_parse,
                                                   dont_filter=True, meta=meta)
                                self.crawler.engine.crawl(r, self)

                # pattern_ = re.compile('商务商行|用品厂|有限公司|厂|实力供应商|经营部|商行')
        else:
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_new_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()

    def parse_new_page(self, response, **kwargs):
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def err_(self, response, **kwargs):
        print("sleep=========error", response.getErrorMessage(), response.request.meta.get("asin"))
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    # def parse(self, response, **kwargs):
    #     stats = self.crawler.stats
    #     data_list = re.findall(r"window.data.offerresultData = successDataCheck\((.*?)\);", response.text)
    #     item = []
    #     if data_list:
    #         datas = json.loads(data_list[0])
    #         for i in datas["data"]["companyWithOfferLists"]:
    #             memberId = i["factoryInfo"]["factoryDetailUrl"].split("memberId=")[-1].split("&")[0]
    #             item.append({
    #                 "company_name": i["factoryInfo"]["company"],
    #                 "memberId": memberId,
    #                 "page": response.meta.get("page"),
    #                 "search_term": response.meta.get("search_term")
    #             })
    #             print(i["factoryInfo"]["company"])
    #             print(memberId)
    #             print("页数", response.meta.get("page"))
    #             print("搜索词", response.meta.get("search_term"))
    #         stats.inc_value(response.meta.get("search_term"))
    #
    #     response.meta["items"] = item

    def text_contrast(self, text1, text2):
        distance = Levenshtein.distance(text1, text2)
        similarity = 1 - distance / max(len(text1), len(text2))
        print(f"Similarity: {similarity}")
        return similarity

    def parse_json(self, response, **kwargs):
        if "window._config_ =" in response.text:
            logging.info("验证码")
            self.reCaptcha_count += 1
            self.method = random.choice(["GET", "POST"])
            item = {
                "error_asin": True,
                "search_term": {"state": 1, "search_term": response.request.meta.get("name")},
            }
            logging.info(f"爬取失败关键词:{item.get('search_term')}")
            yield item
        else:
            if "search.1688.com/service/companyInfoSearchDataService" in response.url:
                datas = response.json()["data"]
            else:
                str_json = \
                response.xpath(".//script[contains(text(), 'window.data.offerresultData = successDataCheck')]").get().split(
                    "window.data.offerresultData = successDataCheck(")[-1].split(");")[0].strip()
                datas = json.loads(str_json)
            info = []
            memberid_list = []
            factory_name_list = []
            for i in datas['data']['companyWithOfferLists']:
                if (response.meta.get('name') in i["factoryInfo"]["company"]) or (
                        response.meta.get('search_keyword') in i["factoryInfo"]["company"]):
                    memberId = i["factoryInfo"]["factoryDetailUrl"].split("memberId=")[-1].split("&")[0]
                    if i["factoryInfo"]["company"] in factory_name_list:
                        # 过滤重复数据
                        continue
                    info.append({
                        "name": i["factoryInfo"]["company"],
                        "source_name": response.meta.get("name"),
                        "search_keyword": response.meta.get('search_keyword'),
                        "member_id": memberId,
                        "text_contrast": self.text_contrast(response.meta.get('split_search'), i["factoryInfo"]["company"])
                    })
                    memberid_list.append(memberId)
                    factory_name_list.append(i["factoryInfo"]["company"])
            if info:
                # 只匹配匹配率最高的一个
                sorted_list = sorted(info, key=lambda x: x['text_contrast'])
                # i = sorted_list[-1]
                logging.info(
                    f"通过搜索词 {response.meta.get('search_keyword')} 搜索公司名称 {response.meta.get('name')} 匹配符合的为：{sorted_list}")
                for i in info:
                    url = f'https://h5api.m.1688.com/h5/mtop.alibaba.alisite.cbu.server.pc.moduleasyncservice/1.0/'
                    meta = {
                        "sign_1688": True,
                        "memberId": i.get("member_id"),
                        "curlcffi": True,
                        "cookiejar": int(time.time()),
                        "company_name": i.get("name"),
                        'search_term': response.meta.get('name'),
                        "text_contrast": i.get('text_contrast')
                    }

                    headers = {
                        'Host': 'h5api.m.1688.com',
                        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                        'referer': 'https://kszwyr.1688.com/',
                        'accept-language': 'zh-CN,zh;q=0.9',
                    }
                    yield scrapy.Request(url=url, headers=headers,
                                         callback=self.parse_info, errback=self.err_parse,
                                         dont_filter=True, meta=meta)
            else:
                logging.info(
                    f"通过搜索词 {response.meta.get('search_keyword')} 搜索公司名称 {response.meta.get('name')} 匹配不符合")

    def parse_info(self, response, **kwargs):
        if "令牌为空" in response.text:
            logging.info("页面爬取失败")
        else:
            data_dict = response.json()
            company_name = data_dict["data"].get("companyName")
            mobileNo = data_dict.get("data").get("mobileNo")
            phoneNum = data_dict.get("data").get("phoneNum")
            fax = data_dict.get("data").get("faxNum")
            address = data_dict.get("data").get("address")
            print("传真", fax)
            contact_name = data_dict.get("data").get("name")
            position = data_dict.get("data").get("jobTitle")
            print(response.json())
            items = {
                "company_name": company_name,
                "mobileNo": mobileNo,
                "phoneNum": phoneNum,
                "fax": fax or "",
                "contact_name": contact_name,
                "position": position,
                "memberId": response.meta.get("memberId"),
                "search_term": response.meta.get("search_term"),
                "address": address,
                "text_contrast": response.meta.get("text_contrast")
            }
            yield {"inner_item": items}

    def err_parse(self, response, **kwargs):
        stats = self.crawler.stats
        stats.set_value(response.request.meta.get("search_term"), 0)
        item = {
            "error_asin": True,
            "search_term": {"state": 1, "search_term": response.request.meta.get("name")},
            # "search_term": (1, response.request.meta.get("search_term")),
        }
        logging.info(f"爬取失败关键词:{item.get('search_term')}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl get_1688_search_company'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl get_1688_search  > 1688_search1.log 2>&1 &
# nohup scrapy crawl get_1688_search -a page_spider=5 > 1688_search1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl get_1688_search" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl get_1688_search
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl get_1688_search > 1688_search1.log 2>&1 &
