# coding:utf-8
import os
import re
import sys
import json
import time
import scrapy
import logging
import platform
from scrapy import cmdline, signals
from sqlalchemy.exc import OperationalError
from scrapy.exceptions import DontCloseSpider
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
# 亚马逊详情页数据获取
from amazon_spider.utils.read_db_data import ReadDb

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class Get1688ContactInfoSpider(scrapy.Spider):
    name = 'get_1688_contact_info'
    custom_settings = {
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['1688.com'],
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        # # 设置重启爬虫时是否清空爬取队列
        # 'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        # 'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # 种子队列的信息
        # 'REDIS_URL': None,
        # 'REDIS_HOST': '192.168.10.224',
        # 'REDIS_PORT': 6379,
        # 'REDIS_PARAMS': {
        #     'password': 'HCL1zcUgQesaaXNLbL37O5KhpSAy0c',
        #     'db': 0
        # },
        # # # 6379
        # # 'FILTER_URL': None,
        # # 'FILTER_HOST': '127.0.0.1',
        # # 'FILTER_PORT': 6379,
        # # # 6379
        # # 'FILTER_DB': 0,
        # 'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            # 'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            'amazon_spider.middleware.contact_info_1688.AddSignMiddleware': 470,
            # 'amazon_spider.middlewares.Hadoop10ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.SearchCookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 递减调用
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipeline.contact_info_pipe.ContactInfoPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(Get1688ContactInfoSpider, self).__init__()
        self.site = site
        self.col = ['company_name', 'mobileNo', 'phoneNum', 'fax', 'contact_name', 'position', 'memberId', 'search_term']

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(Get1688ContactInfoSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------------------{self.site}')
        try:
            df = ReadDb(self.site).read_db_1688_memberId()
        except OperationalError as e:
            logging.info(f'get seeds failure in link sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        except FunctionTimedOut as e:
            logging.info(f'get seeds time out sleep 30s{e}')
            time.sleep(30)
            raise DontCloseSpider
        if df.shape[0] > 0:
            for i in df.values:

                url = f'https://h5api.m.1688.com/h5/mtop.alibaba.alisite.cbu.server.pc.moduleasyncservice/1.0/'
                meta = {
                    "memberId": i[1],
                    # "amazon_proxy": True,
                    "cookiejar": int(time.time()),
                    "sign_1688": True,
                }
                headers = {
                    'Host': 'h5api.m.1688.com',
                    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                    'referer': 'https://kszwyr.1688.com/',
                    'accept-language': 'zh-CN,zh;q=0.9',
                }
                self.crawler.engine.crawl(
                    scrapy.Request(url=url, headers=headers,
                                   callback=self.parse, errback=self.err_parse,
                                   dont_filter=True, meta=meta), self)
        else:
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_new_page, errback=self.err_, dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()

    def parse_new_page(self, response, **kwargs):
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def err_(self, response, **kwargs):
        print("sleep=========error", response.getErrorMessage(), response.request.meta.get("asin"))
        return {
            "finish_spider": True,
            "asin": "finish",
        }

    def parse(self, response, **kwargs):
        meta = response.meta
        data_dict = response.json()
        if "RGV587_ERROR::SM::哎哟喂,被挤爆啦,请稍后重试!" in data_dict.get('ret'):
            logging.info(f"异常返回 {response.text}")
            item = {
                "error_asin": True,
                "memberId": (4, response.meta.get("memberId")),
            }
            logging.info(f"哎哟喂,被挤爆啦,请稍后重试! 爬取失败工厂id:{item.get('memberId')}")
            yield item
        else:
            company_name = data_dict["data"].get("companyName")
            mobileNo = data_dict.get("data").get("mobileNo")
            phoneNum = data_dict.get("data").get("phoneNum")
            fax = data_dict.get("data").get("faxNum")
            address = data_dict.get("data").get("address")
            # mobileNo 手机
            # phoneNum 电话
            # company_name 公司名称
            # fax 传真
            # contact_name 联系人名称
            # position职位信息
            # address 地址
            # memberId 工厂id
            contact_name = data_dict.get("data").get("name")
            position = data_dict.get("data").get("jobTitle")
            # cord_data = get_1688_index(memberid)
            # cord_data['contact_address'] = address
            # home_url = cord_data.get("home_url")
            items = {
                "company_name": company_name,
                "mobileNo": mobileNo,
                "phoneNum": phoneNum,
                "fax": fax or "",
                "contact_name": contact_name,
                "position": position,
                "memberId": meta['memberId'],
                # "address": cord_data.get('address'),
                # "home_url": home_url,
                "card_url": f"https://sale.1688.com/factory/card.html?memberId={meta['memberId']}"
            }
            meta['items'] = items

            headers = {
                'authority': 'sale.1688.com',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'accept-language': 'zh-CN,zh;q=0.9',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
            }
            url = f"https://sale.1688.com/factory/card.html?memberId={meta['memberId']}"
            meta['sign_1688'] = False
            yield scrapy.Request(url=url, headers=headers,
                                 callback=self.parse_index, errback=self.err_parse,
                                 dont_filter=True, meta=meta)

    def parse_index(self, response, **kwargs):
        items = response.meta['items']
        datas = re.findall(r"window\.\$\$pageData=(.*?});.*?window\.\$\$pageSeed", response.text, re.S)
        if datas:
            datas = json.loads(datas[0])
            for i in datas.values():
                if isinstance(i.get("initShopInfo"), dict):
                    print("地址", i["initShopInfo"]["factoryDetailedAddress"])
                    print("url", i['initShopInfo']['shopPcWpIndexUrl'] + '/' if i['initShopInfo'][
                        'shopPcWpIndexUrl'] else None)
                    items['home_url'] = i['initShopInfo']['shopPcWpIndexUrl'] + '/' if i['initShopInfo'][
                        'shopPcWpIndexUrl'] else None
                    items['address'] = i["initShopInfo"]["factoryDetailedAddress"]
                # else:
                #     items['home_url'] = ''
                #     items['address'] = ''
                # if i.get("authInfo"):
                #     print("--企业诚信档案")
                #     print("地址", i["authInfo"]["result"]["addressTitle"])
                #     item['addressTitle'] = i["authInfo"]["result"]["addressTitle"]
        items['home_url'] = items.get('home_url', '')
        items['address'] = items.get('address', '')
        items['state'] = 3
        items['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        print(items)
        yield {'inner_item': items}

    def err_parse(self, response, **kwargs):
        item = {
            "error_asin": True,
            "memberId": (1, response.request.meta.get("memberId")),
        }
        logging.info(f"爬取失败工厂id:{item.get('memberId')}")
        yield item


if __name__ == '__main__':
    args = 'scrapy crawl get_1688_contact_info -a site=us'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl get_1688_contact_info  > 1688_contact_info1.log 2>&1 &
# nohup scrapy crawl get_1688_contact_info > 1688_contact_info1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl get_1688_contact_info" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl get_1688_contact_info
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl get_1688_contact_info > 1688_contact_info1.log 2>&1 &
