# coding:utf-8
import os
import re
import sys
import json
import time
import queue
import random
import scrapy
import logging
import platform
from pprint import pprint
from urllib.parse import urlparse
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.scrapy_redis.spiders import RedisSpider
from amazon_spider.utils.common import field_length_dispose
from amazon_spider.extractor.amazon_detail_extractor import AmazonDetailExtractor
from amazon_spider.items import detail_inner_item, self_asin_seller_item, variat_item

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class TextRedisDetail(RedisSpider):
    name = 'redis_detail_text'
    SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'
    custom_settings = {
        'CONCURRENT_REQUESTS': 25,
        'DOWNLOAD_TIMEOUT': 20,
        'allowed_domains': ['amazon.com'],
        # # 设置重启爬虫时是否清空爬取队列
        'SCHEDULER_FLUSH_ON_START': False,
        # # 启用Redis调度存储请求队列
        'SCHEDULER': "amazon_spider.scrapy_redis.scheduler.Scheduler",
        # # 确保所有的爬虫通过Redis去重
        # 'DUPEFILTER_CLASS': "amazon_spider.scrapy_redis.dupefilter.RFPDupeFilter",
        # 'SCHEDULER_QUEUE_CLASS': 'amazon_spider.scrapy_redis.queue.SpiderPriorityQueue',
        # # # 6379
        # # 'FILTER_DB': 0,
        'SCHEDULER_QUEUE_KEY': "detail_seed",
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 2,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            # 'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            # 'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.real_keepa_pipe.AmazonRealKeepaSpiderPipeline': 230,
            # 'amazon_spider.pipeline.cs_count.FidleMonitorPipeline': 200,
            # 'amazon_spider.pipeline.news_real_pipe.AmazonRealKeepaSpiderPipeline': 200,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        }
    }

    def __init__(self, site='us'):
        super(TextRedisDetail, self).__init__()
        self.site = site
        self.update_cookies()
        self.sleep_count = 0
        self.seller_work = {
            "us": "Sold by",
            "uk": "Sold by",
            "fr": "Vendu par",
            "de": "Verkäufer",
            "es": "Vendedor",
            "it": "Venditore",
            "mx": "Vendedor",
            "ca": "Sold by"
        }

        self.seller_type = {
            "us": "Ships from",
            "uk": "Dispatches from",
            "fr": "Expéditeur",
            "de": "Versand",
            "es": "Envía por",
            "it": "Spedito da",
            "mx": "Envío por",
            "ca": "Ships from"
        }
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s', level=logging.INFO)
        # self.utils_requests = [{"use_httpx": True}]
        self.asins = ['B0C68HFRCJ', 'B0993KNJ8Q', 'B0D73QPH2L', 'B09SPSVQG8']
        # self.asins = ['B0C68HFRCJ']

        self.datas = []
        self.seeds = [{"asin": f"{asin}", "site": "us", "asin_type": "3,7", "is_variation": 1, "date_info": "",
                       "account_id": None, "priority": "1", "updated_at": "2024-05-21 08:52:44", "other_sellers_id": "",
                       "other_seller_name": "", "other_seller_buy_boy_type": ""} for asin in self.asins]
        self.variat_list = []

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(TextRedisDetail, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        return [self.seeds.pop()]

    def spider_idle(self, spider):
        # read database again and send new requests
        logging.debug(f'IDLE------------------{self.site} {time.time()} {len(self.seeds)}')
        stats = self.crawler.stats
        stats.clear_stats()
        seeds = self.get_seeds()
        if seeds:
            for i in seeds:
                # # text
                job = i
                # job = json.loads(i)
                if not self.site_url.get(job['site']):
                    logging.info(f"过滤 未知站点 {job['site']}")
                    continue
                if job["is_variation"] == 2:
                    url = f"{self.site_url.get(job['site'])}/dp/{job['asin']}"
                else:
                    url = f"{self.site_url.get(job['site'])}/dp/{job['asin']}" + "?th=1&psc=1"
                headers = {
                    'Connection': 'close',
                    'authority': urlparse(url).hostname,
                    'accept': 'text/html,*/*',
                    'accept-language': 'zh-CN,zh;q=0.9',
                    'cache-control': 'no-cache',
                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                    'origin': url,
                    'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                }
                try:
                    account_id = str(job["account_id"]) if job["account_id"] and job["account_id"] != "nan" else None
                except:
                    account_id = ""
                if account_id:
                    url = f"{self.site_url.get(job['site'])}/dp/{job['asin']}?m={account_id}"
                meta = {
                    # "use_aiohttp": True,
                    "asin": job["asin"],
                    # "amazon_proxy": True,
                    "choice_header": True,
                    "cookiejar": int(time.time()),
                    "asin_type": job["asin_type"],
                    "date_info": job["date_info"],
                    "site": job["site"] or self.site,
                    "account_id": account_id,
                    "is_variation": job["is_variation"],
                    "priority": job["priority"],
                    "updated_at": job.get("updated_at", ""),
                    "other_sellers_id": job.get("other_sellers_id", ""),
                    "other_seller_name": job.get("other_seller_name", ""),
                    "other_seller_buy_boy_type": job.get("other_seller_buy_boy_type", ""),
                    # 'proxy': "http://27.22.63.217:3389",
                }
                meta = self.random_r(meta)
                h_key = random.choice(list(self.h_dict))
                headers[h_key] = self.h_dict.get(h_key)
                logging.info(f'随机添加headers： {meta["site"]} {h_key}')
                cookies = self.json_cookies(meta["site"])
                if meta['site'] in ['au', 'tr', 'nl']:
                    del headers['referer']
                self.crawler.engine.crawl(scrapy.Request(url=url, cookies=cookies, headers=headers,
                                                         callback=self.parse, errback=self.err_parse,
                                                         dont_filter=True, meta=meta), self)

        else:

            raise
            # meta = {
            #     'handle_httpstatus_all': True
            # }
            # request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_, dont_filter=True, meta=meta)
            # self.crawler.engine.crawl(request, spider=self)
            # logging.info('no task sleep 30s')
            # time.sleep(30)
            # raise DontCloseSpider()


    def seller_history_msg_add(self, response, self_asin_seller):
        # 判断更新时间是否在  24小时内  在24小时内则  将历史 更买和最新更买  去重合并
        if response.meta["other_seller_name"]:
            history_other_sellers_id = response.meta["other_sellers_id"].split("|-|")
            history_other_sellers_name = response.meta["other_seller_name"].split("|-|")
            history_other_seller_buy_boy_type = response.meta["other_seller_buy_boy_type"].split("|-|")
            name = self_asin_seller["other_seller_name"].split("|-|") if self_asin_seller["other_seller_name"] else ""
            id = self_asin_seller["other_sellers_id"].split("|-|") if self_asin_seller["other_sellers_id"] else ""
            seller_buy_boy_type = self_asin_seller["other_seller_buy_boy_type"].split("|-|") if self_asin_seller[
                "other_seller_buy_boy_type"] else ""

            for i in range(0, len(name)):
                if name[i] in history_other_sellers_name:
                    continue
                else:
                    history_other_sellers_name.append(name[i].strip())
                    history_other_sellers_id.append(id[i].strip())
                    history_other_seller_buy_boy_type.append(seller_buy_boy_type[i].strip())
            if len(history_other_sellers_id) == len(history_other_sellers_name) == len(
                    history_other_seller_buy_boy_type):
                self_asin_seller["other_sellers_id"] = "|-|".join(history_other_sellers_id)
                self_asin_seller["other_seller_name"] = "|-|".join(history_other_sellers_name)
                self_asin_seller["other_seller_buy_boy_type"] = "|-|".join(history_other_seller_buy_boy_type)
                return self_asin_seller
            else:
                logging.info(
                    f'history_other_sellers {response.meta["other_sellers_id"]} {response.meta["other_seller_name"]} {response.meta["other_seller_buy_boy_type"]}')
                logging.info(f"self_asin_seller: {self_asin_seller}")
                self_asin_seller["other_sellers_id"] = ""
                self_asin_seller["other_seller_name"] = ""
                self_asin_seller["other_seller_buy_boy_type"] = ""
                return self_asin_seller
        else:
            return self_asin_seller

    def seller_old(self, self_asin_seller):
        name = self_asin_seller.get("other_seller_name").split("|-|") if self_asin_seller.get(
            "other_seller_name") else []
        s_id = self_asin_seller.get("other_sellers_id").split("|-|") if self_asin_seller.get("other_sellers_id") else []
        s_type = self_asin_seller.get("other_seller_buy_boy_type").split("|-|") if self_asin_seller.get(
            "other_seller_buy_boy_type") else []
        seller_old = [
            {"asin": self_asin_seller.get("asin"), "seller_id": s_id[i].strip(), "seller_name": name[i].strip(),
             "buy_boy_type": s_type[i].strip(), "site": self_asin_seller.get("site")} for i in range(len(name))]
        logging.info(f"seller old --> {seller_old}")
        return seller_old

    def parse(self, response, **kwargs):
        meta = response.meta
        print("-" * 20)
        # with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
        #     f.write(response.text)
        if item := self.page_state(response):
            yield item
        else:
            start_time = time.time()
            amazon_detail_extractor = AmazonDetailExtractor(meta.get("site"))
            items = amazon_detail_extractor.run(response)
            inner_item = detail_inner_item(items)
            inner_item = {k: v or None for k, v in inner_item.items()}
            # if inner_item['sp_delivery_initial_seen_asins_json']:
            #     with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
            #         f.write(response.text)
            #         print("sp_delivery_initial_seen_asins_json ---------------------")
            inner_item["is_coupon"] = inner_item["is_coupon"] if inner_item["is_coupon"] else "0"
            pprint(inner_item)

            dddd_test = {
                'asin': response.meta.get('asin'),
                '品牌': inner_item['brand'],
                '材质': inner_item['material'],
                'product_json': inner_item['product_json'],
                'productdetail_json': inner_item['productdetail_json']
            }
            self.datas.append(dddd_test)
            # logging.info(f"解析耗时{time.time() - start_time}")
            # print(list(inner_item.keys()))
            # 添加字段长度限制
            inner_item = field_length_dispose(inner_item)
            self_asin_seller = self_asin_seller_item(items)
            # if (not self_asin_seller.get("seller_id")) and (not self_asin_seller.get("seller_name")):
            #     with open(f"{response.meta.get('asin')}.html", "w", encoding="utf-8")as f:
            #         f.write(response.text)
            #         logging.info(f"write html {response.meta.get('asin')}")
            # 变体数据
            variat_list = []
            parent_asin = None
            for i in amazon_detail_extractor.variat_msgs(response):
                logging.info(f'变体数据为：{i}')
                variat_list.append(variat_item(i))
            if variat_list:
                parent_asin = variat_list[0].get("parent_asin")
            # inner_item["parent_asin"] = parent_asin
            print(variat_list)
            if "7" in inner_item.get("asin_type", ""):
                # 更买数据 获取逻辑处理
                self_asin_seller = {k: v or None for k, v in self_asin_seller.items()}
                logging.info("每3小时数据")
                if buy := response.xpath("//span[@class='a-declarative']/span[@class='a-color-base']/text()").get():
                    if buy_num := re.findall(".*?(\d+).*?", buy):
                        if int(buy_num[0]) >= 2:
                            page_count = self.get_page_num(int(buy_num[0]), 10)
                            buy_q = queue.Queue()
                            for i in range(1, page_count + 1):
                                url = f"{self.site_url.get(meta['site']) or self.url_}/gp/product/ajax/ref=dp_aod_NEW_mbc?asin={meta['asin']}&pc=dp&experienceId=aodAjaxMain&isonlyrenderofferlist=true&pageno={i}"
                                headers = {
                                    'Connection': 'close',
                                    'authority': urlparse(url).hostname,
                                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                                    'accept-language': '*',
                                    'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
                                    'origin': url,
                                    'Cache-Control': 'no-cache',
                                    'referer': f'{url}/{meta["asin"]}/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
                                }

                                meta = {
                                    # "use_aiohttp": True,
                                    # "curlcffi": True,
                                    "asin": meta["asin"],
                                    # "amazon_proxy": True,
                                    "cookiejar": int(time.time()),
                                    "asin_type": meta["asin_type"],
                                    "date_info": meta["date_info"],
                                    "site": meta["site"],
                                    "account_id": meta["account_id"],
                                    "is_variation": meta["is_variation"],
                                    "priority": meta["priority"],
                                    "updated_at": meta["updated_at"],
                                    "other_sellers_id": meta["other_sellers_id"],
                                    "other_seller_name": meta["other_seller_name"],
                                    "other_seller_buy_boy_type": meta["other_seller_buy_boy_type"],
                                    "buy_num": int(buy_num[0]),
                                    "buy_q": buy_q,
                                    "buy_page": page_count,
                                    "page": i,
                                    "sellers_data": {'inner_item': [inner_item],
                                                     'self_asin_seller': [self_asin_seller]},
                                }
                                meta = self.random_r(meta)
                                # cookies = {"i18n-prefs": "EUR", "session-id": "258-1590528-7416067", "session-id-time": "2082787201l", "sp-cdn": "\"L5Z9:HK\"", "ubid-acbnl": "260-0449045-0070351"}
                                cookies = self.json_cookies(meta['site'])
                                yield scrapy.Request(url=url, cookies=cookies, headers=headers,
                                                     callback=self.parse_buy, errback=self.err_parse,
                                                     dont_filter=True, meta=meta)
                        else:
                            if response.meta.get("other_seller_name"):
                                self_asin_seller = self.seller_history_msg_add(response, self_asin_seller)
                            pprint(inner_item)
                            pprint(self_asin_seller)
                            seller_old_data = self.seller_old(self_asin_seller)
                            yield {'inner_item': [inner_item], 'self_asin_seller': [self_asin_seller],
                                   "seller_old": seller_old_data}
                    else:
                        if response.meta.get("other_seller_name"):
                            self_asin_seller = self.seller_history_msg_add(response, self_asin_seller)
                        pprint(inner_item)
                        pprint(self_asin_seller)
                        seller_old_data = self.seller_old(self_asin_seller)
                        yield {'inner_item': [inner_item], 'self_asin_seller': [self_asin_seller],
                               "seller_old": seller_old_data}
                else:
                    if response.meta.get("other_seller_name"):
                        self_asin_seller = self.seller_history_msg_add(response, self_asin_seller)
                    pprint(inner_item)
                    pprint(self_asin_seller)
                    seller_old_data = self.seller_old(self_asin_seller)
                    yield {'inner_item': [inner_item], 'self_asin_seller': [self_asin_seller],
                           "seller_old": seller_old_data}
            if ("9" in inner_item.get("asin_type", "")) or ("8" in inner_item.get("asin_type", "")):
                logging.info("用户收藏数据")
                inner_item["parent_asin"] = parent_asin
                bsr_datas = amazon_detail_extractor.asin_bs_category_asin_detail(response)
                if bsr_datas:
                    best_sellers_rank = re.sub(r"(\(.*?\))", "", bsr_datas.get("best_sellers_rank") or "")
                    best_sellers_rank = '&&&&#'.join(best_sellers_rank.split(" #"))
                    all_best_sellers_href = "&&&&".join(bsr_datas.get("last_herf") or "")
                    # last_herf = all_best_sellers_href[-1]
                    inner_item["best_sellers_rank"] = best_sellers_rank or None
                    inner_item["all_best_sellers_href"] = all_best_sellers_href or None
                    inner_item["data_type"] = inner_item.get("asin_type", "")
                    yield {
                        'inner_item': [inner_item],
                        "self_asin_seller": None,
                        "seller_old": None
                    }
                else:
                    inner_item["best_sellers_rank"] = None
                    inner_item["all_best_sellers_href"] = None
                    inner_item["data_type"] = inner_item.get("asin_type", "")
                    yield {
                        'inner_item': [inner_item],
                        "self_asin_seller": None,
                        "seller_old": None
                    }

            if response.meta.get("site") in ["us", "es", "fr", "it", "uk", "de"]:
                # <span class="a-size-base a-color-secondary">
                if "Item Package" in response.xpath(".//span[@class='a-size-base a-color-secondary']//text()").get():
                    print(response.xpath(".//span[@class='a-size-base a-color-secondary']//text()").get())
                    self.variat_list.append(response.meta.get("asin"))
                if variat_list:
                    yield {"variat_item": variat_list, "site": response.meta["site"]}
            else:
                logging.info("非6大站点变体信息 ")
            # asin_image = amazon_detail_extractor.asin_image(response)
            # if asin_image:
            #     logging.info(f"img data --> {asin_image[0:5]}")
            #     # 入库
            #     yield {"asin_img": asin_image, "site": response.meta["site"]}

    def parse_buy(self, response, **kwargs):
        meta = response.meta
        logging.info(f"{meta['asin']} get `New ({meta['buy_num']}) from` > 2 to buy more page {meta['page']}")
        s_work = self.seller_work.get(meta["site"])
        s_ = self.seller_type.get(meta["site"])
        seller = response.xpath(
            f"//span[contains(text(), '{s_work}')]/parent::div/parent::div[@class='a-fixed-left-grid-inner']")
        seller_type = response.xpath(
            f"//span[contains(text(), '{s_}')]/parent::div/parent::div[@class='a-fixed-left-grid-inner']//span[@class='a-size-small a-color-base']//text()").getall()
        other_sellers_id = []
        other_sellers_name = []
        other_seller_buy_boy_type = []
        for x, i in enumerate(seller):
            s_name = i.xpath(
                ".//*[(@class='a-size-small a-color-base') or (@aria-label and @class='a-size-small a-link-normal' and @role)]//text()").get().strip()
            s_id = i.xpath(
                ".//*[(@class='a-size-small a-color-base') or (@aria-label and @class='a-size-small a-link-normal' and @role)]/@href").get() or None
            s_type = seller_type[x] or None
            if s_name in other_sellers_name:
                continue
            if "seller" not in [s_id or ""][0]:
                continue
            other_sellers_name.append(s_name.strip())
            other_sellers_id.append(s_id.strip())
            other_seller_buy_boy_type.append(s_type.strip())
        meta["buy_q"].put({
            'other_seller_name': other_sellers_name,
            'other_sellers_id': other_sellers_id,
            'other_seller_buy_boy_type': other_seller_buy_boy_type
        })
        if meta["buy_q"].qsize() == meta["buy_page"]:
            other_sellers_name = []
            other_sellers_id = []
            other_sellers_type = []
            for i in range(meta["buy_q"].qsize()):
                sellers = meta["buy_q"].get()
                other_sellers_id += sellers["other_sellers_id"]
                other_sellers_name += sellers["other_seller_name"]
                other_sellers_type += sellers["other_seller_buy_boy_type"]
            logging.info(
                f"{meta['asin']} seller len {len(other_sellers_name)}   {len(other_sellers_id)} {len(other_sellers_type)}")
            other_sellers_name = AmazonDetailExtractor(self.site).extract_other_seller_name(other_sellers_name,
                                                                                            meta["site"])
            other_sellers_id = AmazonDetailExtractor(self.site).extract_other_sellers_id(other_sellers_id,
                                                                                         meta["site"])
            other_sellers_type = AmazonDetailExtractor(self.site).extract_other_seller_name(other_sellers_type,
                                                                                            meta["site"])
            meta["sellers_data"]["self_asin_seller"][0]["other_seller_name"] = other_sellers_name or None
            meta["sellers_data"]["self_asin_seller"][0]["other_sellers_id"] = other_sellers_id or None
            meta["sellers_data"]["self_asin_seller"][0]["other_seller_buy_boy_type"] = other_sellers_type or None
            if response.meta.get("other_seller_name"):
                self_asin_seller = self.seller_history_msg_add(response, meta["sellers_data"]["self_asin_seller"][0])
                meta["sellers_data"]["self_asin_seller"][0] = self_asin_seller
            logging.info(f"sellers data -->{meta['sellers_data']['self_asin_seller']}")
            pprint(meta['sellers_data'])

            seller_old_data = self.seller_old(meta['sellers_data']['self_asin_seller'][0])
            meta['sellers_data']["seller_old"] = seller_old_data
            # seller_old_data = self.seller_old(self_asin_seller)

            yield meta['sellers_data']

    def err_parse(self, failure, **kwargs):
        logging.info(f"error______ {failure.getErrorMessage()}, {failure.request.meta.get('asin')}")
        # item = {
        #     "error_asin": True,
        #     "asin": (1, response.request.meta.get("asin"), response.request.meta.get("site")),
        # }
        # logging.info(f"爬取失败asin:{item.get('asin')}")
        # yield item
        # 处理失败请求并将其重新发送到 Redis
        # 获取原始请求对象
        request = failure.request
        meta = {
            "asin": request.meta.get("asin"),
            "asin_type": request.meta.get("asin_type"),
            "is_variation": request.meta.get("is_variation"),
            "date_info": request.meta.get("date_info"),
            "site": request.meta.get("site"),
            "account_id": request.meta.get("account_id"),
            "priority": request.meta.get("priority"),
            "updated_at": request.meta.get("updated_at"),
            "other_sellers_id": request.meta.get("other_sellers_id"),
            "other_seller_name": request.meta.get("other_seller_name"),
            "other_seller_buy_boy_type": request.meta.get("other_seller_buy_boy_type"),
        }
        k = json.dumps(meta)
        # while True:
        #     try:
        #         # sadd(f"{self.site}_real_seed", k, use_md5=False)
        #         zadd(f"{self.site}_real_zset_seed", {k: meta.get("priority")})
        #         logging.info(f"push {self.site}_real_zset_seed succeed")
        #         break
        #     except redis.exceptions.ConnectionError as e:
        #         logging.info(f"push {self.site}_real_zset_seed ConnectionError，afresh push --> T_T {e}")
        #         time.sleep(5)
        #         continue
        #     except FunctionTimedOut as e:
        #         logging.info(f"push {self.site}_real_zset_seed time out --> T_T {e}")
        #         continue

    def close(self, spider, reason):
        logging.info("spider finish")
        logging.info(f"self.variat_list {self.variat_list}")


if __name__ == '__main__':
    args = 'scrapy crawl redis_detail_text -a site=us'.split()
    cmdline.execute(args)

# us, uk, fr, de, es, it, mx
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl real_new_detail  > real_redis_amazon1.log 2>&1 &
# nohup scrapy crawl real_new_detail -a site=us  > real_redis_amazon1.log 2>&1 &
# source activate pyspark
# for i in `ps -ef|grep "scrapy crawl real_new_detail" |awk '{print $2}' `; do kill -9 $i ; done;
# C:\Users\Administrator\AppData\Local\Programs\Python\Python38\scrapy crawl real_redis_detail
# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl real_redis_detail -a site=uk > real_redis_amazon1.log 2>&1 &
