import re
import time
import queue
import scrapy
import random
import logging
import pandas as pd
from lxml import etree
import os, sys, platform
from scrapy import cmdline, signals
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.utils.utils import time_ch
from amazon_spider.items import amazon_comment_item
from amazon_spider.spiders.yswg_spider import SourceSpider

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class CommentCriticalSpider(SourceSpider):
    name = 'comment_critical_api'
    custom_settings = {
        #  curl_cffi 代理添加
        # 'PROXY_HOST': 'http-dynamic-S02.xiaoxiangdaili.com',
        # 'PROXY_PORT': 10030,
        # 'PROXY_USER': '******',
        # 'PROXY_PASS': '******',
        'CONCURRENT_REQUESTS': 3,
        'DOWNLOAD_TIMEOUT': 10,
        # 深度 廣度
        # 'DOWNLOAD_DELAY': 2,
        # # 启用后，当从相同的网站获取数据时，Scrapy将会等待一个随机的值，延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404, 401],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 480,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            'amazon_spider.middlewares.UpdateCookiesUrl': 530,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.amazon_comment_pipe_text.AmazonCommentSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        },
        # 'DOWNLOAD_HANDLERS': {
        #     'http': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        #     'https': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        # }
    }

    def __init__(self, site='us'):
        super(CommentCriticalSpider, self).__init__()
        self.site = site
        if not self.site_url.get(self.site):
            raise ValueError(f"{type(self).__name__} site error")
        self.url_ = self.site_url.get(self.site)
        self.update_cookies()
        self.comment_headers = None
        self.h_dict = {
            'rtt': '100',
            'sec-ch-device-memory': '8',
            'sec-ch-dpr': '1',
            'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^112^\\^, ^\\^Google',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '^\\^Windows^\\^',
            # 'sec-ch-ua-platform-version': '^\\^10.0.0^\\^',
            'sec-ch-viewport-width': '1587',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'viewport-width': '1587',
            'device-memory': '8',
            # 'downlink': '10',
            'dpr': '1',
            'ect': '4g',
        }
        self.utils_requests = [{"use_aiohttp": True}, {"curlcffi": True}, {"use_httpx": True}]

        self.seeds_file = './comment.xlsx'
        self.save_file = './comment_critical.xlsx'
        self.seeds = pd.read_excel(self.seeds_file, dtype={'asin': str, 'sku': str})
        self.seeds['status'] = 1
        # B0CQYPM3NV  B0CQYP1ZPG  B0CQYPXX8R B0CQYQ5DL4 B0CZLK31K5 B0CZLJY49H B0CZLKYYYV
        # self.seeds = pd.DataFrame([{"a": "1", "id": 1, "asin": 'B0CZLKYYYV', "status": 1, "site": 'Amazon.com'}])
        self.log_cookie = [
            'jPvNIv2B5a1C?WOi6XOoz2kOeCo7Hsd@sBjuYLLWEoc6GUluZQ3yfLXlVUE2VQh2',
            '0WabUGUu8uSgALAoRpAiD5US1c6Am8syTSOFv@LYMWYou@5TfBXLeQr9N4TP@aYM',
            '9Ixyrh@Iq450FZ@srt2FrAEXZaeHSGqjZmKu8v0FQRIPPcsLEUlMWvFwJpCb@n3N',
            'P71hK2UZXye@VN4ml6PMMTvS9krpXt4IOf?LUCb1p??mtPEVFWsJTBAOiDJsQm31'
        ]
        # self.seeds.loc[((self.seeds['站点'] == self.sssite)), 'state'] = 1
        # self.seeds['status'] = 1
        # self.seeds_new = self.seeds[(self.seeds['status'] == 1)]

        # self.seeds_new = self.seeds[self.seeds['hot_product'].isna()]
        # del self.seeds_new['Unnamed: 0.1']
        # del self.seeds_new['Unnamed: 0.1.1']
        # self.seeds['hot_product'] = ''
        # self.seeds['hot_product_len'] = ''
        # self.seeds['title_svg'] = ''
        # self.q = queue.Queue()

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(CommentCriticalSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def spider_idle(self, spider):
        logging.debug(f'IDLE------{self.site}------------')
        url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
        self.update_comment_headers()
        data = {
            'sortBy': 'recent',
            'reviewerType': 'all_reviews',
            'pageNumber': '1',
            'shouldAppend': 'undefined',
            'deviceType': 'desktop',
            'canShowIntHeader': 'undefined',
            'reftag': 'cm_cr_arp_d_viewopt_srt',
            'pageSize': '10',
            # 'asin': job['asin'],
            'scope': 'reviewsAjax',
            # 'filterByStar': star,
        }

        self.seeds_new = self.seeds[(self.seeds['status'] == 1) & ~(self.seeds['asin'].isnull())]
        logging.info(f"-----长度{self.seeds_new.shape}")
        if self.seeds_new.shape[0] > 0:
            num = 0
            for i in self.seeds_new.values:
                star_list = ["one_star", "two_star", "three_star", "four_star", "five_star"]
                # star_list = ["critical"]
                logging.info(f"num {num}  长度{self.seeds_new.shape}")
                num += 1
                print(i)
                for num in range(len(star_list)):
                    star = star_list[num]
                    dq = queue.Queue()
                    queue_name = i[2] + star
                    data["asin"] = i[2]
                    data["filterByStar"] = star
                    data["pageNumber"] = '1'
                    meta = {
                        'priority': num,
                        # "use_aiohttp": True,
                        "queue_name": queue_name,
                        "star": star,
                        # "curlcffi": True,
                        "asin": i[2],
                        "pageNumber": data.get("pageNumber"),
                        "ck": True,
                        f"{i[2] + star}": dq,
                        "history_comment_count": 0,
                        'choice_header': True,
                    }
                    meta = self.random_r(meta)
                    cookies = self.update_site_cookie(self.json_cookies(self.site))
                    self.comment_headers = self.get_comment_headers(self.url_, i[2])
                    h_key = random.choice(list(self.h_dict))
                    self.comment_headers[h_key] = self.h_dict.get(h_key)
                    self.comment_headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'

                    cookies.update({'x-main': random.choice(self.log_cookie)})
                    self.update_comment_headers()
                    self.crawler.engine.crawl(
                        scrapy.FormRequest(url=url + f'#star={star}#asin={i[2]}#pageNumber={data.get("pageNumber")}',
                                           headers=self.comment_headers, formdata=data, callback=self.parse, meta=meta,
                                           cookies=cookies, dont_filter=True, errback=self.err_parse), self)
            # self.seeds.to_excel('./cantonfair_玩具及孕婴童max.xlsx', encoding='utf-8')
            # self.seeds.to_csv('./cantonfair_2900_main.csv', encoding='utf-8')
        else:
            logging.info("爬取完成")
            logging.info("spider finish")
            self.seeds.to_excel(self.seeds_file, index=False)
            # quit()
            raise

    def parse(self, response, **kwargs):
        comment_num = re.findall(
            "(rating|ratings|Gesamtbewertungen|totales|total|totali|totale|总评分), (.*?) (带评论|with|mit|con|avec)",
            response.text)
        if not comment_num:
            # 反爬重试
            logging.info(f'页面变狗：{response.meta.get("star")} {response.meta.get("asin")}')
            item = {
                "count_max": True,
                "sql_data": (3, response.meta.get("asin")),
                "asin": response.meta.get("asin"),
                "comment_count": 0,
                "star": response.meta.get("star"),
            }
            # yield item
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'status'] = '评论数为0或者变狗'
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'title'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'content'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'is_vp'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'model'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'rating'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'agree_num'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'img_num'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'img_url'] = ''
            #
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'is_video'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'video_url'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_url'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_name'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_img'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_page'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_time'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'page'] = ''
            #
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_id'] = ''

        else:
            comment_count = re.sub(r"\\u.{4}", '', comment_num[0][1].__repr__()).replace(",", "").replace(".",
                                                                                                          "").replace(
                "'", "") if comment_num else 0

            total_num = re.findall(
                "(\d+) total",
                response.text)

            url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
            logging.info(
                f'{response.meta.get("asin")}历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')

            if not comment_count == "0":
                r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
                    '\\"', '"')
                page_count = 1 if response.meta.get('error_page') else self.get_page_num(int(comment_count), 10)
                response.meta["pageNumber"] = response.meta.get('pageNumber')
                response.meta["page_count"] = 10 if page_count >= 10 else page_count
                response.meta["comment_num"] = comment_count
                response.meta['total_num'] = total_num[0] if total_num else None
                item = self.xpath_res(r, response)
                item["save_comment_num"] = True
                # if int(response.meta.get("history_comment_count")) > int(comment_count):
                #     # 判断评论最新时间
                #     logging.info(
                #         f'评论数小于历史评论数,历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')
                #     return
                yield item
                ran = range(2, 11) if page_count >= 10 else range(2, page_count + 1)
                for i in ran:
                    cookies = self.json_cookies(self.site)
                    # cookies = json.loads(random.choice(self.cookie_list)[1])
                    cookies = self.update_site_cookie(cookies)
                    data = {
                        'sortBy': 'recent',
                        'reviewerType': 'all_reviews',
                        'pageNumber': str(i),
                        'shouldAppend': 'undefined',
                        'deviceType': 'desktop',
                        'canShowIntHeader': 'undefined',
                        'reftag': 'cm_cr_arp_d_viewopt_srt',
                        'pageSize': '10',
                        'asin': response.meta.get("asin"),
                        'scope': 'reviewsAjax',
                        'filterByStar': response.meta["star"],
                    }
                    meta = self.random_r(response.meta)
                    meta["pageNumber"] = str(i)
                    meta["priority"] = i
                    meta['total_num'] = total_num[0] if total_num else None
                    self.comment_headers = self.get_comment_headers(self.url_, data['asin'])
                    h_key = random.choice(list(self.h_dict))
                    self.comment_headers[h_key] = self.h_dict.get(h_key)
                    self.comment_headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                    cookies.update({'x-main': random.choice(self.log_cookie)})
                    self.update_comment_headers()
                    yield scrapy.FormRequest(
                        url=url + f'#star={response.meta.get("star")}#asin={data.get("asin")}#pageNumber={str(i)}',
                        headers=self.comment_headers, cookies=cookies, formdata=data, callback=self.parse_data,
                        errback=self.err_parse, meta=meta, dont_filter=True)

            elif comment_count == "0":
                logging.info(f'评论数 为0 页面变狗：{response.meta.get("asin")}')
                # item = {
                #     "count_max": True,
                #     "sql_data": (3, response.meta.get("asin")),
                #     "comment_count": 0,
                #     "asin": response.meta.get("asin"),
                #     "star": response.meta.get("star"),
                # }
                # yield item
                self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'status'] = '评论数为0或者变狗'

                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'title'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'content'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'is_vp'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'model'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'rating'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'agree_num'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'img_num'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'img_url'] = ''
                #
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'is_video'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'video_url'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_url'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_name'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_img'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_page'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_time'] = ''
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'page'] = ''
                #
                # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_id'] = ''

    def parse_data(self, response):
        """解析详情页数据"""
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
            '\\"', '"')
        if "@@@@@@@@@@@@@@@@@@@@___" in r:
            msg = self.r_utils(response)
            logging.info(
                f"页面 @@@@@@@@@@@@@ asin为: {response.meta.get('asin')} {response.meta.get('star')} 爬取页数 {response.meta.get('pageNumber')} 请求： {msg}")
            return None
        item = self.xpath_res(r, response)
        return item

    def comment_time(self, time_msg):
        if self.site == "de":
            time_msg = (time_msg[0].split(".")[0][-2:] + "." + time_msg[0].split(".")[1]).strip()
        elif self.site == "es":
            time_msg = time_msg[0].split("el ")[-1].strip()
        elif self.site == "fr":
            time_msg = time_msg[0].split("le ")[1].strip()
        elif self.site == "it":
            time_msg = time_msg[0].split("il ")[1].strip()
        else:
            if "年" in time_msg[0]:
                from datetime import datetime
                date_object = datetime.strptime(time_msg[0].split(" ")[0], "%Y年%m月%d日")
                time_msg = date_object.strftime("%Y-%m-%d")
            else:
                time_msg = time_msg[0].split("on")[1].strip()
        return time_msg

    def xpath_res(self, r, response):
        stats = self.crawler.stats
        ret = etree.HTML(r)
        # if not ret:
        #     # with open(f"./{response.meta.get('asin')}.html", "w", encoding='utf-8')as f:
        #     #     f.write(r)
        #     logging.info("NoneType object has no attribute xpath error find")
        # 10条评论
        datas = ret.xpath("//div[@class='a-section review aok-relative']")
        items = []
        for i in datas:
            # if not i.xpath(".//span[@data-hook='review-body']/span/text()"):
            #     print("-------", etree.tostring(i))
            title = i.xpath(".//*[@data-hook='review-title']/span/text()")[0].strip() if i.xpath(".//*[@data-hook='review-title']/span/text()") else ""
            # 用户名称
            user_name = i.xpath(".//span[@class='a-profile-name']//text()")[0].strip()
            # 用户图片链接
            user_img = i.xpath(".//div[@class='a-profile-avatar']/img/@data-src")
            content = i.xpath(".//span[@data-hook='review-body']/span/text()")
            if content:
                content = [i.strip() for i in content]
            # 赞同数
            helpful = i.xpath(".//span[@data-hook='helpful-vote-statement']//text()")
            if helpful:
                if self.site == "es":
                    review = 1 if helpful[0].split(" ")[1].strip() == "One" else helpful[0].split(" ")[1].strip()
                else:
                    review = 1 if helpful[0].split(" ")[0].strip() == "One" else helpful[0].split(" ")[0].strip()
            else:
                review = 0
            try:
                review = int(review)
            except:
                review = 0
            # 是否购买
            size_mini = i.xpath(".//span[@class='a-size-mini a-color-state a-text-bold']//text()")
            mini = 1 if size_mini else 2
            # 时间 国家
            time_msg = i.xpath(".//span[@data-hook='review-date']//text()")
            # Reviewed in Brazil 🇧🇷 on December 31, 2022
            # Rezension aus Deutschland vom 13. Februar 2023
            time_msg = self.comment_time(time_msg)
            # 评论内图片
            comment_img = i.xpath(".//div[@class='a-section a-spacing-top-mini cr-lightbox-image-thumbnails']/img/@src")
            # 视频地址
            video_url_list = i.xpath('.//input[contains(@class,"video-url")]/@value')
            if video_url_list:
                video_url = ';'.join(video_url_list)
            else:
                video_url = ""
            video_len_num = len(video_url_list)
            # 评论人主页连接
            user_page = i.xpath(".//div[@data-hook='genome-widget']/a/@href")

            rat = i.xpath(".//span[@class='a-icon-alt']//text()")

            rating = rat[0].split(".")[0].replace("stars", "").strip() if len(rat[0].split(".")[0].replace("stars", "").strip()) == 1 else rat[0].split(",")[0].replace("stars", "").strip()
            # 评论链接
            comment_url = i.xpath(".//*[@data-hook='review-title']/@href")
            # if not comment_url:
            #     logging.info("not get comment_url filter ------")
            #     continue
            variat_asin = i.xpath(".//a[@data-hook='format-strip']/@href")
            variat_asin = re.findall(r"product-reviews/(.*?)(?:\/|\?)", variat_asin[0])[0] if variat_asin else ""
            star_list = {
                '1': "one_star",
                '2': "two_star",
                '3': "three_star",
                '4': "four_star",
                '5': "five_star",
            }
            item = {
                "asin": variat_asin or response.meta.get("asin"),
                "parent_asin": response.meta.get("asin"),
                "title": title,
                "content": " ".join(content).strip() if content else "",
                # 是否确认购买
                "is_vp": str(mini),
                # 型号如color、size、style
                "model": "|-|".join(i.xpath(".//a[@data-hook='format-strip']//text()")).strip() if i.xpath(".//a[@data-hook='format-strip']//text()") else "",
                # 星级评分
                "rating": rating,
                # 赞同数
                "agree_num": int(review),
                # 评论图片数量
                "img_num": len(comment_img),
                # 图片URL
                "img_url": ",".join(comment_img),
                # 是否有视频（1是2否）
                "is_video": str(1 if video_url else 2),
                # 视频地址
                "video_url": video_url,
                # 评论链接
                "comment_url": str(comment_url[0]) if comment_url else "",
                # 评论人名称
                "user_name": user_name,
                # 评论人头像图片链接
                "user_img": str(user_img[0]),
                # 所属国家
                "country": self.site,
                # # 所属国家
                # "country": time_msg[0].split("on")[0].split(" ")[-2].strip(),
                # 评论人主页URL
                "user_page": str(user_page[0]) if user_page else "",
                # '是否是红人计划链接(1是2否含有“Earns Commissions”标签的评论人主页)',
                "is_earns_commissions": "",
                # '评论时间',
                "comment_time": time_ch(self.site, time_msg),
                "page": int(response.meta.get("pageNumber")),
                "star": star_list[rating],
            }
            item = {k: v or None for k, v in amazon_comment_item(item).items()}
            item["comment_id"] = str(i.xpath("./@id")[0]) if i.xpath("./@id") else None
            item["page_state"] = 1 if item["comment_url"] else 2
            items.append(item)

        msg = self.r_utils(response)
        logging.info(f"成功爬取数据为: {msg} {len(items)} {items}")
        stats.inc_value(response.meta.get("queue_name"))
        response.meta.get(response.meta.get("queue_name")).put(items)
        msgs = {}
        if int(stats.get_value(response.meta.get("queue_name"))) == int(response.meta.get('page_count')):
            stats.set_value(response.meta.get("queue_name"), 0)
            logging.info(
                f"页数爬取成功asin为: {response.meta.get('asin')} {msg} {response.meta.get('star')} 爬取页数 {response.meta.get(response.meta.get('queue_name')).qsize()} 数据长度： {response.meta.get('comment_num')}")
            # msgs["queues_"] = response.meta.get(response.meta.get("queue_name"))
            # msgs["comment_count"] = response.meta.get("comment_num")
            # msgs["star"] = response.meta.get("star")
            q = response.meta.get(response.meta.get("queue_name"))
            data = []
            for i in range(0, q.qsize()):
                data += q.get()

            df = pd.DataFrame(data)
            df.to_excel(f'save_{response.meta.get("asin")}_{response.meta.get("star")}.xlsx')
            self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'status'] = '3'
            print("剩余未爬取数", self.seeds[(self.seeds['status'] == 1) & ~(self.seeds['asin'].isnull())].shape)
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'title'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'content'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'is_vp'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'model'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'rating'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'agree_num'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'img_num'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'img_url'] = ''
            #
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'is_video'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'video_url'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_url'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_name'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_img'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'user_page'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_time'] = ''
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'page'] = ''
            #
            # self.seeds.loc[((self.seeds['asin'] == response.meta['asin'])), 'comment_id'] = ''

        msgs["asin"] = response.meta.get("asin")
        return msgs

    def err_parse(self, failure, **kwargs):
        request = failure.request
        logging.info(f"error______ {failure.getErrorMessage()}")

    def close(self, spider, reason):
        logging.info("spider finish")
        self.seeds.to_excel(self.seeds_file, index=False)
        # raise
        # if self.q.qsize():
        #     d = []
        #     for i in range(self.q.qsize()):
        #         d += self.q.get()
        #     df_1 = pd.DataFrame(d)
        #     print(df_1.shape)
        #     try:
        #         df_2 = pd.read_excel(self.save_file, dtype={'shop_id': str, 'commodity_id': str})
        #         print(df_2.shape)
        #         df_save = pd.concat([df_1, df_2])
        #     except FileNotFoundError as e:
        #         df_save = pd.concat([df_1])
        #         print("没有该文件")
        #     print(df_save.shape)
        #     df_save.drop_duplicates(['commodity_id'], inplace=True)
        #     print(df_save.shape)
        #     df_save.to_excel(self.save_file, index=False, encoding='utf-8')
        # self.seeds.to_excel(self.seeds_file, index=False, encoding='utf-8')


if __name__ == '__main__':
    args = 'scrapy crawl comment_critical_api -a site=us'.split()
    cmdline.execute(args)

# source activate pyspark
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_comment_history -a site=us  > amazon.log 2>&1 &
# nohup scrapy crawl amazon_comment_all -a site=us > amazon_history_comment_us1.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl amazon_comment_all" |awk '{print $2}' `; do kill -9 $i ; done;
# nohup /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_comment -a site=us > amazon1.log 2>&1 &
# nohup scrapy crawl amazon_comment -a site=de > amazon_de1.log 2>&1 &



