import re
import json
import queue
import time
import scrapy
import random
import logging
from lxml import etree
import os, sys, platform
from scrapy import cmdline, signals
from scrapy.exceptions import DontCloseSpider
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0]))) # 上级目录
from amazon_spider.db.redis_db import sadd
# 亚马逊历史评论获取
from amazon_spider.utils.utils import time_ch
from amazon_spider.spiders.yswg_spider import SourceSpider

if "Windows" == platform.system():
    print("windows")
else:
    time.tzset()


class AmazonCommentTextSpider(SourceSpider):
    name = 'amazon_comment_t'
    custom_settings = {
        #  curl_cffi 代理添加
        # 'PROXY_HOST': 'http-dynamic-S02.xiaoxiangdaili.com',
        # 'PROXY_PORT': 10030,
        # 'PROXY_USER': '******',
        # 'PROXY_PASS': '******',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 10,
        'DOWNLOAD_DELAY': 2,
        # # 启用后，当从相同的网站获取数据时，Scrapy将会等待一个随机的值，延迟时间为0.5到1.5之间的一个随机值乘以DOWNLOAD_DELAY
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 3,  # 想重试几次就写几
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404, 401],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 480,
            # 'amazon_spider.middlewares.ProxyMiddleware': 450,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            'amazon_spider.middleware.aiohttp.AiohttpMiddleware': 490,
            'amazon_spider.middleware.http2.HttpxMiddleware': 490,
            'amazon_spider.middleware.temu.CurlCffiRequests': 490,
            # 递减调用
            'amazon_spider.middlewares.UpdateCookiesUrl': 530,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            # 'amazon_spider.pipeline.amazon_comment_pip.AmazonCommentSpiderPipeline': 230,
        },
        'DOWNLOAD_HANDLERS': {
            'http': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
            'https': 'amazon_spider.downloadhandlers.ja3.MyHTTPDownloadHandler',
        },
        # 'DOWNLOAD_HANDLERS': {
        #     'http': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        #     'https': ('amazon_spider.downloadhandlers.curl.FingerprintDownloadHandler'),
        # }
    }

    def __init__(self, site='us'):
        super(AmazonCommentTextSpider, self).__init__()
        self.site = site
        self.cols_list = ['asin', 'parent_asin', 'title', 'content', 'is_vp', 'model', 'rating', 'agree_num', 'img_num', 'img_url', 'is_video', 'video_url', 'comment_url', 'user_name', 'user_img', 'country', 'user_page', 'is_earns_commissions', 'comment_time', 'page', 'md5_unique', 'star']
        if not self.site_url.get(self.site):
            raise ValueError(f"{type(self).__name__} site error")
        self.url_ = self.site_url.get(self.site)
        self.update_cookies()
        self.comment_headers = None
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s', level=logging.INFO)
        self.h_dict = {
            'rtt': '100',
            'sec-ch-device-memory': '8',
            'sec-ch-dpr': '1',
            'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^112^\\^, ^\\^Google',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '^\\^Windows^\\^',
            # 'sec-ch-ua-platform-version': '^\\^10.0.0^\\^',
            'sec-ch-viewport-width': '1587',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'viewport-width': '1587',
            'device-memory': '8',
            # 'downlink': '10',
            'dpr': '1',
            'ect': '4g',
        }
        self.seeds = [
            {"asin": "B0D9842QHM"},
        ]
        #
        self.log_cookie = [
            'jPvNIv2B5a1C?WOi6XOoz2kOeCo7Hsd@sBjuYLLWEoc6GUluZQ3yfLXlVUE2VQh2',
            '0WabUGUu8uSgALAoRpAiD5US1c6Am8syTSOFv@LYMWYou@5TfBXLeQr9N4TP@aYM',
            '9Ixyrh@Iq450FZ@srt2FrAEXZaeHSGqjZmKu8v0FQRIPPcsLEUlMWvFwJpCb@n3N',
            'P71hK2UZXye@VN4ml6PMMTvS9krpXt4IOf?LUCb1p??mtPEVFWsJTBAOiDJsQm31',
            "1oQfwQPnEajiUydZlH6LU6ENAFFpZZNHaux8maTqwG@skcXTK8XYqqhkDZtAgEhf",
            'jPvNIv2B5a1C?WOi6XOoz2kOeCo7Hsd@sBjuYLLWEoc6GUluZQ3yfLXlVUE2VQh2',
            'fMhny8K1Vx?jc1p2x?HKEWkUhbDqRtwZvELLNc9wpQ1CcYqf6bazr4NFmTuYMHt1',
            '8aSRtYCOtxGLVA6JDprF6FIVk?3LKoNStMtDGTZ9s549AdI9BKrVf7aWFTM64TFg',
            'lJpfQtU3gFL0MdP0QB2npl07xPQplgjYMXuzH2KPM4F8z@f0saUFtSKNo1PHkweC',
        ]
        self.log_cookie_name = {
            "us": 'x-main',
            "de": 'x-acbde',
            "uk": 'x-acbuk',
            "it": 'x-acbit',
            "es": 'x-acbes',
            "fr": 'x-acbfr',
            "mx": 'x-acbmx',
            "ca": 'x-acbca',

            "ae": 'x-acbae',
            "au": 'x-acbau',
            "tr": 'x-acbtr',
            "be": 'x-acbbe',
            "jp": 'x-acbjp',
            "nl": 'x-acbnl',
            "pl": 'x-acbpl',
            "se": 'x-acbse',
        }


    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(AmazonCommentTextSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.close, signals.spider_closed)
        crawler.signals.connect(spider.spider_idle, signals.spider_idle)
        return spider

    def get_seeds(self):
        # while True:
        #     try:
        #         # seeds = ['{"asin":"B00VPRZC50", "star": "four_star"}']
        #         seeds = zpop(f'{self.site}_day_comment_seed', 5)
        #         # seeds = spop(f'{self.site}_day_comment_seed', 1)
        #         logging.info("获取任务成功")
        #         break
        #     except redis.exceptions.ConnectionError as e:
        #         logging.info(f"获取任务成功失败{e}")
        #         time.sleep(5)
        #         continue
        #     except FunctionTimedOut as e:
        #         logging.info(f"获取任务成功超时 {e}")
        #         continue
        seeds = self.seeds.pop()
        return [seeds]

    def spider_idle(self, spider):
        logging.debug(f'IDLE------{self.site}------------')
        url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
        self.update_comment_headers()
        str_time = time.strftime("%H:%M:%S", time.localtime())
        # 更新cookies
        if str_time >= "07:59:00" and str_time <= "08:02:00":
            self.update_cookies()
        seeds = self.get_seeds()
        data = {
            'sortBy': 'recent',
            'reviewerType': 'all_reviews',
            'pageNumber': '1',
            'shouldAppend': 'undefined',
            'deviceType': 'desktop',
            'canShowIntHeader': 'undefined',
            'reftag': 'cm_cr_arp_d_viewopt_srt',
            'pageSize': '10',
            # 'asin': job['asin'],
            'scope': 'reviewsAjax',
            # 'formatType': 'current_format',
            # 'filterByStar': star,
        }
        if seeds:
            for i in seeds:
                job = i
                # job = json.loads(i)
                if job.get("star") and not isinstance(job.get("star"), float):
                    star_list = [job.get("star")]
                else:
                    star_list = ["one_star", "two_star", "three_star", "four_star", "five_star"]
                for num in range(len(star_list)):
                    star = star_list[num]
                    dq = queue.Queue()

                    queue_name = job['asin'] + star
                    data["asin"] = job['asin']
                    data["filterByStar"] = star
                    data["pageNumber"] = job.get('error_page', '1')
                    meta = {
                        'priority': num,
                        # "use_aiohttp": True,
                        "queue_name": queue_name,
                        "star": star,
                        # "curlcffi": True,
                        "asin": job['asin'],
                        "pageNumber": data.get("pageNumber"),
                        "ck": True,
                        f"{job['asin'] + star}": dq,
                        "history_comment_count": job.get('history_comment_count') or 0,
                        'choice_header': True,
                        'error_page': job.get('error_page'),
                        'request_data': {
                            'sortBy': 'recent',
                            'formatType': 'current_format',
                            "filterByStar": star
                        }
                    }
                    meta = self.random_r(meta)
                    print(meta)
                    cookies = self.update_site_cookie(self.json_cookies(self.site), self.site)
                    self.comment_headers = self.get_comment_headers(self.url_, job['asin'])
                    h_key = random.choice(list(self.h_dict))
                    self.comment_headers[h_key] = self.h_dict.get(h_key)
                    self.comment_headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                    x_main = random.choice(self.log_cookie)
                    print(x_main)
                    cookies.update({self.log_cookie_name.get(self.site, 'us'): x_main})
                    self.crawler.engine.crawl(
                        scrapy.FormRequest(url=url + f'#star={star}#asin={job["asin"]}#pageNumber={data.get("pageNumber")}',
                                           headers=self.comment_headers, formdata=data, callback=self.parse, meta=meta,
                                           cookies=cookies, dont_filter=True, errback=self.err_parse), self)
        else:
            meta = {
                'handle_httpstatus_all': True
            }
            request = scrapy.Request(url="https://www.baidu.com/", callback=self.parse_news_page, errback=self.err_,
                                     dont_filter=True, meta=meta)
            self.crawler.engine.crawl(request, spider=self)
            logging.info('no task sleep 30s')
            time.sleep(30)
            raise DontCloseSpider()

    def parse(self, response, **kwargs):
        comment_num = re.findall("(rating|ratings|Gesamtbewertungen|totales|total|totali|totale|总评分), (.*?) (带评论|with|mit|con|avec)", response.text)
        if not comment_num:
            # 反爬重试
            logging.info(f'页面变狗：{response.meta.get("star")} {response.meta.get("asin")}')
            item = {
                "count_max": True,
                "sql_data": (3, response.meta.get("asin")),
                "asin": response.meta.get("asin"),
                "comment_count": 0,
                "star": response.meta.get("star"),
            }
            yield item
        else:
            comment_count = re.sub(r"\\u.{4}", '', comment_num[0][1].__repr__()).replace(",", "").replace(".", "").replace("'", "")  if comment_num else 0
            url = f"{self.url_}/hz/reviews-render/ajax/reviews/get/ref="
            logging.info(f'{response.meta.get("asin")}历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')
            if not comment_count == "0":
                r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
                    '\\"', '"')
                page_count = 1 if response.meta.get('error_page') else self.get_page_num(int(comment_count), 10)
                response.meta["pageNumber"] = response.meta.get('pageNumber')
                response.meta["page_count"] = 10 if page_count >= 10 else page_count
                response.meta["comment_num"] = comment_count
                item = self.xpath_res(r, response)
                item["save_comment_num"] = True
                # if int(response.meta.get("history_comment_count")) > int(comment_count):
                #     # 判断评论最新时间
                #     logging.info(
                #         f'评论数小于历史评论数,历史评论数为： {response.meta.get("history_comment_count")}, 现评论数为：{comment_count}')
                #     return

                yield item
                ran = range(2, 11) if page_count >= 10 else range(2, page_count + 1)
                for i in ran:
                    # cookies = json.loads(random.choice(self.cookie_list)[1])
                    cookies = self.update_site_cookie(self.json_cookies(self.site), self.site)
                    data = {
                        'sortBy': 'recent',
                        'reviewerType': 'all_reviews',
                        'pageNumber': str(i),
                        'shouldAppend': 'undefined',
                        'deviceType': 'desktop',
                        'canShowIntHeader': 'undefined',
                        'reftag': 'cm_cr_arp_d_viewopt_srt',
                        'pageSize': '10',
                        'asin': response.meta.get("asin"),
                        'scope': 'reviewsAjax',
                        'filterByStar': response.meta["star"],
                        # 'formatType': 'current_format',
                    }
                    meta = self.random_r(response.meta)
                    meta["pageNumber"] = str(i)
                    meta["priority"] = i
                    self.comment_headers = self.get_comment_headers(self.url_, data['asin'])
                    h_key = random.choice(list(self.h_dict))
                    self.comment_headers[h_key] = self.h_dict.get(h_key)
                    self.comment_headers['X-Forwarded-For'] = '1.1.1.1,2.2.2.2'
                    x_main = random.choice(self.log_cookie)
                    print(x_main)
                    cookies.update({self.log_cookie_name.get(self.site, 'us'): x_main})
                    yield scrapy.FormRequest(url=url+f'#star={response.meta.get("star")}#asin={data.get("asin")}#pageNumber={str(i)}', headers=self.comment_headers, cookies=cookies, formdata=data, callback=self.parse_data, errback=self.err_parse, meta=meta, dont_filter=True)
            elif comment_count == "0":
                logging.info(f'评论数 为0 页面变狗：{response.meta.get("asin")}')
                item = {
                    "count_max": True,
                    "sql_data": (3, response.meta.get("asin")),
                    "comment_count": 0,
                    "asin": response.meta.get("asin"),
                    "star": response.meta.get("star"),
                }
                yield item

    def parse_data(self, response):
        """解析详情页数据"""
        r = response.text.replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
            '\\"', '"')
        if "@@@@@@@@@@@@@@@@@@@@___" in r:
            msg = self.r_utils(response)
            logging.info(f"页面 @@@@@@@@@@@@@ asin为: {response.meta.get('asin')} {response.meta.get('star')} 爬取页数 {response.meta.get('pageNumber')} 请求： {msg}")
            return None
        item = self.xpath_res(r, response)
        return item

    def comment_time(self, time_msg):
        if self.site == "de":
            time_msg = (time_msg[0].split(".")[0][-2:] + "." + time_msg[0].split(".")[1]).strip()
        elif self.site == "es":
            time_msg = time_msg[0].split("el ")[-1].strip()
        elif self.site == "fr":
            time_msg = time_msg[0].split("le ")[1].strip()
        elif self.site == "it":
            time_msg = time_msg[0].split("il ")[1].strip()
        else:
            if "年" in time_msg[0]:
                from datetime import datetime
                date_object = datetime.strptime(time_msg[0].split(" ")[0], "%Y年%m月%d日")
                time_msg = date_object.strftime("%Y-%m-%d")
            else:
                time_msg = time_msg[0].split("on")[1].strip()
        return time_msg

    def xpath_res(self, r, response):
        stats = self.crawler.stats
        ret = etree.HTML(r)
        # if not ret:
        #     # with open(f"./{response.meta.get('asin')}.html", "w", encoding='utf-8')as f:
        #     #     f.write(r)
        #     logging.info("NoneType object has no attribute xpath error find")
        comment_data = ret.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()')
        if comment_data:
            comment_count = re.sub(r"\\u.{4}", '', comment_data[0].__repr__()).replace(",", "").replace(".", "").replace("'", "") if comment_data else ''
            comment_count = re.findall(r'(\d+)', comment_count)
        else:
            comment_count = ['0', '0']
        # 10条评论
        datas = ret.xpath("//div[@class='a-section review aok-relative']")
        items = []
        for i in datas:
            # if not i.xpath(".//span[@data-hook='review-body']/span/text()"):
            #     print("-------", etree.tostring(i))
            title = i.xpath(".//*[@data-hook='review-title']/span/text()")[0].strip() if i.xpath(".//*[@data-hook='review-title']/span/text()") else ""
            # 用户名称
            user_name = i.xpath(".//span[@class='a-profile-name']//text()")[0].strip()
            # 用户图片链接
            user_img = i.xpath(".//div[@class='a-profile-avatar']/img/@data-src")
            content = i.xpath(".//span[@data-hook='review-body']/span/text()")
            if content:
                content = [i.strip() for i in content]
            # 赞同数
            helpful = i.xpath(".//span[@data-hook='helpful-vote-statement']//text()")
            if helpful:
                if self.site == "es":
                    review = 1 if helpful[0].split(" ")[1].strip() == "One" else helpful[0].split(" ")[1].strip()
                else:
                    review = 1 if helpful[0].split(" ")[0].strip() == "One" else helpful[0].split(" ")[0].strip()
            else:
                review = 0
            try:
                review = int(review)
            except:
                review = 0
            # 是否购买
            size_mini = i.xpath(".//span[@class='a-size-mini a-color-state a-text-bold']//text()")
            mini = 1 if size_mini else 2
            # 时间 国家
            time_msg = i.xpath(".//span[@data-hook='review-date']//text()")
            # Reviewed in Brazil 🇧🇷 on December 31, 2022
            # Rezension aus Deutschland vom 13. Februar 2023
            time_msg = self.comment_time(time_msg)
            # 评论内图片
            comment_img = i.xpath(".//div[@class='a-section a-spacing-top-mini cr-lightbox-image-thumbnails']/img/@src")
            # 视频地址
            video_url_list = i.xpath('.//input[contains(@class,"video-url")]/@value')
            if video_url_list:
                video_url = ';'.join(video_url_list)
            else:
                video_url = ""
            video_len_num = len(video_url_list)
            # 评论人主页连接
            user_page = i.xpath(".//div[@data-hook='genome-widget']/a/@href")

            rat = i.xpath(".//span[@class='a-icon-alt']//text()")

            rating = rat[0].split(".")[0].replace("stars", "").strip() if len(rat[0].split(".")[0].replace("stars", "").strip()) == 1 else rat[0].split(",")[0].replace("stars", "").strip()
            # 评论链接
            comment_url = i.xpath(".//*[@data-hook='review-title']/@href")
            if (not comment_url) and (not i.xpath('./@id')):
                logging.info("not get comment_url filter ------")
                continue

            variat_asin = i.xpath(".//a[@data-hook='format-strip']/@href")
            variat_asin = re.findall(r"product-reviews/(.*?)(?:\/|\?)", variat_asin[0])[0] if variat_asin else ""
            vine = i.xpath(".//span[@class=\"a-color-success a-text-bold\"]//text()")

            item = {
                "asin": variat_asin or response.meta.get("asin"),
                # "parent_asin": response.meta.get("asin"),
                "title": title,
                "content": " ".join(content).strip() if content else "",
                # 是否确认购买
                "is_vp": str(mini),
                # 型号如color、size、style
                "model": "|-|".join(i.xpath(".//a[@data-hook='format-strip']//text()")).strip() if i.xpath(".//a[@data-hook='format-strip']//text()") else "",
                # 星级评分
                "rating": rating,
                # 赞同数
                "agree_num": int(review),
                # 评论图片数量
                "img_num": len(comment_img),
                # 图片URL
                "img_url": ",".join(comment_img),
                # 是否有视频（1是2否）
                "is_video": str(1 if video_url else 2),
                # 视频地址
                "video_url": video_url,
                # 评论链接
                "comment_url": str(comment_url[0]) if comment_url else "",
                # 评论人名称
                "user_name": user_name,
                # 评论人头像图片链接
                "user_img": str(user_img[0]),
                # 所属国家
                "country": self.site,
                # # 所属国家
                # "country": time_msg[0].split("on")[0].split(" ")[-2].strip(),
                # 评论人主页URL
                "user_page": str(user_page[0]) if user_page else "",
                # '是否是红人计划链接(1是2否含有“Earns Commissions”标签的评论人主页)',
                "is_earns_commissions": "",
                "comment_time": time_msg,
                # '评论时间',
                "comment_time_format": time_ch(self.site, time_msg),
                'vine_review_flag': '1' if vine else '2',
            }

            item = {k: v or None for k, v in item.items()}
            item["comment_id"] = str(i.xpath("./@id")[0]) if i.xpath("./@id") else None
            # item['id'] = response.meta.get('id')
            items.append(item)
            # items.append(list(item.values()))
        msg = self.r_utils(response)
        logging.info(f"成功爬取数据为: {msg} {len(items)} {items}")
        stats.inc_value(response.meta.get("queue_name"))
        response.meta.get(response.meta.get("queue_name")).put(items)
        msgs = {}
        if int(stats.get_value(response.meta.get("queue_name"))) == int(response.meta.get('page_count')):
            stats.set_value(response.meta.get("queue_name"), 0)
            logging.info(f"页数爬取成功asin为: {response.meta.get('asin')} {msg} {response.meta.get('star')} 爬取页数 {response.meta.get(response.meta.get('queue_name')).qsize()} 数据长度： {response.meta.get('comment_num')}")
            # msgs["queues_"] = response.meta.get(response.meta.get("queue_name"))
            # msgs["comment_count"] = response.meta.get("comment_num")
            # msgs["star"] = response.meta.get("star")
            comment_all = [response.meta.get(response.meta.get("queue_name")).get() for i in range(response.meta.get(response.meta.get("queue_name")).qsize())]

            flattened_list = [item for sublist in comment_all for item in sublist]
            api_data = {
                "site": "us",
                "search": response.meta.get('request_data'),
                "header": {
                    "reviews": comment_count[1],
                    "ratings": comment_count[0]
                },
                "values": flattened_list
            }
            if comment_all:
                sadd("CommentData", json.dumps(api_data), use_md5=False)
        msgs["asin"] = response.meta.get("asin")
        # msgs["id"] = str(response.meta.get("id"))
        return msgs

    # def err_parse(self, failure, **kwargs):
    #     stats = self.crawler.stats
    #     stats.set_value(failure.request.meta.get("asin"), 0)
    #     self.header_error_count += 1
    #     item = {
    #         "error_asin": True,
    #         "sql_data": (1, failure.request.meta.get("asin")),
    #         "asin": failure.request.meta.get("asin"),
    #     }
    #     logging.info(f"爬取失败asin:{item.get('asin')}")
    #     yield item

    def err_parse(self, failure, **kwargs):

        request = failure.request
        # stats = self.crawler.stats
        # stats.set_value(request.meta.get("asin"), 0)
        # self.header_error_count += 1
        # item = {
        #     "error_asin": True,
        #     "sql_data": (3, failure.request.meta.get("id")),
        #     "asin": failure.request.meta.get("asin"),
        # }
        # logging.info(f"爬取失败asin:{item.get('asin')}")
        # yield item

        logging.info(f"error______ {failure.getErrorMessage()}, {failure.request.meta.get('asin')}")

        meta = {
            "asin": request.meta.get("asin"),
            "star": request.meta.get("star"),
            # "error_page": request.meta.get("pageNumber"),
            # "history_comment_count": job[2]
        }
        k = json.dumps(meta)
        # while True:
        #     try:
        #         # zadd(f"{self.site}_day_comment_seed", {k: 1})
        #         logging.info(f"push {self.site}_asin_comment_seed succeed")
        #         break
        #     except redis.exceptions.ConnectionError as e:
        #         logging.info(f"push {self.site}_asin_comment_seed ConnectionError，afresh push --> T_T {e}")
        #         time.sleep(5)
        #         continue
        #     except FunctionTimedOut as e:
        #         logging.info(f"push {self.site}_asin_comment_seed time out --> T_T {e}")
        #         continue


if __name__ == '__main__':
    args = 'scrapy crawl amazon_comment_t -a site=us'.split()
    cmdline.execute(args)

# source activate pyspark
# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl amazon_comment_history -a site=us  > amazon.log 2>&1 &
# nohup scrapy crawl amazon_comment_all -a site=us > amazon_history_comment_us1.log 2>&1 &
# for i in `ps -ef|grep "scrapy crawl amazon_comment_all" |awk '{print $2}' `; do kill -9 $i ; done;
# nohup /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_comment -a site=us > amazon1.log 2>&1 &
# nohup scrapy crawl amazon_comment -a site=de > amazon_de1.log 2>&1 &



