# coding:utf-8
import json
import os, sys
import scrapy
import logging
import random
from scrapy import cmdline
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))  # 上级目录
from amazon_spider.db.mysql_db import get_con, sql_connect, sql_fetch_rows
from amazon_spider.utils.read_db_data import ReadDb, ReadCookie


class AmazonGetAsinSpider(scrapy.Spider):
    name = 'amazon_get_asin'
    custom_settings = {
        'SPIDER_MODULES': ['amazon_spider.spiders_text'],
        'NEWSPIDER_MODULE': 'amazon_spider.spiders_text',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_TIMEOUT': 30,
        'allowed_domains': ['amazon.com'],
        # Retry settings
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 10,  # 想重试几次就写几
        # 'COOKIES_ENABLED': True,
        # 'COOKIES_DEBUG': False,
        'RETRY_HTTP_CODES': [203, 301, 302, 403, 408, 429, 500, 502, 503, 504, 522, 524, 404],
        # downloader middlewares for spider.
        'DOWNLOADER_MIDDLEWARES': {
            # 递增调用
            'amazon_spider.middlewares.RandomUserAgentMiddleware': 460,
            'amazon_spider.middlewares.ProxyMiddleware': 450,
            'amazon_spider.middlewares.CookiesZip': 480,
            # 'amazon_spider.middlewares.GetCookieMiddleware': 460,
            # 递减调用
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500
        },
        'ITEM_PIPELINES': {
            'amazon_spider.pipelines.DayAmazonSpiderPipeline': 230,
            # 'amazon_spider.pipelines.AyncMysqlPipeline': 230

        }
    }

    def __init__(self, site='us'):
        super(AmazonGetAsinSpider, self).__init__()
        self.cols_list = [
            'asin', 'title', 'content', 'is_vp', 'model', 'rating',
            'agree_num', 'img_num', 'img_url', 'is_video', 'video_url',
            'comment_url', 'user_name', 'user_img', 'country', 'user_page',
            'is_earns_commissions', 'comment_time'
        ]
        self.site = site
        self.r_db = ReadDb(self.site)
        self.r_ck = ReadCookie(self.site)
        self.cookie_list = list([i[1], i[0]] for i in self.r_ck.get_cookie().values)
        # self.cookie_list = [["1", '{"i18n-prefs": "USD", "lc-main": "en_US", "session-id": "133-3003065-7887457", "session-id-time": "2082787201l", "sp-cdn": "L5Z9:CN", "ubid-main": "131-7051492-6565166"}']]
        # logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
        #                     level=logging.INFO)
        self.headers = {
            'authority': 'www.amazon.com',
            'accept': 'text/html,*/*',
            'accept-language': '*',
            'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
            'origin': 'https://www.amazon.com',
            'referer': 'https://www.amazon.com/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
        }
        self.conn = get_con()
        self.s = sql_connect(self.site)
        # self.datas = queue.Queue()
        self.seeds = sql_fetch_rows("SELECT bsr_link, nr_link FROM category_asin_special_data_sum_copy1;")

    def start_requests(self):
        while True:
            logging.debug(f'队列长度为：{len(self.seeds)}')
            if not self.seeds:
                break
            if len(self.seeds) >= 30:
                list_num = range(0, 30)
            else:
                list_num = range(0, len(self.seeds))
            for i in list_num:
                datas = self.seeds.pop()
                for i in range(1, 3):
                    cookies = random.choice(self.cookie_list)
                    url_id = datas["bsr_link"].split("/")[-1]
                    url = f"https://www.amazon.com/Best-Sellers-Home-Kitchen-Decorative-Hanging-Ornaments/zgbs/home-garden/{url_id}/ref=zg_bs_pg_{i}?_encoding=UTF8&pg={i}"
                    meta = {
                        "bsr_link": datas["bsr_link"],
                        "cookie_id": cookies[0],
                        "amazon_proxy": True,
                        # f"{url_id}": bs,
                        # "url_id": url_id
                    }
                    yield scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers, callback=self.parse, dont_filter=False, meta=meta)
                    cookies = random.choice(self.cookie_list)
                    url_id = datas["bsr_link"].split("/")[-1]
                    url = f"https://www.amazon.com/gp/new-releases/home-garden/{url_id}/ref=zg_bsnr_pg_{i}?ie=UTF8&pg={i}"
                    meta = {
                        "nr_link": datas["nr_link"],
                        "cookie_id": cookies[0],
                        "amazon_proxy": True,
                        # f"{url_id}": nr,
                        # "url_id": url_id
                    }
                    yield scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers,
                                         callback=self.parse,
                                         dont_filter=False, meta=meta)

    def parse(self, response, **kwargs):
        datas = response.xpath(".//div[@class='p13n-desktop-grid']/@data-client-recs-list").get()
        d = json.loads(datas)
        print("商品数量为", len(d), response.url)
        for i in d:
            url = f"https://www.amazon.com/gp/product/{i.get('id')}?th=1"
            response.meta["asin"] = i.get("id")
            cookies = random.choice(self.cookie_list)
            response.meta["cookie_id"] = cookies[0]
            response.meta["data_url"] = url
            yield scrapy.Request(url=url, cookies=json.loads(cookies[1]), headers=self.headers, callback=self.parse_data, dont_filter=False, meta=response.meta)

    def parse_data(self, response):
        asin = response.meta.get("asin")
        product_description = [i.strip() for i in response.xpath(".//div[@id='aplus']/h2//text()").getall()]
        video_url = response.xpath("//div[@class='celwidget aplus-module premium-module-8-hero-video aplus-premium']")
        # 轮播图
        premium_module = response.xpath("//div[@class='celwidget aplus-module premium-module-13-carousel aplus-premium']")
        category = []
        # //p[@data-elementid='vse-cards-vw-dp-hover-text']
        has_video = 1 if response.xpath("//li[contains(@class, 'a-spacing-small item video')]") else 2
        if "Product Description" in product_description and len(product_description) == 1:
            category.append(1)
        elif video_url and premium_module:
            category.append(3)
        else:
            category.append(2)
        category.sort(reverse=False)
        item = {
            "asin": asin,
            "bsr_link": response.meta.get("bsr_link"),
            "nr_link": response.meta.get("nr_link"),
            "has_video": has_video,
            "a_plus": ",".join([str(i) for i in category])
        }
        print(response.url)
        print("入库成功", list(item.values()))
        # response.meta.get(response.meta.get("url_id")).put(list(item.values()))
        # if response.meta.get(response.meta.get("url_id")).qsize() == 100:
        #     df = []
        #     for i in range(0, 100):
        #         df.append(response.meta.get(response.meta.get("url_id")).get())
        #     print(df)
        # df = pd.DataFrame([list(item.values())], columns=['asin', 'bsr_link', 'nr_link', 'has_video', 'a_plus'])
        # df.to_sql(name="category_asin_special_data_asin_syn_copy1", con=self.conn, if_exists='append', index=False)


if __name__ == '__main__':
    args = 'scrapy crawl amazon_get_asin'.split()
    cmdline.execute(args)


# nohup cd /mnt/hezhe/amazon_spider/amazon_spider scrapy crawl day_amazon  > amazon.log 2>&1 &
# nohup scrapy crawl day_amazon  > amazon1.log 2>&1 &

# for i in `ps -ef|grep "scrapy crawl day_amazon" |awk '{print $2}' `; do kill -9 $i ; done;

# 0 21 * * *  cd /mnt/hezhe/amazon_spider/amazon_spider && /opt/module/anaconda3/envs/pyspark/bin/scrapy crawl amazon_get_asin  > amazon_cookies1.log 2>&1 &


