import re
import time
import json
import copy
import logging
import hashlib
import redis as rd
import pandas as pd
from scrapy import Selector
from func_timeout import func_set_timeout
from func_timeout.exceptions import FunctionTimedOut
from playwright.sync_api import Browser, BrowserContext, Page, sync_playwright, TimeoutError, Error


REDIS = {
    # 'host': '127.0.0.1',
    'host': '120.79.147.190',
    'port': 6379,
    'password': 'fG7#vT6kQ1pX',
    'db': 10
}


def singleton(cls, *args, **kw):
    """singleton mode.

    :param cls: classname
    :param args: args.
    :param kw: kwargs.
    :return:
    """

    instances = {}

    def _singleton():
        if cls not in instances:
            instances[cls] = cls(*args, **kw)
        return instances[cls]

    return _singleton


def md5(src: str, algorithm: str = "md5", digits: int = 32) -> str:
    """md5 algorithms.

    :param src: original string.
    :param algorithm: algorithm method.
    :param digits: 16 length or 32 length.
    :return: string.
    """

    algorithm = hashlib.new(algorithm)
    algorithm.update(src.encode('utf8'))
    if digits == 16:
        return algorithm.hexdigest()[8:24]
    else:
        return algorithm.hexdigest()


@singleton
class Redis(object):
    def __init__(self):
        self.host = REDIS['host']
        self.port = REDIS['port']
        self.db = REDIS['db']
        self.password = REDIS['password']

    def get_instance(self, db=10):
        self.pool = rd.ConnectionPool(
            host=self.host,
            port=self.port,
            db=db,
            password=self.password,
            max_connections=3,
            socket_timeout=5,
            socket_connect_timeout=5,
            retry_on_timeout=True,
        )
        return rd.Redis(connection_pool=self.pool)


@func_set_timeout(30)
def xadd_db0(key, data):
    r = Redis().get_instance(0)
    added = r.xadd(key, data)
    r.close()
    return added == 1


@func_set_timeout(30)
def xadd_db10(key, data):
    r = Redis().get_instance(10)
    added = r.xadd(key, data)
    r.close()
    return added == 1


def lpop(key) -> list:
    """lpop
    :param key:
    :return:
    """

    r = Redis().get_instance()
    return r.lpop(key)


@func_set_timeout(10)
def lpush(key, value):
    """add key-value to the sorted set.

    :param key: key
    :param value: value
    :return: True for done, False for not.
    """
    # value dict {value: score}
    r = Redis().get_instance()
    added = r.lpush(key, value)
    r.close()
    return added == 1


class AmazonCommentSpider(object):
    seed_key = 'AsinCommentSpiderList'
    save_key = 'AsinCommentResultStream'

    def __init__(self):
        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
                            level=logging.INFO)
        self.browser: Browser = None
        self.context: BrowserContext = None
        self.page: Page = None
        self.padding_error = 1
        self.seeds = [
            # {"siteName":"us","taskId":"commentSpider_zhoushuyi1@yswg.com.cn_1744079640","asin":[{"save_count":21,"last_save_time":1714462383323,"asin":"B09BKMM38W","last_comment_time":"2024-04-10"},{"asin":"B0DH2C4YF9"},{"save_count":29,"last_save_time":1714462383323,"asin":"B09BKP2Y2X","last_comment_time":"2024-03-15"},{"asin":"B0DH2C6C83"},{"save_count":24,"last_save_time":1714462383323,"asin":"B09BKMRKLW","last_comment_time":"2024-03-25"},{"save_count":26,"last_save_time":1714462383323,"asin":"B09BKN8XPB","last_comment_time":"2024-04-22"}]},
            {"siteName":"us","taskId":"commentSpider_wanghe@yswg.com.cn_1744010345","asin":[{"asin":"B0DR8VH3GF"}]}
        ]
        self.data = None
        self.site = "us"
        #

    def get_seed(self):
        while True:
            try:
                # seeds = self.seeds.pop()
                seeds = lpop(self.seed_key)
                logging.info("获取任务成功")
                break
            except rd.exceptions.ConnectionError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"获取任务成功超时 {e}")
                continue
            except rd.exceptions.TimeoutError as e:
                logging.info(f"获取任务成功失败{e}")
                time.sleep(5)
                continue
        return seeds

    def if_response(self, r, spider_page, page):
        # 判断页面是否正常  有部分异常页面 会显示Sorry, no reviews match your current selections.
        # 页面显示11条评论 爬取只有10条 导致异常  asin：B0BV5W3V6H
        sel = Selector(text=r, type="html")
        comment_num_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get()
        comment_num_data = re.sub(r"\\u.{4}", '', comment_num_data.__repr__()).replace(",", "").replace(".",
                                                                                                        "").replace("'",
                                                                                         "") if comment_num_data else ''
        comment_num_data = re.findall(r'(\d+)', comment_num_data)

        if not comment_num_data:
            logging.info("未解析到评论数 页面正常")
            return False
        logging.info(f"页面判断 评论数： {comment_num_data[0]}")
        if comment_num_data:
            comment_num = comment_num_data[0]
            comment_data_len = len(sel.xpath("//li[@data-hook='review']"))
            # comment_score_num = comment_num_data[0]
            # 判断页面是否有数据  如果评论数有  评论内容无则 被反爬
            if (comment_num != "0") and (comment_data_len):
                logging.info("评论数不为0，解析到评论 页面正常")
                return False
            elif (comment_num == "0"):
                logging.info("评论数为0 页面正常")
                return False
            elif int(spider_page[0]) == 1 and int(comment_num_data[1]) == 1:
                # 判断第一页是否为 显示为1 实际没数据
                logging.info(f"第一页 显示有数据 未解析到评论 参考asin B0C89ZFFSF")
                return False
            elif int(spider_page[0]) == (int(page) - 1) and int(comment_num) % 10 == 1 and (not comment_data_len):
                logging.info(f"最后一页  显示有数据 未解析到评论  参考asin B0BV5W3V6H")
                return False
            else:
                logging.info(f"评论数不为0，未解析到评论 页面异常")
                return True
        else:
            logging.info("未解析到评论数 页面异常")
            return True

    def print_request_finished(self, request):
        if "https://www.amazon.com/hz/reviews-render/ajax/reviews/get/ref=" in request.url:
            if request.response():
                try:
                    self.data = request.response().text().replace('\\n', '').replace('data-hook=\\"review\\"', 'data-hook="review"').replace(
                '\\"', '"')
                except:
                    self.data = None

    def get_comment_count(self):
        comment_verified_data = self.page.query_selector(
            "div[data-hook='cr-filter-info-review-rating-count']").text_content()
        if comment_verified_data:
            comment_verified_count = re.sub(r"\\u.{4}", '', comment_verified_data.__repr__()).replace(",", "").replace(".",
                                                                                                                 "").replace(
                "'", "") if comment_verified_data else ''
            comment_total_count = re.findall(r'(\d+)', comment_verified_count) or ['0']
        else:
            comment_total_count = ['0']
        return comment_total_count

    def get_page_num(self, num, size):
        if num <= 10:
            page = 2
        elif num % 10:
            page = int(num / 10) + 2
        else:
            page = int(num / 10) + 1
        if page >= 10:
            page = 11
        return page

    def save_redis(self, key, data):
        while True:
            try:
                xadd_db0(key, data)
                logging.info("save 成功")
                break
            except rd.exceptions.ConnectionError as e:
                logging.info(f"save 失败{e}")
                time.sleep(5)
                continue
            except FunctionTimedOut as e:
                logging.info(f"save 超时 {e}")
                continue
            except rd.exceptions.TimeoutError as e:
                logging.info(f"save 失败{e}")
                time.sleep(5)
                continue

    def crawl(self, seed, asin):
        url = f"https://www.amazon.com/product-reviews/{asin}/"
        # 需要打开的网站
        self.page.goto(url)
        self.page.wait_for_timeout(4000)
        self.page.on("requestfinished", self.print_request_finished)
        if not self.page.query_selector("div[data-hook='cr-filter-info-review-rating-count']"):
            api_data = {
                "taskId": seed.get('taskId'),
                "asin": asin,
                "site": seed.get('siteName'),
                "search": "",
                "header": json.dumps({
                    # "reviews": "0",
                    # "ratings": "0",
                    "totalReviews": "0",
                    "totalRatings": "0",
                }),
                "values": json.dumps([])
            }
            logging.info(f"该asin下没有评论数据 {api_data}")
            self.save_redis(self.save_key, api_data)
            self.page.close()
            self.context.close()
            return False
        comment_total_count = self.get_comment_count()
        if int(comment_total_count[-1]) == 0:
            api_data = {
                "taskId": seed.get('taskId'),
                "asin": asin,
                "site": seed.get('siteName'),
                "search": "",
                "header": json.dumps({
                    # "reviews": "0",
                    # "ratings": "0",
                    "totalReviews": comment_total_count[0],
                    "totalRatings": '',
                }),
                "values": json.dumps([])
            }
            logging.info(f"该asin下没有评论数据 {api_data}")
            self.save_redis(self.save_key, api_data)
            self.page.close()
            self.context.close()
            return False
        self.page.query_selector("span[id='a-autoid-3-announce']").click()
        self.page.wait_for_timeout(4000)
        if len(self.page.query_selector_all("ul[class='a-nostyle a-list-link'] > li")) == 2:
            self.page.query_selector_all("ul[class='a-nostyle a-list-link'] > li")[-1].click()
        self.page.wait_for_timeout(8000)
        if self.data:
            self.data = None
        comment_total_v_count = self.get_comment_count()
        if int(comment_total_v_count[-1]) == 0:
            api_data = {
                "taskId": seed.get('taskId'),
                "asin": asin,
                "site": seed.get('siteName'),
                "search": "",
                "header": json.dumps({
                    "totalReviews": comment_total_count[0],
                    "totalRatings": '',
                }),
                "values": json.dumps([])
            }
            logging.info(f"单变体下 没有评论数据{api_data}")
            self.save_redis(self.save_key, api_data)
            self.page.close()
            self.context.close()
            return False
        comment_datas = []
        star_list = [5, 4, 3, 2, 1]

        # 点击星级爬取5页数据
        while True:
        # for i in range(1, 6):
            if star_list:
                star_num = star_list.pop()
            else:
                logging.info("爬取完成")
                break
            self.page.query_selector(f"span[id='a-autoid-5-announce']").click()
            self.page.query_selector(f"a[id='star-count-dropdown_{star_num}']").click()
            self.page.wait_for_timeout(8000)
            comment_one_count = self.get_comment_count()
            if int(comment_one_count[-1]) == 0:
                logging.info("该星级 没有评论数据")
                page_star = re.findall("&pageNumber=(\d+)&filterByStar=(.*?)_star", self.page.url)
                request_data = {
                    'sortBy': 'recent',
                    'formatType': 'current_format',
                    "filterByStar": page_star[0][1]
                }
                api_data = {
                    "taskId": seed.get('taskId'),
                    "asin": asin,
                    "site": seed.get('siteName'),
                    "search": "",
                    "header": json.dumps({
                        "totalReviews": comment_total_count[0],
                        "totalRatings": '',
                    }),
                    "values": json.dumps([])
                }
                logging.info(f"单变体下 没有评论数据{api_data}")
                self.save_redis(self.save_key, api_data)
                continue
            page = self.get_page_num(int(comment_one_count[-1]), 10)
            star_comment = []
            if self.data:
                page_star = re.findall("&pageNumber=(\d+)&filterByStar=(.*?)_star", self.page.url)
                logging.info(f"页数: {page_star[0][0]} 星级： {page_star[0][1]}")
                if self.if_response(self.data, page_star[0], page):
                    star_list.append(star_num)
                    continue
                items = self.xpath_res(self.data, seed, page_star[0][0], asin)
                comment_datas += items
                star_comment += items
                self.data = None
            for p in range(2, page):
                self.page.query_selector(f"text='Next page'").click()
                self.page.wait_for_timeout(3000)
                if self.data:
                    page_star = re.findall("&pageNumber=(\d+)&filterByStar=(.*?)_star", self.page.url)
                    logging.info(f"星级: {page_star[0][1]}, 爬取页数：{p}")
                    if self.if_response(self.data, page_star[0], page):
                        star_list.append(star_num)
                        continue
                    items = self.xpath_res(self.data, seed, page_star[0][0], asin)
                    comment_datas += items
                    star_comment += items
                    self.data = None
                    self.page.wait_for_timeout(3000)
            logging.info(f"星级页面显示评论数: {comment_one_count[-1]}, 实际爬取评论数：{len(star_comment)}")
            if star_comment:
                df = pd.DataFrame(star_comment)
                df.drop_duplicates(['asin', 'comment_id'], inplace=True)
                df['agree_num'] = df['agree_num'].fillna("0")
                df['img_num'] = df['img_num'].fillna("0")
                api_item = df.to_dict(orient='records')
                request_data = {
                    'sortBy': 'recent',
                    'formatType': 'current_format',
                    "filterByStar": page_star[0][1]
                }
                api_data = {
                    "taskId": seed.get('taskId'),
                    "asin": asin,
                    "site": seed.get('siteName'),
                    "search": json.dumps(request_data),
                    "header": json.dumps({
                        "reviews": comment_total_v_count[0],
                        "ratings": '',
                        "totalReviews": comment_total_count[0],
                        "totalRatings": '',
                    }),
                    "values_len": len(api_item),
                    "values": json.dumps(api_item)
                }
            else:
                api_data = {
                    "taskId": seed.get('taskId'),
                    "asin": asin,
                    "site": seed.get('siteName'),
                    "search": json.dumps(request_data),
                    "header": json.dumps({
                        "reviews": comment_total_v_count[0],
                        "ratings": '',
                        "totalReviews": comment_total_count[0],
                        "totalRatings": '',
                    }),
                    "values": json.dumps([])
                }
            logging.info(f"星级数据为：{api_data}")
            self.save_redis(self.save_key, api_data)
            logging.info(f"页面显示评论数为: {comment_total_v_count[-1]}, 实际爬取评论总数为 {len(comment_datas)}")
        logging.info(f"x-main {[i for i in self.context.cookies() if i['name'] == 'x-main']}")
        self.context.storage_state(path='./login_cookie.json')
        self.page.close()
        self.context.close()

    def change_user(self):
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
        self.context = self.browser.new_context(
            locale='en-GB',
            bypass_csp=True,
            user_agent=user_agent,
            ignore_https_errors=True,
            no_viewport=True,
            storage_state="./login_cookie.json"
        )
        self.page = self.context.new_page()
        # self.seed_page = self.context.new_page()
        # 模拟真实浏览器环境 加载js
        with open('stealth.min.js', 'r') as f:
            js = f.read()
        self.page.add_init_script(js)
        self.page.evaluate_handle('''() =>{ window.chrome = { runtime: {}, }; }''')
        self.page.evaluate_handle(
            '''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
        # 模拟浏览器参数
        self.page.locator("body").click()
        self.page.bring_to_front()

    def amazon_login(self):
        url = "https://www.amazon.com/product-reviews/B00MQIQXBE/"
        self.page.goto(url)
        self.page.wait_for_timeout(1000)
        self.page.query_selector("input[id='ap_email']").fill("865582388@qq.com")
        self.page.query_selector("input[id='continue']").click()
        self.page.wait_for_timeout(3000)
        self.page.query_selector("input[id='ap_password']").fill("wjc_123456789!")
        self.page.query_selector("input[id='signInSubmit']").click()
        self.page.wait_for_timeout(3000)
        logging.info(f"url: {self.page.url}")
        if "ap/cvf/request" in self.page.url:
            logging.info("需要处理验证码")

    def amazon_login_if(self):
        url = "https://www.amazon.com/product-reviews/B00MQIQXBE/"
        self.page.goto(url)
        self.page.wait_for_timeout(1000)
        if self.page.query_selector("input[type='password']"):
            self.page.query_selector("input[type='password']").fill("wjc_123456789!")
            self.page.query_selector("input[id='signInSubmit']").click()
            self.page.wait_for_timeout(1000)
            return True
        if "Hello, sign in" == self.page.query_selector("span[id='nav-link-accountList-nav-line-1']").text_content():
            logging.info("未登录")
            return False
        else:
            logging.info("已经登录")
            return True

    def time_ch(self, site, time_msg):
        time_dict = {
            # tr
            "Ağustos": "08",
            "Kasım": "11",
            "Aralık": "12",
            "Şubat": "02",
            "Mart": "03",
            "Nisan": "04",
            "Temmuz": "07",
            "Ocak": "01",
            "Ekim": "10",
            "Mayıs": "05",
            "Eylül": "09",
            "Haziran": "06",
            # nl
            "januari": "01",
            "december": "12",
            "april": "04",
            "oktober": '10',
            "augustus": "08",
            "februari": '02',
            "maart": "03",
            "juli": "07",
            "november": "11",
            "juni": '06',
            "mei": '05',

            # us
            "June": "06",
            "April": "04",
            "January": "01",
            "October": "10",
            "November": "11",
            "August": "08",
            "March": "03",
            "December": "12",
            "July": "07",
            "September": "09",
            "Feb": '02',
            "May": "05",
            # de
            "Januar": "01",
            "Februar": '02',
            "März": '03',
            # "April": "04",
            "Mai": '05',
            "Juni": '06',
            "Juli": '07',
            # "August": "08",
            "september": "09",
            "Oktober": '10',
            # "November": "11",
            "Dezember": "12",
            # es
            "enero": '01',
            "febrero": '02',
            "marzo": '03',
            "abril": "04",
            "mayo": "05",
            "junio": '06',
            "julio": "07",
            "agosto": "08",
            "septiembre": '09',
            "octubre": '10',
            "noviembre": "11",
            "diciembre": "12",
            # fr
            "janvier": "01",
            "février": '02',
            "mars": "03",
            "avril": "04",
            "mai": "05",
            "juin": "06",
            "juillet": "07",
            "août": "08",
            "septembre": "09",
            "octobre": "10",
            "novembre": "11",
            "décembre": "12",
            # it
            "gennaio": "01",
            "febbraio": "02",
            # "marzo": '03',
            "aprile": "04",
            "maggio": '05',
            "giugno": "06",
            "luglio": "07",
            # "agosto": "08",
            "settembre": '09',
            "ottobre": '10',
            # "novembre": "11",
            "dicembre": "12",
            # uk
            # "January": "01",
            "February": '02',
            "Aug": '08',
            "Jun": "06",
            "Mar": "03",
            "Nov": "11",
            "Sept": "09",
            "Oct": "10",
            "Dec": "12",
            "Jan": "01",
            # "June": "06",
            # "Nov": "07",
            # "August": "08",
            # "September": "09",
            # "October": "10",
            # "November": "11",
            # "December": "12"
        }
        if t := re.match("([0-9]{2}).([0-9]{2}).([0-9]{4})", time_msg):
            return f"{t.group(3)}-{t.group(2)}-{t.group(1)}"
        if site == "de":
            if "/" in time_msg:
                return time_msg.strip().replace("/", "-")
            if not time_dict.get(time_msg.split(" ")[1].strip()):
                return ""
            day = time_msg.split(".")[0].replace("\xa0", "").strip()
            month = time_dict.get(time_msg.split(" ")[1].strip())
            year = time_msg.split(" ")[-1]
        elif site == "uk":
            if not time_dict.get(time_msg.split(" ")[1].strip().replace(".", "")):
                return ""
            day = time_msg.split(" ")[0].strip()
            month = time_dict[time_msg.split(" ")[1].strip().replace(".", "")]
            year = time_msg.split(" ")[2]
        elif site == "fr":
            if not time_dict.get(time_msg.split(" ")[1].strip()):
                return ""
            day = time_msg.split(" ")[0].strip()
            month = time_dict[time_msg.split(" ")[1].strip()]
            year = time_msg.split(" ")[2]
        elif site == "es":
            if len(time_msg.split(" ")) == 1:
                return ""
            if not time_dict.get(time_msg.split(" ")[1].strip()):
                return ""
            day = time_msg.split(" ")[0].strip()
            month = time_dict[time_msg.split(" ")[1].strip()]
            year = time_msg.split(" ")[-1]
        elif site == "it":
            if len(time_msg) == 4:
                return time_msg
            if not time_dict.get(time_msg.split(" ")[1].strip()):
                return ""
            day = time_msg.split(" ")[0].strip()
            month = time_dict[time_msg.split(" ")[1].strip()]
            year = time_msg.split(" ")[-1]
        elif site == "ca":
            if not time_dict.get(time_msg.replace(".", "").split(" ")[0].strip()):
                return ""
            day = time_msg.split(" ")[1].strip().split(" ")[0].strip()
            month = time_dict[time_msg.replace(".", "").split(" ")[0].strip()]
            year = time_msg.split(" ")[-1]
        elif site == "mx":
            if not time_dict.get(time_msg.replace(".", "").split(" ")[1].strip()):
                return ""
            day = time_msg.split(" ")[0].strip()
            month = time_dict[time_msg.replace(".", "").split(" ")[1].strip()]
            year = time_msg.split(" ")[-1]
        elif site in ["nl", "au", "be", "tr", "ae"]:
            if not time_dict.get(time_msg.split(" ")[1]):
                return ""
            day = time_msg.split(",")[0].split(" ")[0].replace("\xa0", "")
            month = time_dict[time_msg.split(" ")[1]]
            year = time_msg.split(" ")[-1].strip()
        else:
            if not time_dict.get(time_msg.split(" ")[0]):
                return ""
            day = time_msg.split(",")[0].split(" ")[-1].replace("\xa0", "")
            month = time_dict[time_msg.split(" ")[0]]
            year = time_msg.split(",")[-1].strip()
        day = day if len(day) == 2 else "0" + day
        return "{}-{}-{}".format(year, month, day)

    def comment_time(self, time_msg, site):
        if site == "de":
            time_msg = (time_msg.split(".")[0][-2:] + "." + time_msg.split(".")[1]).strip()
        elif site == "es":
            time_msg = time_msg.split("el ")[-1].strip()
        elif site == "fr":
            time_msg = time_msg.split("le ")[1].strip()
        elif site == "it":
            time_msg = time_msg.split("il ")[1].strip()
        else:
            if "年" in time_msg:
                from datetime import datetime
                date_object = datetime.strptime(time_msg.split(" ")[0], "%Y年%m月%d日")
                time_msg = date_object.strftime("%Y-%m-%d")
            else:
                time_msg = time_msg.split("on")[1].strip()
        return time_msg

    def xpath_res(self, r, seed, page, asin):
        sel = Selector(text=r, type="html")
        comment_data = sel.xpath('.//div[@class="a-row a-spacing-base a-size-base"]//text()').get('')
        if comment_data:
            comment_count = re.sub(r"\\u.{4}", '', comment_data.__repr__()).replace(",", "").replace(".", "").replace("'", "") if comment_data else ''
            comment_count = re.findall(r'(\d+)', comment_count)
        else:
            comment_count = ['0', '0']
        # 10条评论
        datas = sel.xpath("//li[@data-hook='review']")
        items = []
        for i in datas:
            # if not i.xpath(".//span[@data-hook='review-body']/span/text()"):
            #     print("-------", etree.tostring(i))
            title = i.xpath(".//h5/a/span//text()").get("").strip() or i.xpath(".//h5//span[@data-hook='review-title']/span/text()").get("").strip()
            # 用户名称
            user_name = i.xpath(".//span[@class='a-profile-name']//text()").get('').strip()
            # 用户图片链接
            user_img = i.xpath(".//div[@class='a-profile-avatar']/img/@data-src").get('')
            content = i.xpath(".//span[@data-hook='review-body']/span/text()").getall()
            if content:
                content = [i for i in content]
            # 赞同数
            helpful = i.xpath(".//span[@data-hook='helpful-vote-statement']//text()").get()
            if helpful:
                if self.site == "es":
                    review = 1 if helpful.split(" ")[1].strip() == "One" else helpful.split(" ")[1].strip()
                else:
                    review = 1 if helpful.split(" ")[0].strip() == "One" else helpful.split(" ")[0].strip()
            else:
                review = 0
            try:
                review = int(review)
            except:
                review = 0
            # 是否购买
            size_mini = i.xpath(".//span[@class='a-size-mini a-color-state a-text-bold']//text()").get()
            mini = 1 if size_mini else 2
            # 时间 国家
            time_msg = i.xpath(".//span[@data-hook='review-date']//text()").get('')
            # Reviewed in Brazil 🇧🇷 on December 31, 2022
            # Rezension aus Deutschland vom 13. Februar 2023
            time_msg = self.comment_time(time_msg, self.site)
            # 评论内图片
            comment_img = i.xpath(".//div[@class='a-section a-spacing-top-mini cr-lightbox-image-thumbnails']/img/@src").getall()
            # 视频地址
            video_url_list = i.xpath('.//input[contains(@class,"video-url")]/@value').getall()
            if video_url_list:
                video_url = ';'.join(video_url_list)
            else:
                video_url = ""
            video_len_num = len(video_url_list)
            # 评论人主页连接
            user_page = i.xpath(".//div[@data-hook='genome-widget']/a/@href").get('')

            rat = i.xpath(".//span[@class='a-icon-alt']//text()").get()

            rating = rat.split(".")[0].replace("stars", "").strip() if len(rat.split(".")[0].replace("stars", "").strip()) == 1 else rat.split(",")[0].replace("stars", "").strip()
            # 评论链接
            comment_url = i.xpath(".//*[@data-hook='review-title']/@href").get("")
            # if (not comment_url) and (not i.xpath('./@id')):
            #     logging.info("not get comment_url filter ------")
            #     continue
            variat_asin = i.xpath(".//a[@data-hook='format-strip']/@href").get()
            variat_asin = re.findall(r"product-reviews/(.*?)(?:\/|\?)", variat_asin)[0] if variat_asin else ""
            star_list = {
                '1': "one_star",
                '2': "two_star",
                '3': "three_star",
                '4': "four_star",
                '5': "five_star",
            }
            vine = i.xpath(".//span[@class=\"a-color-success a-text-bold\"]//text()").get('')
            item = {
                "asin": variat_asin or asin,
                # "parent_asin": response.meta.get("asin"),
                "title": title,
                "content": "".join(content).strip() if content else "",
                # 是否确认购买
                "is_vp": str(mini),
                # 型号如color、size、style
                "model": "|-|".join(i.xpath(".//a[@data-hook='format-strip']//text()").getall()).strip().replace("What's this?", "") if i.xpath(".//a[@data-hook='format-strip']//text()") else "",
                # 星级评分
                "rating": rating,
                # 赞同数
                "agree_num": int(review),
                # 评论图片数量
                "img_num": len(comment_img),
                # 图片URL
                "img_url": ",".join(comment_img),
                # 是否有视频（1是2否）
                "is_video": str(1 if video_url else 2),
                # 视频地址
                "video_url": video_url,
                # 评论链接
                "comment_url": comment_url,
                # 评论人名称
                "user_name": user_name,
                # 评论人头像图片链接
                "user_img": user_img,
                # 所属国家
                "country": self.site,
                # # 所属国家
                # "country": time_msg[0].split("on")[0].split(" ")[-2].strip(),
                # 评论人主页URL
                "user_page": user_page,
                # '是否是红人计划链接(1是2否含有“Earns Commissions”标签的评论人主页)',
                "is_earns_commissions": "",
                # '评论时间',
                "comment_time": time_msg,
                # '评论时间',
                "comment_time_format": self.time_ch(self.site, time_msg),
                "page": int(page),
                # "star": star_list[rating],
                'vine_review_flag': '1' if vine else '2',
            }
            item = {k: v or None for k, v in item.items()}
            item["comment_id"] = i.xpath("./@id").get()
            item["page_state"] = 1 if item["comment_url"] else 2
            # items.append(list(item.values()))
            items.append(item)
        return items

    def run(self):
        while True:
            seeds = self.get_seed()
            if seeds:
                seed = json.loads(seeds)
            # if self.seeds:
            #     seed = self.seeds.pop()
            #     logging.info("获取任务成功")
            else:
                logging.info("time sleep 3")
                time.sleep(3)
                continue
            if seed:
                for asin in seed.get('asin'):
                    try:
                        # 创建上下文和页面
                        self.change_user()
                        if not self.amazon_login_if():
                            # 登录
                            self.amazon_login()
                            self.page.wait_for_timeout(1000)
                            self.context.storage_state(path='./login_cookie.json')
                        self.crawl(seed, asin.get('asin'))
                        time.sleep(3)
                    except Error as e:
                        logging.info(f"playwright error {e}")
                        self.page.close()
                        self.context.close()
                        error_seed = copy.deepcopy(seed)
                        error_seed['asin'] = [asin]
                        lpush(self.seed_key, json.dumps(error_seed))
                        continue
                    except Exception as e:
                        logging.info(f"playwright error {e}")
                        self.page.close()
                        self.context.close()
                        error_seed = copy.deepcopy(seed)
                        error_seed['asin'] = [asin]
                        lpush(self.seed_key, json.dumps(error_seed))
                        continue
            else:
                time.sleep(30)
                logging.info('no task sleep 30s')


    def main(self):
        headless = False
        # headless = True
        logging.info(f"{headless}")
        with sync_playwright() as _playwright:
            # self.browser = _playwright.chromium.launch_persistent_context(
            #     headless=False,
            #     executable_path="C:\Program Files\Google\Chrome\Application\chrome.exe",
            #     user_data_dir=r"D:\chrome_data\test01",
            #     ignore_https_errors=True,
            #     no_viewport=True,
            #     bypass_csp=True
            # )
            self.browser = _playwright.chromium.launch(
                headless=False,
                # executable_path="C:\Program Files\Google\Chrome\Application\chrome.exe",
                executable_path=r"C:\Program Files (x86)\ChatAI Chrome\ChatAI_Chrome.exe",
            )
            self.run()


if __name__ == '__main__':
    comment = AmazonCommentSpider()
    comment.main()
    # seed = {
    #     "u_key": "xxx",
    #     "fpdm": "044002311111",
    #     "fphm": "59210491",
    #     "kprq": "20231214",
    #     "kjje": "303791"
    # }
    # ladd('finance:sp_invoice_queue', json.dumps(seed), use_md5=False)

