import sys
import os

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.db_connect import BaseUtils
import pandas as pd
from utils.asin_parse import ParseAsinUs
import json
import ast
import gzip
from queue import Queue
import time
from collections import Counter
from utils.parse_search_term_xpath import ParseSearchTermUs
from threading_spider.db_connectivity import connect_db
from datetime import datetime
from dateutil.relativedelta import relativedelta
import traceback


class check_amazon_html():
    def __init__(self, site_name=None):
        self.engine_doris = BaseUtils().doris_connect()
        self.site_name = site_name
        days = time.strftime("%d")
        year = 2025
        if int(days) > 12:
            month = time.strftime("%m")
        else:
            # 获取当前日期
            current_date = datetime.now()
            # 减去一个月
            previous_month_date = current_date - relativedelta(months=1)
            # 获取上个月的月份
            month = previous_month_date.strftime("%m")
        _year_month = f'{year}_{month}'
        self.month = month
        self.date_info = _year_month
        self.item_queue = Queue()
        self.count_cols_list = []
        self.limit_num = 1000
        self.count_num_asin = -1
        self.count_num_search_term = -1
        self.zr_all_list = []
        self.sp_all_list = []
        self.sb_all_list = []
        self.ac_all_list = []
        self.bs_all_list = []
        self.er_all_list = []
        self.tr_all_list = []
        self.buy_text_list = []
        self.hr_list = []
        self.sort_all_list = []
        self.st_list = []
        self.db_class = connect_db('us')

    def decompress_bytes(self, input_bytes):
        if isinstance(input_bytes, str):
            input_bytes = ast.literal_eval(input_bytes)
        return gzip.decompress(input_bytes).decode('utf-8')

    def check_asin(self, asin_html_list):
        for asin_html_str in asin_html_list:
            asin_html_str_list = asin_html_str.split('|-|-|-|-|-|')
            asin = asin_html_str_list[0]
            asin_b_html = asin_html_str_list[1]
            html_str = asin_b_html
            asin_html = self.decompress_bytes(html_str)  # 解压缩字节对象
            items = ParseAsinUs(resp=asin_html, asin=asin, month=self.month, date_info=self.date_info,
                                site_name=self.site_name).xpath_html()
            new_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            item = {
                'title': items["title"],
                'img_url': items["img_url"],
                'rating': items["rating"],
                'total_comments': items["total_comments"],
                'price': items["price"], "rank": items["rank"], 'category': items["category"],
                'launch_time': items["launch_time"],
                'volume': items["volume"],
                'weight': items["weight"], "page_inventory": items["page_inventory"],
                "buy_box_seller_type": items["buy_box_seller_type"],
                "asin_vartion_list": items["asin_vartion_list"], 'title_len': items["title_len"],
                'img_num': items["img_num"], 'img_type': items["img_type"],
                'activity_type': items["activity_type"],
                'one_two_val': items["one_two_val"], 'three_four_val': items["three_four_val"],
                'eight_val': items["eight_val"],
                'qa_num': items["qa_num"], 'five_star': items["five_star"], 'four_star': items["four_star"],
                'three_star': items["three_star"],
                'two_star': items["two_star"], 'one_star': items["one_star"], 'low_star': items["low_star"],
                'together_asin': items["together_asin"],
                'brand': items["brand"], 'ac_name': items["ac_name"], 'material': items["material"],
                'node_id': items["node_id"], 'data_type': 1,
                'sp_num': items["sp_num"], 'describe': items["describe"],
                'date_info': self.date_info.replace('_', '-'),
                'weight_str': items["weight_str"], 'package_quantity': items['package_quantity'],
                'pattern_name': items['pattern_name'], 'seller_id': items["seller_id"],
                'variat_num': items['variat_num'], 'spider_int': 1,
                'site_name': self.site_name, 'best_sellers_rank': items["best_sellers_rank"],
                'best_sellers_herf': items["best_sellers_herf"], 'account_url': items["account_url"],
                'account_name': items["account_name"], 'parentAsin': items["parentAsin"],
                'asinUpdateTime': new_date, 'follow_sellers': items['sellers_num'],
                'all_best_sellers_herf': items['all_best_sellers_herf'],
                'product_description': items['product_description'], 'buy_sales': items['buySales'],
                'image_view': items['image_view'], 'product_json': items['product_json'],
                'product_detail_json': items['productdetail_json'],
                'review_ai_text': items['review_ai_text'], 'review_label_json': items['review_label_json'],
                'lob_asin_json': items['lob_asin_json'],
                'sp_initial_seen_asins_json': items['sp_initial_seen_asins_json'],
                'sp_4stars_initial_seen_asins_json': items['sp_4stars_initial_seen_asins_json'],
                'sp_delivery_initial_seen_asins_json': items['sp_delivery_initial_seen_asins_json'],
                'compare_similar_asin_json': items['compare_similar_asin_json'],
                'customer_reviews_json': items['customer_reviews_json'],
                'together_asin_json': items['together_asin_json'],
                'min_match_asin_json': items['min_match_asin_json'], 'seller_json': items['seller_json'],
                'created_time': new_date, 'current_asin': items['current_asin'],
                'parent_asin': items["parentAsin"]
            }
            if self.site_name in ['uk', 'de', 'fr', 'es', 'it']:
                item['five_six_val'] = items['five_six_val']
            else:
                item['five_six_val'] = None
            if items["asin_variation_list"]:
                item['variat_list'] = json.dumps(items["asin_variation_list"])  # 变体
            else:
                item['variat_list'] = None
            item['asin_vartion_list'] = items["asin_variation_list"]
            if items["all_img_video_list"]:
                item['img_list'] = json.dumps(items["all_img_video_list"])
            else:
                item['img_list'] = None
            none_keys = [key for key, value in item.items() if
                         (value is None) or (value == -1 and key == 'price') or (
                                 key == 'buy_box_seller_type' and value == 4)
                         or (key == 'page_inventory' and value == 3) or (
                                 value == 0 and key in ['weight', 'total_comments', 'rating', 'one_star', 'two_star',
                                                        'three_star', 'four_star', 'low_star', 'five_star'])]
            for key in ['three_four_val', 'eight_val', 'package_quantity', 'pattern_name', 'min_match_asin_json',
                        'current_asin', 'five_six_val']:
                if key in none_keys:
                    none_keys.remove(key)
            self.count_cols_list.append(none_keys)

    def check_search_term(self, search_term_html_list):
        for search_term_html_str in search_term_html_list:
            search_term_str_list = search_term_html_str.split('|-|-|-|-|-|')
            search_term = search_term_str_list[0]
            search_term_b_html = search_term_str_list[1]
            html_str = search_term_b_html
            search_term_html = self.decompress_bytes(html_str)  # 解压缩字节对象
            parse_search_term = ParseSearchTermUs(page_source=search_term_html, driver=None,
                                                  search_term=search_term,
                                                  page=1, site_name=self.site_name)
            st_list = parse_search_term.run()
            self.st_list.append(st_list)
            for st_list in self.st_list:
                zr_list, sp_list, sb_list, ac_list, bs_list, er_list, tr_list, sort_list, buy_text_list, hr_list = st_list
                self.zr_all_list.extend(zr_list)
                self.sp_all_list.extend(sp_list)
                self.sb_all_list.extend(sb_list)
                self.ac_all_list.extend(ac_list)
                self.bs_all_list.extend(bs_list)
                self.er_all_list.extend(er_list)
                self.tr_all_list.extend(tr_list)
                self.buy_text_list.extend(buy_text_list)
                self.hr_list.extend(hr_list)
                self.sort_all_list.extend(sort_list)

    def pare_search_term(self):
        print('self.zr_all_list:', len(self.zr_all_list))
        print('self.sp_all_list:', len(self.sp_all_list))
        print('self.sb_all_list:', len(self.sb_all_list))
        print('self.ac_all_list:', len(self.ac_all_list))
        print('self.bs_all_list:', len(self.bs_all_list))
        print('self.er_all_list:', len(self.er_all_list))
        print('self.tr_all_list:', len(self.tr_all_list))
        print('self.buy_text_list:', len(self.buy_text_list))
        print('self.hr_list:', len(self.hr_list))
        print('self.sort_all_list:', len(self.sort_all_list))
        search_term_dict = {}
        if len(self.zr_all_list) < self.count_num * 12:
            search_term_dict['zr'] = 0
        if len(self.sp_all_list) == 0:
            search_term_dict['sp'] = 0
        else:
            if len(self.sp_all_list) < self.count_num * 12:
                search_term_dict['sp'] = '搜索词广告asin发生变化或者标签下广告发生变化'
        if len(self.sb_all_list) < self.limit_num:
            search_term_dict['sb'] = 0
        if len(self.ac_all_list) < self.count_num:
            search_term_dict['ac'] = 0
        if len(self.bs_all_list) < self.count_num:
            search_term_dict['bs'] = 0
        sbi1 = 0
        sbi2 = 0
        sbi3 = 0
        for sb in self.sb_all_list:
            if sb[4] == 1:
                sbi1 += 0.2
            if sb[4] == 2:
                sbi2 += 0.2
            if sb[4] == 3:
                sbi3 += 0.3
        print('sbi1:',sbi1)
        print('sbi2:',sbi2)
        print('sbi3:',sbi3)
        if sbi1 < self.limit_num + self.count_num:
            search_term_dict['sb1'] = '没有获取到头部品牌'
        if sbi2 < self.limit_num + self.count_num:
            search_term_dict['sb2'] = '没有获取到尾部品牌'
        if sbi3 < self.limit_num + self.count_num:
            search_term_dict['sb3'] = '没有获取到视频asin'
        # 检查是否有月销
        i = 0
        for buy_text in self.buy_text_list:
            if buy_text[3]:
                if 'bought in past month' not in buy_text[3]:
                    i += 0.1
                else:
                    i -= 0.1
            else:
                i += 0.1

        if i >= self.limit_num:
            search_term_dict['buy_text'] = '搜索词页面没有获取到月销'
        ii = 0
        for sort in self.sort_all_list:
            if sort[1] <= 1:
                ii += 1
        if ii >= self.limit_num:
            search_term_dict['sort_all'] = '搜索词页面没有获取到搜索量'
        print(self.limit_num + self.count_num, 'sb类型 1 2 3：', sbi1, sbi2, sbi3)
        print('月销：', i, '搜索量：', ii)
        # print('search_term_dict::', search_term_dict)
        return search_term_dict

    def read_db_data(self, str_type=None):
        if str_type == 'asin':
            table_name = f'asin_html_{self.date_info}'
        else:
            table_name = f'search_term_html_{self.date_info}'
        try:
            sql_read = f"SELECT max(id) FROM {table_name} where site_name='{self.site_name}'"
            print(sql_read)
            df_max_id = pd.read_sql(sql_read, con=self.engine_doris)
            id_list = list(df_max_id['max(id)'])
            print('id_list::', id_list)
            id = id_list[0] - 100000
            if str_type == 'asin':
                sql_read = f"SELECT asin,html FROM {table_name} where  id in (SELECT id FROM {table_name} WHERE site_name='{self.site_name}' ORDER BY id DESC LIMIT 500)"
                print(sql_read)
                df_asin = pd.read_sql(sql_read, con=self.engine_doris)
                data_html_list = list(
                    df_asin.asin + '|-|-|-|-|-|' + df_asin.html)
                self.count_num_asin = len(data_html_list)
            else:
                # sql_read = f"SELECT search_term,html FROM {table_name} where site_name='{self.site_name}' and page=1  and id> {id} limit 1000"
                sql_read = f"SELECT search_term,html FROM {table_name} where  id in (SELECT id FROM {table_name} WHERE site_name='{self.site_name}' and page=1 ORDER BY id DESC LIMIT 500)"
                print(sql_read)
                df_asin = pd.read_sql(sql_read, con=self.engine_doris)
                data_html_list = list(
                    df_asin.search_term + '|-|-|-|-|-|' + df_asin.html)
                self.count_num_search_term = len(data_html_list)
            self.count_num = len(data_html_list)
            print('self.count_num::', self.count_num)
            time.sleep(3)
            return data_html_list
        except Exception as e:
            print(e, f"\n{traceback.format_exc()}")
            self.count_num = -1
            return []

    def run(self):
        search_term_dict = {}
        search_term_html_list = self.read_db_data(str_type='search_term')
        if search_term_html_list:
            self.check_search_term(search_term_html_list)
            search_term_dict = self.pare_search_term()

        asin_html_list = self.read_db_data(str_type='asin')
        if asin_html_list:
            self.check_asin(asin_html_list)
            # 将所有列表中的字符串合并成一个平坦的列表
            flat_list = [item for sublist in self.count_cols_list for item in sublist]
            # 使用Counter统计每个字符串出现的次数
            counter = Counter(flat_list)
            # 打印每个字符串及其出现次数
            for item, count in counter.items():
                if count >= self.count_num - 10:
                    print(f'{item}: {count}')
                    search_term_dict[item] = count
        print('search_term_dict::', search_term_dict)
        account = 'pengyanbing'
        title = self.date_info + ' ' + self.site_name + f' 搜索词数据校验 {self.count_num_search_term} 条，asin数据校验 {self.count_num_asin} 条'
        content = json.dumps(search_term_dict, ensure_ascii=False)
        self.db_class.send_mg(account, title, content)


if __name__ == '__main__':
    for site in ['us', 'de', 'uk']:
        check_amazon_html(site_name=site).run()
