import sys
import os
import json

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from threading_spider.db_connectivity import connect_db
from utils.db_connect import BaseUtils
import time
import pandas as pd
from queue import Queue


class spider_check(BaseUtils):
    def __init__(self, site):
        super().__init__()
        self.site_name = site  # 站点
        self.site_pg = site
        self.db_connect()
        self.data_queue = Queue()
        self.asin_detail_list = []

    def db_connect(self):
        self.engine_pg = self.pg_connect()
        self.db_class = connect_db(self.site_pg)
        self.engine_site = self.db_class.mysql_engine()

    def check_search_term_count(self, table_name, week):
        '校验 导入搜索词总数 是否满足150w'
        sql_count = f'select count(id) from {table_name} where  week ={week}'
        df = pd.read_sql(sql_count, con=self.engine_pg)
        count = list(df['count'])[0]
        print('总数：', count)
        account = 'pengyanbing'
        title = self.site + '站点 搜索词抓取总数统计'
        if count > 1510000:
            print(table_name, '搜索词导入超过150w')
            self.db_class.send_mg(account, title, '搜索词导入超过150w')
        elif count < 1450000:
            print(table_name, '搜索词导入少于150w')
            self.db_class.send_mg(account, title, '搜索词导入少于150w')

    def check_search_term(self, year_week):
        count_list = []
        for i in ['ac', 'bs', 'er', 'hr', 'sb', 'tr']:
            sql = f'select count(id) from us_search_term_rank_{i}_{year_week}'
            df = pd.read_sql(sql, con=self.engine_pg)
            count_id = list(df['count'])[0]
            print(i, count_id)
            count_list.append([i + ' 总数：', count_id])
        account = 'pengyanbing'
        title = self.site + '站点 搜索词抓取总数统计'
        count_list.append('zr：每周总数最少2亿，sp：每周总数最少4500w，sb：每周总数最少2000w，ac：每周总数最少120w，tr：每周总数最少15w，hr：每周总数最少1200w')
        self.db_class.send_mg(account, title, count_list)

    def check_being_sold(self, table_name):
        # 查询搜索词抓取。搜索量为 0 。插入到 _search_term_no 表
        sql_sold = f'select search_term from {table_name} where quantity_being_sold=0'
        df = pd.read_sql(sql_sold, con=self.engine_pg)
        print(len(df['search_term']))
        search_list = list(df['search_term'])
        data_list = []
        for i in search_list:
            s = []
            s.append(i)
            self.data_queue.put(i)
            data_list.append(tuple(s))
        print(data_list)
        # pg存储。存在就更新。不存在就插入
        with self.engine_pg.begin() as conn:
            insert_sql = f"INSERT INTO {self.site}_search_term_no (search_term) VALUES (%s) ON CONFLICT (search_term) DO UPDATE SET search_term = EXCLUDED.search_term"
            conn.execute(insert_sql, *data_list)
        self.send_ms(table_name + f'搜索词抓取出现 搜索量 =0有 {len(df["search_term"])}')

    def check_data(self, items, site):
        data_list = []
        table_name = f'asin_detail_2024_{items["week"]}'
        account = 'pengyanbing'
        if items["brand"]:
            if len(items["brand"]) > 100:
                brand = {"brand": items["brand"], 'asin': items['asin']}
                brand_str = json.dumps(brand)
                data_list.append(
                    ['pg_mysql', table_name, 'brand', '长度校验', brand_str, 1, account, site])

        if items["weight"]:
            weight = {"weight": items["weight"], 'asin': items['asin']}
            weight_str = json.dumps(weight)
            try:
                float_weight = float(items["weight"])
                if site == 'us':
                    if float_weight > 2500:
                        data_list.append(
                            ['pg_mysql', table_name, 'weight', '数值太大', weight_str, 1, account,
                             site])
                else:
                    if float_weight > 100000:
                        data_list.append(
                            ['pg_mysql', table_name, 'weight', '数值太大', weight_str, 1, account,
                             site])
            except:
                data_list.append(
                    ['pg_mysql', table_name, 'weight', '数值类型校验', weight_str, 1, account, site])
        if items['volume']:
            if len(items['volume']) > 50:
                volume = {"volume": items["volume"], 'asin': items['asin']}
                volume_str = json.dumps(volume)
                data_list.append(
                    ['pg_mysql', table_name, 'volume_str', '体积长度超出', volume_str, 1, account, site])

        if items["category"]:
            if 'Back to results' in items["category"]:
                category = {"category": items["category"], 'asin': items['asin']}
                category_str = json.dumps(category)
                data_list.append(
                    ['pg_mysql', table_name, 'category', '分类出现Back to results', category_str, 1,
                     'pengyanbing', site])
        if items["img_num"]:
            if items["img_num"] > 10:
                img_num = {"img_num": items["img_num"], 'asin': items['asin']}
                img_num_str = json.dumps(img_num)
                data_list.append(
                    ['pg_mysql', table_name, 'img_num', '小图超过7张', img_num_str, 1, account, site])

        if items["title"]:
            title = {"title": items["title"], 'asin': items['asin']}
            title_str = json.dumps(title)
            if len(items["title"]) > 400:
                data_list.append(
                    ['pg_mysql', table_name, 'title', '标题长度大于400', title_str, 1, account, site])
            elif len(items["title"]) < 2:
                data_list.append(
                    ['pg_mysql', table_name, 'title', '标题长度小于2', title_str, 1, account, site])

        if items["price"]:
            price = {"price": items["price"], 'asin': items['asin']}
            price_str = json.dumps(price)
            try:
                if float(items["price"]) == -1:
                    data_list.append(
                        ['pg_mysql', table_name, 'price', '价格解析错误', price_str, 1, account, site])
            except:
                data_list.append(
                    ['pg_mysql', table_name, 'price', '价格解析错误', price_str, 1, account, site])
        if data_list:
            print('db_field_verify_config 需要检查的数据：', data_list)
            df = pd.DataFrame(data=data_list,
                              columns=['db_type', 'table_name', 'field_name', 'verify_type', 'config_json', 'use_flag',
                                       'msg_usr_list', 'site'])
            for i in range(5):
                try:
                    df.to_sql(f"db_field_verify_config", con=self.engine_pg, if_exists='append',
                              index=False)
                    break
                except Exception as e:
                    print('存储校验字段报错：', e)
                    self.engine_pg = self.pg_connect()

    def select_none(self, site, item):
        week = time.strftime("%W")
        day_of_week = time.strftime("%w")
        day_of_month = int(time.strftime("%m"))
        if int(day_of_month) < 10:
            month = '0' + str(day_of_month)
        else:
            month = day_of_month
        print('day_of_week:',day_of_week)
        if int(day_of_week) > 4:
            week = int(week)-1
        else:
            week = int(week) - 1
        item['week'] = week
        item['month'] = month
        print('查询周：', week)
        print('查询月：', month)
        if site == 'us':
            sql_id = f'select count(asin) from {site}_asin_detail_month_2024_{month}'
        else:
            sql_id = f'select count(id) from {site}_asin_detail_2024_{week}'
        print(sql_id)
        if site == 'us':
            df_id = pd.read_sql(sql_id, con=self.engine_pg)
        else:
            df_id = pd.read_sql(sql_id, con=self.engine_pg)
        id_count = df_id.values.tolist()[0][0]
        print('总入库:', id_count)
        item['总入库:'] = id_count
        if site == 'us':
            sql_rank = f'select count(id) from {site}_asin_detail_month_2024_{month} where rank is NUll'
        else:
            sql_rank = f'select count(id) from {site}_asin_detail_2024_{week} where rank is NUll'
        if site == 'us':
            df_rank = pd.read_sql(sql_rank, con=self.engine_pg)
        else:
            df_rank = pd.read_sql(sql_rank, con=self.engine_pg)
        rank_count = df_rank.values.tolist()[0][0]
        print('rank 为 None:', rank_count)
        rank_None_count = (rank_count / id_count) * 100
        print('rank 空值率:', rank_None_count)
        item['rank 空值率:'] = str(int(rank_None_count)) + '%'
        if site == 'us':
            sql_price = f'select count(id) from {site}_asin_detail_month_2024_{month} where price is NUll or price=-1'
        else:
            sql_price = f'select count(id) from {site}_asin_detail_2024_{week} where price is NUll or price=-1'
        print(sql_price)
        if site == 'us':
            df_price = pd.read_sql(sql_price, con=self.engine_pg)
        else:
            df_price = pd.read_sql(sql_price, con=self.engine_pg)
        price_count = df_price.values.tolist()[0][0]
        price_None_count = (price_count / id_count) * 100
        print('price 空值率:', price_None_count)
        item['price 空值率:'] = str(int(price_None_count)) + '%'
        if site == 'us':
            sql_rating = f'select count(id) from {site}_asin_detail_month_2024_{month} where rating is NUll'
        else:
            sql_rating = f'select count(id) from {site}_asin_detail_2024_{week} where rating is NUll'
        if site == 'us':
            df_rating = pd.read_sql(sql_rating, con=self.engine_pg)
        else:
            df_rating = pd.read_sql(sql_rating, con=self.engine_pg)
        rating_count = df_rating.values.tolist()[0][0]
        rating_None_count = (rating_count / id_count) * 100
        print('rating 空值率:', rating_None_count)
        item['rating 空值率:'] = str(int(rating_None_count)) + '%'
        if site == 'us':
            sql_comments = f'select count(id) from {site}_asin_detail_month_2024_{month} where total_comments is NUll'
        else:
            sql_comments = f'select count(id) from {site}_asin_detail_2024_{week} where total_comments is NUll'
        if site == 'us':
            df_comments = pd.read_sql(sql_comments, con=self.engine_pg)
        else:
            df_comments = pd.read_sql(sql_comments, con=self.engine_pg)
        comments_count = df_comments.values.tolist()[0][0]
        comments_None_count = (comments_count / id_count) * 100
        print('total_comments 空值率:', comments_None_count)
        item['total_comments 空值率:'] = str(int(comments_None_count)) + '%'
        if site == 'us':
            sql_launch_time = f'select count(id) from {site}_asin_detail_month_2024_{month} where launch_time is NUll'
        else:
            sql_launch_time = f'select count(id) from {site}_asin_detail_2024_{week} where launch_time is NUll'
        if site == 'us':
            df_launch_time = pd.read_sql(sql_launch_time, con=self.engine_pg)
        else:
            df_launch_time = pd.read_sql(sql_launch_time, con=self.engine_pg)
        launch_time_count = df_launch_time.values.tolist()[0][0]
        launch_time_None_count = (launch_time_count / id_count) * 100
        print('launch_time 空值率:', launch_time_None_count)
        item['launch_time 空值率:'] = str(int(launch_time_None_count)) + '%'
        if site == 'us':
            sql_brand = f'select count(id) from {site}_asin_detail_month_2024_{month} where brand is NUll'
        else:
            sql_brand = f'select count(id) from {site}_asin_detail_2024_{week} where brand is NUll'
        if site == 'us':
            df_brand = pd.read_sql(sql_brand, con=self.engine_pg)
        else:
            df_brand = pd.read_sql(sql_brand, con=self.engine_pg)
        brand_count = df_brand.values.tolist()[0][0]
        brand_None_count = (brand_count / id_count) * 100
        print('brand 空值率:', brand_None_count)
        item['brand 空值率:'] = str(int(brand_None_count)) + '%'
        if site == 'us':
            sql_nodeid = f"select count(asin) from {site}_asin_detail_month_2024_{month} where  node_id is NULL and rank is not null"
        else:
            sql_nodeid = f"select count(asin) from {site}_asin_detail_2024_{week} where  node_id is NULL and rank is not null"
        if site == 'us':
            df_nodeid = pd.read_sql(sql_nodeid, con=self.engine_pg)
        else:
            df_nodeid = pd.read_sql(sql_nodeid, con=self.engine_pg)
        nodeid_count = df_nodeid.values.tolist()[0][0]
        nodeid_None_count = (nodeid_count / id_count) * 100
        print('nodeid 空值率:', nodeid_None_count)
        item['nodeid 空值率:'] = str(int(nodeid_None_count)) + '%'
        return item

    def send_ms(self):
        item = {}
        # for name in ['price', 'title', 'img_num', 'category', 'volume', 'weight', 'brand']:
        #     for i in range(3):
        #         try:
        #             sql = f"select count(id) from db_field_verify_config where site ='{self.site_pg}' and field_name ='{name}';"
        #             print(sql)
        #             df = pd.read_sql(sql, con=self.engine_pg)
        #             id_count = df.values.tolist()[0][0]
        #             if id_count > 0:
        #                 item[name] = id_count
        #
        #             break
        #         except:
        #             self.db_connect()
        #
        # if self.site_pg == 'us':
        #     w = '重量大于 2500p'
        # else:
        #     w = '重量大于 100000g'

        item = self.select_none(self.site_pg, item)
        # item['校验字段类型标准：'] = f'价格 -1, 标题长度 400, 小图张数 9, 头部分类 Back to results, 体积长度 50, {w}, 品牌长度大于 100 ' + '\n' + '某个字段空值率50%以上需要去页面核查该字段'

        print(item)
        self.db_class.send_mg('pengyanbing', f"\u26A0 {self.site_pg}站点 字段校验",
                              bytes(json.dumps(item), 'utf-8').decode('unicode_escape'))


# if __name__ == '__main__':
#     spider_check('us').send_ms()
    # spider_check('de').send_ms()
    # spider_check('uk').send_ms()
