import os
import sys

sys.path.append(os.path.dirname(sys.path[0]))

from utils.common_util import CommonUtil
from utils.spark_util import SparkUtil


class DimBsrAsinRankHistory_cacl(object):
    """
    整理并清除重复数据
    """

    def __init__(self, site_name, date_info, data_type):
        self.site_name = site_name
        self.date_info = date_info
        self.data_type = data_type
        app_name = f"{self.__class__.__name__}:{site_name}:{date_info}"
        self.spark = SparkUtil.get_spark_session(app_name)

    def run(self):
        # 数据清洗
        if self.data_type == 'bsr':
            sql = f"""
            select *
            from dim_bsr_asin_rank_history
            where site_name = '{self.site_name}'
              and date_info = '{self.date_info}'
    """
            hive_tb = "dim_bsr_asin_rank_history"
        elif self.data_type == 'nsr':
            sql = f"""
            select *
            from dim_nsr_asin_rank_history
            where site_name = '{self.site_name}'
              and date_info = '{self.date_info}'
    """
            hive_tb = "dim_nsr_asin_rank_history"
        else:
            return
        df_save = self.spark.sql(sql)

        # 清除重复数据
        df_save = df_save.dropDuplicates(['asin', 'category_id'])
        # 分区数量调整
        df_save = df_save.repartition(1)

        partition_dict = {
            "site_name": self.site_name,
            "date_info": self.date_info,
        }

        CommonUtil.save_or_update_table(
            spark_session=self.spark,
            hive_tb_name=hive_tb,
            partition_dict=partition_dict,
            df_save=df_save,
            drop_exist_tmp_flag=False
        )


if __name__ == '__main__':
    site_name = CommonUtil.get_sys_arg(1, None)
    date_info = CommonUtil.get_sys_arg(2, None)
    data_type = CommonUtil.get_sys_arg(3, None)
    obj = DimBsrAsinRankHistory_cacl(site_name, date_info, data_type)
    obj.run()
