import os
import sys

sys.path.append(os.path.dirname(sys.path[0]))
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils

from utils.spark_util import SparkUtil
from pyspark.sql import functions as F

"""
头部分类对应亚马逊分类ID
"""


class DimCategoryDescId(object):

    def __init__(self, site_name):
        app_name = f"{self.__class__.__name__}"
        self.spark = SparkUtil.get_spark_session(app_name)
        self.site_name = site_name
        self.hive_table = 'dim_category_desc_id'
        pass

    def run(self):
        sql = f"""
     select asin_category_desc,
            node_id,
            tmp['asin']            as example_asin,
            tmp['asin_crawl_date'] as last_crawl_date
     from (
              select asin_category_desc                 as asin_category_desc,
                     node_id,
                     max(struct(asin_crawl_date, asin)) as tmp
              from dim_cal_asin_history_detail
              where site_name = '{self.site_name}'
                and node_id is not null
                and asin_category_desc is not null
                and asin_category_desc != '无'
                and asin_crawl_date > '2023-01-01'
              group by asin_category_desc, node_id
          ) tmp
"""

        df_save = self.spark.sql(sql)

        print("=====================sql=====================")
        print(sql)

        sql = f"""
select category_id  as category_id,
	   rel_first_id as category_first_id,
	   category_name,
	   nodes_num
from (
		 select category_id,
				rel_first_id,
				en_name                                                                             as category_name,
				nodes_num,
				row_number() over (partition by category_id order by delete_time desc nulls first ) as row_number
		 from dim_bsr_category_tree
		 where site_name = '{site_name}'
	 ) tmp
where row_number = 1
        """
        tree_df = self.spark.sql(sql)

        first_df = tree_df.where("nodes_num == 2").select(
            F.col("category_id").alias('first_id'),
            F.col("category_name").alias('first_name'),
        )

        tree_df = tree_df.join(first_df, on=tree_df['category_first_id'].eqNullSafe(first_df['first_id']), how="left").select(
            F.col("category_id"),
            F.col("category_first_id"),
            F.col("first_name").alias("tree_first_name"),
        )

        df_save = df_save.withColumn("first_name", F.split(F.col("asin_category_desc"), "›").getItem(0))

        df_save = df_save.join(tree_df, on=df_save['node_id'].eqNullSafe(tree_df['category_id']), how='fullouter') \
            .join(first_df, on=['first_name'], how='left') \
            .select(
            F.col("asin_category_desc"),
            F.coalesce("node_id", tree_df['category_id']).alias("node_id"),
            F.col("example_asin"),
            F.col("last_crawl_date"),
            tree_df['category_id'].alias('category_id'),
            F.coalesce("first_name", tree_df['tree_first_name']).alias("first_name"),
            F.coalesce(tree_df['category_first_id'], first_df['first_id']).alias("category_first_id"),
            F.lit(self.site_name).alias("site_name")
        )

        df_save = df_save.dropDuplicates(['asin_category_desc', 'node_id', 'category_id'])

        partition_dict = {
            "site_name": site_name
        }
        # partition_by = list(partition_dict.keys())

        #  插入或更新
        CommonUtil.save_or_update_table(
            spark_session=self.spark,
            hive_tb_name=self.hive_table,
            partition_dict=partition_dict,
            df_save=df_save
        )
        print("success")
        pass


if __name__ == '__main__':
    site_name = CommonUtil.get_sys_arg(1, None)
    obj = DimCategoryDescId(site_name)
    obj.run()
