
"""
   @Author      : HuangJian
   @Description : 搜索词基础指标表:主要集成历史月的搜索量和销量
   @SourceTable :
                  ①ods_st_key
                  ②dim_st_detail

   @SinkTable   : dwt_st_base_report
   @CreateTime  : 2022/03/28 14:55
   @UpdateTime  : 2022/03/28 14:55
"""

import os
import sys

sys.path.append(os.path.dirname(sys.path[0]))

from utils.hdfs_utils import HdfsUtils
from utils.common_util import CommonUtil, DateTypes
from utils.spark_util import SparkUtil
from pyspark.sql import functions as F


class DwtSTBaseReport(object):

    def __init__(self, site_name, date_type, date_info):
        self.site_name = site_name
        self.date_type = date_type
        self.date_info = date_info
        self.hive_tb = "dwt_st_base_report"
        app_name = f"{self.hive_tb}:{site_name} {date_type} {date_info}"
        self.spark = SparkUtil.get_spark_session(app_name)
        self.partitions_num = CommonUtil.reset_partitions(site_name, 1)
        hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.hive_tb}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
        print(f"清除hdfs目录中数据:{hdfs_path}")
        HdfsUtils.delete_hdfs_file(hdfs_path)

    def run(self):
        sql = f""" 
        select 
            st_key.st_key,
            dim_st.search_term,
            dim_st.st_rank,
            dim_st.st_search_num as st_volume,
            cast(round(dim_st.st_search_sum) as int) as st_orders,
            cast(substr(dim_st.date_info,0,4) as int) as years,
            dim_st.site_name,
            dim_st.date_type,
            dim_st.date_info
        from (
            select 
                search_term,
                st_rank,
                st_search_num,
                st_search_sum,
                site_name,
                date_type,
                date_info
            from dim_st_detail
            where site_name = '{self.site_name}'
              and date_type = '{self.date_type}'
              and date_info = '{self.date_info}' ) dim_st
        left join (
            select 
                st_key,
                search_term
            from ods_st_key
            where site_name = '{self.site_name}' ) st_key
        on dim_st.search_term = st_key.search_term;
        """
        df_save = self.spark.sql(sqlQuery=sql).repartition(10).cache()
        df_save.show(10, truncate=True)

        df_save = df_save.withColumn(
            "created_time",
            F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS')
        ).withColumn(
            "updated_time",
            F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS')
        )
        df_save = df_save.repartition(self.partitions_num)
        partition_by = ["site_name", "date_type", "date_info"]
        print(f"当前存储的表名为：{self.hive_tb},分区为{partition_by}", )
        df_save.write.saveAsTable(name=self.hive_tb, format='hive', mode='append', partitionBy=partition_by)
        print("success")


if __name__ == '__main__':
    site_name = CommonUtil.get_sys_arg(1, None)
    date_type = CommonUtil.get_sys_arg(2, None)
    date_info = CommonUtil.get_sys_arg(3, None)
    obj = DwtSTBaseReport(site_name, date_type, date_info)
    obj.run()
