"""
   @Author      : HuangJian
   @Description : 店铺分类统计表
   @SourceTable :
                  ①ods_seller_asin_account
                  ②dim_cal_asin_history_detail

   @SinkTable   :
                  ①dwt_fb_category_report
   @CreateTime  : 2023/07/18 17:33
   @UpdateTime  : 2023/07/18 17:33
"""

import os
import sys
import re
from datetime import datetime
import traceback
from functools import reduce

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录

# 分组排序的udf窗口函数
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType, DoubleType
from utils.common_util import CommonUtil, DateTypes
from utils.spark_util import SparkUtil
from yswg_utils.common_udf import udf_new_asin_flag
from utils.hdfs_utils import HdfsUtils


class DwtFbCategoryReport(object):
    def __init__(self, site_name, date_type, date_info):
        super().__init__()
        self.site_name = site_name
        self.date_type = date_type
        self.date_info = date_info
        self.hive_tb = f"dwt_fb_category_report"
        self.partition_dict = {
            "site_name": site_name,
            "date_type": date_type,
            "date_info": date_info
        }
        # 落表路径校验
        self.hdfs_path = CommonUtil.build_hdfs_path(self.hive_tb, partition_dict=self.partition_dict)

        # 创建spark_session对象相关
        app_name = f"{self.__class__.__name__}:{site_name}:{date_info}"
        self.spark = SparkUtil.get_spark_session(app_name)

        # 获取不同维度日期下的计算日期YYYY-MM-DD
        self.cal_date = CommonUtil.get_calDay_by_dateInfo(self.spark, self.date_type, self.date_info)

        # 初始化全局df
        self.df_fb_asin_info = self.spark.sql(f"select 1+1;")
        self.df_asin_history = self.spark.sql(f"select 1+1;")
        self.df_cate_name = self.spark.sql(f"select 1+1;")
        self.df_fb_cate_asin_cal = self.spark.sql(f"select 1+1;")
        self.df_fb_asin_cal = self.spark.sql(f"select 1+1;")
        self.df_bsr_asin_cal = self.spark.sql(f"select 1+1;")
        self.df_result_cal = self.spark.sql(f"select 1+1;")

        # 初始化UDF函数
        self.udf_new_asin_flag = F.udf(udf_new_asin_flag, IntegerType())

    def read_data(self):
        print("==============================获取 原始数据sql==========================================")
        # 获取ods_seller_account_feedback

        # 获取ods_seller_asin_account,获取卖家和asin关系
        sql = f"""select fb.seller_id,
       oa.asin
from (
         select seller_id
         from ods_seller_account_feedback
         where site_name = '{site_name}'
           and date_type = '{date_type}'
           and date_info = '{date_info}') fb
         left join (select seller_id ,asin 
              from ods_seller_asin_account
              where site_name = '{self.site_name}' 
              and date_format(created_at,'yyyy-MM-dd') <= '{self.cal_date}') oa
                   on fb.seller_id = oa.seller_id  """
        self.df_fb_asin_info = self.spark.sql(sqlQuery=sql)
        self.df_fb_asin_info = self.df_fb_asin_info.drop_duplicates(['seller_id', 'asin']).cache()
        print(sql)

        # 获取dim_cal_asin_history_detail，历史asin取上架时间
        sql = f"""
            select asin,
                   asin_launch_time,
                   category_first_id as bsr_cate_1_id 
              from dim_cal_asin_history_detail 
              where site_name = '{self.site_name}'"""
        self.df_asin_history = self.spark.sql(sqlQuery=sql).cache()
        print(sql)

        # 获取一级分类的名称
        sql = f"""
            select category_id as bsr_cate_1_id, en_name as bsr_cate_1_name
            from big_data_selection.dim_bsr_category_tree
            where site_name = '{self.site_name}'
            and category_parent_id = 0
        """
        self.df_cate_name = self.spark.sql(sqlQuery=sql)
        print(sql)

    def run(self):
        # 读取数据
        self.read_data()
        # 逻辑处理
        self.handle_fb_agg()
        # 字段处理
        self.sava_data()

    # 计算卖家下的asin数量和新品数量
    def handle_fb_agg(self):
        self.df_fb_cate_asin_cal = self.df_fb_asin_info.join(self.df_asin_history, on='asin', how='left')

        self.df_fb_cate_asin_cal = self.df_fb_cate_asin_cal.na.fill({'bsr_cate_1_id': '无'})

        # 通过days_diff走自定义udf，生成is_asin_new字段（是否asin新品标记）
        self.df_fb_cate_asin_cal = self.df_fb_cate_asin_cal.withColumn("is_asin_new",
                                                                       self.udf_new_asin_flag(F.col('asin_launch_time'),
                                                                                              F.lit(self.cal_date)))

        # 按照seller_id和category_id进行分组聚合
        self.df_fb_cate_asin_cal = self.df_fb_cate_asin_cal.groupby(['seller_id', 'bsr_cate_1_id']). \
            agg(
            F.count("asin").alias("fb_cate_asin_num"),
            F.sum("is_asin_new").alias("fb_cate_new_asin_num"),
        )

        # 计算卖家分类下的asin数量
        self.df_fb_asin_cal = self.df_fb_asin_info.groupby(['seller_id']).agg(F.count("asin").alias("fb_asin_num"))

        # 计算bsr分类下的asin数量
        self.df_bsr_asin_cal = self.df_asin_history.groupby(['bsr_cate_1_id']).agg(
            F.count("asin").alias("bsr_asin_num"))

        # 合并取到卖家分类的asin数量 和 bsr分类asin数量
        self.df_result_cal = self.df_fb_cate_asin_cal \
            .join(self.df_fb_asin_cal, on='seller_id', how='left') \
            .join(self.df_bsr_asin_cal, on='bsr_cate_1_id', how='left')

        # 分类店铺产品占比
        self.df_result_cal = self.df_result_cal.withColumn("fb_cate_asin_rate",
                                                           F.ceil((F.col("fb_cate_asin_num") / F.col(
                                                               "fb_asin_num") * 10000)) / 10000)

        # 分类店铺新品占比
        self.df_result_cal = self.df_result_cal.withColumn("fb_cate_new_asin_rate",
                                                           F.ceil((F.col("fb_cate_new_asin_num") / F.col(
                                                               "fb_asin_num") * 10000)) / 10000)

        # 分类市场占比
        self.df_result_cal = self.df_result_cal.withColumn("fb_market_rate",
                                                           F.ceil((F.col("fb_cate_asin_num") / F.col(
                                                               "bsr_asin_num") * 10000)) / 10000)

    def sava_data(self):
        df_save = self.df_result_cal.join(
            self.df_cate_name, on='bsr_cate_1_id', how='left'
        )

        # 未匹配到的分类名称置为无
        df_save = df_save.na.fill({'bsr_cate_1_name':'无'})

        df_save = df_save.select(
            F.col('seller_id'),
            F.col('bsr_cate_1_id'),
            F.col('bsr_cate_1_name'),
            F.col('fb_cate_asin_num'),
            F.col('fb_cate_new_asin_num'),
            F.col('fb_asin_num'),
            F.col('bsr_asin_num'),
            F.col('fb_cate_asin_rate'),
            F.col('fb_cate_new_asin_rate'),
            F.col('fb_market_rate'),
            F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS').alias('created_time'),
            F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS').alias('updated_time'),
            F.lit(self.site_name).alias('site_name'),
            F.lit(self.date_type).alias('date_type'),
            F.lit(self.date_info).alias('date_info')
        )

        print(f"清除hdfs目录中:{self.hdfs_path}")
        HdfsUtils.delete_file_in_folder(self.hdfs_path)

        df_save = df_save.repartition(1)
        partition_by = ["site_name", "date_type", "date_info"]
        print(f"当前存储的表名为：{self.hive_tb},分区为{partition_by}", )
        df_save.write.saveAsTable(name=self.hive_tb, format='hive', mode='append', partitionBy=partition_by)
        print("success")


if __name__ == '__main__':
    site_name = CommonUtil.get_sys_arg(1, None)
    date_type = CommonUtil.get_sys_arg(2, None)
    date_info = CommonUtil.get_sys_arg(3, None)  # 参数3：年-周/年-月/年-季/年-月-日, 比如: 2022-1
    handle_obj = DwtFbCategoryReport(site_name=site_name, date_type=date_type, date_info=date_info)
    handle_obj.run()
