"""
   @Author      : HuangJian
   @Description : 店铺分类统计表
   @SourceTable :
                  ①dim_fd_asin_info
                  ②dim_cal_asin_history_detail

   @SinkTable   :
                  ①dwt_fd_category_agg
   @CreateTime  : 2022/12/16 17:33
   @UpdateTime  : 2022/12/16 17:33
"""

import os
import sys
import re
import datetime
import traceback
from functools import reduce

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
# 分组排序的udf窗口函数
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType, DoubleType
from sqlalchemy import create_engine
import pandas as pd


class DwtFdCategoryAgg(Templates):
    def __init__(self, site_name="us", date_type="month", date_info="2022-11"):
        super().__init__()
        self.site_name = site_name
        self.date_type = date_type
        self.date_info = date_info
        self.db_save = f"dwt_fd_category_agg"
        self.spark = self.create_spark_object(
            app_name=f"{self.db_save}: {self.site_name},{self.date_type}, {self.date_info}")

        # 写入、分区初始化
        self.df_save = self.spark.sql(f"select 1+1;")
        self.partitions_by = ['site_name', 'date_type', 'date_info']
        self.reset_partitions(partitions_num=10)

        # 计算新品日期
        self.get_date_info_tuple()
        self.ym = self.year+"_"+str(int(self.month))
        self.cal_day = self.get_calDay_by_dateInfo()

        # 初始化全局df
        self.df_fd_asin_info = self.spark.sql(f"select 1+1;")
        self.df_asin_history = self.spark.sql(f"select 1+1;")
        self.df_fd_cate_asin_cal = self.spark.sql(f"select 1+1;")
        self.df_fd_asin_cal = self.spark.sql(f"select 1+1;")
        self.df_bsr_asin_cal = self.spark.sql(f"select 1+1;")
        self.df_result_cal = self.spark.sql(f"select 1+1;")

        # 自定义udf函数相关对象
        self.u_launch_time = self.spark.udf.register("u_launch_time", self.udf_launch_time, IntegerType())
        self.u_days_diff = self.spark.udf.register("u_days_diff", self.udf_days_diff, IntegerType())

    def get_calDay_by_dateInfo(self):
        if self.date_type in ['day', 'last30day']:
            return str(self.date_info)
        # 如果为 周、月则取该周、月的最后一日，作为新品计算基准日
        if self.date_type in ['week', 'month']:
            self.df_date = self.spark.sql(f"select * from dim_date_20_to_30;")
            df = self.df_date.toPandas()
            df_loc = df.loc[df[f'year_{self.date_type}'] == f"{self.date_info}"]
            self.date_info_tuple = tuple(df_loc.date)
            # week_date_info_tuple = tuple(df_loc.date)
            # last_index = len(week_date_info_tuple)
            # print("self.cal_day:", str(tuple(df_loc.date)[last_index - 1]))
            # # 判断长度，取最后一日
            # return str(tuple(df_loc.date)[last_index - 1])
            # 取周第一天、月的第一天
            print("self.cal_day:", str(list(df_loc.date)[0]))
            return str(list(df_loc.date)[0])

    @staticmethod
    def udf_launch_time(launch_time, cal_day):
        # 针对launch_time字段进行计算与当前日期的间隔天数
        if "-" in str(launch_time):
            # print(DwdFeedBack.week_date)
            asin_date_list = str(launch_time).split("-")
            try:
                asin_date = datetime.date(year=int(asin_date_list[0]),
                                          month=int(asin_date_list[1]),
                                          day=int(asin_date_list[2]))
                if not cal_day.strip():
                    week_date = '2022-11-02'
                else:
                    week_date = cal_day
                cur_date_list = str(week_date).split("-")
                cur_date = datetime.date(year=int(cur_date_list[0]),
                                         month=int(cur_date_list[1]),
                                         day=int(cur_date_list[2]))
                days_diff = (cur_date - asin_date).days
            except Exception as e:
                print(e, traceback.format_exc())
                print(launch_time, asin_date_list)
                days_diff = 999999
        else:
            days_diff = 999999
        return days_diff

    @staticmethod
    def udf_days_diff(days_diff):
        # 针对days_diff字段进行计算180天，判断是否为新品
        if days_diff <= 180:
            return 1
        elif days_diff == 999999:
            return None
        else:
            return 0

        # 数据加载

    def read_data(self):
        # 获取dim_fd_asin_info,获取卖家和asin关系
        sql = f"select fd_account_id,asin " \
              f"from dim_fd_asin_info where site_name = '{self.site_name}' "
        self.df_fd_asin_info = self.spark.sql(sqlQuery=sql)
        print("self.df_fd_asin_info", self.df_fd_asin_info.show(10, truncate=False))

        # 获取dim_cal_asin_history_detail，历史asin取上架时间
        sql = f"select asin,asin_launch_time,bsr_cate_1_id " \
              f"from dim_cal_asin_history_detail where site_name = '{self.site_name}'"
        self.df_asin_history = self.spark.sql(sqlQuery=sql)
        self.df_asin_history = self.df_asin_history.cache()
        print("self.df_asin_history", self.df_asin_history.show(10, truncate=False))

    # 数据处理
    def handle_data(self):
        # 计算卖家分类下的asin数量和新品数量
        self.handle_fd_cate_asin_agg()
        # 计算卖家分类下的asin数量
        self.handle_fd_asin_agg()
        # 计算单分类下的bsr数量
        self.handle_bsr_asin_agg()
        # 合并指标，计算
        self.handle_result_cal()
        # 字段处理
        self.handle_column()

    # 计算卖家下的asin数量和新品数量
    def handle_fd_cate_asin_agg(self):
        self.df_fd_cate_asin_cal = self.df_fd_asin_info.join(self.df_asin_history, on='asin', how='left')
        self.df_fd_cate_asin_cal = self.df_fd_cate_asin_cal.withColumn("days_diff", self.u_launch_time(
            self.df_fd_cate_asin_cal.asin_launch_time, F.lit(self.cal_day)))
        # 通过days_diff走自定义udf，生成is_asin_new字段（是否asin新品标记）
        # print("asin_is_new")
        self.df_fd_cate_asin_cal = self.df_fd_cate_asin_cal.withColumn("asin_is_new_flag", self.u_days_diff(
            self.df_fd_cate_asin_cal.days_diff))

        # 按照account_id和category_id进行分组聚合
        self.df_fd_cate_asin_cal = self.df_fd_cate_asin_cal.groupby(['fd_account_id', 'bsr_cate_1_id']). \
            agg(
            F.count("asin").alias("fd_cate_asin_num"),
            F.sum("asin_is_new_flag").alias("fd_cate_new_asin_num"),
        )

    # 计算卖家分类下的asin数量
    def handle_fd_asin_agg(self):
        self.df_fd_asin_cal = self.df_fd_asin_info.groupby(['fd_account_id']).agg(F.count("asin").alias("fd_asin_num"))

    # 计算bsr分类下的asin数量
    def handle_bsr_asin_agg(self):
        self.df_bsr_asin_cal = self.df_asin_history.groupby(['bsr_cate_1_id']).agg(
            F.count("asin").alias("bsr_asin_num"))

    # 计算指标
    def handle_result_cal(self):
        # 合并取到卖家分类的asin数量 和 bsr分类asin数量
        self.df_result_cal = self.df_fd_cate_asin_cal \
            .join(self.df_fd_asin_cal, on='fd_account_id', how='left') \
            .join(self.df_bsr_asin_cal, on='bsr_cate_1_id', how='left')

        # 分类店铺产品占比
        self.df_result_cal = self.df_result_cal.withColumn("fd_cate_asin_per",
                                                           F.ceil((F.col("fd_cate_asin_num") / F.col(
                                                               "fd_asin_num") * 10000)) / 10000)

        # 分类店铺新品占比
        self.df_result_cal = self.df_result_cal.withColumn("fd_cate_new_asin_per",
                                                           F.ceil((F.col("fd_cate_new_asin_num") / F.col(
                                                               "fd_asin_num") * 10000)) / 10000)

        # 分类市场占比
        self.df_result_cal = self.df_result_cal.withColumn("fd_market_per",
                                                           F.ceil((F.col("fd_cate_asin_num") / F.col(
                                                               "bsr_asin_num") * 10000)) / 10000)

        # 年月字段补全
        self.df_result_cal = self.df_result_cal.withColumn("ym",F.lit(self.ym))

        # 周字段补全
        self.df_result_cal = self.df_result_cal.withColumn("week", F.lit(self.week))

    def handle_column(self):
        self.df_save = self.df_result_cal.select("fd_account_id","bsr_cate_1_id","fd_cate_asin_num",
                                                 "fd_cate_new_asin_num","fd_asin_num","bsr_asin_num",
                                                 "fd_cate_asin_per","fd_cate_new_asin_per","fd_market_per","ym","week")

        self.df_save = self.df_save. \
            na.fill({"bsr_cate_1_id": 0, "fd_cate_asin_per": 0,"fd_cate_new_asin_per": 0, "fd_market_per": 0})

        # 预留字段补全
        self.df_save = self.df_save.withColumn("re_double_field1", F.lit(0))
        self.df_save = self.df_save.withColumn("re_double_field2", F.lit(0))
        self.df_save = self.df_save.withColumn("re_double_field3", F.lit(0))
        self.df_save = self.df_save.withColumn("re_string_field1", F.lit("null"))
        self.df_save = self.df_save.withColumn("re_string_field2", F.lit("null"))
        self.df_save = self.df_save.withColumn("re_string_field3", F.lit("null"))
        self.df_save = self.df_save.withColumn("re_int_field1", F.lit(0))
        self.df_save = self.df_save.withColumn("re_int_field2", F.lit(0))
        self.df_save = self.df_save.withColumn("re_int_field3", F.lit(0))

        # 分区字段补全
        self.df_save = self.df_save.withColumn("site_name", F.lit(self.site_name))
        self.df_save = self.df_save.withColumn("date_type", F.lit(self.date_type))
        self.df_save = self.df_save.withColumn("date_info", F.lit(self.date_info))

if __name__ == '__main__':
    site_name = sys.argv[1]  # 参数1：站点
    date_type = sys.argv[2]  # 参数2：类型：week/4_week/month/quarter
    date_info = sys.argv[3]  # 参数3：年-周/年-月/年-季, 比如: 2022-1
    handle_obj = DwtFdCategoryAgg(site_name=site_name, date_type=date_type, date_info=date_info)
    handle_obj.run()
