import os
import sys

sys.path.append(os.path.dirname(sys.path[0]))

from utils.spark_util import SparkUtil
from pyspark.sql import functions as F
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils
from utils.db_util import DBUtil


class DwdAiAsinAdd(object):

    def __init__(self, site_name="us", date_type="month", date_info="2024-10"):
        self.site_name = site_name
        self.date_type = date_type
        self.date_info = date_info
        app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
        self.spark = SparkUtil.get_spark_session(app_name)

        # 获取历史date_info
        # 环比上月
        self.date_info_last_month = CommonUtil.get_month_offset(self.date_info, -1)
        # 同比去年
        self.date_info_last_year = CommonUtil.get_month_offset(self.date_info, -12)

        self.df_base_asin = self.spark.sql(f"select 1+1;")
        self.df_asin_last_month = self.spark.sql(f"select 1+1;")
        self.df_ai_asin = self.spark.sql(f"select 1+1;")
        self.df_save1 = self.spark.sql(f"select 1+1;")
        self.df_save2 = self.spark.sql(f"select 1+1;")

    def run(self):
        self.read_data()
        self.handle_data()
        self.save_data()

    def read_data(self):
        # 基于流量选品筛选ASIN信息库所需asin
        sql1 = f"""
        select
            asin,
            asin_bought_month,
            category_id,
            asin_category_desc
        from dwt_flow_asin
        where site_name = '{self.site_name}'
          and date_type = '{self.date_type}'
          and date_info = '{self.date_info}'
          and asin_type in (0, 1)
          and asin_bought_month >= 50
        """
        self.df_base_asin = self.spark.sql(sqlQuery=sql1).repartition(40, 'asin')

        # 筛选需要过滤掉的分类
        sql2 = f"""
        select distinct category_id as category_id from category_full_name a 
        where EXISTS (
            select 1 from category_disable_config b where b.id_path is not null and a.id_path like concat(b.id_path, '%') and a.site = b.site 
        ) and a.site = '{self.site_name}'
        """
        conn_info = DBUtil.get_connection_info("mysql", "us")
        df_filter_category_id = SparkUtil.read_jdbc_query(
            session=self.spark,
            url=conn_info["url"],
            pwd=conn_info["pwd"],
            username=conn_info["username"],
            query=sql2
        )

        # 二次过滤分类防止遗漏
        sql3 = f"""
        select distinct name_path as asin_category_desc from category_disable_config where site = '{self.site_name}'
        """
        df_filter_category_desc = SparkUtil.read_jdbc_query(
            session=self.spark,
            url=conn_info["url"],
            pwd=conn_info["pwd"],
            username=conn_info["username"],
            query=sql3
        )

        # 读取流量选品历史数据
        sql4 = f"""
        select
            asin,
            asin_bought_month,
            date_info
        from dwt_flow_asin
        where site_name = '{self.site_name}'
          and date_type = '{self.date_type}'
          and date_info in ('{self.date_info_last_month}', '{self.date_info_last_year}')
        """
        df_flow_asin = self.spark.sql(sqlQuery=sql4).cache()
        df_flow_asin_last_month = df_flow_asin.filter(f"date_info = '{self.date_info_last_month}'").withColumnRenamed(
            'asin_bought_month', 'asin_bought_last_month'
        ).drop('date_info').repartition(40, 'asin')
        df_flow_asin_last_year = df_flow_asin.filter(f"date_info = '{self.date_info_last_year}'").withColumnRenamed(
            'asin_bought_month', 'asin_bought_last_year'
        ).drop('date_info').repartition(40, 'asin')

        # 关联历史数据
        self.df_base_asin = self.df_base_asin.join(
            df_filter_category_id, 'category_id', 'left_anti'
        ).join(
            df_filter_category_desc, 'asin_category_desc', 'left_anti'
        ).join(
            df_flow_asin_last_month, 'asin', 'left'
        ).join(
            df_flow_asin_last_year, 'asin', 'left'
        ).drop('category_id', 'asin_category_desc').cache()
        df_flow_asin.unpersist()
        print("ASIN信息库基础数据如下：")
        self.df_base_asin.show(10, truncate=True)

        # 读取上个月维度asin，判断新增asin
        sql5 = f"""
        select asin, 0 as asin_is_new_flag 
        from dwd_ai_asin_add 
        where site_name = '{self.site_name}' 
          and date_type = '{self.date_type}'
          and date_info = '{self.date_info_last_month}'
        """
        self.df_asin_last_month = self.spark.sql(sqlQuery=sql5).repartition(40, 'asin')

        # 读取ASIN信息库历史数据
        sql6 = f"""
        select asin from dim_ai_asin_base where site_name = '{self.site_name}'
        """
        self.df_ai_asin = self.spark.sql(sqlQuery=sql6).repartition(40, 'asin').cache()
        print(f"ASIN信息库历史数量：{self.df_ai_asin.count()}")

    def handle_data(self):
        # 计算月销同比、环比
        self.df_base_asin = self.df_base_asin.withColumn(
            'asin_bought_yoy',
            F.when(
                F.col("asin_bought_month").isNull() & F.col("asin_bought_last_year").isNull(), F.lit(None)
            ).when(
                F.col("asin_bought_month").isNull(), F.lit(-1000.0000)
            ).when(
                F.col("asin_bought_last_year").isNull(), F.lit(1000.0000)
            ).otherwise(
                F.round((F.col("asin_bought_month") - F.col("asin_bought_last_year")) / F.col("asin_bought_last_year"), 4)
            )
        ).withColumn(
            'asin_bought_mom',
            F.when(
                F.col("asin_bought_month").isNull() & F.col("asin_bought_last_month").isNull(), F.lit(None)
            ).when(
                F.col("asin_bought_month").isNull(), F.lit(-1000.0000)
            ).when(
                F.col("asin_bought_last_month").isNull(), F.lit(1000.0000)
            ).otherwise(
                F.round((F.col("asin_bought_month") - F.col("asin_bought_last_month")) / F.col("asin_bought_last_month"), 4)
            )
        )

        # 给asin打销量标签：1.上升、2.不变、3.下降、0.默认
        self.df_base_asin = self.df_base_asin.withColumn(
            'asin_bought_month_flag',
            F.when(
                F.col("asin_bought_month").isNull() | F.col("asin_bought_last_month").isNull(), F.lit(0)
            ).when(
                F.col("asin_bought_month") - F.col("asin_bought_last_month") > 0, F.lit(1)
            ).when(
                F.col("asin_bought_month") - F.col("asin_bought_last_month") == 0, F.lit(2)
            ).when(
                F.col("asin_bought_month") - F.col("asin_bought_last_month") < 0, F.lit(3)
            ).otherwise(F.lit(0))
        )

        # 给asin打新增标签（基于ASIN信息库）：1.新增、0.默认
        self.df_base_asin = self.df_base_asin.join(
            self.df_asin_last_month, 'asin', 'left'
        ).fillna({
            'asin_is_new_flag': 1
        }).cache()

    def save_data(self):
        # 字段标准化
        self.df_save1 = self.df_base_asin.select(
            F.col("asin"),
            F.col("asin_bought_month"),
            F.col("asin_bought_last_month"),
            F.col("asin_bought_last_year"),
            F.col("asin_bought_mom"),
            F.col("asin_bought_yoy"),
            F.col("asin_bought_month_flag"),
            F.col("asin_is_new_flag"),
            F.lit(self.site_name).alias("site_name"),
            F.lit(self.date_type).alias("date_type"),
            F.lit(self.date_info).alias("date_info")
        ).repartition(1)

        # dwd_ai_asin_add数据存储
        partition_by1 = ["site_name", "date_type", "date_info"]
        hive_tb1 = "dwd_ai_asin_add"
        hdfs_path1 = CommonUtil.build_hdfs_path(
            hive_tb1,
            partition_dict={
                "site_name": self.site_name,
                "date_type": self.date_type,
                "date_info": self.date_info,
            }
        )
        HdfsUtils.delete_file_in_folder(hdfs_path1)
        print(f"正在进行数据存储，当前存储的表名为：{hive_tb1}，存储路径：{hdfs_path1}")
        self.df_save1.write.saveAsTable(name=hive_tb1, format='hive', mode='append', partitionBy=partition_by1)

        if self.date_info >= '2024-10':
            self.df_save2 = self.df_base_asin.select(F.col("asin")).unionByName(
                self.df_ai_asin
            ).select(
                F.col("asin"),
                F.lit(self.site_name).alias("site_name")
            ).distinct().repartition(1)

            # dim_ai_asin_base数据存储
            partition_by2 = ["site_name"]
            hive_tb2 = "dim_ai_asin_base"
            hdfs_path2 = CommonUtil.build_hdfs_path(
                hive_tb2,
                partition_dict={
                    "site_name": self.site_name,
                }
            )
            HdfsUtils.delete_file_in_folder(hdfs_path2)
            print(f"正在进行数据存储，当前存储的表名为：{hive_tb2}，存储路径：{hdfs_path2}")
            self.df_save2.write.saveAsTable(name=hive_tb2, format='hive', mode='append', partitionBy=partition_by2)

        print("success!")


if __name__ == "__main__":
    site_name = sys.argv[1]
    date_type = sys.argv[2]
    date_info = sys.argv[3]
    handle_obj = DwdAiAsinAdd(site_name=site_name, date_type=date_type, date_info=date_info)
    handle_obj.run()
