import os
import sys

sys.path.append(os.path.dirname(sys.path[0]))

from utils.spark_util import SparkUtil
from pyspark.sql import functions as F, Window
from utils.common_util import CommonUtil
from utils.hdfs_utils import HdfsUtils


class DwtAiAsinAdd(object):

    def __init__(self, site_name="us", date_type="month", date_info="2024-10"):
        self.site_name = site_name
        self.date_type = date_type
        self.date_info = date_info
        app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
        self.spark = SparkUtil.get_spark_session(app_name)

        # 近6个月list
        self.last_6_month = []
        for i in range(0, 6):
            self.last_6_month.append(CommonUtil.get_month_offset(self.date_info, -i))

        self.df_base_asin = self.spark.sql(f"select 1+1;")
        self.df_flow_asin_detail = self.spark.sql(f"select 1+1;")
        self.df_fb_info = self.spark.sql(f"select 1+1;")
        self.df_ods_asin_detail = self.spark.sql(f"select 1+1;")
        self.df_ai_asin_detail = self.spark.sql(f"select 1+1;")
        self.df_asin_bought_flag = self.spark.sql(f"select 1+1;")
        self.df_save = self.spark.sql(f"select 1+1;")

    def run(self):
        self.read_data()
        self.handle_data()
        self.save_data()

    def read_data(self):
        # 读取ASIN信息库基础数据
        sql1 = f"""
        select 
            asin, 
            asin_bought_month,
            asin_bought_mom,
            asin_bought_yoy,
            asin_bought_month_flag,
            asin_is_new_flag
        from dwd_ai_asin_add
        where site_name = '{self.site_name}'
          and date_type = '{self.date_type}'
          and date_info = '{self.date_info}'
        """
        self.df_base_asin = self.spark.sql(sqlQuery=sql1).repartition(40, 'asin').cache()
        print("ASIN信息库基础数据如下：")
        self.df_base_asin.show(10, truncate=True)

        # 读取流量选品详情数据
        sql2 = f"""
        select
            asin,
            asin_weight,
            asin_category_desc,
            asin_img_url,
            asin_title,
            asin_brand_name,
            account_name,
            asin_buy_box_seller_type,
            asin_launch_time,
            asin_img_num,
            case when variation_num > 0 then 1 else 0 end as variation_flag,
            variation_num,
            asin_ao_val,
            category_first_id,
            category_id,
            parent_asin,
            first_category_rank,
            asin_price,
            asin_rating,
            asin_total_comments,
            asin_launch_time_type,
            asin_describe
        from dwt_flow_asin
        where site_name = '{self.site_name}'
          and date_type = '{self.date_type}'
          and date_info = '{self.date_info}'
          and asin_type in (0, 1)
          and asin_bought_month >= 50
        """
        self.df_flow_asin_detail = self.spark.sql(sqlQuery=sql2).repartition(40, 'asin').cache()
        print("流量选品详情数据如下：")
        self.df_flow_asin_detail.show(10, truncate=True)

        # 读取店铺数据
        sql3 = f"""
        select
            account_name,
            seller_id,
            fb_country_name,
            business_addr
        from dwt_fb_base_report
        where site_name = '{self.site_name}'
          and date_type = '{self.date_type}'
          and date_info = '{self.date_info}'
        """
        self.df_fb_info = self.spark.sql(sqlQuery=sql3).dropDuplicates(['account_name']).cache()
        print("店铺详情数据如下：")
        self.df_fb_info.show(10, truncate=True)

        # 读取review_json_list等详情数据
        sql4 = f"""
        select
            asin,
            review_json_list,
            product_json,
            product_detail_json,
            updated_at
        from ods_asin_detail
        where site_name = '{self.site_name}'
          and date_type = '{self.date_type}'
          and date_info = '{self.date_info}'
        """
        self.df_ods_asin_detail = self.spark.sql(sqlQuery=sql4)
        window = Window.partitionBy(['asin']).orderBy(
            self.df_ods_asin_detail.updated_at.desc_nulls_last()
        )
        self.df_ods_asin_detail = self.df_ods_asin_detail.withColumn(
            'rank', F.row_number().over(window=window)
        ).filter('rank = 1').drop('rank', 'updated_at').repartition(40, 'asin').cache()
        print("ods详情数据如下：")
        self.df_ods_asin_detail.show(10, truncate=True)

        # df对象join聚合
        self.df_ai_asin_detail = self.df_base_asin.join(
            self.df_flow_asin_detail, 'asin', 'left'
        ).join(
            self.df_ods_asin_detail, 'asin', 'left'
        ).join(
            self.df_fb_info, 'account_name', 'left'
        ).cache()
        self.df_base_asin.unpersist()
        self.df_flow_asin_detail.unpersist()
        self.df_fb_info.unpersist()
        self.df_ods_asin_detail.unpersist()

        # 读取dwd_ai_asin_add月销标识
        sql5 = f"""
        select 
            asin, 
            asin_bought_month_flag 
        from dwd_ai_asin_add
        where site_name = '{self.site_name}'
          and date_type = '{self.date_type}'
          and date_info in ({CommonUtil.list_to_insql(self.last_6_month)})
        """
        self.df_asin_bought_flag = self.spark.sql(sqlQuery=sql5).repartition(40, 'asin').cache()
        print("dwd_ai_asin_add月销标识数据如下：")
        self.df_asin_bought_flag.show(10, truncate=True)

    def handle_data(self):
        # 计算上升产品标识：连续6个月销量上升
        self.df_asin_bought_flag = self.df_asin_bought_flag.groupBy('asin').agg(
            F.sum(F.when(F.col('asin_bought_month_flag') == 1, 1).otherwise(0)).alias('sum_flag')
        ).withColumn(
            'is_ascending_flag', F.when(F.col('sum_flag') == 6, 1).otherwise(0)
        )

    def save_data(self):
        # 字段标准化
        self.df_save = self.df_ai_asin_detail.join(
            self.df_asin_bought_flag, 'asin', 'left'
        ).select(
            F.col("asin"),
            F.col("asin_weight").alias("weight"),
            F.col("asin_bought_month").alias("bought_month"),
            F.col("asin_category_desc").alias("category"),
            F.col("asin_img_url").alias("img"),
            F.col("asin_title").alias("title"),
            F.col("asin_brand_name").alias("brand"),
            F.col("account_name"),
            F.col("business_addr").alias("account_addr"),
            F.col("asin_buy_box_seller_type").alias("buy_box_seller_type"),
            F.col("asin_launch_time").alias("launch_time"),
            F.col("asin_img_num").alias("img_num"),
            F.col("variation_flag"),
            F.col("variation_num"),
            F.col("asin_ao_val").alias("ao_val"),
            F.col("category_first_id").alias("category_id"),
            F.col("category_id").alias("category_current_id"),
            F.col("parent_asin"),
            F.col("first_category_rank").alias("bsr_rank"),
            F.col("asin_price").alias("price"),
            F.col("asin_rating").alias("rating"),
            F.col("asin_total_comments").alias("total_comments"),
            F.col("seller_id"),
            F.col("fb_country_name"),
            F.col("review_json_list"),
            F.col("asin_launch_time_type").alias("launch_time_type"),
            F.col("asin_describe").alias("describe"),
            F.col("product_json"),
            F.col("product_detail_json"),
            F.col("asin_bought_mom").alias("bought_month_mom"),
            F.col("asin_bought_yoy").alias("bought_month_yoy"),
            F.col("asin_is_new_flag").alias("is_new_flag"),
            F.col("is_ascending_flag"),
            F.lit(self.site_name).alias("site_name"),
            F.lit(self.date_type).alias("date_type"),
            F.lit(self.date_info).alias("date_info")
        ).repartition(100).cache()

        # 数据存储
        partition_by = ["site_name", "date_type", "date_info"]
        hive_tb = "dwt_ai_asin_add"
        hdfs_path = CommonUtil.build_hdfs_path(
            hive_tb,
            partition_dict={
                "site_name": self.site_name,
                "date_type": self.date_type,
                "date_info": self.date_info,
            }
        )
        HdfsUtils.delete_file_in_folder(hdfs_path)
        print(f"正在进行数据存储，当前存储的表名为：{hive_tb}，存储路径：{hdfs_path}")
        self.df_save.write.saveAsTable(name=hive_tb, format='hive', mode='append', partitionBy=partition_by)

        print("success!")


if __name__ == "__main__":
    site_name = sys.argv[1]
    date_type = sys.argv[2]
    date_info = sys.argv[3]
    handle_obj = DwtAiAsinAdd(site_name=site_name, date_type=date_type, date_info=date_info)
    handle_obj.run()
