"""
   @Author      : wangrui
   @Description : top3asin详情
   @SourceTable :
                  dim_st_detail
                  dim_cal_asin_history_detail
                  ods_st_key

   @SinkTable   : dwt_brand_st_top3_asin.py
   @CreateTime  : 2023/02/21 16:55
   @UpdateTime  : 2023/02/21 16:55
"""

import os
import sys
import re
from functools import reduce

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
# 分组排序的udf窗口函数
from pyspark.sql.window import Window
from pyspark.sql import functions as F
import datetime
from pyspark.sql.types import StringType, IntegerType, DoubleType
from sqlalchemy import create_engine
import pandas as pd


class DwtBrandStTop3Asin(Templates):

    def __init__(self, site_name="us", date_type="week", date_info="2022-1"):
        super().__init__()
        self.site_name = site_name
        self.date_type = date_type
        self.date_info = date_info
        self.db_save = f"dwt_brand_st_top3_asin"
        self.spark = self.create_spark_object(
            app_name=f"{self.db_save}: {self.site_name},{self.date_type}, {self.date_info}")
        # 写入、分区初始化
        self.df_save = self.spark.sql(f"select 1+1;")
        self.df_date = self.spark.sql(f"select 1+1;")
        self.previous_date = self.udf_get_previous_date()
        self.last_date = self.udf_get_last_day()
        self.partitions_by = ['site_name', 'date_type', 'date_info']
        if self.date_type in ["week"]:
            self.reset_partitions(partitions_num=5)
        elif self.date_type in ["month", "last30day"]:
            self.reset_partitions(partitions_num=10)
        elif self.date_type in ["quarter"]:
            self.reset_partitions(partitions_num=20)
        # 初始化全局df
        self.df_st_detail = self.spark.sql(f"select 1+1;")
        self.df_asin_history = self.spark.sql(f"select 1+1;")
        self.df_st_key = self.spark.sql(f"select 1+1;")
        self.df_previous_st_top3 = self.spark.sql(f"select 1+1;")
        # 注册udf函数
        self.u_format_string = self.spark.udf.register('u_format_string', self.udf_get_format_string, StringType())

    def udf_get_previous_date(self):
        self.df_date = self.spark.sql(f"select * from dim_date_20_to_30;")
        df = self.df_date.toPandas()
        if self.date_type == 'week':
            df_today = df.loc[(df.year_week == f'{self.date_info}') & (df.week_day == 1)]
            date_id = list(df_today.id)[0]
            previous_date_id = int(date_id) - 4
            df_loc = df.loc[df.id == previous_date_id]
            previous_week = list(df_loc.year_week)[0]
            return previous_week
        elif self.date_type == 'month':
            df_loc = df.loc[(df.year_month == f'{self.date_info}') & (df.day == 1)]
            date_id = list(df_loc.id)[0]
            previous_date_id = int(date_id) - 1
            df_loc = df.loc[df.id == previous_date_id]
            previous_month = list(df_loc.year_month)[0]
            return previous_month

    def udf_get_last_day(self):
        self.df_date = self.spark.sql(f"select * from dim_date_20_to_30;")
        df = self.df_date.toPandas()
        if self.date_type == 'week':
            df_loc = df.loc[(df.year_week == f'{self.date_info}') & (df.week_day == 7)]
            last_day_of_week = list(df_loc.date)[0]
            return last_day_of_week
        elif self.date_type == 'month':
            df_loc = df.loc[(df.year_month == f'{self.date_info}') & (df.day == 28)]
            last_day_of_month = list(df_loc.date)[0]
            return last_day_of_month

    @staticmethod
    def udf_get_format_string(original_string):
        source_string = str(original_string)
        source_string = source_string.strip(',')
        result_string = source_string.replace(',,', ',')
        return result_string

    def read_data(self):
        # 1.读取dim_st_detail
        sql = f"select " \
              f"search_term, " \
              f"st_rank, " \
              f"st_asin1, " \
              f"st_asin2, " \
              f"st_asin3, " \
              f"st_click_share_sum, " \
              f"st_conversion_share_sum, " \
              f"st_quantity_being_sold, " \
              f"st_search_num " \
              f"from dim_st_detail " \
              f"where site_name = '{self.site_name}' and date_type = '{self.date_type}' and date_info = '{self.date_info}'"
        print("sql:" + sql)
        self.df_st_detail = self.spark.sql(sqlQuery=sql).cache()
        self.df_st_detail = self.df_st_detail.withColumnRenamed("st_click_share_sum", "click_percentage")
        self.df_st_detail = self.df_st_detail.withColumnRenamed("st_conversion_share_sum", "conversion_percentage")
        # self.df_st_detail.show(10, truncate=False)

        # 2.读取dim_cal_asin_history_detail
        sql = f"select " \
              f"asin, " \
              f"asin_launch_time " \
              f"from dim_cal_asin_history_detail " \
              f"where site_name = '{self.site_name}'"
        print("sql:" + sql)
        self.df_asin_history = self.spark.sql(sqlQuery=sql)
        # self.df_asin_history.show(10, truncate=False)

        # 3.读取ods_st_key
        sql = f"select " \
              f"st_key, " \
              f"search_term " \
              f"from ods_st_key " \
              f"where site_name = '{self.site_name}'"
        print("sql:" + sql)
        self.df_st_key = self.spark.sql(sqlQuery=sql).cache()
        # self.df_st_key.show(10, truncate=False)

        # 4.读取上一个时间分区的st_top3
        sql = f"select " \
              f"search_term_id, " \
              f"rank, " \
              f"opportunity_index " \
              f"from dwt_brand_st_top3_asin " \
              f"where site_name = '{self.site_name}' and date_type = '{self.date_type}' and date_info = '{self.previous_date}'"
        print("sql:" + sql)
        self.df_previous_st_top3 = self.spark.sql(sqlQuery=sql).cache()
        self.df_previous_st_top3 = self.df_previous_st_top3.withColumnRenamed("rank", "pre_rank")
        self.df_previous_st_top3 = self.df_previous_st_top3.withColumnRenamed("opportunity_index",
                                                                              "pre_opportunity_index")
        # self.df_previous_st_top3.show(10, truncate=False)

    def get_st_index_change(self):
        self.df_st_detail = self.df_st_detail.join(
            self.df_st_key, on=['search_term'], how='inner'
        )
        self.df_st_detail = self.df_st_detail.withColumnRenamed("st_key", "search_term_id")
        self.df_st_detail = self.df_st_detail.join(
            self.df_previous_st_top3, on=["search_term_id"], how='left'
        )
        self.df_st_detail = self.df_st_detail.withColumn("opportunity_index",
                                                         F.when(F.col("st_quantity_being_sold").isNotNull(),
                                                                self.df_st_detail.st_search_num / self.df_st_detail.st_quantity_being_sold).otherwise(
                                                             F.lit(None)))
        self.df_st_detail = self.df_st_detail.withColumn("rank_rise", F.when(F.col("pre_rank").isNotNull(), (
                self.df_st_detail.st_rank - self.df_st_detail.pre_rank) / self.df_st_detail.pre_rank).otherwise(
            F.lit(None)))
        self.df_st_detail = self.df_st_detail.withColumn("opportunity_index_rise",
                                                         F.when(F.col("pre_opportunity_index").isNotNull(), (
                                                                 self.df_st_detail.opportunity_index - self.df_st_detail.pre_opportunity_index) / self.df_st_detail.pre_opportunity_index))
        self.df_st_detail = self.df_st_detail.drop("st_quantity_being_sold")
        self.df_st_detail = self.df_st_detail.drop("st_search_num")
        self.df_st_detail = self.df_st_detail.drop("pre_rank")
        self.df_st_detail = self.df_st_detail.drop("pre_opportunity_index")
        self.df_st_detail = self.df_st_detail.withColumnRenamed("st_rank", "rank")

    def get_new_asin(self):
        time = datetime.datetime.strptime(self.last_date, '%Y-%m-%d')
        six_months_time = time + datetime.timedelta(days=-180)

        df_st_asin1 = self.df_st_detail.select("search_term_id", "st_asin1")
        df_st_asin1 = df_st_asin1.withColumnRenamed("st_asin1", "asin")
        df_st_asin1_launch_time = df_st_asin1.join(
            self.df_asin_history, on=['asin'], how='left'
        )
        df_st_asin1_launch_time = df_st_asin1_launch_time.withColumn("is_asin1_new_asin", F.when(
            F.col("asin_launch_time") >= six_months_time, F.lit(1)).otherwise(F.lit(0)))
        df_st_asin1_launch_time = df_st_asin1_launch_time.withColumnRenamed("asin", "asin1")
        df_st_asin2 = self.df_st_detail.select("search_term_id", "st_asin2")
        df_st_asin2 = df_st_asin2.withColumnRenamed("st_asin2", "asin")
        df_st_asin2_launch_time = df_st_asin2.join(
            self.df_asin_history, on=['asin'], how='left'
        )
        df_st_asin2_launch_time = df_st_asin2_launch_time.withColumn("is_asin2_new_asin", F.when(
            F.col("asin_launch_time") >= six_months_time, F.lit(1)).otherwise(F.lit(0)))
        df_st_asin2_launch_time = df_st_asin2_launch_time.withColumnRenamed("asin", "asin2")
        df_st_asin3 = self.df_st_detail.select("search_term_id", "st_asin3")
        df_st_asin3 = df_st_asin3.withColumnRenamed("st_asin3", "asin")
        df_st_asin3_launch_time = df_st_asin3.join(
            self.df_asin_history, on=['asin'], how='left'
        )
        df_st_asin3_launch_time = df_st_asin3_launch_time.withColumn("is_asin3_new_asin", F.when(
            F.col("asin_launch_time") >= six_months_time, F.lit(1)).otherwise(F.lit(0)))
        df_st_asin3_launch_time = df_st_asin3_launch_time.withColumnRenamed("asin", "asin3")
        df_st_asin = df_st_asin1_launch_time.join(
            df_st_asin2_launch_time, on=['search_term_id'], how='left'
        ).join(
            df_st_asin3_launch_time, on=['search_term_id'], how='left'
        )

        df_st_asin = df_st_asin.withColumn("new_asin_list",
                                           F.concat_ws(",",
                                                       F.when(F.col("is_asin1_new_asin") == 1, F.col("asin1")).otherwise(
                                                           F.lit('')),
                                                       F.when(F.col("is_asin2_new_asin") == 1, F.col("asin2")).otherwise(
                                                           F.lit('')),
                                                       F.when(F.col("is_asin3_new_asin") == 1, F.col("asin3")).otherwise(
                                                           F.lit(''))))
        df_st_asin = df_st_asin.withColumn("new_asin", self.u_format_string(df_st_asin.new_asin_list))
        df_st_asin = df_st_asin.select("search_term_id", "new_asin")
        self.df_st_detail = self.df_st_detail.join(
            df_st_asin, on=['search_term_id'], how='left'
        )

    def get_top_one_count(self):
        df_top_1_asin = self.df_st_detail.filter("st_asin1 not in ('','null')")
        df_top_1_asin = df_top_1_asin.filter("st_asin1 is not null")
        #df_top_1_asin.show(10, truncate=False)
        df_top_1_asin = df_top_1_asin.select("search_term_id", "st_asin1")
        df_top_1_asin_agg = df_top_1_asin.groupby(['st_asin1']).agg(
            F.count("search_term_id").alias("top_one_asin_counts")
        )
        df_top_1_asin_agg = df_top_1_asin_agg.drop("search_term_id")
        self.df_st_detail = self.df_st_detail.join(
            df_top_1_asin_agg, on=['st_asin1'], how='left'
        )

    def get_top_3_asin_sum(self):
        df_top1 = self.df_st_detail.filter("st_asin1 not in ('','null')")
        df_top1 = df_top1.filter("st_asin1 is not null")
        df_top1 = df_top1.select("search_term_id", "st_asin1")
        df_top1 = df_top1.withColumnRenamed("st_asin1", "asin")
        df_top2 = self.df_st_detail.filter("st_asin2 not in ('','null')")
        df_top2 = df_top2.filter("st_asin2 is not null")
        df_top2 = df_top2.select("search_term_id", "st_asin2")
        df_top2 = df_top2.withColumnRenamed("st_asin2", "asin")
        df_top3 = self.df_st_detail.filter("st_asin3 not in ('','null')")
        df_top3 = df_top3.filter("st_asin3 is not null")
        df_top3 = df_top3.select("search_term_id", "st_asin3")
        df_top3 = df_top3.withColumnRenamed("st_asin3", "asin")
        df_st_top3 = df_top1.unionByName(df_top2)
        df_st_top3 = df_st_top3.unionByName(df_top3)
        df_top3_asin_count = df_st_top3.groupby(['asin']).agg(
            F.count("search_term_id").alias("asin_count")
        )
        self.df_st_detail = self.df_st_detail.withColumnRenamed("st_asin1", "asin")
        self.df_st_detail = self.df_st_detail.join(
            df_top3_asin_count, on=["asin"], how='left'
        )
        self.df_st_detail = self.df_st_detail.withColumnRenamed("asin_count", "top_1_asin_sum")
        self.df_st_detail = self.df_st_detail.drop("asin")
        self.df_st_detail = self.df_st_detail.withColumnRenamed("st_asin2", "asin")
        self.df_st_detail = self.df_st_detail.join(
            df_top3_asin_count, on=["asin"], how='left'
        )
        self.df_st_detail = self.df_st_detail.withColumnRenamed("asin_count", "top_2_asin_sum")
        self.df_st_detail = self.df_st_detail.drop("asin")
        self.df_st_detail = self.df_st_detail.withColumnRenamed("st_asin3", "asin")
        self.df_st_detail = self.df_st_detail.join(
            df_top3_asin_count, on=["asin"], how='left'
        )
        self.df_st_detail = self.df_st_detail.withColumnRenamed("asin_count", "top_3_asin_sum")
        self.df_st_detail = self.df_st_detail.drop("asin")

    def handle_data_group(self):
        self.df_save = self.df_st_detail
        self.df_save.show(10, truncate=False)
        self.df_save = self.df_save.withColumn("created_time",
                                               F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS')). \
            withColumn("updated_time", F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS'))
        self.df_save = self.df_save.withColumn("re_string_field1", F.lit("null"))
        self.df_save = self.df_save.withColumn("re_string_field2", F.lit("null"))
        self.df_save = self.df_save.withColumn("re_string_field3", F.lit("null"))
        self.df_save = self.df_save.withColumn("re_int_field1", F.lit(0))
        self.df_save = self.df_save.withColumn("re_int_field2", F.lit(0))
        self.df_save = self.df_save.withColumn("re_int_field3", F.lit(0))
        self.df_save = self.df_save.withColumn("site_name", F.lit(self.site_name))
        self.df_save = self.df_save.withColumn("date_type", F.lit(self.date_type))
        self.df_save = self.df_save.withColumn("date_info", F.lit(self.date_info))

    def handle_data(self):
        self.get_st_index_change()
        self.get_new_asin()
        self.get_top_one_count()
        self.get_top_3_asin_sum()
        self.handle_data_group()


if __name__ == '__main__':
    site_name = sys.argv[1]  # 参数1：站点
    date_type = sys.argv[2]  # 参数2：类型：week/4_week/month/quarter
    date_info = sys.argv[3]  # 参数3：年-周/年-月/年-季, 比如: 2022-1
    handle_obj = DwtBrandStTop3Asin(site_name=site_name, date_type=date_type, date_info=date_info)
    handle_obj.run()
