import os
import sys
import re

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from utils.templates import Templates
# from ..utils.templates import Templates
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, IntegerType


class DimBsAsinInfo(Templates):

    def __init__(self, site_name='us', date_type="month", date_info='2022-1'):
        super().__init__()
        self.site_name = site_name
        self.date_type = date_type
        self.date_info = date_info
        # 初始化self.spark对
        self.db_save = 'dim_asin_bs_info'
        self.spark = self.create_spark_object(
            app_name=f"{self.db_save}: {self.site_name}, {self.date_type}, {self.date_info}")
        self.df_save = self.spark.sql("select 1+1;")
        self.df_asin_node_id = self.spark.sql("select 1+1;")
        self.df_bs_asin_detail = self.spark.sql("select 1+1;")
        self.df_bs_category = self.spark.sql("select 1+1;")
        # 定义 UDF 的返回类型，即一个包含三个 DoubleType 字段的 StructType
        schema = StructType([
            StructField('asin_bs_cate_1_rank', IntegerType(), True),
            StructField('asin_bs_cate_current_rank', IntegerType(), True),
        ])
        self.u_parse_bs_category = F.udf(self.udf_parse_bs_category, schema)
        self.pattern1_dict = {
            "us": "(\d+).*?See Top 100 in ".lower(),
            "uk": "(\d+).*?See Top 100 in ".lower(),
            "de": "(\d+).*?Siehe Top 100 in ".lower(),
            "es": "(\d+).*?Ver el Top 100 en ".lower(),
            "fr": "(\d+).*?Voir les 100 premiers en ".lower(),
            "it": "(\d+).*?Visualizza i Top 100 nella categoria ".lower(),
        }
        self.pattern_current_dict = {
            "us": "(\d+) in ",
            "uk": "(\d+) in ",
            "de": "(\d+) in ",
            "es": "(\d+) en ",
            "fr": "(\d+) en ",
            "it": "(\d+) in ",
        }
        self.partitions_by = ['site_name', 'date_type', 'date_info']
        self.reset_partitions(partitions_num=20)
        self.get_year_week_tuple()

    @staticmethod
    def udf_parse_bs_category(asin_bs_sellers_rank_lower, cate_1_pattern1, cate_1_pattern2, cate_current_pattern):
        # 匹配一级和当前分类的排名
        asin_bs_sellers_rank_lower = asin_bs_sellers_rank_lower.replace(",", "")
        # 解析一级排名+当前排名
        asin_bs_cate_1_rank_list = re.findall(cate_1_pattern1, asin_bs_sellers_rank_lower)
        asin_bs_cate_current_rank_list = re.findall(cate_current_pattern, asin_bs_sellers_rank_lower)
        asin_bs_cate_1_rank = int(asin_bs_cate_1_rank_list[0]) if asin_bs_cate_1_rank_list else None
        asin_bs_cate_current_rank = int(asin_bs_cate_current_rank_list[-1]) if asin_bs_cate_current_rank_list else None
        # if asin_bs_cate_1_rank is None:
        #     # 先匹配一级分类名称-->再去找一级分类的排名
        #     asin_bs_cate_1_en_name_list = re.findall(cate_1_pattern2, asin_bs_sellers_rank_lower)
        #     asin_bs_cate_1_en_name = asin_bs_cate_1_en_name_list[0] if asin_bs_cate_1_en_name_list else None
        #     asin_bs_cate_1_rank_list = re.findall(f"(\d+).*?{asin_bs_cate_1_en_name}", asin_bs_sellers_rank_lower)
        #     asin_bs_cate_1_rank = int(asin_bs_cate_1_rank_list[0]) if asin_bs_cate_1_rank_list else None
        return asin_bs_cate_1_rank, asin_bs_cate_current_rank

    def read_data(self):
        # 1. 读取ods_asin_detail历史表 -- 获取asin的分类id
        # sql = f"select asin, node_id as asin_node_id, date_info from ods_asin_detail where site_name='{self.site_name}' and date_type='week'"  # and date_info>='2023-15'
        sql = f"select asin, category_id as asin_bs_cate_current_id, category_first_id as asin_bs_cate_1_id, asin_rank, date_info from dim_asin_detail where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info ='{self.date_info}';"  #  and date_info>='2023-15'
        print("sql:", sql)
        self.df_asin_node_id = self.spark.sql(sqlQuery=sql).cache()
        self.df_asin_node_id.show(10, truncate=False)
        # 2. 读取ods_bs_category分类表 -- 获取分类id对应的一级分类和当前分类
        # 改成读取dim_category_desc_id表
        # sql = f"select node_id as asin_bs_cate_current_id, category_first_id as asin_bs_cate_1_id from dim_category_desc_id where site_name='{self.site_name}';"
        # self.df_category_desc_id = self.spark.sql(sqlQuery=sql).cache()
        # self.df_category_desc_id = self.df_category_desc_id.drop_duplicates(['asin_bs_cate_current_id', 'asin_bs_cate_1_id'])
        # self.df_category_desc_id.show(10, truncate=False)

        sql = f"select nodes_num, en_name from ods_bs_category where site_name='{self.site_name}';"
        print("sql:", sql)
        self.df_bs_category = self.spark.sql(sqlQuery=sql).cache()
        self.df_bs_category.show(10, truncate=False)

        # 3. 读取ods_bs_category_asin_detail对应周期的详情表
        params = f" date_info <= '2022-42'" if max(self.year_week_tuple) <= '2022-42' and date_type == 'month' else f" date_info in {self.year_week_tuple}"
        sql = f"select asin, best_sellers_rank as asin_bs_sellers_rank, date_info from ods_bs_category_asin_detail " \
              f"where site_name='{self.site_name}' and date_type='week' and {params};"

        if self.site_name == 'us' and date_type in ['month', 'month_week'] and date_info >= '2023-10':
            sql = f"select asin, best_sellers_rank as asin_bs_sellers_rank, date_info from ods_bs_category_asin_detail " \
                  f"where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}';"

        print("sql:", sql)
        self.df_bs_asin_detail = self.spark.sql(sqlQuery=sql).cache()
        self.df_bs_asin_detail = self.df_bs_asin_detail.drop_duplicates(['asin', 'date_info'])
        self.df_bs_asin_detail.show(10, truncate=False)

    def handle_df_asin_node_id(self):
        # 保留asin最新的node_id
        # self.df_asin_node_id = self.df_asin_node_id.filter("asin_bs_cate_current_id is not null")
        window = Window.partitionBy(['asin']).orderBy(
            self.df_asin_node_id.date_info.desc()
        )
        self.df_asin_node_id = self.df_asin_node_id.withColumn(
            "row_number", F.row_number().over(window=window)
        )
        self.df_asin_node_id = self.df_asin_node_id.filter("row_number=1")
        self.df_asin_node_id = self.df_asin_node_id.drop("row_number")

    def handle_df_bs_asin_detail(self):
        # 保留asin最新的asin_bs_sellers_rank_lower
        window = Window.partitionBy(['asin']).orderBy(
            self.df_bs_asin_detail.date_info.desc()
        )
        self.df_bs_asin_detail = self.df_bs_asin_detail.withColumn(
            "row_number", F.row_number().over(window=window)
        )
        self.df_bs_asin_detail = self.df_bs_asin_detail.filter("row_number=1")
        self.df_bs_asin_detail = self.df_bs_asin_detail.drop("row_number", "date_info")

        # 小写
        self.df_bs_asin_detail = self.df_bs_asin_detail.withColumn("asin_bs_sellers_rank_lower", F.lower("asin_bs_sellers_rank"))
        # 生成一级分类匹配规则
        df_cate_1 = self.df_bs_category.filter("nodes_num==2").toPandas()
        pattern1_list = df_cate_1.en_name.to_numpy()
        cate_1_pattern1 = self.pattern1_dict[self.site_name]
        cate_1_pattern2 = "|".join(pattern1_list)

        print(f"site_name: {self.site_name}, cate_1_pattern1: {cate_1_pattern1}")
        print(f"site_name: {self.site_name}, cate_1_pattern2: {cate_1_pattern2}")
        # 生成当前分类匹配规则
        cate_current_pattern = self.pattern_current_dict[self.site_name]
        print(f"site_name: {self.site_name}, cate_current_pattern: {cate_current_pattern}")

        # 提取分类字符串中的asin_bs_cate_1_rank, asin_bs_cate_current_rank
        self.df_bs_asin_detail = self.df_bs_asin_detail.withColumn(
            'asin_bs_cate_ranks', self.u_parse_bs_category('asin_bs_sellers_rank_lower', F.lit(cate_1_pattern1), F.lit(cate_1_pattern2), F.lit(cate_current_pattern))
        )
        self.df_bs_asin_detail = self.df_bs_asin_detail \
            .withColumn('asin_bs_cate_1_rank', self.df_bs_asin_detail.asin_bs_cate_ranks.getField('asin_bs_cate_1_rank')) \
            .withColumn('asin_bs_cate_current_rank', self.df_bs_asin_detail.asin_bs_cate_ranks.getField('asin_bs_cate_current_rank')) \
            .drop('asin_bs_cate_ranks')
        # self.df_save = self.df_asin_node_id.join(
        #     self.df_bs_asin_detail, 'asin', how='left'
        # ).join(
        #     self.df_category_desc_id, 'asin_bs_cate_current_id', how='left'
        # )
        self.df_save = self.df_asin_node_id.join(
            self.df_bs_asin_detail, 'asin', how='left'
        )
        # 假设 df 是你的 DataFrame -- 用asin_detail表的rank补充 -- 去掉这个逻辑2023-09-26
        # self.df_save = self.df_save.withColumn("asin_bs_cate_1_rank", F.when(F.col("asin_bs_cate_1_rank").isNull(), F.col("asin_rank")).otherwise(F.col("asin_bs_cate_1_rank")))
        # print(4, self.df_save.count())

        # self.df_save = self.df_save.join(
        #     self.df_category_desc_id, 'asin_bs_cate_current_id', how='left'
        # )
        # print(5, self.df_save.count())

        self.df_save.show(20)
        self.df_save = self.df_save.drop("asin_bs_sellers_rank_lower", "asin_rank")

    def handle_data(self):
        self.handle_df_asin_node_id()
        self.handle_df_bs_asin_detail()
        self.df_save = self.df_save.withColumn("site_name", F.lit(self.site_name))
        self.df_save = self.df_save.withColumn("date_type", F.lit(self.date_type))
        self.df_save = self.df_save.withColumn("date_info", F.lit(self.date_info))


if __name__ == '__main__':
    site_name = sys.argv[1]  # 参数1：站点
    date_type = sys.argv[2]  # 参数2：类型：week/4_week/month/quarter
    date_info = sys.argv[3]  # 参数3：年-周/年-月/年-季, 比如: 2022-1
    handle_obj = DimBsAsinInfo(site_name=site_name, date_type=date_type, date_info=date_info)
    handle_obj.run()