每日利润率清洗

e8a5c1e8 · chenyuanjie · 40fcc4ea · e8a5c1e8
Commit e8a5c1e8 authored Feb 27, 2026 by chenyuanjie
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 1 deletions

dim_asin_profit_rate_info.py Pyspark_job/dim/dim_asin_profit_rate_info.py +13 -1

No files found.
--- a/Pyspark_job/dim/dim_asin_profit_rate_info.py
+++ b/Pyspark_job/dim/dim_asin_profit_rate_info.py
@@ -20,6 +20,7 @@ class DimAsinProfitRateInfo(object):

        self.df_asin_profit = self.spark.sql(f"select 1+1;")
        self.df_asin_profit_history = self.spark.sql(f"select 1+1;")
+        self.df_keepa_asin = self.spark.sql(f"select 1+1;")
        self.df_save = self.spark.sql(f"select 1+1;")

    def run(self):
@@ -36,9 +37,20 @@ class DimAsinProfitRateInfo(object):
        self.df_asin_profit = self.spark.sql(sqlQuery=sql).repartition(40, 'asin').cache()
        self.df_asin_profit_history = self.df_asin_profit.filter(f"date_info < '{self.date_info}'").cache()

+        # 读取keepa数据
+        sql = f"""
+            select asin, package_length, package_width, package_height, weight 
+            from dim_keepa_asin_info where site_name = '{self.site_name}';
+        """
+        self.df_keepa_asin = self.spark.sql(sqlQuery=sql).repartition(40, 'asin')
+
    def handle_data(self):
+        # 因为keepa数据存在更新的情况，保留与keepa最新数据所对应的数据行
+        self.df_asin_profit = self.df_asin_profit.join(
+            self.df_keepa_asin, on=['asin', 'package_length', 'package_width', 'package_height', 'weight'], how='inner'
+        )
        # 去重
-        window = Window.partitionBy(['asin', 'price', 'package_length', 'package_width', 'package_height', 'weight']).orderBy(
+        window = Window.partitionBy(['asin', 'price']).orderBy(
            self.df_asin_profit.updated_time.desc_nulls_last()
        )
        self.df_asin_profit = self.df_asin_profit.withColumn(