Commit 906830e1 by chenyuanjie

fix

parent a9b8d60d
......@@ -19,7 +19,6 @@ class DimAsinProfitRateInfo(object):
self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}: {self.site_name} {self.date_info}")
self.df_asin_profit = self.spark.sql(f"select 1+1;")
self.df_asin_profit_history = self.spark.sql(f"select 1+1;")
self.df_keepa_asin = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
......@@ -35,7 +34,6 @@ class DimAsinProfitRateInfo(object):
from dim_asin_profit_rate_info where site_name = '{self.site_name}';
"""
self.df_asin_profit = self.spark.sql(sqlQuery=sql).repartition(40, 'asin').cache()
self.df_asin_profit_history = self.df_asin_profit.filter(f"date_info < '{self.date_info}'").cache()
# 读取keepa数据
sql = f"""
......@@ -65,10 +63,8 @@ class DimAsinProfitRateInfo(object):
).repartition(10).cache()
new_count = self.df_save.count()
old_count = self.df_asin_profit_history.count()
print(f"历史数据量:{old_count}")
print(f"最新数据量:{new_count}")
if new_count >= old_count:
hive_tb = "dim_asin_profit_rate_info"
partition_dict = {
"site_name": self.site_name,
......@@ -79,7 +75,18 @@ class DimAsinProfitRateInfo(object):
HdfsUtils.delete_hdfs_file(hdfs_path)
print(f"正在进行数据存储,当前存储的表名为:{hive_tb},存储路径:{hdfs_path}")
self.df_save.write.saveAsTable(name=hive_tb, format='hive', mode='append', partitionBy=partition_by)
print("success!")
# 验证实际写入数量,确保写入成功后再删除历史分区
written_count = self.spark.sql(f"""
select count(1) as cnt from {hive_tb}
where site_name='{self.site_name}' and date_info='{self.date_info}'
""").collect()[0]['cnt']
print(f"实际写入数量:{written_count},预期:{new_count}")
if written_count != new_count:
raise RuntimeError(
f"写入数量校验失败!实际写入 {written_count} != 预期 {new_count},终止删除历史分区,请人工检查。"
)
print("写入校验通过!")
print(f"正在删除历史分区数据")
self.spark.sql(f"""
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment