Commit e37eeca5 by chenyuanjie

月流程更新parent_asin_latest表

parent cc57a012
...@@ -474,38 +474,32 @@ class DimAsinDetail(object): ...@@ -474,38 +474,32 @@ class DimAsinDetail(object):
# 处理parent_asin下最新变体信息 # 处理parent_asin下最新变体信息
def handle_latest_variation_info(self): def handle_latest_variation_info(self):
if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-06': if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-06':
max_report_sql = f""" df_asin_variat = self.df_asin_detail.filter("parent_asin is not null").select("parent_asin", "variat_list_change", "created_time")
SELECT MAX(date_info) as table_date_info FROM {self.doris_db}.{self.parent_asin_latest_detail_table} latest_asin_window = Window.partitionBy('parent_asin').orderBy(
""" F.desc_nulls_last("created_time")
df_date_info = DorisHelper.spark_import_with_sql(self.spark, query=max_report_sql) )
table_date_info = df_date_info.take(1)[0]['table_date_info'] df_asin_variat = df_asin_variat.withColumn("p_rank", F.row_number().over(window=latest_asin_window))
print("doris中记录最新的日期为:", table_date_info) df_asin_variat = df_asin_variat.filter("p_rank = 1").drop("p_rank")
if self.date_info >= table_date_info: df_asin_variat = df_asin_variat.filter(F.size("variat_list_change") > 0). \
df_asin_variat = self.df_asin_detail.filter("parent_asin is not null").select("parent_asin", "variat_list_change", "created_time") select("parent_asin", "created_time", F.explode("variat_list_change").alias("variant_attribute")). \
latest_asin_window = Window.partitionBy('parent_asin').orderBy( select("parent_asin", "created_time", F.col("variant_attribute")[0].alias("asin"),
F.desc_nulls_last("created_time") F.col("variant_attribute")[1].alias("color"), F.col("variant_attribute")[3].alias("size"),
) F.col("variant_attribute")[5].alias("style"))
df_asin_variat = df_asin_variat.withColumn("p_rank", F.row_number().over(window=latest_asin_window)) df_asin_variat_agg = df_asin_variat.groupby(['parent_asin']).agg(
df_asin_variat = df_asin_variat.filter("p_rank = 1").drop("p_rank") F.first("created_time").alias("asin_crawl_date"),
df_asin_variat =df_asin_variat.filter(F.size("variat_list_change") > 0). \ F.concat_ws(',', F.collect_list("asin")).alias("variation_info"),
select("parent_asin", "created_time", F.explode("variat_list_change").alias("variant_attribute")). \ F.to_json(F.collect_list(F.struct(F.col("color"), F.col("size"), F.col("style")))).alias("attr_info")
select("parent_asin", "created_time", F.col("variant_attribute")[0].alias("asin"), )
F.col("variant_attribute")[1].alias("color"), F.col("variant_attribute")[3].alias("size"), print("导出父ASIN最新变体信息到doris:")
F.col("variant_attribute")[5].alias("style")) df_doris = df_asin_variat_agg.select(
df_asin_variat_agg = df_asin_variat.groupby(['parent_asin']).agg( "parent_asin",
F.first("created_time").alias("asin_crawl_date"), F.lit(self.date_info).alias("date_info"),
F.concat_ws(',', F.collect_list("asin")).alias("variation_info"), # Doris 新表 asin_crawl_date 是 DATETIME,需 string → timestamp 显式转
F.to_json(F.collect_list(F.struct(F.col("color"), F.col("size"), F.col("style")))).alias("attr_info") F.to_timestamp(F.col("asin_crawl_date")).alias("asin_crawl_date"),
) "variation_info", "attr_info",
print("导出父ASIN最新变体信息到doris:") F.current_timestamp().alias("updated_at"))
df_doris = df_asin_variat_agg.select( table_columns = "parent_asin, date_info, asin_crawl_date, variation_info, attr_info, updated_at"
"parent_asin", F.lit(self.date_info).alias("date_info"), "asin_crawl_date", "variation_info", "attr_info", DorisHelper.spark_export_with_columns(df_save=df_doris, db_name=self.doris_db, table_name=self.parent_asin_latest_detail_table, table_columns=table_columns)
F.current_timestamp().alias("updated_at"))
table_columns = "parent_asin, date_info, asin_crawl_date, variation_info, attr_info, updated_at"
DorisHelper.spark_export_with_columns(df_save=df_doris, db_name=self.doris_db, table_name=self.parent_asin_latest_detail_table, table_columns=table_columns)
else:
print("不用导出旧数据到doris中")
pass
else: else:
pass pass
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment