Commit bef91688 by chenyuanjie

新品asin规则迭代

parent b0597344
...@@ -71,6 +71,7 @@ class DimAsinDetail(object): ...@@ -71,6 +71,7 @@ class DimAsinDetail(object):
self.df_self_asin = self.spark.sql(f"select 1+1;") self.df_self_asin = self.spark.sql(f"select 1+1;")
self.df_asin_category = self.spark.sql(f"select 1+1;") self.df_asin_category = self.spark.sql(f"select 1+1;")
self.df_asin_variat = self.spark.sql(f"select 1+1;") self.df_asin_variat = self.spark.sql(f"select 1+1;")
self.df_keepa_tracking = self.spark.sql(f"select 1+1;")
# 调用公用udf函数 # 调用公用udf函数
self.udf_new_asin_flag = F.udf(udf_new_asin_flag, IntegerType()) self.udf_new_asin_flag = F.udf(udf_new_asin_flag, IntegerType())
self.handle_string_num_value = F.udf(myUDF, StringType()) self.handle_string_num_value = F.udf(myUDF, StringType())
...@@ -215,6 +216,17 @@ class DimAsinDetail(object): ...@@ -215,6 +216,17 @@ class DimAsinDetail(object):
"category_first_name", F.lower("category_first_name") "category_first_name", F.lower("category_first_name")
).repartition(100).persist(StorageLevel.DISK_ONLY) ).repartition(100).persist(StorageLevel.DISK_ONLY)
self.df_asin_category.show(10, truncate=False) self.df_asin_category.show(10, truncate=False)
print("9. 获取keepa追踪时间")
sql = f"""
select asin,
date_format(from_unixtime((cast(tracking_since as bigint) + 21564000) * 60), 'yyyy-MM-dd') as keepa_tracking_since
from dim_keepa_asin_info
where site_name='{self.site_name}'
and tracking_since is not null
"""
print(sql)
self.df_keepa_tracking = self.spark.sql(sqlQuery=sql).repartition(100).persist(StorageLevel.DISK_ONLY)
self.df_keepa_tracking.show(10, truncate=False)
if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info < '2024-06': if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info < '2024-06':
sql = f""" sql = f"""
SELECT asin, parent_asin, color as asin_color, `size` as asin_size, style as asin_style, SELECT asin, parent_asin, color as asin_color, `size` as asin_size, style as asin_style,
...@@ -414,12 +426,15 @@ class DimAsinDetail(object): ...@@ -414,12 +426,15 @@ class DimAsinDetail(object):
).otherwise(F.lit(None)) ).otherwise(F.lit(None))
) )
self.df_asin_keep_date.unpersist() self.df_asin_keep_date.unpersist()
# 关联 keepa 追踪时间
self.df_asin_detail = self.df_asin_detail.join(self.df_keepa_tracking, on='asin', how='left')
self.df_keepa_tracking.unpersist()
# 处理asin各类型信息 # 处理asin各类型信息
def handle_asin_flag(self): def handle_asin_flag(self):
# 生成is_asin_new字段(是否asin新品标记) # 生成is_asin_new字段(是否asin新品标记)
self.df_asin_detail = self.df_asin_detail.withColumn( self.df_asin_detail = self.df_asin_detail.withColumn(
"asin_is_new", self.udf_new_asin_flag(F.col('asin_launch_time'), F.lit(self.cal_date)))\ "asin_is_new", self.udf_new_asin_flag(F.greatest(F.col('asin_launch_time'), F.col('keepa_tracking_since')), F.lit(self.cal_date)))\
.withColumn("asin_is_aadd", F.expr(f"""CASE WHEN INSTR(asin_img_type, '3') > 0 THEN 1 ELSE 0 END"""))\ .withColumn("asin_is_aadd", F.expr(f"""CASE WHEN INSTR(asin_img_type, '3') > 0 THEN 1 ELSE 0 END"""))\
.withColumn("asin_is_video", F.expr(f"""CASE WHEN INSTR(asin_img_type, '2') > 0 THEN 1 ELSE 0 END"""))\ .withColumn("asin_is_video", F.expr(f"""CASE WHEN INSTR(asin_img_type, '2') > 0 THEN 1 ELSE 0 END"""))\
.withColumn("asin_is_picture", F.expr(f"""CASE WHEN INSTR(asin_img_type, '1') > 0 THEN 1 ELSE 0 END"""))\ .withColumn("asin_is_picture", F.expr(f"""CASE WHEN INSTR(asin_img_type, '1') > 0 THEN 1 ELSE 0 END"""))\
...@@ -554,7 +569,7 @@ class DimAsinDetail(object): ...@@ -554,7 +569,7 @@ class DimAsinDetail(object):
"asin_bought_month", "asin_length", "asin_width", "asin_height", "asin_is_self", "asin_bought_month", "asin_length", "asin_width", "asin_height", "asin_is_self",
"customer_reviews_json", "img_list", "variat_list", "customer_reviews_json", "img_list", "variat_list",
F.round("asin_fbm_price", 2).alias("asin_fbm_price"), F.round("asin_fbm_price", 2).alias("asin_fbm_price"),
"current_asin", "amazon_label", "current_asin", "amazon_label", "keepa_tracking_since",
F.lit(self.site_name).alias('site_name'), F.lit(self.site_name).alias('site_name'),
F.lit(self.date_type).alias('date_type'), F.lit(self.date_type).alias('date_type'),
F.lit(self.date_info).alias('date_info')).persist(StorageLevel.MEMORY_ONLY) F.lit(self.date_info).alias('date_info')).persist(StorageLevel.MEMORY_ONLY)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment