Commit 0bcdbf7c by chenyuanjie

流量选品-打包数量解析迭代

parent 5bf88f84
......@@ -159,7 +159,8 @@ class DimAsinDetail(object):
REGEXP_REPLACE(seller_json, chr(10), '') as seller_json, buy_box_seller_type as asin_buy_box_seller_type,
customer_reviews_json, parent_asin, img_list, created_at as created_time, updated_at as updated_time,
updated_at as dt, variat_num as variation_num, fbm_delivery_price as asin_fbm_price,
get_json_object(product_json, '$.Color') as product_json_color
get_json_object(product_json, '$.Color') as product_json_color,
get_json_object(product_json, '$.Number of Items') as product_json_number_of_items
from ods_asin_detail where site_name='{self.site_name}' {self.date_sql}"""
print(sql)
self.df_asin_detail = self.spark.sql(sqlQuery=sql)
......@@ -348,15 +349,23 @@ class DimAsinDetail(object):
).withColumn(
"variat_package_quantity_is_abnormal", self.df_asin_detail.variat_parse.getField("is_package_quantity_abnormal")
).drop("title_parse", "variat_parse", "variat_attribute")
# Number of Items:直接从 product_json 提取,cast 失败(脏数据)自动为 null
self.df_asin_detail = self.df_asin_detail.withColumn(
"package_quantity", F.expr(""" CASE
WHEN title_package_quantity is null and variat_package_quantity is not null THEN variat_package_quantity
WHEN title_package_quantity is not null THEN title_package_quantity
"number_of_items", F.col("product_json_number_of_items").cast("int")
).drop("product_json_number_of_items")
# 优先级:Number of Items > 属性字段 > 标题解析 > 默认1
self.df_asin_detail = self.df_asin_detail.withColumn(
"package_quantity", F.expr("""CASE
WHEN number_of_items IS NOT NULL AND number_of_items > 0 THEN number_of_items
WHEN variat_package_quantity IS NOT NULL THEN variat_package_quantity
WHEN title_package_quantity IS NOT NULL THEN title_package_quantity
ELSE 1 END""")).withColumn(
"is_package_quantity_abnormal", F.expr("""CASE
WHEN title_package_quantity is null and variat_package_quantity is not null THEN variat_package_quantity_is_abnormal
WHEN title_package_quantity is not null THEN title_package_quantity_is_abnormal
ELSE 2 END""")).drop("title_package_quantity", "variat_package_quantity", "title_package_quantity_is_abnormal", "variat_package_quantity_is_abnormal")
WHEN number_of_items IS NOT NULL AND number_of_items > 0 THEN 0
WHEN variat_package_quantity IS NOT NULL THEN variat_package_quantity_is_abnormal
WHEN title_package_quantity IS NOT NULL THEN title_package_quantity_is_abnormal
ELSE 2 END""")).drop("number_of_items", "title_package_quantity", "variat_package_quantity",
"title_package_quantity_is_abnormal", "variat_package_quantity_is_abnormal")
self.df_asin_detail = self.df_asin_detail.join(self.df_user_package_num, on=['asin', 'asin_title'], how='left')
self.df_asin_detail = self.df_asin_detail.withColumn(
"package_quantity", F.coalesce(F.col("user_package_num"), F.col("package_quantity"))).withColumn(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment