Commit 2691d9d0 by chenyuanjie

店铺数据解析店铺评分

parent 47359d84
...@@ -110,6 +110,7 @@ class DwtFbBaseReport(object): ...@@ -110,6 +110,7 @@ class DwtFbBaseReport(object):
count_lifetime as count_lifetime_num, count_lifetime as count_lifetime_num,
country_name as fb_country_name, country_name as fb_country_name,
seller_address, seller_address,
seller_rating,
date_format(updated_at, 'yyyy-MM-dd HH:mm:ss') as fb_crawl_date date_format(updated_at, 'yyyy-MM-dd HH:mm:ss') as fb_crawl_date
from ods_seller_account_feedback from ods_seller_account_feedback
where site_name = '{self.site_name}' where site_name = '{self.site_name}'
...@@ -134,6 +135,23 @@ class DwtFbBaseReport(object): ...@@ -134,6 +135,23 @@ class DwtFbBaseReport(object):
""" """
self.df_fb_feedback = self.spark.sql(sqlQuery=sql) self.df_fb_feedback = self.spark.sql(sqlQuery=sql)
self.df_fb_feedback = self.df_fb_feedback.drop_duplicates(['seller_id']).cache() self.df_fb_feedback = self.df_fb_feedback.drop_duplicates(['seller_id']).cache()
# 解析seller_rating: 格式为"30天|-|90天|-|1年|-|历史",切分后转double,-1改为0
_rating_split = F.split(F.col("seller_rating"), r"\|-\|")
self.df_fb_feedback = self.df_fb_feedback \
.withColumn("rating_30_day_num",
F.when(_rating_split.getItem(0).cast(DoubleType()) == -1.0, F.lit(0.0))
.otherwise(_rating_split.getItem(0).cast(DoubleType()))) \
.withColumn("rating_90_day_num",
F.when(_rating_split.getItem(1).cast(DoubleType()) == -1.0, F.lit(0.0))
.otherwise(_rating_split.getItem(1).cast(DoubleType()))) \
.withColumn("rating_1_year_num",
F.when(_rating_split.getItem(2).cast(DoubleType()) == -1.0, F.lit(0.0))
.otherwise(_rating_split.getItem(2).cast(DoubleType()))) \
.withColumn("rating_lifetime_num",
F.when(_rating_split.getItem(3).cast(DoubleType()) == -1.0, F.lit(0.0))
.otherwise(_rating_split.getItem(3).cast(DoubleType())))
print(sql) print(sql)
# 获取我们内部的店铺与asin的数据库(从搜索词抓下来,店铺与asin的关系表) # 获取我们内部的店铺与asin的数据库(从搜索词抓下来,店铺与asin的关系表)
...@@ -406,6 +424,10 @@ class DwtFbBaseReport(object): ...@@ -406,6 +424,10 @@ class DwtFbBaseReport(object):
F.lit(None).alias('usr_mask_progress'), F.lit(None).alias('usr_mask_progress'),
F.col('business_name'), F.col('business_name'),
F.col('business_addr'), F.col('business_addr'),
F.col('rating_30_day_num'),
F.col('rating_90_day_num'),
F.col('rating_1_year_num'),
F.col('rating_lifetime_num'),
F.lit(self.site_name).alias('site_name'), F.lit(self.site_name).alias('site_name'),
F.lit(self.date_type).alias('date_type'), F.lit(self.date_type).alias('date_type'),
F.lit(self.date_info).alias('date_info') F.lit(self.date_info).alias('date_info')
......
...@@ -88,7 +88,11 @@ if __name__ == '__main__': ...@@ -88,7 +88,11 @@ if __name__ == '__main__':
"usr_mask_type", "usr_mask_type",
"usr_mask_progress", "usr_mask_progress",
"business_name", "business_name",
"business_addr" "business_addr",
"rating_30_day_num",
"rating_90_day_num",
"rating_1_year_num",
"rating_lifetime_num"
], ],
partition_dict={ partition_dict={
"site_name": site_name, "site_name": site_name,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment