Commit 9a02c65a by wangjing

no message

parent 68c22cae
...@@ -98,6 +98,10 @@ class DwtStDetailWeek(object): ...@@ -98,6 +98,10 @@ class DwtStDetailWeek(object):
'dt_rank', F.row_number().over(window=window) 'dt_rank', F.row_number().over(window=window)
).filter('dt_rank=1').drop('dt_rank', 'updated_time').cache() ).filter('dt_rank=1').drop('dt_rank', 'updated_time').cache()
# 对数据列清洗 有些是\0的数据
for col in ['search_term', 'asin1', 'asin2', 'asin3', 'product_title1', 'product_title2', 'product_title3', 'brand1', 'brand2', 'brand3', 'category1', 'category2', 'category3']:
self.df_st_detail = self.df_st_detail.withColumn(col, F.regexp_replace(F.col(col), '\x00', ''))
self.df_st_detail_last_week = self.df_st_detail.filter(f"date_info = '{self.date_info_last_week}'") self.df_st_detail_last_week = self.df_st_detail.filter(f"date_info = '{self.date_info_last_week}'")
for col in self.cols: for col in self.cols:
self.df_st_detail_last_week = self.df_st_detail_last_week.withColumnRenamed( self.df_st_detail_last_week = self.df_st_detail_last_week.withColumnRenamed(
......
...@@ -103,7 +103,7 @@ if __name__ == '__main__': ...@@ -103,7 +103,7 @@ if __name__ == '__main__':
"site_name": site_name, "site_name": site_name,
"date_type": date_type, "date_type": date_type,
"date_info": date_info "date_info": date_info
} },num_mappers=2
) )
client = SSHUtil.get_ssh_client() client = SSHUtil.get_ssh_client()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment