Commit 145fe90b by chenyuanjie

“关联流量”相关代码

parent 20ea475b
import os
import sys
sys.path.append(os.path.dirname(sys.path[0])) # 上级目录
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from utils.spark_util import SparkUtil
from pyspark.sql.types import *
from utils.common_util import CommonUtil
from pyspark.sql import DataFrame
from utils.hdfs_utils import HdfsUtils
class DwtAsinRelatedTraffic(object):
def __init__(self, site_name, date_type, date_info):
super().__init__()
self.site_name = site_name
self.date_type = date_type
self.date_info = date_info
self.hive_tb = f'dwt_asin_related_traffic'
self.partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
self.hdfs_path = CommonUtil.build_hdfs_path(self.hive_tb, partition_dict=self.partition_dict)
app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
self.spark = SparkUtil.get_spark_session(app_name)
self.partitions_by = ['site_name', 'date_type', 'date_info']
self.df_dim_asin_related_traffic = self.spark.sql(f"select 1+1;")
self.df_save = self.spark.sql(f"select 1+1;")
self.col_num_index = {
"four_star_above": 1,
"brand_recommendation": 2,
"similar_items": 3,
"look_and_look": 4,
"look_also_look": 5,
"look_but_bought": 6,
"bundle_bought": 7,
"combination_bought": 8,
"more_relevant": 9,
"bought_and_bought": 10,
"product_adv": 11,
"brand_adv": 12
}
def read_data(self):
print("读取dim_asin_related_traffic流量数据")
sql = f"""
select
asin,
four_star_above,
brand_recommendation,
similar_items,
look_and_look,
look_also_look,
look_but_bought,
bundle_bought,
combination_bought,
more_relevant,
bought_and_bought,
product_adv,
brand_adv
from dim_asin_related_traffic where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}';
"""
self.df_dim_asin_related_traffic = self.spark.sql(sqlQuery=sql).cache()
print("dim_asin_related_traffic数据如下:")
self.df_dim_asin_related_traffic.show(10, True)
# 指标计算
def handle_data(self):
cols = [col for col in self.df_dim_asin_related_traffic.columns if col != 'asin']
# 不同类型下 关联流量asin数
for col in cols:
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
f"{col}_cnt", F.size(F.split(F.col(col), ","))
).withColumn(
f"{col}_cnt", F.when(F.col(f"{col}_cnt") == -1, 0).otherwise(F.col(f"{col}_cnt"))
)
# 关联流量asin总数
all_cnt_cols = [F.col(f"{col}_cnt") for col in cols]
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
"total_cnt", sum(all_cnt_cols)
).withColumn(
"total_cnt", F.when(F.col('total_cnt') == 0, -1).otherwise(F.col('total_cnt'))
)
# 不同类型 关联流量占比
for col in cols:
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
f"{col}_rat", F.round(F.col(f"{col}_cnt") / F.col("total_cnt"), 4)
)
# 付费/免费流量占比
pay_cols = ["four_star_above", "product_adv", "brand_adv"]
free_cols = [col for col in cols if col not in pay_cols]
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
"pay_cnt", sum([F.col(f"{col}_cnt") for col in pay_cols])
).withColumn(
"pay_rat", F.round(F.col("pay_cnt") / F.col("total_cnt"), 4)
).withColumn(
"free_cnt", sum([F.col(f"{col}_cnt") for col in free_cols])
).withColumn(
"free_rat", F.round(F.col("free_cnt") / F.col("total_cnt"), 4)
).fillna({"pay_cnt": 0,
"pay_rat": 0,
"free_cnt": 0,
"free_rat": 0})
# 字段整合
# 将所有类型下的关联流量asin拼接
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
"related_asin", F.concat_ws(",", *[F.col(c) for c in cols])
)
# 根据map映射 生成与流量asin数量相等的标号列
for col in cols:
num = self.col_num_index[col]
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
f"{col}_num",
F.when(
F.col(col).isNull(), F.lit(None)
).otherwise(
F.concat_ws(",", F.array_repeat(F.lit(num), F.size(F.split(F.col(col), ",")))))
)
# 将所有编号列进行拼接
num_cols = [f"{col}_num" for col in cols]
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
"related_type", F.concat_ws(",", *[F.col(c) for c in num_cols])
)
# 将12种类型的流量数、占比数据整合
self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
"related_type_num", F.concat_ws(",", *[F.col(f"{c}_cnt") for c in cols])
).withColumn(
"related_type_rate", F.concat_ws(",", *[F.col(f"{c}_rat") for c in cols])
)
# 数据落盘
def save_data(self):
self.df_save = self.df_dim_asin_related_traffic.select(
'asin',
'related_asin',
'related_type',
'related_type_num',
'related_type_rate',
'free_cnt',
'free_rat',
'pay_cnt',
'pay_rat',
'total_cnt',
F.lit(self.site_name).alias('site_name'),
F.lit(self.date_type).alias('date_type'),
F.lit(self.date_info).alias('date_info')
)
print(f"清除hdfs目录中:{self.hdfs_path}")
HdfsUtils.delete_file_in_folder(self.hdfs_path)
print(f"当前存储的表名为:{self.hive_tb},分区为:{self.partitions_by}")
self.df_save.repartition(40).write.saveAsTable(name=self.hive_tb, format='hive', mode='append', partitionBy=self.partitions_by)
print("success")
def run(self):
# 读取数据
self.read_data()
# 指标计算
self.handle_data()
# 数据落盘
self.save_data()
if __name__ == '__main__':
site_name = sys.argv[1]
date_type = sys.argv[2]
date_info = sys.argv[3]
handle_obj = DwtAsinRelatedTraffic(site_name=site_name, date_type=date_type, date_info=date_info)
handle_obj.run()
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil
from utils.db_util import DBUtil
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
print(f"执行参数为{sys.argv}")
# CommonUtil.judge_is_work_hours(site_name=site_name, date_type=date_type, date_info=date_info,
# principal='chenyuanjie',
# priority=2, export_tools_type=1, belonging_to_process=f'新ABA流程_{date_type}')
db_type = 'postgresql_cluster'
engine = DBUtil.get_db_engine(db_type, site_name)
dt = str(date_info).replace("-", "_")
export_tb = f"{site_name}_asin_related_{dt}"
export_cols = [
'asin',
'related_asin',
'related_type',
'related_type_num',
'related_type_rate',
'free_cnt',
'free_rat',
'pay_cnt',
'pay_rat',
'total_cnt'
]
sql = f"""
ALTER TABLE {export_tb} ALTER COLUMN related_asin TYPE text;
ALTER TABLE {export_tb} ALTER COLUMN related_type TYPE text;
ALTER TABLE {export_tb} ALTER COLUMN related_type_num TYPE text;
ALTER TABLE {export_tb} ALTER COLUMN related_type_rate TYPE text;
"""
DBUtil.engine_exec_sql(engine, sql)
partition_dict = {
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
sh = CommonUtil.build_export_sh(
site_name=site_name,
db_type=db_type,
hive_tb="dwt_asin_related_traffic",
export_tb=export_tb,
col=export_cols,
partition_dict=partition_dict
)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
client.close()
sql = f"""
ALTER TABLE {export_tb}
ALTER COLUMN related_asin TYPE VARCHAR(20)[]
USING string_to_array(related_asin, ',');
ALTER TABLE {export_tb}
ALTER COLUMN related_type TYPE INTEGER[]
USING string_to_array(related_type, ',')::int[];
ALTER TABLE {export_tb}
ALTER COLUMN related_type_num TYPE INTEGER[]
USING string_to_array(related_type_num, ',')::int[];
ALTER TABLE {export_tb}
ALTER COLUMN related_type_rate TYPE numeric(10,4)[]
USING string_to_array(related_type_rate, ',')::numeric(10,4)[];
"""
DBUtil.engine_exec_sql(engine, sql)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment