Commit d4397441 by hejiangming

词频分类代码

parent b6374899
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil, DateTypes
from utils.db_util import DBUtil
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
# 最后一个参数为 test 标志
test_flag = CommonUtil.get_sys_arg(len(sys.argv) - 1, None)
print(f"执行参数为{sys.argv}")
assert date_type == DateTypes.month.name, "本脚本仅支持 month 类型导出"
if test_flag == 'test':
db_type = 'postgresql_test'
print("导出到测试库中")
else:
# 工时校验(与新 ABA 流程其它导出脚本一致):非工作时段或负责人不在班则跳过
CommonUtil.judge_is_work_hours(
site_name=site_name, date_type=date_type, date_info=date_info,
principal='hejiangming',
priority=2,
export_tools_type=1,
belonging_to_process=f'分类词频_{date_type}'
)
db_type = 'postgresql_cluster'
print("导出到PG集群中")
# 1) 校验 Hive 分区有数据,避免空分区导出后 PG 数据被清空
hive_partition_path = (
f"/home/big_data_selection/dws/dws_aba_word_freq_cate/"
f"site_name={site_name}/date_type={date_type}/date_info={date_info}"
)
hive_files = HdfsUtils.read_list(hive_partition_path)
if not hive_files:
print(f"[ERROR] Hive 分区无数据文件,路径:{hive_partition_path},跳过导出!")
sys.exit(1)
print(f"Hive 分区文件数:{len(hive_files)},路径:{hive_partition_path},继续导出")
engine = DBUtil.get_db_engine(db_type, site_name)
# 2) 表名拼装
# master 表(DBA 按 DDL 建好,PARTITION BY RANGE(date_info)):us_aba_word_freq_cate_2026
# 子分区表:us_aba_word_freq_cate_2026_04
# copy 表:us_aba_word_freq_cate_2026_04_copy
suffix = str(date_info).replace("-", "_")
year_str = CommonUtil.safeIndex(date_info.split("-"), 0, None)
next_val = CommonUtil.get_next_val(date_type, date_info)
export_base_tb = f"{site_name}_aba_word_freq_cate"
export_master_tb = f"{export_base_tb}_{year_str}"
export_table = f"{export_base_tb}_{suffix}"
export_tb_copy = f"{export_table}_copy"
# 3) 在 master 表上建当月子分区(首次跑当月才真正创建,重跑幂等)
sql_create_partition = f"""
create table if not exists {export_table} partition of {export_master_tb}
for values from ('{date_info}') to ('{next_val}');
"""
DBUtil.engine_exec_sql(engine, sql_create_partition)
# 4) 创建 copy 表(继承子分区结构 like ... including all),并清空
# copy 表是独立普通表,Sqoop 先写到这里,最后通过分区交换替换正式子分区,避免空窗期
sql_copy = f"""
create table if not exists {export_tb_copy}
(
like {export_table} including all
);
truncate table {export_tb_copy};
"""
DBUtil.engine_exec_sql(engine, sql_copy)
# 5) ALTER copy 表的 attr_dim 列类型 VARCHAR[] → VARCHAR(500)
# 原因:Sqoop 不支持向 PG 数组类型写数据,先临时改普通 VARCHAR,
# 让 Sqoop 把 Hive 端 "材质,颜色" 逗号串原样写进来;交换前再 ALTER 回 VARCHAR[]
sql_alter_to_varchar = f"""
ALTER TABLE {export_tb_copy} ALTER COLUMN attr_dim TYPE VARCHAR(500);
"""
DBUtil.engine_exec_sql(engine, sql_alter_to_varchar)
# 6) 拼装 Sqoop 导出脚本(字段顺序与 Hive dws_aba_word_freq_cate schema 一致;
# site_name/date_type 编进表名不导出列,date_info 作为普通列导出)
export_cols = [
"scope",
"category_id",
"category_level",
"base_word",
"display_word",
"word_freq",
"word_heat",
"relate_st_num",
"min_rank",
"new_st_num",
"word_heat_last_year",
"word_heat_change_rate",
"word_heat_last_month",
"word_heat_rate_of_change",
"yoy_up_ratio",
"mom_up_ratio",
"word_heat_change_last_1_month",
"word_heat_change_1_month_ago",
"word_heat_change_2_month_ago",
"word_heat_change_3_month_ago",
"word_heat_change_4_month_ago",
"word_heat_change_5_month_ago",
"top_aba_example",
"brand_word_flag",
"new_word_flag",
"attr_dim",
"word_cn",
"heat_trend",
"created_time",
"updated_time",
"date_info",
]
sh = CommonUtil.build_export_sh(
site_name=site_name,
db_type=db_type,
hive_tb="dws_aba_word_freq_cate",
export_tb=export_tb_copy,
col=export_cols,
partition_dict={
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
client.close()
# 7) Sqoop 写完后,ALTER copy 表的 attr_dim 回 VARCHAR[]
# USING string_to_array(...) 把逗号串 "材质,颜色" 拆成数组 {材质,颜色}
# 词典无匹配的词 PySpark 已填 "-1",转换后是 {-1},与 Java 占位约定一致
sql_alter_back = f"""
ALTER TABLE {export_tb_copy}
ALTER COLUMN attr_dim TYPE VARCHAR[]
USING string_to_array(attr_dim, ',')::varchar[];
"""
DBUtil.engine_exec_sql(engine, sql_alter_back)
# 8) 分区交换:copy 表替换正式子分区,无空窗期
DBUtil.exchange_pg_part_tb(
engine,
source_tb_name=export_tb_copy,
part_master_tb=export_master_tb,
part_target_tb=export_table,
cp_index_flag=False,
part_val={"from": [date_info], "to": [next_val]}
)
# 9) 删除 copy 表(交换后 copy 表里是旧数据,留着没意义)
DBUtil.engine_exec_sql(engine, f"drop table if exists {export_tb_copy};")
print(f"==================表 {export_table} 导出完成==================================")
print("success")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment