Commit ff9987d0 by hejiangming

no message

parent e22e3efe
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil, DateTypes
from utils.db_util import DBUtil
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
# 获取最后一个参数(test 标志)
test_flag = CommonUtil.get_sys_arg(len(sys.argv) - 1, None)
print(f"执行参数为{sys.argv}")
assert date_type == DateTypes.month.name, "本脚本仅支持 month 类型导出"
if test_flag == 'test':
db_type = 'postgresql_test'
print("导出到测试库中")
else:
# 工时校验(与新 ABA 流程其他导出脚本一致):非工作时段或负责人不在班则跳过
CommonUtil.judge_is_work_hours(
site_name=site_name, date_type=date_type, date_info=date_info,
principal='hejiangming',
priority=2,
export_tools_type=1,
belonging_to_process=f'新ABA流程词频热度_{date_type}'
)
db_type = 'postgresql_cluster'
print("导出到PG集群中")
# 1) 校验 Hive 分区有数据,避免空分区导出后 PG 数据被清空
hive_partition_path = (
f"/home/big_data_selection/dws/dws_aba_word_heat/"
f"site_name={site_name}/date_type={date_type}/date_info={date_info}"
)
hive_files = HdfsUtils.read_list(hive_partition_path)
if not hive_files:
print(f"[ERROR] Hive 分区无数据文件,路径:{hive_partition_path},跳过导出!")
sys.exit(1)
print(f"Hive 分区文件数:{len(hive_files)},路径:{hive_partition_path},继续导出")
engine = DBUtil.get_db_engine(db_type, site_name)
# 2) 表名拼装
# master 表(DBA 建好的,PARTITION BY RANGE):us_aba_word_heat_2026
# 子分区表:us_aba_word_heat_month_2026_05
# copy 表:us_aba_word_heat_month_2026_05_copy
suffix = str(date_info).replace("-", "_")
year_str = CommonUtil.safeIndex(date_info.split("-"), 0, None)
next_val = CommonUtil.get_next_val(date_type, date_info)
export_base_tb = f"{site_name}_aba_word_heat"
export_master_tb = f"{export_base_tb}_{year_str}"
export_table = f"{export_base_tb}_{suffix}"
export_tb_copy = f"{export_table}_copy"
# 3) 在 master 表上建当月子分区(首次跑当月才会真正创建,重跑幂等)
sql_create_partition = f"""
create table if not exists {export_table} partition of {export_master_tb}
for values from ('{date_info}') to ('{next_val}');
"""
DBUtil.engine_exec_sql(engine, sql_create_partition)
# 4) 创建 copy 表(继承子分区结构 like ... including all),并清空
# copy 表是独立普通表,Sqoop 先写到这里,最后通过分区交换替换正式子分区,避免空窗期
sql_copy = f"""
create table if not exists {export_tb_copy}
(
like {export_table} including all
);
truncate table {export_tb_copy};
"""
DBUtil.engine_exec_sql(engine, sql_copy)
# 5) ALTER copy 表的 theme_ch_list 列类型 VARCHAR[] → VARCHAR(200)
# 原因:Sqoop 不支持向 PG 数组类型写数据,必须临时改成普通 VARCHAR,
# 让 Sqoop 把 Hive 端 "材质,颜色" 这种逗号串原样写进来
# 交换前再 ALTER 回 VARCHAR[]
sql_alter_to_varchar = f"""
ALTER TABLE {export_tb_copy} ALTER COLUMN theme_ch_list TYPE VARCHAR(200);
"""
DBUtil.engine_exec_sql(engine, sql_alter_to_varchar)
# 6) 拼装 Sqoop 导出脚本(字段顺序与 Hive dws_aba_word_heat schema 一致)
export_cols = [
"word",
"word_heat",
"word_heat_change_rate",
"theme_ch_list",
"created_time",
"updated_time",
"date_info"
]
sh = CommonUtil.build_export_sh(
site_name=site_name,
db_type=db_type,
hive_tb="dws_aba_word_heat",
export_tb=export_tb_copy,
col=export_cols,
partition_dict={
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
client.close()
# 7) Sqoop 写完后,ALTER copy 表的 theme_ch_list 回 VARCHAR[]
# USING string_to_array(...) 把逗号串 "材质,颜色" 拆成数组 {材质,颜色}
# 词典无匹配的词 PySpark 已 fillna "-1",转换后是 {-1},与 Java 占位约定一致
sql_alter_back = f"""
ALTER TABLE {export_tb_copy}
ALTER COLUMN theme_ch_list TYPE VARCHAR[]
USING string_to_array(theme_ch_list, ',')::varchar[];
"""
DBUtil.engine_exec_sql(engine, sql_alter_back)
# 8) 分区交换:copy 表替换正式子分区,无空窗期
DBUtil.exchange_pg_part_tb(
engine,
source_tb_name=export_tb_copy,
part_master_tb=export_master_tb,
part_target_tb=export_table,
cp_index_flag=False,
part_val={"from": [date_info], "to": [next_val]}
)
# 9) 删除 copy 表(交换后 copy 表里是旧数据,留着没意义)
DBUtil.engine_exec_sql(engine, f"drop table if exists {export_tb_copy};")
# 10) 更新 workflow_everyday 流程表(业务监听导出完成的标记)
# 参考 sqoop_export/dwt_aba_last365.py 的 REPLACE INTO 写法(新流程节点登记)
# page='AbaWordHeat' 为本表专属标识,date_type='month',table_name 是 master 表名
# if test_flag != 'test':
# mysql_engine = DBUtil.get_db_engine("mysql", "us")
# with mysql_engine.connect() as connection:
# sql = f"""
# replace into workflow_everyday (
# site_name, report_date, status, status_val, table_name, date_type, page, is_end, remark, export_db_type
# )
# values (
# '{site_name}', '{date_info}', '导出pg完成', 14,
# '{export_master_tb}', 'month', 'AbaWordHeat', '是',
# 'ABA词频热度月表', 'postgresql_cluster'
# );
# """
# print("================================更新 workflow_everyday================================")
# print(sql)
# connection.execute(sql)
print(f"==================表 {export_table} 导出完成==================================")
print("success")
import os
import sys
sys.path.append(os.path.dirname(sys.path[0]))
from utils.db_util import DBUtil
from utils.ssh_util import SSHUtil
from utils.common_util import CommonUtil, DateTypes
from utils.hdfs_utils import HdfsUtils
if __name__ == '__main__':
site_name = CommonUtil.get_sys_arg(1, None)
date_type = CommonUtil.get_sys_arg(2, None)
date_info = CommonUtil.get_sys_arg(3, None)
test_flag = CommonUtil.get_sys_arg(len(sys.argv) - 1, None)
print(f"执行参数为{sys.argv}")
assert site_name == 'us', "本表仅 us 站点导出"
assert date_type == DateTypes.month.name, "本脚本仅支持 month 类型(基准月口径)"
if test_flag == 'test':
db_type = 'postgresql_test'
print("导出到测试库中")
else:
# 工时校验(与新 ABA 流程其他导出脚本一致)
CommonUtil.judge_is_work_hours(
site_name=site_name, date_type=date_type, date_info=date_info,
principal='hejiangming',
priority=2,
export_tools_type=1,
belonging_to_process=f'新ABA流程年词频热度_{date_type}'
)
db_type = "postgresql_cluster"
print("导出到PG集群中")
# 1) 校验 Hive 分区有数据
hive_partition_path = (
f"/home/big_data_selection/dws/dws_aba_word_heat_last365/"
f"site_name={site_name}/date_type={date_type}/date_info={date_info}"
)
hive_files = HdfsUtils.read_list(hive_partition_path)
if not hive_files:
print(f"[ERROR] Hive 分区无数据文件,路径:{hive_partition_path},跳过导出!")
sys.exit(1)
print(f"Hive 分区文件数:{len(hive_files)},路径:{hive_partition_path},继续导出")
# 2) 表名拼装(年表固定,不带年份后缀)
export_tb_target = "us_aba_word_heat_last_365_day" # 正式表
export_tb_copy = f"{export_tb_target}_copy" # copy 表
engine = DBUtil.get_db_engine(db_type, site_name)
# 3) 创建 copy 表(drop + create like including comments),并 ALTER 数组列为 VARCHAR
# 用 `including comments` 而不是 `including all`:
# only 复制列定义和注释,不复制索引/约束,避免主键冲突等问题(与 dwt_aba_last365 同款)
with engine.connect() as connection:
sql = f"""
drop table if exists {export_tb_copy};
create table if not exists {export_tb_copy}
(
like {export_tb_target} including comments
);
ALTER TABLE {export_tb_copy} ALTER COLUMN theme_ch_list TYPE VARCHAR(200);
"""
print("================================执行 SQL================================")
print(sql)
connection.execute(sql)
# 4) Sqoop 导出(字段顺序与 Hive 表 schema 一致,分区字段 date_info 放最后)
sh = CommonUtil.build_export_sh(
site_name=site_name,
db_type=db_type,
hive_tb="dws_aba_word_heat_last365",
export_tb=export_tb_copy,
col=[
"word",
"word_heat",
"word_heat_change_rate",
"theme_ch_list",
"created_time",
"updated_time",
"date_info"
],
partition_dict={
"site_name": site_name,
"date_type": date_type,
"date_info": date_info
}
)
client = SSHUtil.get_ssh_client()
SSHUtil.exec_command_async(client, sh, ignore_err=False)
client.close()
# 5) 交换 copy 表与正式表(DBUtil.exchange_tb 内部做 rename 交换)
# 与 dwt_aba_last365 同款:cp_index_flag=True 复制索引到新正式表
DBUtil.exchange_tb(
engine,
source_tb_name=export_tb_copy,
target_tb_name=export_tb_target,
cp_index_flag=True
)
# 6) 交换完成后,把正式表的 theme_ch_list 从 VARCHAR 转回 VARCHAR[]
# USING string_to_array(...) 把 "材质,颜色" 拆成 {材质,颜色}
# 词典无匹配的词 PySpark 已 fillna "-1",转换后是 {-1},与 Java 占位约定一致
with engine.connect() as connection:
sql = f"""
ALTER TABLE {export_tb_target}
ALTER COLUMN theme_ch_list TYPE VARCHAR[]
USING string_to_array(theme_ch_list, ',')::varchar[];
"""
print("================================执行 SQL================================")
print(sql)
connection.execute(sql)
# 7) 更新 workflow_everyday 流程表(业务监听导出完成的标记)
# 参考 sqoop_export/dwt_aba_last365.py 的写法
# if test_flag != 'test':
# mysql_engine = DBUtil.get_db_engine("mysql", "us")
# with mysql_engine.connect() as connection:
# sql = f"""
# replace into workflow_everyday (
# site_name, report_date, status, status_val, table_name, date_type, page, is_end, remark, export_db_type
# )
# values (
# '{site_name}', '{date_info}', '导出pg完成', 14,
# '{export_tb_target}', '365_day', 'AbaWordHeatYear', '是',
# 'ABA词频热度年表(最近12月,每月更新)', 'postgresql_cluster'
# );
# """
# print("================================更新 workflow_everyday================================")
# print(sql)
# connection.execute(sql)
print("success")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment