no message

ff9987d0 · hejiangming · e22e3efe · ff9987d0 · ff9987d0 · ff9987d0
Commit ff9987d0 authored Jun 11, 2026 by hejiangming
4 changed files
--- a/Pyspark_job/dws/dws_aba_word_heat.py
+++ b/Pyspark_job/dws/dws_aba_word_heat.py
--- a/Pyspark_job/dws/dws_aba_word_heat_last365.py
+++ b/Pyspark_job/dws/dws_aba_word_heat_last365.py
--- a/Pyspark_job/sqoop_export/dws_aba_word_heat.py
+++ b/Pyspark_job/sqoop_export/dws_aba_word_heat.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))
+from utils.ssh_util import SSHUtil
+from utils.common_util import CommonUtil, DateTypes
+from utils.db_util import DBUtil
+from utils.hdfs_utils import HdfsUtils
+
+if __name__ == '__main__':
+    site_name = CommonUtil.get_sys_arg(1, None)
+    date_type = CommonUtil.get_sys_arg(2, None)
+    date_info = CommonUtil.get_sys_arg(3, None)
+    # 获取最后一个参数（test 标志）
+    test_flag = CommonUtil.get_sys_arg(len(sys.argv) - 1, None)
+    print(f"执行参数为{sys.argv}")
+
+    assert date_type == DateTypes.month.name, "本脚本仅支持 month 类型导出"
+
+    if test_flag == 'test':
+        db_type = 'postgresql_test'
+        print("导出到测试库中")
+    else:
+        # 工时校验（与新 ABA 流程其他导出脚本一致）：非工作时段或负责人不在班则跳过
+        CommonUtil.judge_is_work_hours(
+            site_name=site_name, date_type=date_type, date_info=date_info,
+            principal='hejiangming',
+            priority=2,
+            export_tools_type=1,
+            belonging_to_process=f'新ABA流程词频热度_{date_type}'
+        )
+        db_type = 'postgresql_cluster'
+        print("导出到PG集群中")
+
+    # 1) 校验 Hive 分区有数据，避免空分区导出后 PG 数据被清空
+    hive_partition_path = (
+        f"/home/big_data_selection/dws/dws_aba_word_heat/"
+        f"site_name={site_name}/date_type={date_type}/date_info={date_info}"
+    )
+    hive_files = HdfsUtils.read_list(hive_partition_path)
+    if not hive_files:
+        print(f"[ERROR] Hive 分区无数据文件，路径：{hive_partition_path}，跳过导出！")
+        sys.exit(1)
+    print(f"Hive 分区文件数：{len(hive_files)}，路径：{hive_partition_path}，继续导出")
+
+    engine = DBUtil.get_db_engine(db_type, site_name)
+
+    # 2) 表名拼装
+    # master 表（DBA 建好的，PARTITION BY RANGE）：us_aba_word_heat_2026
+    # 子分区表：us_aba_word_heat_month_2026_05
+    # copy 表：us_aba_word_heat_month_2026_05_copy
+    suffix = str(date_info).replace("-", "_")
+    year_str = CommonUtil.safeIndex(date_info.split("-"), 0, None)
+    next_val = CommonUtil.get_next_val(date_type, date_info)
+
+    export_base_tb = f"{site_name}_aba_word_heat"
+    export_master_tb = f"{export_base_tb}_{year_str}"
+    export_table = f"{export_base_tb}_{suffix}"
+    export_tb_copy = f"{export_table}_copy"
+
+    # 3) 在 master 表上建当月子分区（首次跑当月才会真正创建，重跑幂等）
+    sql_create_partition = f"""
+        create table if not exists {export_table} partition of {export_master_tb}
+        for values from ('{date_info}') to ('{next_val}');
+    """
+    DBUtil.engine_exec_sql(engine, sql_create_partition)
+
+    # 4) 创建 copy 表（继承子分区结构 like ... including all），并清空
+    # copy 表是独立普通表，Sqoop 先写到这里，最后通过分区交换替换正式子分区，避免空窗期
+    sql_copy = f"""
+        create table if not exists {export_tb_copy}
+        (
+            like {export_table} including all
+        );
+        truncate table {export_tb_copy};
+    """
+    DBUtil.engine_exec_sql(engine, sql_copy)
+
+    # 5) ALTER copy 表的 theme_ch_list 列类型 VARCHAR[] → VARCHAR(200)
+    # 原因：Sqoop 不支持向 PG 数组类型写数据，必须临时改成普通 VARCHAR，
+    # 让 Sqoop 把 Hive 端 "材质,颜色" 这种逗号串原样写进来
+    # 交换前再 ALTER 回 VARCHAR[]
+    sql_alter_to_varchar = f"""
+        ALTER TABLE {export_tb_copy} ALTER COLUMN theme_ch_list TYPE VARCHAR(200);
+    """
+    DBUtil.engine_exec_sql(engine, sql_alter_to_varchar)
+
+    # 6) 拼装 Sqoop 导出脚本（字段顺序与 Hive dws_aba_word_heat schema 一致）
+    export_cols = [
+        "word",
+        "word_heat",
+        "word_heat_change_rate",
+        "theme_ch_list",
+        "created_time",
+        "updated_time",
+        "date_info"
+    ]
+
+    sh = CommonUtil.build_export_sh(
+        site_name=site_name,
+        db_type=db_type,
+        hive_tb="dws_aba_word_heat",
+        export_tb=export_tb_copy,
+        col=export_cols,
+        partition_dict={
+            "site_name": site_name,
+            "date_type": date_type,
+            "date_info": date_info
+        }
+    )
+
+    client = SSHUtil.get_ssh_client()
+    SSHUtil.exec_command_async(client, sh, ignore_err=False)
+    client.close()
+
+    # 7) Sqoop 写完后，ALTER copy 表的 theme_ch_list 回 VARCHAR[]
+    # USING string_to_array(...) 把逗号串 "材质,颜色" 拆成数组 {材质,颜色}
+    # 词典无匹配的词 PySpark 已 fillna "-1"，转换后是 {-1}，与 Java 占位约定一致
+    sql_alter_back = f"""
+        ALTER TABLE {export_tb_copy}
+        ALTER COLUMN theme_ch_list TYPE VARCHAR[]
+        USING string_to_array(theme_ch_list, ',')::varchar[];
+    """
+    DBUtil.engine_exec_sql(engine, sql_alter_back)
+
+    # 8) 分区交换：copy 表替换正式子分区，无空窗期
+    DBUtil.exchange_pg_part_tb(
+        engine,
+        source_tb_name=export_tb_copy,
+        part_master_tb=export_master_tb,
+        part_target_tb=export_table,
+        cp_index_flag=False,
+        part_val={"from": [date_info], "to": [next_val]}
+    )
+
+    # 9) 删除 copy 表（交换后 copy 表里是旧数据，留着没意义）
+    DBUtil.engine_exec_sql(engine, f"drop table if exists {export_tb_copy};")
+
+    # 10) 更新 workflow_everyday 流程表（业务监听导出完成的标记）
+    # 参考 sqoop_export/dwt_aba_last365.py 的 REPLACE INTO 写法（新流程节点登记）
+    # page='AbaWordHeat' 为本表专属标识，date_type='month'，table_name 是 master 表名
+    # if test_flag != 'test':
+    #     mysql_engine = DBUtil.get_db_engine("mysql", "us")
+    #     with mysql_engine.connect() as connection:
+    #         sql = f"""
+    #             replace into workflow_everyday (
+    #                 site_name, report_date, status, status_val, table_name, date_type, page, is_end, remark, export_db_type
+    #             )
+    #             values (
+    #                 '{site_name}', '{date_info}', '导出pg完成', 14,
+    #                 '{export_master_tb}', 'month', 'AbaWordHeat', '是',
+    #                 'ABA词频热度月表', 'postgresql_cluster'
+    #             );
+    #         """
+    #         print("================================更新 workflow_everyday================================")
+    #         print(sql)
+    #         connection.execute(sql)
+
+    print(f"==================表 {export_table} 导出完成==================================")
+    print("success")
--- a/Pyspark_job/sqoop_export/dws_aba_word_heat_last365.py
+++ b/Pyspark_job/sqoop_export/dws_aba_word_heat_last365.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))
+from utils.db_util import DBUtil
+from utils.ssh_util import SSHUtil
+from utils.common_util import CommonUtil, DateTypes
+from utils.hdfs_utils import HdfsUtils
+
+if __name__ == '__main__':
+    site_name = CommonUtil.get_sys_arg(1, None)
+    date_type = CommonUtil.get_sys_arg(2, None)
+    date_info = CommonUtil.get_sys_arg(3, None)
+    test_flag = CommonUtil.get_sys_arg(len(sys.argv) - 1, None)
+    print(f"执行参数为{sys.argv}")
+
+    assert site_name == 'us', "本表仅 us 站点导出"
+    assert date_type == DateTypes.month.name, "本脚本仅支持 month 类型（基准月口径）"
+
+    if test_flag == 'test':
+        db_type = 'postgresql_test'
+        print("导出到测试库中")
+    else:
+        # 工时校验（与新 ABA 流程其他导出脚本一致）
+        CommonUtil.judge_is_work_hours(
+            site_name=site_name, date_type=date_type, date_info=date_info,
+            principal='hejiangming',
+            priority=2,
+            export_tools_type=1,
+            belonging_to_process=f'新ABA流程年词频热度_{date_type}'
+        )
+        db_type = "postgresql_cluster"
+        print("导出到PG集群中")
+
+    # 1) 校验 Hive 分区有数据
+    hive_partition_path = (
+        f"/home/big_data_selection/dws/dws_aba_word_heat_last365/"
+        f"site_name={site_name}/date_type={date_type}/date_info={date_info}"
+    )
+    hive_files = HdfsUtils.read_list(hive_partition_path)
+    if not hive_files:
+        print(f"[ERROR] Hive 分区无数据文件，路径：{hive_partition_path}，跳过导出！")
+        sys.exit(1)
+    print(f"Hive 分区文件数：{len(hive_files)}，路径：{hive_partition_path}，继续导出")
+
+    # 2) 表名拼装（年表固定，不带年份后缀）
+    export_tb_target = "us_aba_word_heat_last_365_day"   # 正式表
+    export_tb_copy = f"{export_tb_target}_copy"          # copy 表
+
+    engine = DBUtil.get_db_engine(db_type, site_name)
+
+    # 3) 创建 copy 表（drop + create like including comments），并 ALTER 数组列为 VARCHAR
+    # 用 `including comments` 而不是 `including all`：
+    #   only 复制列定义和注释，不复制索引/约束，避免主键冲突等问题（与 dwt_aba_last365 同款）
+    with engine.connect() as connection:
+        sql = f"""
+            drop table if exists {export_tb_copy};
+            create table if not exists {export_tb_copy}
+            (
+                like {export_tb_target} including comments
+            );
+            ALTER TABLE {export_tb_copy} ALTER COLUMN theme_ch_list TYPE VARCHAR(200);
+        """
+        print("================================执行 SQL================================")
+        print(sql)
+        connection.execute(sql)
+
+    # 4) Sqoop 导出（字段顺序与 Hive 表 schema 一致，分区字段 date_info 放最后）
+    sh = CommonUtil.build_export_sh(
+        site_name=site_name,
+        db_type=db_type,
+        hive_tb="dws_aba_word_heat_last365",
+        export_tb=export_tb_copy,
+        col=[
+            "word",
+            "word_heat",
+            "word_heat_change_rate",
+            "theme_ch_list",
+            "created_time",
+            "updated_time",
+            "date_info"
+        ],
+        partition_dict={
+            "site_name": site_name,
+            "date_type": date_type,
+            "date_info": date_info
+        }
+    )
+
+    client = SSHUtil.get_ssh_client()
+    SSHUtil.exec_command_async(client, sh, ignore_err=False)
+    client.close()
+
+    # 5) 交换 copy 表与正式表（DBUtil.exchange_tb 内部做 rename 交换）
+    # 与 dwt_aba_last365 同款：cp_index_flag=True 复制索引到新正式表
+    DBUtil.exchange_tb(
+        engine,
+        source_tb_name=export_tb_copy,
+        target_tb_name=export_tb_target,
+        cp_index_flag=True
+    )
+
+    # 6) 交换完成后，把正式表的 theme_ch_list 从 VARCHAR 转回 VARCHAR[]
+    # USING string_to_array(...) 把 "材质,颜色" 拆成 {材质,颜色}
+    # 词典无匹配的词 PySpark 已 fillna "-1"，转换后是 {-1}，与 Java 占位约定一致
+    with engine.connect() as connection:
+        sql = f"""
+            ALTER TABLE {export_tb_target}
+            ALTER COLUMN theme_ch_list TYPE VARCHAR[]
+            USING string_to_array(theme_ch_list, ',')::varchar[];
+        """
+        print("================================执行 SQL================================")
+        print(sql)
+        connection.execute(sql)
+
+    # 7) 更新 workflow_everyday 流程表（业务监听导出完成的标记）
+    # 参考 sqoop_export/dwt_aba_last365.py 的写法
+    # if test_flag != 'test':
+    #     mysql_engine = DBUtil.get_db_engine("mysql", "us")
+    #     with mysql_engine.connect() as connection:
+    #         sql = f"""
+    #             replace into workflow_everyday (
+    #                 site_name, report_date, status, status_val, table_name, date_type, page, is_end, remark, export_db_type
+    #             )
+    #             values (
+    #                 '{site_name}', '{date_info}', '导出pg完成', 14,
+    #                 '{export_tb_target}', '365_day', 'AbaWordHeatYear', '是',
+    #                 'ABA词频热度年表(最近12月,每月更新)', 'postgresql_cluster'
+    #             );
+    #         """
+    #         print("================================更新 workflow_everyday================================")
+    #         print(sql)
+    #         connection.execute(sql)
+
+    print("success")