词频分类代码

d4397441 · hejiangming · b6374899 · d4397441 · d4397441
Commit d4397441 authored Jun 17, 2026 by hejiangming
Expand all Show whitespace changes
Inline Side-by-side

Showing with 164 additions and 0 deletions

dws_aba_word_freq_cate.py Pyspark_job/dws/dws_aba_word_freq_cate.py +0 -0

dws_aba_word_freq_cate.py Pyspark_job/sqoop_export/dws_aba_word_freq_cate.py +164 -0

No files found.
--- a/Pyspark_job/dws/dws_aba_word_freq_cate.py
+++ b/Pyspark_job/dws/dws_aba_word_freq_cate.py
--- a/Pyspark_job/sqoop_export/dws_aba_word_freq_cate.py
+++ b/Pyspark_job/sqoop_export/dws_aba_word_freq_cate.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))
+from utils.ssh_util import SSHUtil
+from utils.common_util import CommonUtil, DateTypes
+from utils.db_util import DBUtil
+from utils.hdfs_utils import HdfsUtils
+
+if __name__ == '__main__':
+    site_name = CommonUtil.get_sys_arg(1, None)
+    date_type = CommonUtil.get_sys_arg(2, None)
+    date_info = CommonUtil.get_sys_arg(3, None)
+    # 最后一个参数为 test 标志
+    test_flag = CommonUtil.get_sys_arg(len(sys.argv) - 1, None)
+    print(f"执行参数为{sys.argv}")
+
+    assert date_type == DateTypes.month.name, "本脚本仅支持 month 类型导出"
+
+    if test_flag == 'test':
+        db_type = 'postgresql_test'
+        print("导出到测试库中")
+    else:
+        # 工时校验（与新 ABA 流程其它导出脚本一致）：非工作时段或负责人不在班则跳过
+        CommonUtil.judge_is_work_hours(
+            site_name=site_name, date_type=date_type, date_info=date_info,
+            principal='hejiangming',
+            priority=2,
+            export_tools_type=1,
+            belonging_to_process=f'分类词频_{date_type}'
+        )
+        db_type = 'postgresql_cluster'
+        print("导出到PG集群中")
+
+    # 1) 校验 Hive 分区有数据，避免空分区导出后 PG 数据被清空
+    hive_partition_path = (
+        f"/home/big_data_selection/dws/dws_aba_word_freq_cate/"
+        f"site_name={site_name}/date_type={date_type}/date_info={date_info}"
+    )
+    hive_files = HdfsUtils.read_list(hive_partition_path)
+    if not hive_files:
+        print(f"[ERROR] Hive 分区无数据文件，路径：{hive_partition_path}，跳过导出！")
+        sys.exit(1)
+    print(f"Hive 分区文件数：{len(hive_files)}，路径：{hive_partition_path}，继续导出")
+
+    engine = DBUtil.get_db_engine(db_type, site_name)
+
+    # 2) 表名拼装
+    # master 表（DBA 按 DDL 建好，PARTITION BY RANGE(date_info)）：us_aba_word_freq_cate_2026
+    # 子分区表：us_aba_word_freq_cate_2026_04
+    # copy 表：us_aba_word_freq_cate_2026_04_copy
+    suffix = str(date_info).replace("-", "_")
+    year_str = CommonUtil.safeIndex(date_info.split("-"), 0, None)
+    next_val = CommonUtil.get_next_val(date_type, date_info)
+
+    export_base_tb = f"{site_name}_aba_word_freq_cate"
+    export_master_tb = f"{export_base_tb}_{year_str}"
+    export_table = f"{export_base_tb}_{suffix}"
+    export_tb_copy = f"{export_table}_copy"
+
+    # 3) 在 master 表上建当月子分区（首次跑当月才真正创建，重跑幂等）
+    sql_create_partition = f"""
+        create table if not exists {export_table} partition of {export_master_tb}
+        for values from ('{date_info}') to ('{next_val}');
+    """
+    DBUtil.engine_exec_sql(engine, sql_create_partition)
+
+    # 4) 创建 copy 表（继承子分区结构 like ... including all），并清空
+    # copy 表是独立普通表，Sqoop 先写到这里，最后通过分区交换替换正式子分区，避免空窗期
+    sql_copy = f"""
+        create table if not exists {export_tb_copy}
+        (
+            like {export_table} including all
+        );
+        truncate table {export_tb_copy};
+    """
+    DBUtil.engine_exec_sql(engine, sql_copy)
+
+    # 5) ALTER copy 表的 attr_dim 列类型 VARCHAR[] → VARCHAR(500)
+    # 原因：Sqoop 不支持向 PG 数组类型写数据，先临时改普通 VARCHAR，
+    # 让 Sqoop 把 Hive 端 "材质,颜色" 逗号串原样写进来；交换前再 ALTER 回 VARCHAR[]
+    sql_alter_to_varchar = f"""
+        ALTER TABLE {export_tb_copy} ALTER COLUMN attr_dim TYPE VARCHAR(500);
+    """
+    DBUtil.engine_exec_sql(engine, sql_alter_to_varchar)
+
+    # 6) 拼装 Sqoop 导出脚本（字段顺序与 Hive dws_aba_word_freq_cate schema 一致；
+    #    site_name/date_type 编进表名不导出列，date_info 作为普通列导出）
+    export_cols = [
+        "scope",
+        "category_id",
+        "category_level",
+        "base_word",
+        "display_word",
+        "word_freq",
+        "word_heat",
+        "relate_st_num",
+        "min_rank",
+        "new_st_num",
+        "word_heat_last_year",
+        "word_heat_change_rate",
+        "word_heat_last_month",
+        "word_heat_rate_of_change",
+        "yoy_up_ratio",
+        "mom_up_ratio",
+        "word_heat_change_last_1_month",
+        "word_heat_change_1_month_ago",
+        "word_heat_change_2_month_ago",
+        "word_heat_change_3_month_ago",
+        "word_heat_change_4_month_ago",
+        "word_heat_change_5_month_ago",
+        "top_aba_example",
+        "brand_word_flag",
+        "new_word_flag",
+        "attr_dim",
+        "word_cn",
+        "heat_trend",
+        "created_time",
+        "updated_time",
+        "date_info",
+    ]
+
+    sh = CommonUtil.build_export_sh(
+        site_name=site_name,
+        db_type=db_type,
+        hive_tb="dws_aba_word_freq_cate",
+        export_tb=export_tb_copy,
+        col=export_cols,
+        partition_dict={
+            "site_name": site_name,
+            "date_type": date_type,
+            "date_info": date_info
+        }
+    )
+
+    client = SSHUtil.get_ssh_client()
+    SSHUtil.exec_command_async(client, sh, ignore_err=False)
+    client.close()
+
+    # 7) Sqoop 写完后，ALTER copy 表的 attr_dim 回 VARCHAR[]
+    # USING string_to_array(...) 把逗号串 "材质,颜色" 拆成数组 {材质,颜色}
+    # 词典无匹配的词 PySpark 已填 "-1"，转换后是 {-1}，与 Java 占位约定一致
+    sql_alter_back = f"""
+        ALTER TABLE {export_tb_copy}
+        ALTER COLUMN attr_dim TYPE VARCHAR[]
+        USING string_to_array(attr_dim, ',')::varchar[];
+    """
+    DBUtil.engine_exec_sql(engine, sql_alter_back)
+
+    # 8) 分区交换：copy 表替换正式子分区，无空窗期
+    DBUtil.exchange_pg_part_tb(
+        engine,
+        source_tb_name=export_tb_copy,
+        part_master_tb=export_master_tb,
+        part_target_tb=export_table,
+        cp_index_flag=False,
+        part_val={"from": [date_info], "to": [next_val]}
+    )
+
+    # 9) 删除 copy 表（交换后 copy 表里是旧数据，留着没意义）
+    DBUtil.engine_exec_sql(engine, f"drop table if exists {export_tb_copy};")
+
+    print(f"==================表 {export_table} 导出完成==================================")
+    print("success")