Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
d4397441
Commit
d4397441
authored
Jun 17, 2026
by
hejiangming
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
词频分类代码
parent
b6374899
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
164 additions
and
0 deletions
+164
-0
dws_aba_word_freq_cate.py
Pyspark_job/dws/dws_aba_word_freq_cate.py
+0
-0
dws_aba_word_freq_cate.py
Pyspark_job/sqoop_export/dws_aba_word_freq_cate.py
+164
-0
No files found.
Pyspark_job/dws/dws_aba_word_freq_cate.py
0 → 100644
View file @
d4397441
This diff is collapsed.
Click to expand it.
Pyspark_job/sqoop_export/dws_aba_word_freq_cate.py
0 → 100644
View file @
d4397441
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
,
DateTypes
from
utils.db_util
import
DBUtil
from
utils.hdfs_utils
import
HdfsUtils
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
# 最后一个参数为 test 标志
test_flag
=
CommonUtil
.
get_sys_arg
(
len
(
sys
.
argv
)
-
1
,
None
)
print
(
f
"执行参数为{sys.argv}"
)
assert
date_type
==
DateTypes
.
month
.
name
,
"本脚本仅支持 month 类型导出"
if
test_flag
==
'test'
:
db_type
=
'postgresql_test'
print
(
"导出到测试库中"
)
else
:
# 工时校验(与新 ABA 流程其它导出脚本一致):非工作时段或负责人不在班则跳过
CommonUtil
.
judge_is_work_hours
(
site_name
=
site_name
,
date_type
=
date_type
,
date_info
=
date_info
,
principal
=
'hejiangming'
,
priority
=
2
,
export_tools_type
=
1
,
belonging_to_process
=
f
'分类词频_{date_type}'
)
db_type
=
'postgresql_cluster'
print
(
"导出到PG集群中"
)
# 1) 校验 Hive 分区有数据,避免空分区导出后 PG 数据被清空
hive_partition_path
=
(
f
"/home/big_data_selection/dws/dws_aba_word_freq_cate/"
f
"site_name={site_name}/date_type={date_type}/date_info={date_info}"
)
hive_files
=
HdfsUtils
.
read_list
(
hive_partition_path
)
if
not
hive_files
:
print
(
f
"[ERROR] Hive 分区无数据文件,路径:{hive_partition_path},跳过导出!"
)
sys
.
exit
(
1
)
print
(
f
"Hive 分区文件数:{len(hive_files)},路径:{hive_partition_path},继续导出"
)
engine
=
DBUtil
.
get_db_engine
(
db_type
,
site_name
)
# 2) 表名拼装
# master 表(DBA 按 DDL 建好,PARTITION BY RANGE(date_info)):us_aba_word_freq_cate_2026
# 子分区表:us_aba_word_freq_cate_2026_04
# copy 表:us_aba_word_freq_cate_2026_04_copy
suffix
=
str
(
date_info
)
.
replace
(
"-"
,
"_"
)
year_str
=
CommonUtil
.
safeIndex
(
date_info
.
split
(
"-"
),
0
,
None
)
next_val
=
CommonUtil
.
get_next_val
(
date_type
,
date_info
)
export_base_tb
=
f
"{site_name}_aba_word_freq_cate"
export_master_tb
=
f
"{export_base_tb}_{year_str}"
export_table
=
f
"{export_base_tb}_{suffix}"
export_tb_copy
=
f
"{export_table}_copy"
# 3) 在 master 表上建当月子分区(首次跑当月才真正创建,重跑幂等)
sql_create_partition
=
f
"""
create table if not exists {export_table} partition of {export_master_tb}
for values from ('{date_info}') to ('{next_val}');
"""
DBUtil
.
engine_exec_sql
(
engine
,
sql_create_partition
)
# 4) 创建 copy 表(继承子分区结构 like ... including all),并清空
# copy 表是独立普通表,Sqoop 先写到这里,最后通过分区交换替换正式子分区,避免空窗期
sql_copy
=
f
"""
create table if not exists {export_tb_copy}
(
like {export_table} including all
);
truncate table {export_tb_copy};
"""
DBUtil
.
engine_exec_sql
(
engine
,
sql_copy
)
# 5) ALTER copy 表的 attr_dim 列类型 VARCHAR[] → VARCHAR(500)
# 原因:Sqoop 不支持向 PG 数组类型写数据,先临时改普通 VARCHAR,
# 让 Sqoop 把 Hive 端 "材质,颜色" 逗号串原样写进来;交换前再 ALTER 回 VARCHAR[]
sql_alter_to_varchar
=
f
"""
ALTER TABLE {export_tb_copy} ALTER COLUMN attr_dim TYPE VARCHAR(500);
"""
DBUtil
.
engine_exec_sql
(
engine
,
sql_alter_to_varchar
)
# 6) 拼装 Sqoop 导出脚本(字段顺序与 Hive dws_aba_word_freq_cate schema 一致;
# site_name/date_type 编进表名不导出列,date_info 作为普通列导出)
export_cols
=
[
"scope"
,
"category_id"
,
"category_level"
,
"base_word"
,
"display_word"
,
"word_freq"
,
"word_heat"
,
"relate_st_num"
,
"min_rank"
,
"new_st_num"
,
"word_heat_last_year"
,
"word_heat_change_rate"
,
"word_heat_last_month"
,
"word_heat_rate_of_change"
,
"yoy_up_ratio"
,
"mom_up_ratio"
,
"word_heat_change_last_1_month"
,
"word_heat_change_1_month_ago"
,
"word_heat_change_2_month_ago"
,
"word_heat_change_3_month_ago"
,
"word_heat_change_4_month_ago"
,
"word_heat_change_5_month_ago"
,
"top_aba_example"
,
"brand_word_flag"
,
"new_word_flag"
,
"attr_dim"
,
"word_cn"
,
"heat_trend"
,
"created_time"
,
"updated_time"
,
"date_info"
,
]
sh
=
CommonUtil
.
build_export_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
hive_tb
=
"dws_aba_word_freq_cate"
,
export_tb
=
export_tb_copy
,
col
=
export_cols
,
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
client
.
close
()
# 7) Sqoop 写完后,ALTER copy 表的 attr_dim 回 VARCHAR[]
# USING string_to_array(...) 把逗号串 "材质,颜色" 拆成数组 {材质,颜色}
# 词典无匹配的词 PySpark 已填 "-1",转换后是 {-1},与 Java 占位约定一致
sql_alter_back
=
f
"""
ALTER TABLE {export_tb_copy}
ALTER COLUMN attr_dim TYPE VARCHAR[]
USING string_to_array(attr_dim, ',')::varchar[];
"""
DBUtil
.
engine_exec_sql
(
engine
,
sql_alter_back
)
# 8) 分区交换:copy 表替换正式子分区,无空窗期
DBUtil
.
exchange_pg_part_tb
(
engine
,
source_tb_name
=
export_tb_copy
,
part_master_tb
=
export_master_tb
,
part_target_tb
=
export_table
,
cp_index_flag
=
False
,
part_val
=
{
"from"
:
[
date_info
],
"to"
:
[
next_val
]}
)
# 9) 删除 copy 表(交换后 copy 表里是旧数据,留着没意义)
DBUtil
.
engine_exec_sql
(
engine
,
f
"drop table if exists {export_tb_copy};"
)
print
(
f
"==================表 {export_table} 导出完成=================================="
)
print
(
"success"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment