Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
1b5a6368
Commit
1b5a6368
authored
Oct 11, 2025
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
sqoop导入脚本重构-隐藏数据库连接信息
parent
4d168d08
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
657 additions
and
1270 deletions
+657
-1270
ods_asin_detail.py
Pyspark_job/sqoop_import/ods_asin_detail.py
+33
-154
ods_asin_err_state.py
Pyspark_job/sqoop_import/ods_asin_err_state.py
+15
-24
ods_bs_category.py
Pyspark_job/sqoop_import/ods_bs_category.py
+40
-56
ods_bs_category_asin_detail.py
Pyspark_job/sqoop_import/ods_bs_category_asin_detail.py
+29
-75
ods_bs_category_top100_asin.py
Pyspark_job/sqoop_import/ods_bs_category_top100_asin.py
+45
-48
ods_bsr_end.py
Pyspark_job/sqoop_import/ods_bsr_end.py
+27
-43
ods_new_releases_top100_asin.py
Pyspark_job/sqoop_import/ods_new_releases_top100_asin.py
+45
-52
ods_one_category_report.py
Pyspark_job/sqoop_import/ods_one_category_report.py
+34
-175
ods_other_search_term_data.py
Pyspark_job/sqoop_import/ods_other_search_term_data.py
+41
-98
ods_search_term_type.py
Pyspark_job/sqoop_import/ods_search_term_type.py
+33
-80
ods_self_asin.py
Pyspark_job/sqoop_import/ods_self_asin.py
+24
-43
ods_self_asin_detail.py
Pyspark_job/sqoop_import/ods_self_asin_detail.py
+67
-74
ods_self_asin_related_traffic.py
Pyspark_job/sqoop_import/ods_self_asin_related_traffic.py
+25
-20
ods_seller_account_feedback.py
Pyspark_job/sqoop_import/ods_seller_account_feedback.py
+22
-34
ods_seller_account_syn.py
Pyspark_job/sqoop_import/ods_seller_account_syn.py
+30
-58
ods_seller_asin_account.py
Pyspark_job/sqoop_import/ods_seller_asin_account.py
+38
-61
ods_seller_asin_product.py
Pyspark_job/sqoop_import/ods_seller_asin_product.py
+48
-66
ods_st_quantity_being_sold.py
Pyspark_job/sqoop_import/ods_st_quantity_being_sold.py
+40
-67
ods_theme.py
Pyspark_job/sqoop_import/ods_theme.py
+21
-42
No files found.
Pyspark_job/sqoop_import/ods_asin_detail.py
View file @
1b5a6368
"""
@Author : HuangJian
@Description : asin详情表-周表
@SourceTable : us_asin_detail_2023_18
@SinkTable : ods_asin_detail
@CreateTime : 2022/05/18 14:55
@UpdateTime : 2022/05/18 14:55
"""
import
os
import
os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.secure_db_client
import
get_remote_engine
from
utils.hdfs_utils
import
HdfsUtils
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
@@ -24,151 +14,40 @@ if __name__ == '__main__':
...
@@ -24,151 +14,40 @@ if __name__ == '__main__':
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_table
=
f
"ods_asin_detail"
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
db_type
=
'postgresql_14'
import_table
=
f
"{site_name}_asin_detail_month_{d1}_{d2}"
hive_table
=
"ods_asin_detail"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
,
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_type"
:
date_type
,
"date_info"
:
date_info
"date_info"
:
date_info
}
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
cols
=
"id, asin, img_url, title, title_len, price, rating, total_comments, buy_box_seller_type, page_inventory, "
\
"category, volume, weight, rank, launch_time, created_time as created_at, updated_time as updated_at, "
\
# 日期拆分
"category_state, img_num, img_type, activity_type, one_two_val, three_four_val, five_six_val, eight_val, "
\
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
"qa_num, one_star, two_star, three_star, four_star, five_star, low_star, together_asin, brand, ac_name, "
\
"material, node_id, data_type, sp_num, describe, weight_str, package_quantity, pattern_name, follow_sellers, "
\
if
date_type
==
DateTypes
.
week
.
name
:
"product_description, buy_sales, image_view, spider_int, lob_asin_json, seller_json, customer_reviews_json, "
\
# pg的分区周单位数是带0,如01、02、03
"product_json, product_detail_json, review_ai_text, review_label_json, sp_initial_seen_asins_json, "
\
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
"sp_4stars_initial_seen_asins_json, sp_delivery_initial_seen_asins_json, compare_similar_asin_json, "
\
# 这里主要是区分db链接
"together_asin_json, min_match_asin_json, variat_num, current_asin, img_list, variat_list, parent_asin, "
\
if
site_name
==
'us'
and
date_info
>=
'2023-26'
:
"bundles_this_asins_json, video_m3u8_url, result_list_json, bundle_asin_component_json"
db_type
=
'postgresql'
if
date_info
>=
'2023-34'
:
engine
=
get_remote_engine
(
db_type
=
'postgresql_14'
site_name
=
site_name
,
date_col
=
"launch_time,created_time as created_at,updated_time as updated_at"
db_type
=
db_type
new_col
=
',describe'
)
else
:
db_type
=
'postgresql_14'
engine
.
sqoop_raw_import
(
date_col
=
"launch_time,created_time as created_at,updated_time as updated_at"
query
=
f
"SELECT {cols} FROM {import_table} WHERE 1=1 and $CONDITIONS"
,
new_col
=
',describe'
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
print
(
f
"同步连接的db_type:{db_type}"
)
partitions
=
partition_dict
,
m
=
50
,
# 这里主要是区分新增字段
split_by
=
'id'
# 18周新增字段weight_str
)
if
date_info
>=
'2023-18'
:
new_col
+=
',weight_str'
pass
# 21周新增字段package_quantity、pattern_name
if
date_info
>=
'2023-21'
:
new_col
+=
',package_quantity,pattern_name'
# 49周新增字段follow_sellers
if
date_info
>=
'2023-49'
:
new_col
+=
',follow_sellers'
# 51周新增字段product_description,buy_sales
if
date_info
>=
'2023-51'
:
new_col
+=
',product_description,buy_sales'
# 2024-02周新增字段image_view
if
date_info
>=
'2024-02'
:
new_col
+=
',image_view'
# # 2024-05周新增字段product_json,product_detail_json,review_ai_text,review_label_json
# if date_info >= '2024-05':
# new_col += ',product_json,product_detail_json,review_ai_text,review_label_json'
import_table
=
f
"{site_name}_asin_detail_{d1}_{d2}"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
db_type
=
'postgresql_14'
date_col
=
"launch_time, created_time as created_at, updated_time as updated_at"
new_col
=
"describe, weight_str, package_quantity, pattern_name, follow_sellers, product_description, buy_sales, image_view, spider_int, "
\
"lob_asin_json, seller_json, customer_reviews_json, product_json, product_detail_json, review_ai_text, review_label_json, sp_initial_seen_asins_json, "
\
"sp_4stars_initial_seen_asins_json, sp_delivery_initial_seen_asins_json, compare_similar_asin_json, together_asin_json, min_match_asin_json, "
\
"variat_num, current_asin, img_list, variat_list, parent_asin, bundles_this_asins_json, video_m3u8_url, result_list_json, bundle_asin_component_json"
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_table
=
f
"{site_name}_asin_detail_month_{d1}_{d2}"
sql_query
=
f
"""
select
id,
asin,
img_url,
title,
title_len,
price,
rating,
total_comments,
buy_box_seller_type,
page_inventory,
category,
volume,
weight,
rank,
{date_col},
category_state,
img_num,
img_type,
activity_type,
one_two_val,
three_four_val,
five_six_val,
eight_val,
qa_num,
one_star,
two_star,
three_star,
four_star,
five_star,
low_star,
together_asin,
brand,
ac_name,
material,
node_id,
data_type,
sp_num,
{new_col}
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
],
partition_dict
=
partition_dict
)
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
,
map_num
=
50
,
key
=
'id'
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
# 导入后检测--检测数据一致性
if
date_type
!=
'month_week'
:
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
])
# 导入后验证--重点字段阈值预警
CommonUtil
.
check_fields_and_warning
(
hive_tb_name
=
hive_table
,
partition_dict
=
partition_dict
)
Pyspark_job/sqoop_import/ods_asin_err_state.py
View file @
1b5a6368
...
@@ -2,16 +2,17 @@ import os
...
@@ -2,16 +2,17 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.hdfs_utils
import
HdfsUtils
from
utils.db_util
import
DbTypes
from
utils.db_util
import
DbTypes
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
import_tb
=
f
"{site_name}_all_syn_st_asin"
db_type
=
DbTypes
.
postgresql
.
name
import_tb
=
f
"{site_name}_all_syn_st_asin"
query
=
f
"""
query
=
f
"""
select asin,
select asin,
state,
state,
...
@@ -20,32 +21,22 @@ if __name__ == '__main__':
...
@@ -20,32 +21,22 @@ if __name__ == '__main__':
where state = 4
where state = 4
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
hive_tb
=
"ods_asin_err_state"
hive_tb
=
"ods_asin_err_state"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
"site_name"
:
site_name
}
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
DbTypes
.
postgresql
.
name
engine
=
get_remote_engine
(
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
site_name
=
site_name
,
site_name
=
site_name
,
db_type
=
db_type
query
=
query
,
)
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'wujicang'
]
engine
.
sqoop_raw_import
(
)
query
=
query
,
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
pass
pass
Pyspark_job/sqoop_import/ods_bs_category.py
View file @
1b5a6368
...
@@ -2,75 +2,59 @@ import os
...
@@ -2,75 +2,59 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_tb
=
"ods_bs_category"
db_type
=
"mysql"
db_type
=
"mysql"
import_tb
=
f
"{site_name}_bs_category"
import_tb
=
f
"{site_name}_bs_category"
partition_dict
=
{
"site_name"
:
site_name
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
query
=
f
"""
query
=
f
"""
select
select
id,
id,
p_id,
p_id,
ch_name,
ch_name,
en_name,
en_name,
nodes_num,
nodes_num,
path,
path,
is_show,
is_show,
one_category_id,
one_category_id,
and_en_name,
and_en_name,
leaf_node,
leaf_node,
delete_time,
delete_time,
full_name,
full_name,
category_id,
category_id,
category_parent_id,
category_parent_id,
category_first_id,
category_first_id,
category_state,
category_state,
redirect_flag,
redirect_flag,
redirect_first_id,
redirect_first_id,
created_at,
created_at,
updated_at
updated_at
from {import_tb}
from {import_tb}
where 1 = 1
where 1 = 1
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_bs_category"
site_name
=
site_name
,
partition_dict
=
{
query
=
query
,
"site_name"
:
site_name
,
hive_tb_name
=
hive_tb
,
}
msg_usr
=
[
'chenyuanjie'
]
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
engine
=
get_remote_engine
(
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
site_name
=
site_name
,
db_type
=
db_type
,
db_type
=
db_type
query
=
query
,
)
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
engine
.
sqoop_raw_import
(
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
query
=
query
,
partition_dict
=
partition_dict
,
hive_table
=
hive_tb
,
import_query
=
query
,
hdfs_path
=
hdfs_path
,
hive_tb_name
=
hive_tb
,
partitions
=
partition_dict
msg_usr
=
[
'chenyuanjie'
]
)
)
pass
pass
Pyspark_job/sqoop_import/ods_bs_category_asin_detail.py
View file @
1b5a6368
...
@@ -2,9 +2,9 @@ import os
...
@@ -2,9 +2,9 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
,
DateTypes
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
@@ -13,7 +13,19 @@ if __name__ == '__main__':
...
@@ -13,7 +13,19 @@ if __name__ == '__main__':
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
year
,
week
=
date_info
.
split
(
"-"
)
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
db_type
=
'postgresql_14'
import_tb
=
f
"{site_name}_bs_category_asin_detail_month_{d1}_{d2}"
query
=
f
"""
select
id, asin, null as week, best_sellers_rank, created_time as created_at, updated_time as updated_at, last_herf, all_best_sellers_href
from {import_tb}
where 1=1
and
\
$CONDITIONS
"""
hive_tb
=
"ods_bs_category_asin_detail"
hive_tb
=
"ods_bs_category_asin_detail"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
,
"site_name"
:
site_name
,
...
@@ -21,77 +33,19 @@ if __name__ == '__main__':
...
@@ -21,77 +33,19 @@ if __name__ == '__main__':
"date_info"
:
date_info
,
"date_info"
:
date_info
,
}
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
if
date_type
==
DateTypes
.
week
.
name
:
if
site_name
==
"us"
:
if
date_info
>=
'2023-18'
:
db_type
=
"postgresql"
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
import_tb
=
f
"{site_name}_bs_category_asin_detail_{year}_{week}"
cols
=
f
"id,asin,{week} as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
params
=
"1 = 1"
else
:
db_type
=
"mysql"
import_tb
=
f
"{site_name}_bs_category_asin_detail"
cols
=
"id,asin,week,best_sellers_rank,created_at,updated_at,last_herf"
params
=
f
"week = {int(week)} and DATE_FORMAT(created_at,'
%
Y') = {year}"
else
:
db_type
=
"postgresql_14"
import_tb
=
f
"{site_name}_bs_category_asin_detail_{year}_{week}"
cols
=
f
"id,asin,{week} as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
params
=
"1 = 1"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
# 日期拆分
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
if
site_name
in
[
'us'
,
'uk'
,
'de'
]:
db_type
=
'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
cols
=
f
"id,asin,null as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
import_tb
=
f
"{site_name}_bs_category_asin_detail_month_{d1}_{d2}"
params
=
f
" 1=1 "
else
:
print
(
f
"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误"
)
exit
()
query
=
f
"""
select
{cols}
from {import_tb}
where {params}
and
\
$CONDITIONS
"""
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
engine
=
get_remote_engine
(
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
site_name
=
site_name
,
partition_dict
=
partition_dict
,
db_type
=
db_type
import_query
=
query
,
)
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
engine
.
sqoop_raw_import
(
)
query
=
query
,
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
,
m
=
50
,
split_by
=
'id'
)
pass
pass
Pyspark_job/sqoop_import/ods_bs_category_top100_asin.py
View file @
1b5a6368
...
@@ -2,72 +2,69 @@ import os
...
@@ -2,72 +2,69 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_tb
=
"ods_bs_category_top100_asin"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
"mysql"
db_type
=
"mysql"
if
date_info
==
'all'
:
if
date_info
==
'all'
:
query
=
f
"""
query
=
f
"""
select id,
select
asin,
id,
cate_1_id,
asin,
cate_current_id,
cate_1_id,
bsr_rank,
cate_current_id,
rating,
bsr_rank,
total_comments,
rating,
created_at as updated_at,
total_comments,
date_info,
created_at as updated_at,
category_id
date_info,
category_id
from {site_name}_bs_category_top100_asin
from {site_name}_bs_category_top100_asin
where 1 = 1
where 1 = 1
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
pass
else
:
else
:
query
=
f
"""
query
=
f
"""
select id,
select
asin,
id,
cate_1_id,
asin,
cate_current_id,
cate_1_id,
bsr_rank,
cate_current_id,
rating,
bsr_rank,
total_comments,
rating,
created_at as updated_at,
total_comments,
date_info,
created_at as updated_at,
category_id
date_info,
category_id
from {site_name}_bs_category_top100_asin
from {site_name}_bs_category_top100_asin
where 1 = 1
where 1 = 1
and date_info = '{date_info}'
and date_info = '{date_info}'
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
pass
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_bs_category_top100_asin"
site_name
=
site_name
,
partition_dict
=
{
query
=
query
,
"site_name"
:
site_name
hive_tb_name
=
hive_tb
,
}
msg_usr
=
[
'wujicang'
])
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
engine
.
sqoop_raw_import
(
query
=
query
,
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
pass
pass
Pyspark_job/sqoop_import/ods_bsr_end.py
View file @
1b5a6368
...
@@ -2,61 +2,45 @@ import os
...
@@ -2,61 +2,45 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_tb
=
"ods_bsr_end"
db_type
=
"mysql"
db_type
=
"mysql"
import_tb
=
f
"{site_name}_bsr_end"
import_tb
=
f
"{site_name}_bsr_end"
query
=
f
"""
select
id,
rank,
bsr_name,
created_at,
updated_at,
category_id
from {import_tb}
where 1 = 1
and
\
$CONDITIONS
"""
hive_tb
=
"ods_bsr_end"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
,
"site_name"
:
site_name
,
}
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
query
=
f
"""
engine
=
get_remote_engine
(
select
site_name
=
site_name
,
id,
db_type
=
db_type
rank,
)
bsr_name,
created_at,
updated_at,
category_id
from {import_tb}
where 1 = 1
and
\
$CONDITIONS
"""
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
site_name
=
site_name
,
query
=
query
,
query
=
query
,
hive_table
=
hive_tb
,
hive_tb_name
=
hive_tb
,
hdfs_path
=
hdfs_path
,
msg_usr
=
[
'chenyuanjie'
]
partitions
=
partition_dict
)
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
)
pass
pass
Pyspark_job/sqoop_import/ods_new_releases_top100_asin.py
View file @
1b5a6368
...
@@ -2,76 +2,69 @@ import os
...
@@ -2,76 +2,69 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_tb
=
"ods_new_releases_top100_asin"
partition_dict
=
{
db_type
=
"mysql"
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
if
date_info
==
'all'
:
if
date_info
==
'all'
:
query
=
f
"""
query
=
f
"""
select id,
select
asin,
id,
cate_1_id,
asin,
cate_current_id,
cate_1_id,
bsr_rank,
cate_current_id,
rating,
bsr_rank,
total_comments,
rating,
created_at as updated_at,
total_comments,
date_info,
created_at as updated_at,
category_id
date_info,
category_id
from {site_name}_new_releases_top100_asin
from {site_name}_new_releases_top100_asin
where 1 = 1
where 1 = 1
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
pass
else
:
else
:
query
=
f
"""
query
=
f
"""
select id,
select
asin,
id,
cate_1_id,
asin,
cate_current_id,
cate_1_id,
bsr_rank,
cate_current_id,
rating,
bsr_rank,
total_comments,
rating,
created_at as updated_at,
total_comments,
date_info,
created_at as updated_at,
category_id
date_info,
category_id
from {site_name}_new_releases_top100_asin
from {site_name}_new_releases_top100_asin
where 1 = 1
where 1 = 1
and date_info = '{date_info}'
and date_info = '{date_info}'
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
pass
print
(
"================================sql===================================="
)
hive_tb
=
"ods_new_releases_top100_asin"
print
(
query
)
partition_dict
=
{
db_type
=
"mysql"
"site_name"
:
site_name
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
}
site_name
=
site_name
,
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
query
=
query
,
hive_tb_name
=
hive_tb
,
engine
=
get_remote_engine
(
msg_usr
=
[
'wujicang'
]
site_name
=
site_name
,
)
db_type
=
db_type
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
)
engine
.
sqoop_raw_import
(
query
=
query
,
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
pass
pass
Pyspark_job/sqoop_import/ods_one_category_report.py
View file @
1b5a6368
import
os
import
os
import
sys
import
sys
import
json
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.db_util
import
DBUtil
from
utils.hdfs_utils
import
HdfsUtils
from
utils.spark_util
import
SparkUtil
from
pyspark.sql.types
import
StructType
,
StructField
,
StringType
,
IntegerType
from
pyspark.sql
import
functions
as
F
def
vertify_data
(
hive_table
,
partition_dict
):
# 获取计算分区
msg_params
=
""
# 解析partition_dict获取分区查询条件
partition_conditions
=
[]
for
key
,
value
in
partition_dict
.
items
():
if
value
is
not
None
:
msg_params
+=
f
"{value} "
partition_conditions
.
append
(
f
"{key} = '{value}'"
)
base_msg
=
f
"{hive_table} {msg_params} "
site_name
=
partition_dict
.
get
(
"site_name"
)
spark_session
=
SparkUtil
.
get_spark_sessionV3
(
"check_fields_rule"
)
# 获取维护的字段验证配置表数据
config_table_query
=
f
"""select * from hive_field_verify_config
where table_name ='{hive_table}'
and site_name = '{site_name}'
and use_flag = 1 """
conn_info
=
DBUtil
.
get_connection_info
(
'mysql'
,
'us'
)
check_field_df
=
SparkUtil
.
read_jdbc_query
(
session
=
spark_session
,
url
=
conn_info
[
"url"
],
pwd
=
conn_info
[
"pwd"
],
username
=
conn_info
[
"username"
],
query
=
config_table_query
)
# 获取验证消息
check_field_list
=
check_field_df
.
select
(
'field_name'
,
'verify_desc'
,
'verify_type'
,
'config_json'
,
'msg_usr_list'
)
.
collect
()
if
not
check_field_list
:
print
(
"============================无验证匹配条件跳过验证==================================="
)
exit
()
# 创建一个df用于储存验证情况
# 定义列的结构
schema
=
StructType
([
StructField
(
"验证描述"
,
StringType
(),
True
),
StructField
(
"验证类型"
,
StringType
(),
True
),
StructField
(
"校验字段"
,
StringType
(),
True
),
StructField
(
"校验条件查询数值"
,
StringType
(),
True
),
StructField
(
"验证临界值"
,
StringType
(),
True
),
StructField
(
"是否验证通过"
,
IntegerType
(),
True
),
])
# 使用定义的结构创建空的 DataFrame
check_df
=
spark_session
.
createDataFrame
([],
schema
)
for
row
in
check_field_list
:
vertify_flag
=
True
field_name
=
row
[
'field_name'
]
verify_type
=
row
[
'verify_type'
]
config_json
=
json
.
loads
(
row
[
'config_json'
])
msg_usr
=
row
[
'msg_usr_list'
]
msg_usr_list
=
[
user
.
strip
()
for
user
in
msg_usr
.
split
(
","
)]
if
msg_usr
else
[]
partition_conf_list
=
config_json
[
'partition_conf'
]
for
conf
in
partition_conf_list
:
conf_site_name
=
conf
[
"site_name"
]
conf_date_type
=
conf
[
"date_type"
]
if
site_name
==
conf_site_name
and
date_type
==
conf_date_type
:
vertify_flag
=
True
break
else
:
vertify_flag
=
False
# assert base_rate is not None, f"未配置{field_name}验证周期{date_type}的基准值,请检查!"
# 没有合适的匹配维度
if
not
vertify_flag
:
break
if
verify_type
==
"自定义sql验证"
:
base_num
=
conf
[
'max_rate'
]
confirm_sql
=
str
(
config_json
[
'confirm_sql'
])
base_condition
=
' AND '
.
join
(
partition_conditions
)
# 需把sql语句中的base_condition用时间周期的语句进行替换
confirm_sql
=
confirm_sql
.
replace
(
"base_condition"
,
base_condition
)
confirm_df
=
spark_session
.
sql
(
confirm_sql
)
confirm_row
=
confirm_df
.
collect
()[
0
]
# 提取自定义sql中的验证结果
confirm_num
=
confirm_row
[
"confirm_num"
]
confirm_result
=
confirm_row
[
"confirm_result"
]
result_df
=
spark_session
.
createDataFrame
(
[(
row
[
'verify_desc'
],
verify_type
,
field_name
,
confirm_num
,
base_num
,
confirm_result
)],
schema
)
.
repartition
(
1
)
elif
verify_type
==
"分类销量最大排名验证"
:
sql_condition
=
config_json
[
'sql_condition'
]
base_num
=
conf
[
'max_rate'
]
confirm_sql
=
CommonUtil
.
generate_min_max_query
(
hive_table
,
field_name
,
partition_dict
)
# 拼接外部查询条件
if
sql_condition
:
confirm_sql
=
confirm_sql
+
f
" AND {sql_condition} "
confirm_df
=
spark_session
.
sql
(
confirm_sql
)
confirm_row
=
confirm_df
.
collect
()[
0
]
confirm_num
=
confirm_row
[
"max_value"
]
if
confirm_num
:
# 必须要大于该基准值才校验通过
confirm_result
=
1
if
(
confirm_num
>=
base_num
)
else
0
else
:
confirm_result
=
0
result_df
=
spark_session
.
createDataFrame
(
[(
row
[
'verify_desc'
],
verify_type
,
field_name
,
confirm_num
,
base_num
,
confirm_result
)],
schema
)
.
repartition
(
1
)
check_df
=
check_df
.
unionByName
(
result_df
,
False
)
if
check_df
.
count
()
<
1
:
print
(
"无验证项验证"
)
exit
()
check_df
.
show
(
50
,
truncate
=
False
)
schema_flag
=
bool
(
check_df
.
select
(
F
.
min
(
"是否验证通过"
)
.
alias
(
"result"
))
.
first
()
.
asDict
()[
'result'
])
# print(schema_flag)
if
not
schema_flag
:
msg
=
f
"数据表:{hive_table} {msg_params},计算数据存在验证不通过,请检查数据是否异常!!具体信息请查看日志!!"
CommonUtil
.
send_wx_msg
([
'fangxingjun'
,
'pengyanbing'
,
'chenjianyun'
],
f
"
\u26A0
{hive_table} {msg_params}数据导入验证异常"
,
msg
)
raise
Exception
(
msg
)
spark_session
.
stop
()
pass
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
@@ -138,72 +13,56 @@ if __name__ == '__main__':
...
@@ -138,72 +13,56 @@ if __name__ == '__main__':
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_tb
=
"ods_one_category_report"
assert
date_type
in
(
'week'
,
'month'
,
'month_week'
),
"入参date_type类型存在问题,请检查!"
db_type
=
"mysql"
assert
date_type
in
(
'week'
,
'month'
,
'month_week'
),
"入参date_type类型存在问题,请检查!"
db_type
=
"mysql"
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
# 该表为月表,因此如果传入week周期进行判断,获取周对应的月维度
# 该表为月表,因此如果传入week周期进行判断,获取周对应的月维度
if
(
date_type
==
'week'
)
and
(
date_info
is
not
None
):
if
(
date_type
==
'week'
)
and
(
date_info
is
not
None
):
engine
=
DBUtil
.
get_db_engine
(
'mysql'
,
'us'
)
sql
=
f
"""select `year_month` from date_20_to_30 where `year_week`='{date_info}' and week_day = 1 """
sql
=
f
"""select `year_month` from date_20_to_30 where `year_week`='{date_info}' and week_day = 1 """
result
=
DBUtil
.
engine_exec_sql
(
engine
,
sql
)
result
=
engine
.
read_sql
(
sql
=
sql
)
year_month
=
result
.
scalar
()
year_month
=
result
.
scalar
()
print
(
f
"当前传入的周期为周维度,date_type:{date_type},date_info:{date_info};对应转换月为:{year_month}"
)
print
(
f
"当前传入的周期为周维度,date_type:{date_type},date_info:{date_info};对应转换月为:{year_month}"
)
engine
.
dispose
()
date_type
=
'month'
date_type
=
'month'
date_info
=
year_month
date_info
=
year_month
year
,
month
=
date_info
.
split
(
"-"
)
year
,
month
=
date_info
.
split
(
"-"
)
import_tb
=
f
"{site_name}_one_category_report"
query
=
f
"""
select
id,
cate_1_id,
name,
rank,
orders,
orders_day,
`year_month`,
week,
created_at,
updated_at,
category_id
from {import_tb}
where `year_month` = '{year}_{int(month)}'
and
\
$CONDITIONS
"""
hive_tb
=
"ods_one_category_report"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
,
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_type"
:
date_type
,
"date_info"
:
date_info
"date_info"
:
date_info
}
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
import_tb
=
f
"{site_name}_one_category_report"
engine
.
sqoop_raw_import
(
cols
=
"id,cate_1_id,name,rank,orders,orders_day,`year_month`,week,created_at,updated_at,category_id"
query
=
query
,
hive_table
=
hive_tb
,
query
=
f
"""
hdfs_path
=
hdfs_path
,
select
partitions
=
partition_dict
{cols}
)
from {import_tb}
where `year_month` = '{year}_{int(month)}'
and
\
$CONDITIONS
"""
print
(
query
)
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbin'
]
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbin'
]
)
vertify_data
(
hive_table
=
hive_tb
,
partition_dict
=
partition_dict
)
pass
pass
Pyspark_job/sqoop_import/ods_other_search_term_data.py
View file @
1b5a6368
"""
@Author : HuangJian
@Description : 各站点店铺asin详情表-- 月抓取
@SourceTable : us_other_search_term_data_2023_18
@SinkTable : ods_other_search_term_data
@CreateTime : 2022/05/23 09:55
@UpdateTime : 2022/05/23 09:55
"""
import
os
import
os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
@@ -25,96 +14,51 @@ if __name__ == '__main__':
...
@@ -25,96 +14,51 @@ if __name__ == '__main__':
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_table
=
f
"ods_other_search_term_data"
db_type
=
'postgresql_14'
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_table
=
f
"{site_name}_other_search_term_month_{d1}_{d2}"
sql_query
=
f
"""
select
id,
search_term,
asin,
page,
buy_data,
label,
created_time,
updated_time,
asin_brand
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
if
site_name
==
'us'
:
map_num
=
20
else
:
map_num
=
5
hive_table
=
"ods_other_search_term_data"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
,
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_type"
:
date_type
,
"date_info"
:
date_info
"date_info"
:
date_info
}
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
# 日期拆分
engine
=
get_remote_engine
(
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
site_name
=
site_name
,
db_type
=
''
db_type
=
db_type
)
if
date_type
==
DateTypes
.
week
.
name
:
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
if
site_name
==
'us'
and
date_info
>=
'2023-18'
:
db_type
=
'postgresql'
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
# pg的分区周单位数是带0,如01、02、03
import_table
=
f
"{site_name}_other_search_term_{d1}_{d2}"
else
:
db_type
=
'postgresql_14'
import_table
=
f
"{site_name}_other_search_term_{d1}_{d2}"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
if
site_name
in
[
'us'
,
'uk'
,
'de'
]:
db_type
=
'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_table
=
f
"{site_name}_other_search_term_month_{d1}_{d2}"
else
:
print
(
f
"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误"
)
exit
()
assert
db_type
!=
''
,
"未获取到db_type,请检查!"
sql_query
=
f
"""
select
id,
search_term,
asin,
page,
buy_data,
label,
created_time,
updated_time,
asin_brand
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
# 进行schema和数据校验
if
site_name
not
in
(
'fr'
,
'it'
,
'es'
):
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbing'
,
'chenyuanjie'
]
,
partition_dict
=
partition_dict
)
if
site_name
==
'us'
:
engine
.
sqoop_raw_import
(
map_num
=
20
query
=
sql_query
,
else
:
hive_table
=
hive_table
,
map_num
=
5
hdfs_path
=
hdfs_path
,
# 生成导出脚本
partitions
=
partition_dict
,
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
m
=
map_num
,
db_type
=
db_type
,
split_by
=
'id'
query
=
sql_query
,
)
hdfs_path
=
hdfs_path
,
map_num
=
map_num
,
key
=
'id'
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
# 导入后检测--检测同步数据数据量的一致性
pass
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbing'
,
'chenyuanjie'
])
\ No newline at end of file
Pyspark_job/sqoop_import/ods_search_term_type.py
View file @
1b5a6368
...
@@ -2,10 +2,9 @@ import os
...
@@ -2,10 +2,9 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.secure_db_client
import
get_remote_engine
from
utils.hdfs_utils
import
HdfsUtils
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
@@ -21,8 +20,6 @@ if __name__ == '__main__':
...
@@ -21,8 +20,6 @@ if __name__ == '__main__':
print
(
"uk站点已无ac类型词,退出执行!"
)
print
(
"uk站点已无ac类型词,退出执行!"
)
sys
.
exit
(
0
)
sys
.
exit
(
0
)
hive_tb
=
f
"ods_search_term_{st_type}"
if
st_type
in
[
"zr"
,
"sp"
]:
if
st_type
in
[
"zr"
,
"sp"
]:
cols
=
"search_term,asin,page,page_row,created_time,updated_time,id"
cols
=
"search_term,asin,page,page_row,created_time,updated_time,id"
elif
st_type
in
[
"sb"
,
"tr"
]:
elif
st_type
in
[
"sb"
,
"tr"
]:
...
@@ -30,92 +27,48 @@ if __name__ == '__main__':
...
@@ -30,92 +27,48 @@ if __name__ == '__main__':
else
:
else
:
cols
=
"search_term,asin,page,created_time,updated_time,id"
cols
=
"search_term,asin,page,created_time,updated_time,id"
db_type
=
'postgresql_14'
# 日期拆分
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
if
date_type
==
DateTypes
.
week
.
name
:
import_tb
=
f
"{site_name}_search_term_rank_{st_type}_month_{d1}_{d2}"
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
if
site_name
==
'us'
and
date_info
>=
'2023-18'
:
db_type
=
'postgresql'
# pg的分区周单位数是带0,如01、02、03
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
import_tb
=
f
"{site_name}_search_term_rank_{st_type}_{d1}_{d2}"
else
:
db_type
=
'postgresql_14'
import_tb
=
f
"{site_name}_search_term_rank_{st_type}_{d1}_{d2}"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
if
site_name
in
[
'us'
,
'uk'
,
'de'
]:
db_type
=
'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_tb
=
f
"{site_name}_search_term_rank_{st_type}_month_{d1}_{d2}"
else
:
print
(
f
"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误"
)
exit
()
query
=
f
"""
query
=
f
"""
select {cols}
select {cols} from {import_tb} where 1 = 1 and
\
$CONDITIONS
from {import_tb}
where 1 = 1
and
\
$CONDITIONS
"""
"""
print
(
f
"当前链接的数据库为:{db_type},同步的表为:{import_tb}"
)
print
(
f
"当前链接的数据库为:{db_type},同步的表为:{import_tb}"
)
hive_tb
=
f
"ods_search_term_{st_type}"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
,
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_type"
:
date_type
,
"date_info"
:
date_info
,
"date_info"
:
date_info
,
}
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
if
st_type
in
[
'er'
,
'tr'
]:
empty_flag
=
False
print
(
f
"st_type类型为{st_type},符合不检测类型跳过检测!"
)
else
:
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbing'
,
'chenyuanjie'
],
partition_dict
=
partition_dict
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
if
st_type
==
"zr"
:
# zr的数据量较大,同步时进行多进程同步
if
site_name
==
"us"
:
if
st_type
in
[
'zr'
]:
map_num
=
40
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
,
map_num
=
10
,
key
=
'id'
)
else
:
else
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
map_num
=
15
db_type
=
db_type
,
elif
st_type
in
[
"sb"
,
"sp"
]:
query
=
query
,
if
site_name
==
"us"
:
hdfs_path
=
hdfs_path
)
map_num
=
6
else
:
# 导入前先删除
map_num
=
2
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
else
:
client
=
SSHUtil
.
get_ssh_client
()
map_num
=
1
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
engine
=
get_remote_engine
(
client
.
close
()
site_name
=
site_name
,
db_type
=
db_type
# 导入后检测--检测数据一致性
)
if
date_type
!=
'month_week'
:
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
partition_dict
=
partition_dict
,
query
=
query
,
import_query
=
query
,
hive_table
=
hive_tb
,
hive_tb_name
=
hive_tb
,
hdfs_path
=
hdfs_path
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbing'
,
'chenyuanjie'
])
partitions
=
partition_dict
,
m
=
map_num
,
split_by
=
'id'
)
pass
pass
Pyspark_job/sqoop_import/ods_self_asin.py
View file @
1b5a6368
...
@@ -2,61 +2,42 @@ import os
...
@@ -2,61 +2,42 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_tb
=
"ods_self_asin"
db_type
=
"mysql"
partition_dict
=
{
"site_name"
:
site_name
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
query
=
f
"""
query
=
f
"""
select
select
id,
id,
asin,
asin,
created_at as created_time,
created_at as created_time,
updated_at as updated_time
updated_at as updated_time
from {site_name}_self_asin
from {site_name}_self_asin
where 1 = 1
where 1 = 1
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
db_type
=
"mysql"
hive_tb
=
"ods_self_asin"
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
partition_dict
=
{
site_name
=
site_name
,
"site_name"
:
site_name
,
query
=
query
,
}
hive_tb_name
=
hive_tb
,
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
msg_usr
=
[
'chenyuanjie'
]
)
engine
=
get_remote_engine
(
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
site_name
=
site_name
,
db_type
=
db_type
if
not
empty_flag
:
)
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
query
=
query
,
hdfs_path
=
hdfs_path
)
hive_table
=
hive_tb
,
# 导入前先删除
hdfs_path
=
hdfs_path
,
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
partitions
=
partition_dict
client
=
SSHUtil
.
get_ssh_client
()
)
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
])
pass
pass
Pyspark_job/sqoop_import/ods_self_asin_detail.py
View file @
1b5a6368
...
@@ -2,96 +2,89 @@ import os
...
@@ -2,96 +2,89 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.hdfs_utils
import
HdfsUtils
from
utils.db_util
import
DbTypes
from
utils.db_util
import
DbTypes
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
site_name
is
not
None
,
"site
_
name 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
year
=
CommonUtil
.
reformat_date
(
date_info
,
"
%
Y-
%
m-
%
d"
,
"
%
Y"
,
)
hive_tb
=
"ods_self_asin_detail"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
DbTypes
.
postgresql
.
name
year
=
CommonUtil
.
reformat_date
(
date_info
,
"
%
Y-
%
m-
%
d"
,
"
%
Y"
,
)
query
=
f
"""
query
=
f
"""
select
select
asin,
asin,
img_url,
img_url,
title,
title,
title_len,
title_len,
price,
price,
rating,
rating,
total_comments,
total_comments,
buy_box_seller_type,
buy_box_seller_type,
page_inventory,
page_inventory,
category,
category,
volume,
volume,
weight,
weight,
rank,
rank,
launch_time,
launch_time,
video_url,
video_url,
add_url,
add_url,
material,
material,
created_at,
created_at,
img_num,
img_num,
img_type,
img_type,
qa_num,
qa_num,
brand,
brand,
ac_name,
ac_name,
node_id,
node_id,
sp_num,
sp_num,
mpn,
mpn,
online_time,
online_time,
describe,
describe,
one_star,
one_star,
two_star,
two_star,
three_star,
three_star,
four_star,
four_star,
five_star,
five_star,
low_star,
low_star,
asin_type,
asin_type,
is_coupon,
is_coupon,
search_category,
search_category,
weight_str,
weight_str,
account_name,
account_name,
other_seller_name,
other_seller_name,
account_id
account_id
from {site_name}_self_asin_detail_{year}
from {site_name}_self_asin_detail_{year}
where 1 = 1
where 1 = 1
and site = '{site_name}'
and site = '{site_name}'
and bsr_date_info = '{date_info}'
and bsr_date_info = '{date_info}'
and date_info >= '{date_info}'
and date_info >= '{date_info}'
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
print
(
"sql ======================================================"
)
print
(
query
)
hive_tb
=
"ods_self_asin_detail"
db_type
=
DbTypes
.
postgresql
.
name
partition_dict
=
{
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
"site_name"
:
site_name
,
site_name
=
site_name
,
"date_type"
:
date_type
,
query
=
query
,
"date_info"
:
date_info
,
hive_tb_name
=
hive_tb
,
}
msg_usr
=
[
'wujicang'
]
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
engine
.
sqoop_raw_import
(
query
=
query
,
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
pass
pass
Pyspark_job/sqoop_import/ods_self_asin_related_traffic.py
View file @
1b5a6368
...
@@ -3,22 +3,15 @@ import sys
...
@@ -3,22 +3,15 @@ import sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_table
=
"ods_self_asin_related_traffic"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
'mysql'
db_type
=
'mysql'
import_table
=
f
"{site_name}_self_asin_detail"
import_table
=
f
"{site_name}_self_asin_detail"
sql_query
=
f
"""
sql_query
=
f
"""
select
select
id,
id,
...
@@ -37,17 +30,29 @@ if __name__ == '__main__':
...
@@ -37,17 +30,29 @@ if __name__ == '__main__':
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
# 生成导出脚本
hive_table
=
"ods_self_asin_related_traffic"
import_sh
=
CommonUtil
.
build_import_sh
(
partition_dict
=
{
site_name
=
site_name
,
db_type
=
db_type
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
,
map_num
=
25
,
key
=
'id'
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
if
site_name
==
'us'
:
map_num
=
25
else
:
map_num
=
1
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
,
m
=
map_num
,
split_by
=
'id'
)
)
# 导入前先删除原始hdfs数据
pass
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
Pyspark_job/sqoop_import/ods_seller_account_feedback.py
View file @
1b5a6368
...
@@ -2,11 +2,10 @@ import os
...
@@ -2,11 +2,10 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
@@ -16,17 +15,6 @@ if __name__ == '__main__':
...
@@ -16,17 +15,6 @@ if __name__ == '__main__':
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_table
=
f
"ods_seller_account_feedback"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
suffix
=
str
(
date_info
)
.
replace
(
"-"
,
"_"
)
suffix
=
str
(
date_info
)
.
replace
(
"-"
,
"_"
)
import_table
=
f
"{site_name}_seller_account_feedback_{suffix}"
import_table
=
f
"{site_name}_seller_account_feedback_{suffix}"
if
date_type
==
DateTypes
.
month
.
name
and
date_info
>=
'2023-08'
:
if
date_type
==
DateTypes
.
month
.
name
and
date_info
>=
'2023-08'
:
...
@@ -51,24 +39,24 @@ if __name__ == '__main__':
...
@@ -51,24 +39,24 @@ if __name__ == '__main__':
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
# 进行schema和数据校验
hive_table
=
"ods_seller_account_feedback"
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
partition_dict
=
{
site_name
=
site_name
,
"site_name"
:
site_name
,
query
=
sql_query
,
"date_type"
:
date_type
,
hive_tb_name
=
hive_table
,
"date_info"
:
date_info
msg_usr
=
[
'chenyuanjie'
])
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
# 生成导出脚本
pass
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
Pyspark_job/sqoop_import/ods_seller_account_syn.py
View file @
1b5a6368
"""
@Author : HuangJian
@Description : 各站点店铺名称与店铺id关系全量表--传参为单站点
@SourceTable : us_seller_account_feedback
@SinkTable : ods_seller_account_feedback
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import
os
import
os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.db_util
import
DBUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.spark_util
import
SparkUtil
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
db_type
=
'mysql'
hive_table
=
f
"ods_seller_account_syn"
import_table
=
f
"{site_name}_seller_account_syn_distinct"
sql_query
=
f
"""
select
id,
account_name,
url,
state,
created_at,
updated_at,
seller_id
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
hive_table
=
"ods_seller_account_syn"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
"site_name"
:
site_name
}
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
import_table
=
f
"{site_name}_seller_account_syn_distinct"
engine
=
get_remote_engine
(
db_type
=
'mysql'
site_name
=
site_name
,
db_type
=
db_type
)
sql_query
=
f
"""
select
id,
account_name,
url,
state,
created_at,
updated_at,
seller_id
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
# 进行schema和数据校验
engine
.
sqoop_raw_import
(
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
query
=
sql_query
,
site_name
=
site_nam
e
,
hive_table
=
hive_tabl
e
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
,
hive_tb_name
=
hive_table
,
partitions
=
partition_dict
msg_usr
=
[
'chenyuanjie'
]
)
)
# 生成导出脚本
pass
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
Pyspark_job/sqoop_import/ods_seller_asin_account.py
View file @
1b5a6368
"""
@Author : HuangJian
@Description : 各站点店铺名称与asin关系全量表--传参为单站点
@SourceTable : us_seller_account_feedback
@SinkTable : ods_seller_account_feedback
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import
os
import
os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.db_util
import
DBUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.spark_util
import
SparkUtil
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
db_type
=
'mysql'
hive_table
=
f
"ods_seller_asin_account"
import_table
=
f
"{site_name}_seller_asin_account"
sql_query
=
f
"""
select
id,
account_name,
asin,
created_at,
updated_at,
seller_id
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
hive_table
=
"ods_seller_asin_account"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
"site_name"
:
site_name
}
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
import_table
=
f
"{site_name}_seller_asin_account"
db_type
=
'mysql'
sql_query
=
f
"""
select
id,
account_name,
asin,
created_at,
updated_at,
seller_id
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
])
# 生成导出脚本
if
site_name
==
'us'
:
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
map_num
=
100
db_type
=
db_type
,
else
:
query
=
sql_query
,
map_num
=
40
hdfs_path
=
hdfs_path
,
map_num
=
10
,
engine
=
get_remote_engine
(
key
=
'id'
)
site_name
=
site_name
,
# 导入前先删除原始hdfs数据
db_type
=
db_type
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
engine
.
sqoop_raw_import
(
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
query
=
sql_query
,
# 创建lzo索引和修复元数据
hive_table
=
hive_table
,
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
hdfs_path
=
hdfs_path
,
# 关闭链接
partitions
=
partition_dict
,
client
.
close
()
m
=
map_num
,
split_by
=
'id'
)
pass
Pyspark_job/sqoop_import/ods_seller_asin_product.py
View file @
1b5a6368
"""
@Author : HuangJian
@Description : 各站点店铺asin详情表-- 月抓取
@SourceTable : us_asin_detail_product_2023
@SinkTable : ods_asin_detail_product
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import
os
import
os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.db_util
import
DBUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.secure_db_client
import
get_remote_engine
from
utils.spark_util
import
SparkUtil
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
@@ -29,21 +17,8 @@ if __name__ == '__main__':
...
@@ -29,21 +17,8 @@ if __name__ == '__main__':
# 该表现在为月同步表,因此增加月类型校验
# 该表现在为月同步表,因此增加月类型校验
assert
date_type
==
DateTypes
.
month
.
name
,
"date_type类型不对,应为month"
assert
date_type
==
DateTypes
.
month
.
name
,
"date_type类型不对,应为month"
hive_table
=
f
"ods_asin_detail_product"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
# 日期拆分
suffix
=
str
(
date_info
)
.
replace
(
"-"
,
"_"
)
suffix
=
str
(
date_info
)
.
replace
(
"-"
,
"_"
)
import_table
=
f
"{site_name}_seller_asin_product_{suffix}"
import_table
=
f
"{site_name}_seller_asin_product_{suffix}"
# db_type = 'postgresql'
if
date_type
==
DateTypes
.
month
.
name
and
date_info
>=
'2023-08'
:
if
date_type
==
DateTypes
.
month
.
name
and
date_info
>=
'2023-08'
:
db_type
=
'postgresql_14'
db_type
=
'postgresql_14'
else
:
else
:
...
@@ -51,44 +26,51 @@ if __name__ == '__main__':
...
@@ -51,44 +26,51 @@ if __name__ == '__main__':
print
(
"当前链接的数据库为:"
,
db_type
)
print
(
"当前链接的数据库为:"
,
db_type
)
sql_query
=
f
"""
sql_query
=
f
"""
select
select
id,
id,
null as account_id,
null as account_id,
asin,
asin,
title,
title,
img_url,
img_url,
price,
price,
rating,
rating,
total_comments,
total_comments,
null as week,
null as week,
row_num,
row_num,
created_at,
created_at,
updated_at,
updated_at,
null as month,
null as month,
seller_id
seller_id
from {import_table}
from {import_table}
where 1=1
where 1=1
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
hive_table
=
"ods_asin_detail_product"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
if
site_name
==
'us'
:
map_num
=
8
else
:
map_num
=
3
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
# 进行schema和数据校验
engine
.
sqoop_raw_import
(
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
query
=
sql_query
,
site_name
=
site_name
,
hive_table
=
hive_table
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
,
hive_tb_name
=
hive_table
,
partitions
=
partition_dict
,
msg_usr
=
[
'chenyuanjie'
])
m
=
map_num
,
split_by
=
'id'
)
# 生成导出脚本
pass
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
Pyspark_job/sqoop_import/ods_st_quantity_being_sold.py
View file @
1b5a6368
import
os
import
os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
@@ -17,75 +14,51 @@ if __name__ == '__main__':
...
@@ -17,75 +14,51 @@ if __name__ == '__main__':
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_table
=
f
"ods_st_quantity_being_sold"
db_type
=
'postgresql_14'
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_table
=
f
"{site_name}_brand_analytics_month_{d1}_{d2}"
sql_query
=
f
"""
select
id,
search_term,
quantity_being_sold,
date_info as date_flag,
created_time,
updated_time,
quantity_being_sold_str,
result_count,
departments
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
hive_table
=
"ods_st_quantity_being_sold"
partition_dict
=
{
partition_dict
=
{
"site_name"
:
site_name
,
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_type"
:
date_type
,
"date_info"
:
date_info
"date_info"
:
date_info
}
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
# 日期拆分
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
if
date_type
==
DateTypes
.
week
.
name
:
if
site_name
==
'us'
:
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
map_num
=
4
if
site_name
==
'us'
and
date_info
>=
'2023-18'
:
else
:
db_type
=
'postgresql'
map_num
=
1
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
else
:
db_type
=
'postgresql_14'
import_table
=
f
"{site_name}_brand_analytics_{d1}_{d2}"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
engine
=
get_remote_engine
(
if
site_name
in
[
'us'
,
'uk'
,
'de'
]:
site_name
=
site_name
,
db_type
=
'postgresql_14'
db_type
=
db_type
# pg的分区单位数是带0,如01、02、03
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_table
=
f
"{site_name}_brand_analytics_month_{d1}_{d2}"
else
:
print
(
f
"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误"
)
exit
()
sql_query
=
f
"""
select
id,
search_term,
quantity_being_sold,
date_info as date_flag,
created_time,
updated_time,
quantity_being_sold_str,
result_count,
departments
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
# 进行schema和数据校验
engine
.
sqoop_raw_import
(
if
site_name
not
in
(
'fr'
,
'it'
,
'es'
):
query
=
sql_query
,
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_table
=
hive_table
,
site_name
=
site_name
,
hdfs_path
=
hdfs_path
,
query
=
sql_query
,
partitions
=
partition_dict
,
hive_tb_name
=
hive_table
,
m
=
map_num
,
msg_usr
=
[
'fangxingjun'
,
'chenyuanjie'
])
split_by
=
'id'
)
# 生成导出脚本
pass
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
Pyspark_job/sqoop_import/ods_theme.py
View file @
1b5a6368
...
@@ -2,62 +2,41 @@ import os
...
@@ -2,62 +2,41 @@ import os
import
sys
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_tb
=
"ods_theme"
db_type
=
"mysql"
db_type
=
"mysql"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
import_tb
=
f
"{site_name}_theme"
import_tb
=
f
"{site_name}_theme"
cols
=
"id,theme_type_en,theme_type_ch,theme_en,theme_ch,created_at,updated_at"
cols
=
"id, theme_type_en, theme_type_ch, theme_en, theme_ch, created_at, updated_at"
query
=
f
"""
query
=
f
"""
select
select
{cols}
{cols}
from {import_tb}
from {import_tb}
where 1 = 1
where 1 = 1
and
\
$CONDITIONS
and
\
$CONDITIONS
"""
"""
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_theme"
site_name
=
site_name
,
partition_dict
=
{
query
=
query
,
"site_name"
:
site_name
hive_tb_name
=
hive_tb
,
}
msg_usr
=
[
'chenyuanjie'
]
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
engine
=
get_remote_engine
(
site_name
=
site_name
,
if
not
empty_flag
:
db_type
=
db_type
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
)
db_type
=
db_type
,
query
=
query
,
engine
.
sqoop_raw_import
(
hdfs_path
=
hdfs_path
)
query
=
query
,
# 导入前先删除
hive_table
=
hive_tb
,
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
hdfs_path
=
hdfs_path
,
client
=
SSHUtil
.
get_ssh_client
()
partitions
=
partition_dict
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
])
pass
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment