Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
057dbb84
Commit
057dbb84
authored
Oct 14, 2025
by
fangxingjun
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'developer' of 47.106.101.75:abel_cjy/Amazon-Selection-Data into developer
parents
0feb4e27
e92a6ecf
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
25 changed files
with
655 additions
and
1110 deletions
+655
-1110
dim_asin_bs_info.py
Pyspark_job/dim/dim_asin_bs_info.py
+84
-34
dwt_st_theme_agg.py
Pyspark_job/dwt/dwt_st_theme_agg.py
+13
-28
es_flow_asin.py
Pyspark_job/export_es/es_flow_asin.py
+9
-1
kafka_flow_asin_detail.py
Pyspark_job/my_kafka/kafka_flow_asin_detail.py
+10
-2
ods_asin_detail.py
Pyspark_job/sqoop_import/ods_asin_detail.py
+29
-150
ods_asin_err_state.py
Pyspark_job/sqoop_import/ods_asin_err_state.py
+12
-21
ods_bs_category.py
Pyspark_job/sqoop_import/ods_bs_category.py
+16
-32
ods_bs_category_asin_detail.py
Pyspark_job/sqoop_import/ods_bs_category_asin_detail.py
+22
-68
ods_bs_category_top100_asin.py
Pyspark_job/sqoop_import/ods_bs_category_top100_asin.py
+25
-28
ods_bsr_end.py
Pyspark_job/sqoop_import/ods_bsr_end.py
+15
-31
ods_new_releases_top100_asin.py
Pyspark_job/sqoop_import/ods_new_releases_top100_asin.py
+24
-31
ods_one_category_report.py
Pyspark_job/sqoop_import/ods_one_category_report.py
+32
-173
ods_other_search_term_data.py
Pyspark_job/sqoop_import/ods_other_search_term_data.py
+23
-80
ods_search_term_type.py
Pyspark_job/sqoop_import/ods_search_term_type.py
+26
-73
ods_self_asin.py
Pyspark_job/sqoop_import/ods_self_asin.py
+16
-35
ods_self_asin_detail.py
Pyspark_job/sqoop_import/ods_self_asin_detail.py
+23
-30
ods_self_asin_related_traffic.py
Pyspark_job/sqoop_import/ods_self_asin_related_traffic.py
+25
-20
ods_seller_account_feedback.py
Pyspark_job/sqoop_import/ods_seller_account_feedback.py
+20
-32
ods_seller_account_syn.py
Pyspark_job/sqoop_import/ods_seller_account_syn.py
+19
-47
ods_seller_asin_account.py
Pyspark_job/sqoop_import/ods_seller_asin_account.py
+25
-48
ods_seller_asin_product.py
Pyspark_job/sqoop_import/ods_seller_asin_product.py
+27
-45
ods_st_quantity_being_sold.py
Pyspark_job/sqoop_import/ods_st_quantity_being_sold.py
+28
-55
ods_theme.py
Pyspark_job/sqoop_import/ods_theme.py
+16
-37
es_util.py
Pyspark_job/utils/es_util.py
+76
-6
common_udf.py
Pyspark_job/yswg_utils/common_udf.py
+40
-3
No files found.
Pyspark_job/dim/dim_asin_bs_info.py
View file @
057dbb84
This diff is collapsed.
Click to expand it.
Pyspark_job/dwt/dwt_st_theme_agg.py
View file @
057dbb84
...
...
@@ -40,7 +40,7 @@ class DwtStThemeAgg(object):
self
.
u_theme_pattern
=
F
.
udf
(
udf_ele_mattch
,
StringType
())
self
.
u_theme_contain_judge
=
F
.
udf
(
self
.
udf_theme_contain_judge
,
IntegerType
())
self
.
u_judge_twin_words
=
F
.
udf
(
self
.
udf_judge_twin_words
,
IntegerType
())
self
.
u_filter_
sec_pattern_words
=
F
.
udf
(
self
.
udf_filter_sec
_pattern_words
,
IntegerType
())
self
.
u_filter_
pattern_words
=
F
.
udf
(
self
.
udf_filter
_pattern_words
,
IntegerType
())
# 全局df初始化
self
.
df_st_base
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
...
...
@@ -180,8 +180,7 @@ class DwtStThemeAgg(object):
return
F
.
udf
(
udf_filter_blacklist
,
IntegerType
())
@staticmethod
def
udf_filter_sec_pattern_words
(
st_word
,
pattern_list
):
# 标记一些特殊情况指定的二级词,方便后期过滤
def
udf_filter_pattern_words
(
st_word
,
pattern_list
):
filter_flag
=
0
theme_list
=
[
'combination'
,
'size'
]
if
pattern_list
:
...
...
@@ -191,7 +190,7 @@ class DwtStThemeAgg(object):
# 进行单项 数字+month/months的所有二级词 和 数字连接t+ boys/girls的二级词特殊匹配
date_pattern
=
re
.
compile
(
r"(\d+(?:\.\d+)?) +(month|months)\b"
,
flags
=
re
.
IGNORECASE
)
numt_pattern
=
re
.
compile
(
r"((?:\d+)t)(?: +)(boys|girls|boy|girl)\b"
,
flags
=
re
.
IGNORECASE
)
other_pattern
=
re
.
compile
(
r"\b(women
|men|man|woman|for|cute|fashion
|kids?|adults?|girls?|boys?)\b"
,
flags
=
re
.
IGNORECASE
)
other_pattern
=
re
.
compile
(
r"\b(women
s?|mens?|mans?|womans?|fors?|cutes?|fashions?
|kids?|adults?|girls?|boys?)\b"
,
flags
=
re
.
IGNORECASE
)
if
re
.
search
(
date_pattern
,
st_word
):
return
1
if
re
.
search
(
numt_pattern
,
st_word
):
...
...
@@ -350,8 +349,6 @@ class DwtStThemeAgg(object):
self
.
read_data
()
# 模板词归一化处理
self
.
handle_base_pattern_data
()
# 二级词单独处理
self
.
handle_sec_st
()
# 将一级二级模板词和搜索词进行匹配,做中间存储
self
.
handle_st_filter_table
()
# 统计各模板词的指标 pattern_type=0
...
...
@@ -399,23 +396,18 @@ class DwtStThemeAgg(object):
'st_blacklist_flag'
,
self
.
filter_blacklist_words
(
pd_match_blacklist
)(
"search_term"
)
)
.
filter
(
'st_blacklist_flag != 1'
)
.
cache
()
# 处理二级词
def
handle_sec_st
(
self
):
self
.
df_sec_words
=
self
.
df_base_filter_date
.
filter
(
'st_word_num = 2'
)
self
.
df_sec_words
=
self
.
df_sec_words
.
join
(
def
handle_st_filter_table
(
self
):
# 过滤特殊词
self
.
df_base_filter_date
=
self
.
df_base_filter_date
.
join
(
self
.
df_theme
,
on
=
[
'search_term'
],
how
=
'left'
)
self
.
df_sec_words
=
self
.
df_sec_words
.
withColumn
(
"filter_flag"
,
self
.
u_filter_sec_pattern_words
(
F
.
col
(
"search_term"
),
F
.
col
(
"pattern_list"
))
)
# 过滤掉被标记为1的数据
self
.
df_sec_words
=
self
.
df_sec_words
.
filter
(
"filter_flag != 1"
)
self
.
df_sec_words
=
self
.
df_sec_words
.
select
(
'search_term'
,
'st_word_num'
,
'st_bsr_cate_1_id_new'
,
'st_bsr_cate_current_id_new'
,
'rank'
,
'rank_change_rate'
,
'rank_rate_of_change'
)
)
.
withColumn
(
"filter_flag"
,
self
.
u_filter_pattern_words
(
F
.
col
(
"search_term"
),
F
.
col
(
"pattern_list"
))
)
.
filter
(
"filter_flag != 1"
)
.
select
(
'search_term'
,
'st_word_num'
,
'st_bsr_cate_1_id_new'
,
'st_bsr_cate_current_id_new'
,
'rank'
,
'rank_change_rate'
,
'rank_rate_of_change'
)
.
cache
()
def
handle_st_filter_table
(
self
):
df_st_filter_base
=
self
.
df_st_base
.
select
(
F
.
col
(
'st_key'
),
F
.
col
(
'search_term'
),
...
...
@@ -425,12 +417,6 @@ class DwtStThemeAgg(object):
F
.
lit
(
self
.
date_info
)
.
alias
(
'date_info'
)
)
.
cache
()
# 将处理后的二级词和一级词合并
df_one_word
=
self
.
df_base_filter_date
.
filter
(
'st_word_num = 1'
)
.
select
(
'search_term'
,
'st_word_num'
,
'st_bsr_cate_1_id_new'
,
'st_bsr_cate_current_id_new'
,
'rank'
,
'rank_change_rate'
,
'rank_rate_of_change'
)
self
.
df_base_filter_date
=
self
.
df_sec_words
.
unionByName
(
df_one_word
)
.
cache
()
pattern_words
=
self
.
df_base_filter_date
.
select
(
'search_term'
)
# 将数据转换成pandas_df
dict_df
=
pattern_words
.
toPandas
()
...
...
@@ -461,7 +447,6 @@ class DwtStThemeAgg(object):
df_list
.
append
(
df_union_filter
)
for
i
in
range
(
0
,
len
(
df_list
),
batch_size
):
print
(
f
"当前是word_batches的轮回:f{word_batches.index(word_batch)},当前写入表的df索引位置:{i + 1}"
)
tmp_df
=
[]
tmp_df
=
df_list
[
i
:
i
+
batch_size
]
result_df
=
self
.
udf_unionAll
(
*
tmp_df
)
result_df
=
result_df
.
repartition
(
1
)
...
...
Pyspark_job/export_es/es_flow_asin.py
View file @
057dbb84
...
...
@@ -48,7 +48,11 @@ class EsStDetail(TemplatesMysql):
self
.
record_table_name_field
=
f
'{self.site_name}_flow_asin_last_month'
if
self
.
date_type
==
'month'
else
f
'{self.site_name}_flow_asin_last30day'
# elasticsearch相关配置
self
.
client
=
EsUtils
.
get_es_client
()
self
.
es_options
=
EsUtils
.
get_es_options
(
self
.
es_index_name
)
# 富集策略相关配置,用于更新 usr_mask_type 字段
self
.
policy_name1
=
"user_mask_asin_policy"
self
.
policy_name2
=
"user_mask_category_policy"
self
.
pipeline_id
=
"user_asin_mask_enrich_pipeline"
self
.
es_options
=
EsUtils
.
get_es_options
(
self
.
es_index_name
,
self
.
pipeline_id
)
self
.
es_body
=
EsUtils
.
get_es_body
()
# 正式导出需入导出记录表
...
...
@@ -105,6 +109,10 @@ class EsStDetail(TemplatesMysql):
def
es_prepare
(
self
):
print
(
"当前链接的es节点信息为:"
+
str
(
EsUtils
.
__es_ip__
))
EsUtils
.
create_index
(
self
.
es_index_name
,
self
.
client
,
self
.
es_body
)
# 执行富集策略
EsUtils
.
user_enrich_pipeline
(
self
.
client
,
self
.
pipeline_id
,
self
.
policy_name1
,
self
.
policy_name2
)
self
.
client
.
enrich
.
execute_policy
(
name
=
self
.
policy_name1
)
self
.
client
.
enrich
.
execute_policy
(
name
=
self
.
policy_name2
)
if
self
.
date_type
!=
'month'
:
if
not
EsUtils
.
exist_index_alias
(
self
.
alias_name
,
self
.
client
):
EsUtils
.
create_index_alias
(
self
.
es_index_name
,
self
.
alias_name
,
self
.
client
)
...
...
Pyspark_job/my_kafka/kafka_flow_asin_detail.py
View file @
057dbb84
...
...
@@ -50,7 +50,11 @@ class KafkaFlowAsinDetail(Templates):
self
.
es_index_name
=
f
"{self.topic_name}_test"
if
self
.
test_flag
==
'test'
else
f
"{self.topic_name}"
self
.
es_index_alias_name
=
f
"{self.site_name}_st_detail_last_4_week_test"
if
self
.
test_flag
==
'test'
else
f
"{self.site_name}_st_detail_last_4_week"
self
.
es_index_body
=
EsUtils
.
get_es_body
()
self
.
es_options
=
EsUtils
.
get_es_options
(
self
.
es_index_name
)
# 富集策略相关配置,用于更新 usr_mask_type 字段
self
.
policy_name1
=
"user_mask_asin_policy"
self
.
policy_name2
=
"user_mask_category_policy"
self
.
pipeline_id
=
"user_asin_mask_enrich_pipeline"
self
.
es_options
=
EsUtils
.
get_es_options
(
self
.
es_index_name
,
self
.
pipeline_id
)
self
.
db_save
=
'kafka_flow_asin_detail'
self
.
app_name
=
self
.
get_app_name
()
print
(
f
"任务名称:{self.app_name}"
)
...
...
@@ -235,7 +239,7 @@ class KafkaFlowAsinDetail(Templates):
cate_1_pattern
=
self
.
pattern1_dict
[
self
.
site_name
]
df
=
df
.
withColumn
(
"asin_bs_sellers_rank_lower"
,
F
.
lower
(
"best_sellers_rank"
))
df
=
df
.
withColumn
(
"asin_bs"
,
self
.
u_parse_bs_category
(
"asin_bs_sellers_rank_lower"
,
"best_sellers_herf"
,
"all_best_sellers_herf"
,
F
.
lit
(
cate_current_pattern
),
F
.
lit
(
cate_1_pattern
)))
"asin_bs_sellers_rank_lower"
,
"best_sellers_herf"
,
"all_best_sellers_herf"
,
F
.
lit
(
cate_current_pattern
),
F
.
lit
(
cate_1_pattern
)
,
"node_id"
))
df
=
df
.
withColumn
(
"asin_bs_cate_1_id"
,
df
.
asin_bs
.
getField
(
"asin_bs_cate_1_id"
))
\
.
withColumn
(
"asin_bs_cate_current_id"
,
df
.
asin_bs
.
getField
(
"asin_bs_cate_current_id"
))
\
.
withColumn
(
"asin_bs_cate_1_rank"
,
df
.
asin_bs
.
getField
(
"asin_bs_cate_1_rank"
))
\
...
...
@@ -828,6 +832,10 @@ class KafkaFlowAsinDetail(Templates):
# 创建对应es索引
EsUtils
.
create_index
(
self
.
es_index_name
,
self
.
client
,
self
.
es_index_body
)
print
(
"索引名称为:"
,
self
.
es_index_name
)
# 执行富集策略
self
.
client
.
enrich
.
execute_policy
(
name
=
self
.
policy_name1
)
self
.
client
.
enrich
.
execute_policy
(
name
=
self
.
policy_name2
)
EsUtils
.
user_enrich_pipeline
(
self
.
client
,
self
.
pipeline_id
,
self
.
policy_name1
,
self
.
policy_name2
)
if
not
EsUtils
.
exist_index_alias
(
self
.
es_index_alias_name
,
self
.
client
):
EsUtils
.
create_index_alias
(
self
.
es_index_name
,
self
.
es_index_alias_name
,
self
.
client
)
else
:
...
...
Pyspark_job/sqoop_import/ods_asin_detail.py
View file @
057dbb84
"""
@Author : HuangJian
@Description : asin详情表-周表
@SourceTable : us_asin_detail_2023_18
@SinkTable : ods_asin_detail
@CreateTime : 2022/05/18 14:55
@UpdateTime : 2022/05/18 14:55
"""
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
...
@@ -24,151 +14,40 @@ if __name__ == '__main__':
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_table
=
f
"ods_asin_detail"
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
db_type
=
'postgresql_14'
import_table
=
f
"{site_name}_asin_detail_month_{d1}_{d2}"
hive_table
=
"ods_asin_detail"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
# 日期拆分
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
if
date_type
==
DateTypes
.
week
.
name
:
# pg的分区周单位数是带0,如01、02、03
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
# 这里主要是区分db链接
if
site_name
==
'us'
and
date_info
>=
'2023-26'
:
db_type
=
'postgresql'
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
date_col
=
"launch_time,created_time as created_at,updated_time as updated_at"
new_col
=
',describe'
else
:
db_type
=
'postgresql_14'
date_col
=
"launch_time,created_time as created_at,updated_time as updated_at"
new_col
=
',describe'
print
(
f
"同步连接的db_type:{db_type}"
)
# 这里主要是区分新增字段
# 18周新增字段weight_str
if
date_info
>=
'2023-18'
:
new_col
+=
',weight_str'
# 21周新增字段package_quantity、pattern_name
if
date_info
>=
'2023-21'
:
new_col
+=
',package_quantity,pattern_name'
# 49周新增字段follow_sellers
if
date_info
>=
'2023-49'
:
new_col
+=
',follow_sellers'
# 51周新增字段product_description,buy_sales
if
date_info
>=
'2023-51'
:
new_col
+=
',product_description,buy_sales'
# 2024-02周新增字段image_view
if
date_info
>=
'2024-02'
:
new_col
+=
',image_view'
# # 2024-05周新增字段product_json,product_detail_json,review_ai_text,review_label_json
# if date_info >= '2024-05':
# new_col += ',product_json,product_detail_json,review_ai_text,review_label_json'
import_table
=
f
"{site_name}_asin_detail_{d1}_{d2}"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
db_type
=
'postgresql_14'
date_col
=
"launch_time, created_time as created_at, updated_time as updated_at"
new_col
=
"describe, weight_str, package_quantity, pattern_name, follow_sellers, product_description, buy_sales, image_view, spider_int, "
\
"lob_asin_json, seller_json, customer_reviews_json, product_json, product_detail_json, review_ai_text, review_label_json, sp_initial_seen_asins_json, "
\
"sp_4stars_initial_seen_asins_json, sp_delivery_initial_seen_asins_json, compare_similar_asin_json, together_asin_json, min_match_asin_json, "
\
"variat_num, current_asin, img_list, variat_list, parent_asin, bundles_this_asins_json, video_m3u8_url, result_list_json, bundle_asin_component_json"
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_table
=
f
"{site_name}_asin_detail_month_{d1}_{d2}"
sql_query
=
f
"""
select
id,
asin,
img_url,
title,
title_len,
price,
rating,
total_comments,
buy_box_seller_type,
page_inventory,
category,
volume,
weight,
rank,
{date_col},
category_state,
img_num,
img_type,
activity_type,
one_two_val,
three_four_val,
five_six_val,
eight_val,
qa_num,
one_star,
two_star,
three_star,
four_star,
five_star,
low_star,
together_asin,
brand,
ac_name,
material,
node_id,
data_type,
sp_num,
{new_col}
from {import_table}
where 1=1
and
\
$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
cols
=
"id, asin, img_url, title, title_len, price, rating, total_comments, buy_box_seller_type, page_inventory, "
\
"category, volume, weight, rank, launch_time, created_time as created_at, updated_time as updated_at, "
\
"category_state, img_num, img_type, activity_type, one_two_val, three_four_val, five_six_val, eight_val, "
\
"qa_num, one_star, two_star, three_star, four_star, five_star, low_star, together_asin, brand, ac_name, "
\
"material, node_id, data_type, sp_num, describe, weight_str, package_quantity, pattern_name, follow_sellers, "
\
"product_description, buy_sales, image_view, spider_int, lob_asin_json, seller_json, customer_reviews_json, "
\
"product_json, product_detail_json, review_ai_text, review_label_json, sp_initial_seen_asins_json, "
\
"sp_4stars_initial_seen_asins_json, sp_delivery_initial_seen_asins_json, compare_similar_asin_json, "
\
"together_asin_json, min_match_asin_json, variat_num, current_asin, img_list, variat_list, parent_asin, "
\
"bundles_this_asins_json, video_m3u8_url, result_list_json, bundle_asin_component_json, review_json_list"
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
],
partition_dict
=
partition_dict
)
db_type
=
db_type
)
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
sql_query
,
engine
.
sqoop_raw_import
(
query
=
f
"SELECT {cols} FROM {import_table} WHERE 1=1 and $CONDITIONS"
,
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
map_num
=
50
,
key
=
'id'
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
# 导入后检测--检测数据一致性
if
date_type
!=
'month_week'
:
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
])
# 导入后验证--重点字段阈值预警
CommonUtil
.
check_fields_and_warning
(
hive_tb_name
=
hive_table
,
partition_dict
=
partition_dict
)
partitions
=
partition_dict
,
m
=
50
,
split_by
=
'id'
)
pass
Pyspark_job/sqoop_import/ods_asin_err_state.py
View file @
057dbb84
...
...
@@ -2,16 +2,17 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.hdfs_utils
import
HdfsUtils
from
utils.db_util
import
DbTypes
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
import_tb
=
f
"{site_name}_all_syn_st_asin"
db_type
=
DbTypes
.
postgresql
.
name
import_tb
=
f
"{site_name}_all_syn_st_asin"
query
=
f
"""
select asin,
state,
...
...
@@ -20,32 +21,22 @@ if __name__ == '__main__':
where state = 4
and
\
$CONDITIONS
"""
hive_tb
=
"ods_asin_err_state"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
DbTypes
.
postgresql
.
name
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'wujicang'
]
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_bs_category.py
View file @
057dbb84
...
...
@@ -2,22 +2,16 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_tb
=
"ods_bs_category"
db_type
=
"mysql"
import_tb
=
f
"{site_name}_bs_category"
partition_dict
=
{
"site_name"
:
site_name
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
query
=
f
"""
select
id,
...
...
@@ -43,34 +37,24 @@ if __name__ == '__main__':
from {import_tb}
where 1 = 1
and
\
$CONDITIONS
"""
"""
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_bs_category"
partition_dict
=
{
"site_name"
:
site_name
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_bs_category_asin_detail.py
View file @
057dbb84
...
...
@@ -2,9 +2,9 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
,
DateTypes
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.common_util
import
CommonUtil
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
...
@@ -13,85 +13,39 @@ if __name__ == '__main__':
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
year
,
week
=
date_info
.
split
(
"-"
)
hive_tb
=
"ods_bs_category_asin_detail"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
if
date_type
==
DateTypes
.
week
.
name
:
if
site_name
==
"us"
:
if
date_info
>=
'2023-18'
:
db_type
=
"postgresql"
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
import_tb
=
f
"{site_name}_bs_category_asin_detail_{year}_{week}"
cols
=
f
"id,asin,{week} as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
params
=
"1 = 1"
else
:
db_type
=
"mysql"
import_tb
=
f
"{site_name}_bs_category_asin_detail"
cols
=
"id,asin,week,best_sellers_rank,created_at,updated_at,last_herf"
params
=
f
"week = {int(week)} and DATE_FORMAT(created_at,'
%
Y') = {year}"
else
:
db_type
=
"postgresql_14"
import_tb
=
f
"{site_name}_bs_category_asin_detail_{year}_{week}"
cols
=
f
"id,asin,{week} as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
params
=
"1 = 1"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
# 日期拆分
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
if
site_name
in
[
'us'
,
'uk'
,
'de'
]:
db_type
=
'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
cols
=
f
"id,asin,null as week,best_sellers_rank,created_time as created_at,updated_time as updated_at,last_herf,all_best_sellers_href"
db_type
=
'postgresql_14'
import_tb
=
f
"{site_name}_bs_category_asin_detail_month_{d1}_{d2}"
params
=
f
" 1=1 "
else
:
print
(
f
"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误"
)
exit
()
query
=
f
"""
select
{cols}
id, asin, null as week, best_sellers_rank, created_time as created_at, updated_time as updated_at, last_herf, all_best_sellers_href
from {import_tb}
where
{params}
where
1=1
and
\
$CONDITIONS
"""
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_bs_category_asin_detail"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
,
m
=
50
,
split_by
=
'id'
)
pass
Pyspark_job/sqoop_import/ods_bs_category_top100_asin.py
View file @
057dbb84
...
...
@@ -2,25 +2,21 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_tb
=
"ods_bs_category_top100_asin"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
"mysql"
if
date_info
==
'all'
:
query
=
f
"""
select id,
select
id,
asin,
cate_1_id,
cate_current_id,
...
...
@@ -33,11 +29,11 @@ if __name__ == '__main__':
from {site_name}_bs_category_top100_asin
where 1 = 1
and
\
$CONDITIONS
"""
pass
"""
else
:
query
=
f
"""
select id,
select
id,
asin,
cate_1_id,
cate_current_id,
...
...
@@ -51,23 +47,24 @@ if __name__ == '__main__':
where 1 = 1
and date_info = '{date_info}'
and
\
$CONDITIONS
"""
pass
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
"""
hive_tb
=
"ods_bs_category_top100_asin"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'wujicang'
])
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
db_type
=
db_type
)
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_bsr_end.py
View file @
057dbb84
...
...
@@ -2,22 +2,16 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_tb
=
"ods_bsr_end"
db_type
=
"mysql"
import_tb
=
f
"{site_name}_bsr_end"
partition_dict
=
{
"site_name"
:
site_name
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
query
=
f
"""
select
id,
...
...
@@ -31,32 +25,22 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_bsr_end"
partition_dict
=
{
"site_name"
:
site_name
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_new_releases_top100_asin.py
View file @
057dbb84
...
...
@@ -2,24 +2,21 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_tb
=
"ods_new_releases_top100_asin"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
"mysql"
if
date_info
==
'all'
:
query
=
f
"""
select id,
select
id,
asin,
cate_1_id,
cate_current_id,
...
...
@@ -32,11 +29,11 @@ if __name__ == '__main__':
from {site_name}_new_releases_top100_asin
where 1 = 1
and
\
$CONDITIONS
"""
pass
"""
else
:
query
=
f
"""
select id,
select
id,
asin,
cate_1_id,
cate_current_id,
...
...
@@ -50,28 +47,24 @@ if __name__ == '__main__':
where 1 = 1
and date_info = '{date_info}'
and
\
$CONDITIONS
"""
pass
"""
print
(
"================================sql===================================="
)
print
(
query
)
db_type
=
"mysql"
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_new_releases_top100_asin"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'wujicang'
]
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_one_category_report.py
View file @
057dbb84
This diff is collapsed.
Click to expand it.
Pyspark_job/sqoop_import/ods_other_search_term_data.py
View file @
057dbb84
"""
@Author : HuangJian
@Description : 各站点店铺asin详情表-- 月抓取
@SourceTable : us_other_search_term_data_2023_18
@SinkTable : ods_other_search_term_data
@CreateTime : 2022/05/23 09:55
@UpdateTime : 2022/05/23 09:55
"""
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
...
@@ -25,46 +14,10 @@ if __name__ == '__main__':
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_table
=
f
"ods_other_search_term_data"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
# 日期拆分
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
db_type
=
''
if
date_type
==
DateTypes
.
week
.
name
:
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
if
site_name
==
'us'
and
date_info
>=
'2023-18'
:
db_type
=
'postgresql'
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
# pg的分区周单位数是带0,如01、02、03
import_table
=
f
"{site_name}_other_search_term_{d1}_{d2}"
else
:
db_type
=
'postgresql_14'
import_table
=
f
"{site_name}_other_search_term_{d1}_{d2}"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
if
site_name
in
[
'us'
,
'uk'
,
'de'
]:
db_type
=
'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_table
=
f
"{site_name}_other_search_term_month_{d1}_{d2}"
else
:
print
(
f
"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误"
)
exit
()
assert
db_type
!=
''
,
"未获取到db_type,请检查!"
sql_query
=
f
"""
select
id,
...
...
@@ -81,40 +34,31 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
# 进行schema和数据校验
if
site_name
not
in
(
'fr'
,
'it'
,
'es'
):
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbing'
,
'chenyuanjie'
]
,
partition_dict
=
partition_dict
)
if
site_name
==
'us'
:
map_num
=
20
else
:
map_num
=
5
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
hive_table
=
"ods_other_search_term_data"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
map_num
=
map_num
,
key
=
'id'
partitions
=
partition_dict
,
m
=
map_num
,
split_by
=
'id'
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
# 导入后检测--检测同步数据数据量的一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbing'
,
'chenyuanjie'
])
\ No newline at end of file
pass
Pyspark_job/sqoop_import/ods_search_term_type.py
View file @
057dbb84
...
...
@@ -2,10 +2,9 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
...
@@ -21,8 +20,6 @@ if __name__ == '__main__':
print
(
"uk站点已无ac类型词,退出执行!"
)
sys
.
exit
(
0
)
hive_tb
=
f
"ods_search_term_{st_type}"
if
st_type
in
[
"zr"
,
"sp"
]:
cols
=
"search_term,asin,page,page_row,created_time,updated_time,id"
elif
st_type
in
[
"sb"
,
"tr"
]:
...
...
@@ -30,92 +27,48 @@ if __name__ == '__main__':
else
:
cols
=
"search_term,asin,page,created_time,updated_time,id"
# 日期拆分
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
if
date_type
==
DateTypes
.
week
.
name
:
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
if
site_name
==
'us'
and
date_info
>=
'2023-18'
:
db_type
=
'postgresql'
# pg的分区周单位数是带0,如01、02、03
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
import_tb
=
f
"{site_name}_search_term_rank_{st_type}_{d1}_{d2}"
else
:
db_type
=
'postgresql_14'
import_tb
=
f
"{site_name}_search_term_rank_{st_type}_{d1}_{d2}"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
if
site_name
in
[
'us'
,
'uk'
,
'de'
]:
db_type
=
'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_tb
=
f
"{site_name}_search_term_rank_{st_type}_month_{d1}_{d2}"
else
:
print
(
f
"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误"
)
exit
()
query
=
f
"""
select {cols}
from {import_tb}
where 1 = 1
and
\
$CONDITIONS
select {cols} from {import_tb} where 1 = 1 and
\
$CONDITIONS
"""
print
(
f
"当前链接的数据库为:{db_type},同步的表为:{import_tb}"
)
hive_tb
=
f
"ods_search_term_{st_type}"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
if
st_type
in
[
'er'
,
'tr'
]:
empty_flag
=
False
print
(
f
"st_type类型为{st_type},符合不检测类型跳过检测!"
)
if
st_type
==
"zr"
:
if
site_name
==
"us"
:
map_num
=
40
else
:
map_num
=
15
elif
st_type
in
[
"sb"
,
"sp"
]:
if
site_name
==
"us"
:
map_num
=
6
else
:
map_num
=
2
else
:
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
map_num
=
1
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbing'
,
'chenyuanjie'
],
partition_dict
=
partition_dict
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
# zr的数据量较大,同步时进行多进程同步
if
st_type
in
[
'zr'
]:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
map_num
=
10
,
key
=
'id'
)
else
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
if
date_type
!=
'month_week'
:
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'fangxingjun'
,
'pengyanbing'
,
'chenyuanjie'
])
partitions
=
partition_dict
,
m
=
map_num
,
split_by
=
'id'
)
pass
Pyspark_job/sqoop_import/ods_self_asin.py
View file @
057dbb84
...
...
@@ -2,24 +2,15 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_tb
=
"ods_self_asin"
partition_dict
=
{
"site_name"
:
site_name
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
"mysql"
query
=
f
"""
select
id,
...
...
@@ -31,32 +22,22 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
db_type
=
"mysql"
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_self_asin"
partition_dict
=
{
"site_name"
:
site_name
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
])
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_self_asin_detail.py
View file @
057dbb84
...
...
@@ -2,27 +2,20 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.hdfs_utils
import
HdfsUtils
from
utils.db_util
import
DbTypes
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
site_name
is
not
None
,
"site
_
name 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
year
=
CommonUtil
.
reformat_date
(
date_info
,
"
%
Y-
%
m-
%
d"
,
"
%
Y"
,
)
hive_tb
=
"ods_self_asin_detail"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
DbTypes
.
postgresql
.
name
year
=
CommonUtil
.
reformat_date
(
date_info
,
"
%
Y-
%
m-
%
d"
,
"
%
Y"
,
)
query
=
f
"""
select
asin,
...
...
@@ -72,26 +65,26 @@ if __name__ == '__main__':
and bsr_date_info = '{date_info}'
and date_info >= '{date_info}'
and
\
$CONDITIONS
"""
print
(
"sql ======================================================"
)
print
(
query
)
db_type
=
DbTypes
.
postgresql
.
name
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
"""
hive_tb
=
"ods_self_asin_detail"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'wujicang'
]
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_self_asin_related_traffic.py
View file @
057dbb84
...
...
@@ -3,22 +3,15 @@ import sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_table
=
"ods_self_asin_related_traffic"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
db_type
=
'mysql'
import_table
=
f
"{site_name}_self_asin_detail"
sql_query
=
f
"""
select
id,
...
...
@@ -37,17 +30,29 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
query
=
sql_query
,
hdfs_path
=
hdfs_path
,
map_num
=
25
,
key
=
'id'
hive_table
=
"ods_self_asin_related_traffic"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
if
site_name
==
'us'
:
map_num
=
25
else
:
map_num
=
1
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
db_type
)
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
,
m
=
map_num
,
split_by
=
'id'
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
pass
Pyspark_job/sqoop_import/ods_seller_account_feedback.py
View file @
057dbb84
...
...
@@ -2,11 +2,10 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
...
@@ -16,17 +15,6 @@ if __name__ == '__main__':
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_table
=
f
"ods_seller_account_feedback"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
suffix
=
str
(
date_info
)
.
replace
(
"-"
,
"_"
)
import_table
=
f
"{site_name}_seller_account_feedback_{suffix}"
if
date_type
==
DateTypes
.
month
.
name
and
date_info
>=
'2023-08'
:
...
...
@@ -51,24 +39,24 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_table
=
"ods_seller_account_feedback"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
])
db_type
=
db_type
)
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_seller_account_syn.py
View file @
057dbb84
"""
@Author : HuangJian
@Description : 各站点店铺名称与店铺id关系全量表--传参为单站点
@SourceTable : us_seller_account_feedback
@SinkTable : ods_seller_account_feedback
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.db_util
import
DBUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.spark_util
import
SparkUtil
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_table
=
f
"ods_seller_account_syn"
partition_dict
=
{
"site_name"
:
site_name
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
import_table
=
f
"{site_name}_seller_account_syn_distinct"
db_type
=
'mysql'
import_table
=
f
"{site_name}_seller_account_syn_distinct"
sql_query
=
f
"""
select
id,
...
...
@@ -52,24 +26,22 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_table
=
"ods_seller_account_syn"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
])
db_type
=
db_type
)
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/sqoop_import/ods_seller_asin_account.py
View file @
057dbb84
"""
@Author : HuangJian
@Description : 各站点店铺名称与asin关系全量表--传参为单站点
@SourceTable : us_seller_account_feedback
@SinkTable : ods_seller_account_feedback
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.db_util
import
DBUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.spark_util
import
SparkUtil
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_table
=
f
"ods_seller_asin_account"
partition_dict
=
{
"site_name"
:
site_name
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
import_table
=
f
"{site_name}_seller_asin_account"
db_type
=
'mysql'
import_table
=
f
"{site_name}_seller_asin_account"
sql_query
=
f
"""
select
id,
...
...
@@ -51,26 +25,29 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_table
=
"ods_seller_asin_account"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
if
site_name
==
'us'
:
map_num
=
100
else
:
map_num
=
40
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
])
db_type
=
db_type
)
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
map_num
=
10
,
key
=
'id'
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
partitions
=
partition_dict
,
m
=
map_num
,
split_by
=
'id'
)
pass
Pyspark_job/sqoop_import/ods_seller_asin_product.py
View file @
057dbb84
"""
@Author : HuangJian
@Description : 各站点店铺asin详情表-- 月抓取
@SourceTable : us_asin_detail_product_2023
@SinkTable : ods_asin_detail_product
@CreateTime : 2022/05/19 14:55
@UpdateTime : 2022/05/19 14:55
"""
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.db_util
import
DBUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.spark_util
import
SparkUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
...
@@ -29,21 +17,8 @@ if __name__ == '__main__':
# 该表现在为月同步表,因此增加月类型校验
assert
date_type
==
DateTypes
.
month
.
name
,
"date_type类型不对,应为month"
hive_table
=
f
"ods_asin_detail_product"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
# 日期拆分
suffix
=
str
(
date_info
)
.
replace
(
"-"
,
"_"
)
import_table
=
f
"{site_name}_seller_asin_product_{suffix}"
# db_type = 'postgresql'
if
date_type
==
DateTypes
.
month
.
name
and
date_info
>=
'2023-08'
:
db_type
=
'postgresql_14'
else
:
...
...
@@ -71,24 +46,31 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
# 进行schema和数据校验
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_table
=
"ods_asin_detail_product"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
if
site_name
==
'us'
:
map_num
=
8
else
:
map_num
=
3
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'chenyuanjie'
])
db_type
=
db_type
)
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
,
m
=
map_num
,
split_by
=
'id'
)
pass
Pyspark_job/sqoop_import/ods_st_quantity_being_sold.py
View file @
057dbb84
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.common_util
import
DateTypes
from
utils.hdfs_utils
import
HdfsUtils
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
...
...
@@ -17,40 +14,10 @@ if __name__ == '__main__':
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
hive_table
=
f
"ods_st_quantity_being_sold"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
# 落表路径校验
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
# 日期拆分
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
if
date_type
==
DateTypes
.
week
.
name
:
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
if
site_name
==
'us'
and
date_info
>=
'2023-18'
:
db_type
=
'postgresql'
if
date_info
>=
'2023-34'
:
db_type
=
'postgresql_14'
else
:
db_type
=
'postgresql_14'
import_table
=
f
"{site_name}_brand_analytics_{d1}_{d2}"
if
date_type
==
DateTypes
.
month
.
name
or
date_type
==
DateTypes
.
month_week
.
name
:
if
site_name
in
[
'us'
,
'uk'
,
'de'
]:
db_type
=
'postgresql_14'
# pg的分区单位数是带0,如01、02、03
d1
,
d2
=
CommonUtil
.
split_month_week_date
(
date_type
,
date_info
)
d2
=
f
'0{d2}'
if
int
(
d2
)
<
10
else
f
'{d2}'
import_table
=
f
"{site_name}_brand_analytics_month_{d1}_{d2}"
else
:
print
(
f
"其他站点{date_type}数据暂未明确,请检查是否dateType传输有误"
)
exit
()
sql_query
=
f
"""
select
id,
...
...
@@ -67,25 +34,31 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
# 进行schema和数据校验
if
site_name
not
in
(
'fr'
,
'it'
,
'es'
):
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_table
=
"ods_st_quantity_being_sold"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
if
site_name
==
'us'
:
map_num
=
4
else
:
map_num
=
1
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
sql_query
,
hive_tb_name
=
hive_table
,
msg_usr
=
[
'fangxingjun'
,
'chenyuanjie'
])
db_type
=
db_type
)
# 生成导出脚本
import_sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
sql_query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除原始hdfs数据
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
# 创建ssh Client对象--用于执行cmd命令
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
import_sh
,
ignore_err
=
False
)
# 创建lzo索引和修复元数据
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_table
)
# 关闭链接
client
.
close
()
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
,
m
=
map_num
,
split_by
=
'id'
)
pass
Pyspark_job/sqoop_import/ods_theme.py
View file @
057dbb84
...
...
@@ -2,29 +2,17 @@ import os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.ssh_util
import
SSHUtil
from
utils.common_util
import
CommonUtil
from
utils.
hdfs_utils
import
HdfsUtils
from
utils.
secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
hive_tb
=
"ods_theme"
db_type
=
"mysql"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
print
(
f
"hdfs_path is {hdfs_path}"
)
import_tb
=
f
"{site_name}_theme"
cols
=
"id,theme_type_en,theme_type_ch,theme_en,theme_ch,created_at,updated_at"
cols
=
"id, theme_type_en, theme_type_ch, theme_en, theme_ch, created_at, updated_at"
query
=
f
"""
select
{cols}
...
...
@@ -33,31 +21,22 @@ if __name__ == '__main__':
and
\
$CONDITIONS
"""
empty_flag
,
check_flag
=
CommonUtil
.
check_schema_before_import
(
db_type
=
db_type
,
hive_tb
=
"ods_theme"
partition_dict
=
{
"site_name"
:
site_name
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
]
db_type
=
db_type
)
assert
check_flag
,
f
"导入hive表{hive_tb}表结构检查失败!请检查query是否异常!!"
if
not
empty_flag
:
sh
=
CommonUtil
.
build_import_sh
(
site_name
=
site_name
,
db_type
=
db_type
,
engine
.
sqoop_raw_import
(
query
=
query
,
hdfs_path
=
hdfs_path
)
# 导入前先删除
HdfsUtils
.
delete_hdfs_file
(
hdfs_path
)
client
=
SSHUtil
.
get_ssh_client
()
SSHUtil
.
exec_command_async
(
client
,
sh
,
ignore_err
=
False
)
CommonUtil
.
after_import
(
hdfs_path
=
hdfs_path
,
hive_tb
=
hive_tb
)
client
.
close
()
# 导入后检测--检测数据一致性
CommonUtil
.
check_import_sync_num
(
db_type
=
db_type
,
partition_dict
=
partition_dict
,
import_query
=
query
,
hive_tb_name
=
hive_tb
,
msg_usr
=
[
'chenyuanjie'
])
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
)
pass
Pyspark_job/utils/es_util.py
View file @
057dbb84
# author : wangrui
# data : 2023/3/9 15:50
from
elasticsearch
import
Elasticsearch
...
...
@@ -27,7 +25,7 @@ class EsUtils(object):
# 获取elasticsearch相关配置
@staticmethod
def
get_es_options
(
es_index_name
):
def
get_es_options
(
es_index_name
,
pipeline_id
):
return
{
"es.nodes"
:
EsUtils
.
__es_ip__
,
"es.port"
:
EsUtils
.
__es_port__
,
...
...
@@ -40,7 +38,8 @@ class EsUtils(object):
"es.batch.size.entries"
:
"5000"
,
"es.nodes.wan.only"
:
"false"
,
"es.batch.write.concurrency"
:
"30"
,
"es.write.operation"
:
"upsert"
"es.write.operation"
:
"index"
,
"es.ingest.pipeline"
:
f
"{pipeline_id}"
}
# 获取elasticsearch中索引配置信息
...
...
@@ -487,7 +486,6 @@ class EsUtils(object):
index_name_list
=
list
(
alias_info
.
keys
())
return
index_name_list
#删除索引别名
@staticmethod
def
delete_index_alias
(
alias_name
,
client
):
...
...
@@ -500,7 +498,79 @@ class EsUtils(object):
else
:
print
(
"索引别名不存在!"
)
@staticmethod
def
user_enrich_pipeline
(
client
,
pipeline_id
,
policy_name1
,
policy_name2
):
pipeline_body
=
{
"description"
:
"asin flow user mask pipeline"
,
"processors"
:
[
{
"enrich"
:
{
"policy_name"
:
f
"{policy_name1}"
,
"field"
:
"asin"
,
"target_field"
:
"policy_add_1"
,
"max_matches"
:
1
,
"ignore_missing"
:
True
},
},
{
"enrich"
:
{
"policy_name"
:
f
"{policy_name2}"
,
"field"
:
"category_id"
,
"target_field"
:
"policy_add_2"
,
"max_matches"
:
1
,
"ignore_missing"
:
True
},
},
{
"set"
:
{
"field"
:
"usr_mask_type"
,
"value"
:
"{{policy_add_1.usr_mask_type}}"
,
"ignore_empty_value"
:
True
}
},
{
"set"
:
{
"field"
:
"usr_mask_progress"
,
"value"
:
"{{policy_add_1.usr_mask_progress}}"
,
"ignore_empty_value"
:
True
}
},
{
"set"
:
{
"field"
:
"package_quantity"
,
"value"
:
"{{policy_add_1.package_quantity}}"
,
"ignore_empty_value"
:
True
}
},
{
"set"
:
{
"field"
:
"usr_mask_type"
,
"value"
:
"{{policy_add_2.usr_mask_type}}"
,
"ignore_empty_value"
:
True
}
},
{
"remove"
:
{
"field"
:
"policy_add_1"
,
"ignore_missing"
:
True
}
},
{
"remove"
:
{
"field"
:
"policy_add_2"
,
"ignore_missing"
:
True
}
},
{
"convert"
:
{
"field"
:
"package_quantity"
,
"type"
:
"integer"
,
"ignore_missing"
:
True
}
}
]
}
client
.
ingest
.
put_pipeline
(
id
=
pipeline_id
,
body
=
pipeline_body
)
if
__name__
==
'__main__'
:
pass
Pyspark_job/yswg_utils/common_udf.py
View file @
057dbb84
...
...
@@ -680,13 +680,14 @@ def udf_extract_weight_format(weight_str: str):
# 分类提取-返回: 一级/当前分类id+一级/当前分类排名
# 参考dim_asin_bs_info.py使用
def
udf_parse_bs_category
(
asin_bs_sellers_rank_lower
,
last_herf
,
all_best_sellers_href
,
cate_current_pattern
,
cate_1_pattern
):
cate_1_pattern
,
node_id
):
"""
asin_bs_sellers_rank_lower: 底部分类字符串
last_herf: 最后一级分类链接
all_best_sellers_href: 所有分类链接
cate_current_pattern: 当前分类排名匹配规则
cate_1_pattern: 一级分类排名匹配规则
node_id: 页面头部抓取分类id
"""
# if (site_name == 'us' and date_type in ['month', 'month_week'] and date_info >= '2023-11') or (site_name != 'us' and date_type in ['week'] and date_info >= '2023-41'):
...
...
@@ -711,7 +712,43 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
break
# 2. 解析一级和当前 分类 + 排名
# 2.1 提取分类
# 2.1 先检查 node_id 是否在 href_list 中
cate_1_id
,
cate_current_id
,
cate_1_rank
,
cate_current_rank
=
None
,
None
,
None
,
None
if
node_id
and
len
(
href_list
)
>
1
:
node_id_str
=
str
(
node_id
)
matched_idx
=
None
for
i
,
href
in
enumerate
(
href_list
):
if
node_id_str
in
href
:
# 判断node_id是否在url中出现
matched_idx
=
i
break
if
matched_idx
is
not
None
:
# 提取对应分类ID
cate_current_id
=
re
.
findall
(
'bestsellers/(.*)/ref'
,
href_list
[
matched_idx
])
cate_current_id
=
cate_current_id
[
0
]
.
split
(
"/"
)[
-
1
]
if
cate_current_id
else
None
# 一级分类还是取第一个
cate_1_id
=
re
.
findall
(
'bestsellers/(.*)/ref'
,
href_list
[
0
])
cate_1_id
=
cate_1_id
[
0
]
.
split
(
"/"
)[
0
]
if
cate_1_id
else
None
# 解析排名
if
asin_bs_sellers_rank_lower
is
not
None
:
asin_bs_sellers_rank_lower2
=
asin_bs_sellers_rank_lower
.
replace
(
"."
,
""
)
.
replace
(
","
,
""
)
.
replace
(
" 100 "
,
""
)
else
:
asin_bs_sellers_rank_lower2
=
''
rank_list
=
re
.
findall
(
cate_current_pattern
,
asin_bs_sellers_rank_lower2
)
rank_list
=
[
int
(
rank
)
for
rank
in
rank_list
]
# 如果 rank_list 长度和 href_list 对齐,则取对应位置的排名
if
matched_idx
<
len
(
rank_list
):
cate_current_rank
=
rank_list
[
matched_idx
]
# 一级分类排名
if
rank_list
and
cate_1_pattern
in
asin_bs_sellers_rank_lower
:
cate_1_rank
=
rank_list
[
0
]
return
cate_1_id
,
cate_current_id
,
cate_1_rank
,
cate_current_rank
# 2.2 提取分类
if
href_list
:
if
len
(
href_list
)
==
1
:
cate_list
=
re
.
findall
(
'bestsellers/(.*)/ref'
,
href_list
[
0
])
...
...
@@ -735,7 +772,7 @@ def udf_parse_bs_category(asin_bs_sellers_rank_lower, last_herf, all_best_seller
else
:
cate_1_id
,
cate_current_id
=
None
,
None
# 2.
2
提取排名
# 2.
3
提取排名
if
asin_bs_sellers_rank_lower
is
not
None
:
asin_bs_sellers_rank_lower2
=
asin_bs_sellers_rank_lower
.
replace
(
"."
,
""
)
.
replace
(
","
,
""
)
.
replace
(
" 100 "
,
""
)
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment