Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
0ed9c4bd
Commit
0ed9c4bd
authored
Mar 31, 2026
by
wangjing
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
no message
parent
417b6578
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
49 additions
and
35 deletions
+49
-35
ods_bs_category_top100_asin.py
Pyspark_job/sqoop_import/ods_bs_category_top100_asin.py
+5
-3
ods_bs_top100_asin.py
Pyspark_job/sqoop_import/ods_bs_top100_asin.py
+23
-15
ods_new_releases_top100_asin.py
Pyspark_job/sqoop_import/ods_new_releases_top100_asin.py
+5
-4
ods_nsr_top100_asin.py
Pyspark_job/sqoop_import/ods_nsr_top100_asin.py
+16
-13
No files found.
Pyspark_job/sqoop_import/ods_bs_category_top100_asin.py
View file @
0ed9c4bd
...
...
@@ -12,7 +12,9 @@ if __name__ == '__main__':
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
db_type
=
"mysql"
year
=
date_info
.
split
(
"-"
)[
0
]
# db_type = "mysql"
db_type
=
"postgresql_14"
if
date_info
==
'all'
:
query
=
f
"""
select
...
...
@@ -26,7 +28,7 @@ if __name__ == '__main__':
created_at as updated_at,
date_info,
category_id
from {site_name}_bs_
category_top100_asin
from {site_name}_bs_
top100_asin_{year}
where 1 = 1
and
\
$CONDITIONS
"""
...
...
@@ -43,7 +45,7 @@ if __name__ == '__main__':
created_at as updated_at,
date_info,
category_id
from {site_name}_bs_
category_top100_asin
from {site_name}_bs_
top100_asin_{year}
where 1 = 1
and date_info = '{date_info}'
and
\
$CONDITIONS
...
...
Pyspark_job/sqoop_import/ods_bs_top100_asin.py
View file @
0ed9c4bd
...
...
@@ -8,11 +8,15 @@ from utils.secure_db_client import get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
date_type
is
not
None
,
"date_type 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
db_type
=
"mysql"
year
=
date_info
.
split
(
"-"
)[
0
]
# db_type = "mysql"
db_type
=
"postgresql_14"
if
date_info
==
'all'
:
query
=
f
"""
select
...
...
@@ -20,13 +24,14 @@ if __name__ == '__main__':
asin,
cate_1_id,
cate_current_id,
category_id,
bsr_rank,
price,
rating,
total_comments,
created_at as updated_at,
date_info,
category_id
from {site_name}_bs_category_top100_asin
created_at as created_at,
updated_at as updated_at
from {site_name}_bs_top100_asin_{year}
where 1 = 1
and
\
$CONDITIONS
"""
...
...
@@ -37,23 +42,26 @@ if __name__ == '__main__':
asin,
cate_1_id,
cate_current_id,
category_id,
bsr_rank,
price,
rating,
total_comments,
created_at as updated_at,
date_info,
category_id
from {site_name}_bs_category_top100_asin
created_at as created_at,
updated_at as updated_at
from {site_name}_bs_top100_asin_{year}
where 1 = 1
and date_info = '{date_info}'
and
\
$CONDITIONS
"""
hive_tb
=
"ods_bs_
category_
top100_asin"
hive_tb
=
"ods_bs_top100_asin"
partition_dict
=
{
"site_name"
:
site_name
"site_name"
:
site_name
,
"date_type"
:
date_type
,
"date_info"
:
date_info
,
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
=
partition_dict
)
#
hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict=partition_dict)
engine
=
get_remote_engine
(
site_name
=
site_name
,
...
...
@@ -63,7 +71,7 @@ if __name__ == '__main__':
engine
.
sqoop_raw_import
(
query
=
query
,
hive_table
=
hive_tb
,
hdfs_path
=
hdfs_path
,
#
hdfs_path=hdfs_path,
partitions
=
partition_dict
)
...
...
Pyspark_job/sqoop_import/ods_new_releases_top100_asin.py
View file @
0ed9c4bd
...
...
@@ -11,8 +11,9 @@ if __name__ == '__main__':
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
db_type
=
"mysql"
year
=
date_info
.
split
(
"-"
)[
0
]
# db_type = "mysql"
db_type
=
"postgresql_14"
if
date_info
==
'all'
:
query
=
f
"""
select
...
...
@@ -26,7 +27,7 @@ if __name__ == '__main__':
created_at as updated_at,
date_info,
category_id
from {site_name}_n
ew_releases_top100_asin
from {site_name}_n
sr_top100_asin_{year}
where 1 = 1
and
\
$CONDITIONS
"""
...
...
@@ -43,7 +44,7 @@ if __name__ == '__main__':
created_at as updated_at,
date_info,
category_id
from {site_name}_n
ew_releases_top100_asin
from {site_name}_n
sr_top100_asin_{year}
where 1 = 1
and date_info = '{date_info}'
and
\
$CONDITIONS
...
...
Pyspark_job/sqoop_import/ods_nsr_top100_asin.py
View file @
0ed9c4bd
...
...
@@ -10,11 +10,12 @@ if __name__ == '__main__':
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_type
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
3
,
None
)
assert
site_name
is
not
None
,
"sitename 不能为空!"
assert
date_type
is
not
None
,
"
sitenam
e 不能为空!"
assert
site_name
is
not
None
,
"site
_
name 不能为空!"
assert
date_type
is
not
None
,
"
date_typ
e 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
db_type
=
"mysql"
year
=
date_info
.
split
(
"-"
)[
0
]
# db_type = "mysql"
db_type
=
"postgresql_14"
if
date_info
==
'all'
:
query
=
f
"""
select
...
...
@@ -22,13 +23,14 @@ if __name__ == '__main__':
asin,
cate_1_id,
cate_current_id,
category_id,
bsr_rank,
price,
rating,
total_comments,
created_at as updated_at,
date_info,
category_id
from {site_name}_bs_category_top100_asin
created_at as created_at,
updated_at as updated_at
from {site_name}_nsr_top100_asin_{year}
where 1 = 1
and
\
$CONDITIONS
"""
...
...
@@ -39,19 +41,20 @@ if __name__ == '__main__':
asin,
cate_1_id,
cate_current_id,
category_id,
bsr_rank,
price,
rating,
total_comments,
created_at as updated_at,
date_info,
category_id
from {site_name}_bs_category_top100_asin
created_at as created_at,
updated_at as updated_at
from {site_name}_nsr_top100_asin_{year}
where 1 = 1
and date_info = '{date_info}'
and
\
$CONDITIONS
"""
hive_tb
=
"ods_
bs_category
_top100_asin"
hive_tb
=
"ods_
nsr
_top100_asin"
partition_dict
=
{
"site_name"
:
site_name
,
"date_type"
:
date_type
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment