Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
da83d3ac
Commit
da83d3ac
authored
Mar 10, 2026
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
keepa数据过滤脏数据
parent
25966c8b
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
8 additions
and
3 deletions
+8
-3
dim_keepa_asin_info.py
Pyspark_job/dim/dim_keepa_asin_info.py
+8
-3
No files found.
Pyspark_job/dim/dim_keepa_asin_info.py
View file @
da83d3ac
...
...
@@ -48,6 +48,8 @@ class DimKeepaAsinInfo(object):
F
.
get_json_object
(
"last_detail"
,
"$.listedSince"
)
.
cast
(
"int"
)
.
alias
(
"listed_since"
),
F
.
get_json_object
(
"last_detail"
,
"$.releaseDate"
)
.
cast
(
"int"
)
.
alias
(
"release_date"
),
F
.
get_json_object
(
"last_detail"
,
"$.trackingSince"
)
.
cast
(
"int"
)
.
alias
(
"tracking_since"
),
F
.
get_json_object
(
"last_detail"
,
"$.productType"
)
.
cast
(
"int"
)
.
alias
(
"product_type"
),
F
.
get_json_object
(
"last_detail"
,
"$.title"
)
.
alias
(
"title"
),
F
.
col
(
'updated_time'
)
)
.
withColumn
(
'weight'
,
F
.
greatest
(
F
.
col
(
"package_weight"
),
F
.
col
(
"item_weight"
))
...
...
@@ -75,9 +77,12 @@ class DimKeepaAsinInfo(object):
F
.
col
(
'updated_time'
),
F
.
col
(
'listed_since'
),
F
.
col
(
'release_date'
),
F
.
col
(
'tracking_since'
)
F
.
col
(
'tracking_since'
),
F
.
col
(
'product_type'
),
F
.
col
(
'title'
)
)
.
filter
(
(
F
.
col
(
"package_length"
)
.
isNotNull
())
&
(
F
.
col
(
"package_width"
)
.
isNotNull
())
&
(
F
.
col
(
"package_height"
)
.
isNotNull
())
&
(
F
.
col
(
"weight"
)
.
isNotNull
())
# 过滤脏数据:productType in (3,4,5) 且 title 为空的异常数据不做保留
~
(
F
.
col
(
"product_type"
)
.
isin
(
3
,
4
,
5
)
&
F
.
col
(
"title"
)
.
isNull
())
)
.
cache
()
self
.
df_to_doris
=
self
.
df_keepa_asin
.
select
(
'asin'
,
'package_length'
,
'package_width'
,
'package_height'
,
'package_weight'
,
'item_weight'
,
'weight'
,
...
...
@@ -86,7 +91,7 @@ class DimKeepaAsinInfo(object):
# 读取历史数据
sql
=
f
"""
select asin, package_length, package_width, package_height, package_weight, item_weight, weight, keepa_launch_time, updated_time, listed_since, release_date, tracking_since
select asin, package_length, package_width, package_height, package_weight, item_weight, weight, keepa_launch_time, updated_time, listed_since, release_date, tracking_since
, product_type, title
from dim_keepa_asin_info where site_name = '{self.site_name}';
"""
self
.
df_keepa_asin_history
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
.
repartition
(
40
,
'asin'
)
.
cache
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment