Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
7edc66cc
Commit
7edc66cc
authored
May 20, 2026
by
fangxingjun
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'developer' of 47.106.101.75:abel_cjy/Amazon-Selection-Data into developer
parents
e56a458e
5655a77a
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
226 additions
and
160 deletions
+226
-160
dim_asin_profit_rate_info.py
Pyspark_job/dim/dim_asin_profit_rate_info.py
+0
-0
dim_keepa_asin_info.py
Pyspark_job/dim/dim_keepa_asin_info.py
+16
-3
dwt_flow_keepa_asin.py
Pyspark_job/dwt/dwt_flow_keepa_asin.py
+0
-151
export_keepa_asin_del.py
Pyspark_job/script/export_keepa_asin_del.py
+22
-4
export_need_profit_rate.py
Pyspark_job/script/export_need_profit_rate.py
+136
-0
ods_asin_profit_rate.py
Pyspark_job/sqoop_import/ods_asin_profit_rate.py
+50
-0
ods_keepa_asin_detail.py
Pyspark_job/sqoop_import/ods_keepa_asin_detail.py
+2
-2
No files found.
Pyspark_job/dim/dim_asin_profit_rate_info.py
View file @
7edc66cc
This diff is collapsed.
Click to expand it.
Pyspark_job/dim/dim_keepa_asin_info.py
View file @
7edc66cc
"""
author: CT
description: Keepa 数据聚合 — 一站式 Hive → Hive + Doris
步骤:
1) 读 Hive ods_keepa_asin_detail 当日分区,解析 last_detail JSON 各字段
派生 keepa_launch_time = min(listed_since, tracking_since) 转 yyyy-MM-dd HH:mm:ss
weight 字段已废弃置 NULL(Doris 端不再保留)
2) 与 Hive 历史 dim_keepa_asin_info union 按 asin 去重保留 updated_time 最新
3) 写入 Hive dim_keepa_asin_info(当日分区)+ 删除所有 date_info < 今日的历史分区
4) 当日新数据(不含历史)写入 Doris dwd.dwd_keepa_asin_detail
Doris UNIQUE KEY(site_name, asin) + sequence_col=updated_time 自动取最新
执行示例: spark-submit dim_keepa_asin_info.py us 2026-05-15
"""
import
os
import
os
import
sys
import
sys
...
@@ -85,14 +98,14 @@ class DimKeepaAsinInfo(object):
...
@@ -85,14 +98,14 @@ class DimKeepaAsinInfo(object):
# 过滤脏数据:productType in (3,4,5) 且 title 为空的异常数据不做保留
# 过滤脏数据:productType in (3,4,5) 且 title 为空的异常数据不做保留
~
(
F
.
col
(
"product_type"
)
.
isin
(
3
,
4
,
5
)
&
F
.
col
(
"title"
)
.
isNull
())
~
(
F
.
col
(
"product_type"
)
.
isin
(
3
,
4
,
5
)
&
F
.
col
(
"title"
)
.
isNull
())
)
.
cache
()
)
.
cache
()
# 写入 Doris 需带 site_name 分区字段,并把 keepa_launch_time 转为 DATETIME(DDL 类型已改)
# 写入 Doris 需带 site_name 分区字段,并把 keepa_launch_time
/ updated_time
转为 DATETIME(DDL 类型已改)
# weight 字段在 Doris dwd_keepa_asin_detail 已废弃,select 不带
# weight 字段在 Doris dwd_keepa_asin_detail 已废弃,select 不带
self
.
df_to_doris
=
self
.
df_keepa_asin
.
select
(
self
.
df_to_doris
=
self
.
df_keepa_asin
.
select
(
F
.
lit
(
self
.
site_name
)
.
alias
(
'site_name'
),
F
.
lit
(
self
.
site_name
)
.
alias
(
'site_name'
),
'asin'
,
'package_length'
,
'package_width'
,
'package_height'
,
'package_weight'
,
'item_weight'
,
'asin'
,
'package_length'
,
'package_width'
,
'package_height'
,
'package_weight'
,
'item_weight'
,
'listed_since'
,
'release_date'
,
'tracking_since'
,
'listed_since'
,
'release_date'
,
'tracking_since'
,
F
.
to_timestamp
(
F
.
col
(
'keepa_launch_time'
))
.
alias
(
'keepa_launch_time'
),
F
.
to_timestamp
(
F
.
col
(
'keepa_launch_time'
))
.
alias
(
'keepa_launch_time'
),
'updated_time'
F
.
to_timestamp
(
F
.
col
(
'updated_time'
))
.
alias
(
'updated_time'
)
)
)
# 读取历史数据
# 读取历史数据
...
@@ -117,7 +130,7 @@ class DimKeepaAsinInfo(object):
...
@@ -117,7 +130,7 @@ class DimKeepaAsinInfo(object):
"site_name"
,
F
.
lit
(
self
.
site_name
)
"site_name"
,
F
.
lit
(
self
.
site_name
)
)
.
withColumn
(
)
.
withColumn
(
"date_info"
,
F
.
lit
(
self
.
date_info
)
"date_info"
,
F
.
lit
(
self
.
date_info
)
)
.
repartition
(
5
0
)
.
cache
()
)
.
repartition
(
4
0
)
.
cache
()
new_count
=
self
.
df_save
.
count
()
new_count
=
self
.
df_save
.
count
()
old_count
=
self
.
df_keepa_asin_history
.
count
()
old_count
=
self
.
df_keepa_asin_history
.
count
()
hive_tb
=
"dim_keepa_asin_info"
hive_tb
=
"dim_keepa_asin_info"
...
...
Pyspark_job/dwt/dwt_flow_keepa_asin.py
deleted
100644 → 0
View file @
e56a458e
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.spark_util
import
SparkUtil
from
utils.es_util
import
EsUtils
from
pyspark.sql
import
functions
as
F
,
Window
from
utils.hdfs_utils
import
HdfsUtils
from
utils.common_util
import
CommonUtil
from
datetime
import
datetime
,
timedelta
class
DwtFlowKeepaAsin
(
object
):
def
__init__
(
self
,
site_name
,
date_info
):
self
.
site_name
=
site_name
self
.
date_info
=
date_info
self
.
spark
=
SparkUtil
.
get_spark_session
(
f
"{self.__class__.__name__}: {self.site_name} {self.date_info}"
)
self
.
df_flow_asin
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
self
.
df_category_id
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
self
.
df_keepa_asin
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
self
.
df_calc_asin
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
self
.
df_export_asin
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
self
.
df_save
=
self
.
spark
.
sql
(
f
"select 1+1;"
)
def
run
(
self
):
self
.
read_data
()
self
.
handle_data
()
self
.
save_data
()
def
read_data
(
self
):
# 读取流量选品月asin
sql
=
f
"""
select asin, asin_price as price, category_first_id, date_info as source_month
from dwt_flow_asin
where site_name = '{self.site_name}'
and date_type = 'month'
and date_info >= '2025-05'
and asin_price is not null
and asin_price > 0
"""
df_flow_asin_month
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
\
.
withColumn
(
'price'
,
F
.
round
(
F
.
col
(
'price'
),
2
)
.
cast
(
'decimal(10,2)'
))
# 读取ES最近30天缺少利润率的asin
days_30_ago
=
(
datetime
.
now
()
-
timedelta
(
days
=
30
))
.
strftime
(
"
%
Y-
%
m-
%
d 00:00:00"
)
es_read_options
=
{
"es.nodes"
:
EsUtils
.
__es_ip__
,
"es.port"
:
EsUtils
.
__es_port__
,
"es.net.http.auth.user"
:
EsUtils
.
__es_user__
,
"es.net.http.auth.pass"
:
EsUtils
.
__es_passwd__
,
"es.nodes.wan.only"
:
"false"
,
"es.mapping.date.rich"
:
"false"
,
"es.scroll.size"
:
"2000"
,
"es.read.field.include"
:
"asin,price,category_first_id,asin_crawl_date"
,
"es.query"
:
f
'{{"query":{{"bool":{{"must":[{{"range":{{"price":{{"gt":0}}}}}},{{"range":{{"asin_crawl_date":{{"gte":"{days_30_ago}"}}}}}}],"must_not":{{"exists":{{"field":"profit_rate_extra.ocean_profit"}}}}}}}}}}'
}
df_flow_asin_30day
=
self
.
spark
.
read
.
format
(
"org.elasticsearch.spark.sql"
)
\
.
options
(
**
es_read_options
)
\
.
load
(
f
"{self.site_name}_flow_asin_30day"
)
\
.
withColumn
(
'price'
,
F
.
round
(
F
.
col
(
'price'
),
2
)
.
cast
(
'decimal(10,2)'
))
\
.
withColumn
(
'source_month'
,
F
.
date_format
(
F
.
col
(
'asin_crawl_date'
),
'yyyy-MM'
))
\
.
select
(
'asin'
,
'price'
,
'category_first_id'
,
'source_month'
)
# 合并两部分,按(asin, price)去重保留最新source_month
self
.
df_flow_asin
=
df_flow_asin_month
.
union
(
df_flow_asin_30day
)
.
repartition
(
40
,
'asin'
)
window
=
Window
.
partitionBy
([
'asin'
,
'price'
])
.
orderBy
(
F
.
col
(
'source_month'
)
.
desc_nulls_last
())
self
.
df_flow_asin
=
self
.
df_flow_asin
.
withColumn
(
'rank'
,
F
.
row_number
()
.
over
(
window
=
window
)
)
.
filter
(
'rank = 1'
)
.
drop
(
'rank'
)
.
cache
()
# 读取分类数据
sql
=
f
"""
select category_first_id, en_name as category from dim_bsr_category_tree where site_name = '{self.site_name}' and nodes_num = 2
"""
self
.
df_category_id
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
.
cache
()
# 读取keepa数据
sql
=
f
"""
select asin, package_length, package_width, package_height, item_weight as weight from dim_keepa_asin_info where site_name = '{self.site_name}' and date_info = '{self.date_info}'
"""
self
.
df_keepa_asin
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
.
repartition
(
40
,
'asin'
)
.
filter
(
(
F
.
col
(
"package_length"
)
>
0
)
&
(
F
.
col
(
"package_width"
)
>
0
)
&
(
F
.
col
(
"package_height"
)
>
0
)
&
(
F
.
col
(
"weight"
)
>
0
)
)
.
cache
()
# 读取已经计算过利润率的asin
sql
=
f
"""
select asin, price, package_length, package_width, package_height, weight from dim_asin_profit_rate_info where site_name = '{self.site_name}' and date_info = '{self.date_info}'
"""
self
.
df_calc_asin
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
.
repartition
(
40
,
'asin'
)
.
cache
()
# 读取已经导出过asin+price,避免重复计算
sql
=
f
"""
select asin, price, package_length, package_width, package_height, weight from dwt_flow_keepa_asin where site_name = '{self.site_name}'
"""
self
.
df_export_asin
=
self
.
spark
.
sql
(
sqlQuery
=
sql
)
.
repartition
(
40
,
'asin'
)
.
cache
()
def
handle_data
(
self
):
self
.
df_save
=
self
.
df_flow_asin
.
join
(
self
.
df_category_id
,
on
=
'category_first_id'
,
how
=
'left'
)
.
join
(
self
.
df_keepa_asin
,
on
=
'asin'
,
how
=
'inner'
)
.
join
(
self
.
df_calc_asin
,
on
=
[
'asin'
,
'price'
,
'package_length'
,
'package_width'
,
'package_height'
,
'weight'
],
how
=
'left_anti'
)
.
join
(
self
.
df_export_asin
,
on
=
[
'asin'
,
'price'
,
'package_length'
,
'package_width'
,
'package_height'
,
'weight'
],
how
=
'left_anti'
)
.
cache
()
self
.
df_flow_asin
.
unpersist
()
self
.
df_category_id
.
unpersist
()
self
.
df_keepa_asin
.
unpersist
()
self
.
df_calc_asin
.
unpersist
()
self
.
df_export_asin
.
unpersist
()
start_key
=
1
self
.
df_save
=
self
.
df_save
.
withColumn
(
'part_key'
,
F
.
ntile
(
50
)
.
over
(
Window
.
orderBy
(
F
.
rand
()))
+
(
start_key
-
1
)
)
.
select
(
F
.
col
(
'asin'
),
F
.
col
(
'price'
),
F
.
col
(
'category'
),
F
.
col
(
'package_length'
),
F
.
col
(
'package_width'
),
F
.
col
(
'package_height'
),
F
.
col
(
'weight'
),
F
.
col
(
'part_key'
),
F
.
col
(
'source_month'
),
F
.
lit
(
self
.
site_name
)
.
alias
(
'site_name'
),
F
.
lit
(
self
.
date_info
)
.
alias
(
'date_info'
)
)
.
repartition
(
10
)
def
save_data
(
self
):
hive_tb
=
"dwt_flow_keepa_asin"
partition_dict
=
{
"site_name"
:
self
.
site_name
,
"date_info"
:
self
.
date_info
}
partition_by
=
list
(
partition_dict
.
keys
())
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_tb
,
partition_dict
)
HdfsUtils
.
delete_file_in_folder
(
hdfs_path
)
print
(
f
"正在进行数据存储,当前存储的表名为:{hive_tb},存储路径:{hdfs_path}"
)
self
.
df_save
.
write
.
saveAsTable
(
name
=
hive_tb
,
format
=
'hive'
,
mode
=
'append'
,
partitionBy
=
partition_by
)
print
(
"success"
)
if
__name__
==
"__main__"
:
site_name
=
sys
.
argv
[
1
]
date_info
=
sys
.
argv
[
2
]
handle_obj
=
DwtFlowKeepaAsin
(
site_name
,
date_info
)
handle_obj
.
run
()
Pyspark_job/script/export_keepa_asin_del.py
View file @
7edc66cc
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
author: CT
author: CT
description: 从 Hive dim_keepa_asin_info 读取 keepa 数据,过滤 updated_time 超过 3 个月的 asin,
description: 从 Hive dim_keepa_asin_info 读取 keepa 数据,过滤 updated_time 超过 3 个月的 asin,
附带当前月份字段 month(yyyy-MM),导出到 PG us_asin_profit_keepa_add,触发爬虫重新抓取 Keepa
附带当前月份字段 month(yyyy-MM),导出到 PG us_asin_profit_keepa_add,触发爬虫重新抓取 Keepa
导出前 LEFT ANTI 剔除 PG 表中已存在的 asin,避免重复触发
执行示例: spark-submit export_keepa_asin_del.py us
执行示例: spark-submit export_keepa_asin_del.py us
"""
"""
import
os
import
os
...
@@ -28,22 +29,39 @@ if __name__ == '__main__':
...
@@ -28,22 +29,39 @@ if __name__ == '__main__':
WHERE site_name = '{site_name}'
WHERE site_name = '{site_name}'
"""
"""
print
(
f
"sql=
\n
{sql}"
)
print
(
f
"sql=
\n
{sql}"
)
df_all
=
spark
.
sql
(
sqlQuery
=
sql
)
df_all
=
spark
.
sql
(
sqlQuery
=
sql
)
.
cache
()
print
(
f
"全量读取 keepa 数据:{df_all.count()}"
)
# 2. Spark 端过滤超过 3 个月的 asin(数据读取后处理,不在 Hive SQL 中算)
# 2. Spark 端过滤超过 3 个月的 asin(数据读取后处理,不在 Hive SQL 中算)
three_months_ago
=
(
datetime
.
now
()
-
relativedelta
(
months
=
3
))
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
three_months_ago
=
(
datetime
.
now
()
-
relativedelta
(
months
=
3
))
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S'
)
current_month
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m'
)
current_month
=
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m'
)
print
(
f
"过滤阈值 updated_time < {three_months_ago};附加 month={current_month}"
)
print
(
f
"过滤阈值 updated_time < {three_months_ago};附加 month={current_month}"
)
con_info
=
DBUtil
.
get_connection_info
(
db_type
=
'postgresql_cluster'
,
site_name
=
site_name
)
table_name
=
'us_asin_profit_keepa_add'
# 3. 读取 PG 已导出 asin 集合,用于 LEFT ANTI 剔除(避免重复触发爬虫)
# PG 端只取 asin 列全量,Spark 端 dropDuplicates 去重
df_pg_existing
=
spark
.
read
.
format
(
"jdbc"
)
\
.
option
(
"url"
,
con_info
[
"url"
])
\
.
option
(
"dbtable"
,
table_name
)
\
.
option
(
"user"
,
con_info
[
"username"
])
\
.
option
(
"password"
,
con_info
[
"pwd"
])
\
.
load
()
\
.
select
(
'asin'
)
\
.
dropDuplicates
([
'asin'
])
.
cache
()
print
(
f
"PG 已导出 asin 数量:{df_pg_existing.count():,}"
)
# 4. 过滤 3 个月以前 + LEFT ANTI 剔除已导出
df_need_export
=
df_all
.
filter
(
F
.
col
(
'updated_time'
)
<
F
.
lit
(
three_months_ago
))
\
df_need_export
=
df_all
.
filter
(
F
.
col
(
'updated_time'
)
<
F
.
lit
(
three_months_ago
))
\
.
select
(
.
select
(
F
.
col
(
'asin'
),
F
.
col
(
'asin'
),
F
.
lit
(
current_month
)
.
alias
(
'month'
),
F
.
lit
(
current_month
)
.
alias
(
'month'
),
)
.
cache
()
)
\
.
join
(
df_pg_existing
,
on
=
'asin'
,
how
=
'left_anti'
)
\
.
cache
()
print
(
f
"导出数据量:{df_need_export.count():,}"
)
print
(
f
"导出数据量:{df_need_export.count():,}"
)
con_info
=
DBUtil
.
get_connection_info
(
db_type
=
'postgresql_cluster'
,
site_name
=
site_name
)
table_name
=
'us_asin_profit_keepa_add'
df_need_export
.
write
.
format
(
"jdbc"
)
\
df_need_export
.
write
.
format
(
"jdbc"
)
\
.
option
(
"url"
,
con_info
[
"url"
])
\
.
option
(
"url"
,
con_info
[
"url"
])
\
.
option
(
"dbtable"
,
table_name
)
\
.
option
(
"dbtable"
,
table_name
)
\
...
...
Pyspark_job/script/export_need_profit_rate.py
0 → 100644
View file @
7edc66cc
"""
author: CT
description: 导出待计算利润率的 ASIN
1) Hive dwt_flow_asin 月维度读取 date_info >= '2025-05' 的所有 ASIN:
asin / price / category_first_id / asin_crawl_date
2) Doris dwt.{site}_flow_asin_30day 读取所有相关 ASIN:
asin / price / category_first_id / asin_crawl_date
3) union 后按 (asin, price) 去重保留 asin_crawl_date 最新
4) LEFT JOIN 分类、INNER JOIN keepa 增量
keepa 表已整合为单分区快照,按 updated_time > last_date_info 筛增量
5) keepa 关联到的 ASIN 全部导出 PG {site}_asin_profit_rate_calc 重新计算利润率
执行示例: spark-submit export_need_profit_rate.py us 2026-05-15
"""
import
os
import
sys
from
datetime
import
datetime
,
timedelta
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
pyspark.sql
import
functions
as
F
,
Window
from
utils.spark_util
import
SparkUtil
from
utils.db_util
import
DBUtil
from
utils.DorisHelper
import
DorisHelper
START_MONTH
=
'2025-05'
class
ExportNeedProfitRate
(
object
):
def
__init__
(
self
,
site_name
,
date_info
):
self
.
site_name
=
site_name
self
.
date_info
=
date_info
# 计算时间 yyyy-MM-dd
# keepa 增量过滤下限:date_info - 1 天(如 date_info=2026-05-13 → last_date_info=2026-05-12)
self
.
last_date_info
=
(
datetime
.
strptime
(
date_info
,
"
%
Y-
%
m-
%
d"
)
.
date
()
-
timedelta
(
days
=
1
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
self
.
spark
=
SparkUtil
.
get_spark_session
(
f
"{self.__class__.__name__}: {self.site_name} {self.date_info}"
)
def
run
(
self
):
df_export
=
self
.
build_export_df
()
self
.
write_to_pg
(
df_export
)
def
build_export_df
(
self
):
# 1. Hive dwt_flow_asin 月维度,date_info >= 2025-05 所有月份
sql_dwt
=
f
"""
SELECT asin,
asin_price AS price,
category_first_id,
asin_crawl_date
FROM dwt_flow_asin
WHERE site_name = '{self.site_name}'
AND date_type = 'month'
AND date_info >= '{START_MONTH}'
AND asin_price > 0
"""
print
(
f
"sql_dwt =
\n
{sql_dwt}"
)
df_dwt
=
self
.
spark
.
sql
(
sqlQuery
=
sql_dwt
)
\
.
withColumn
(
'price'
,
F
.
round
(
F
.
col
(
'price'
),
2
)
.
cast
(
'decimal(20,2)'
))
\
.
withColumn
(
'asin_crawl_date'
,
F
.
to_timestamp
(
F
.
col
(
'asin_crawl_date'
)))
# 2. Doris dwt.{site}_flow_asin_30day 所有相关 ASIN
# spark_import_with_connector 不支持 WHERE 下推,price > 0 过滤改到 Spark 端
table_identifier
=
f
"dwt.{self.site_name}_flow_asin_30day"
read_fields
=
"asin,price,category_first_id,asin_crawl_date"
df_doris
=
DorisHelper
.
spark_import_with_connector
(
self
.
spark
,
table_identifier
,
read_fields
)
\
.
filter
(
F
.
col
(
'price'
)
>
0
)
\
.
withColumn
(
'price'
,
F
.
round
(
F
.
col
(
'price'
),
2
)
.
cast
(
'decimal(20,2)'
))
\
.
withColumn
(
'asin_crawl_date'
,
F
.
col
(
'asin_crawl_date'
)
.
cast
(
'timestamp'
))
\
.
select
(
'asin'
,
'price'
,
'category_first_id'
,
'asin_crawl_date'
)
# 3. union + 按 (asin, price) 去重保留 asin_crawl_date 最新
df_flow
=
df_dwt
.
unionByName
(
df_doris
)
.
repartition
(
40
,
'asin'
,
'price'
)
window
=
Window
.
partitionBy
(
'asin'
,
'price'
)
.
orderBy
(
F
.
col
(
'asin_crawl_date'
)
.
desc_nulls_last
())
df_flow
=
df_flow
.
withColumn
(
'rk'
,
F
.
row_number
()
.
over
(
window
))
\
.
filter
(
'rk = 1'
)
\
.
drop
(
'rk'
)
\
.
cache
()
# 4. 分类名 LEFT JOIN
sql_cate
=
f
"""
SELECT category_first_id, en_name AS category
FROM dim_bsr_category_tree
WHERE site_name = '{self.site_name}' AND nodes_num = 2
"""
df_cate
=
self
.
spark
.
sql
(
sqlQuery
=
sql_cate
)
# 5. keepa 当日增量 INNER JOIN
# keepa 表已整合为单分区快照,用 updated_time > last_date_info 筛"近一天更新"的增量
sql_keepa
=
f
"""
SELECT asin, package_length, package_width, package_height, item_weight AS weight
FROM dim_keepa_asin_info
WHERE site_name = '{self.site_name}' AND updated_time >= '{self.last_date_info}'
"""
df_keepa
=
self
.
spark
.
sql
(
sqlQuery
=
sql_keepa
)
\
.
filter
((
F
.
col
(
'package_length'
)
>
0
)
&
(
F
.
col
(
'package_width'
)
>
0
)
&
(
F
.
col
(
'package_height'
)
>
0
)
&
(
F
.
col
(
'weight'
)
>
0
))
\
.
repartition
(
40
,
'asin'
)
df_result
=
df_flow
\
.
join
(
df_cate
,
on
=
'category_first_id'
,
how
=
'left'
)
\
.
join
(
df_keepa
,
on
=
'asin'
,
how
=
'inner'
)
\
.
withColumn
(
'source_month'
,
F
.
date_format
(
F
.
col
(
'asin_crawl_date'
),
'yyyy-MM'
))
\
.
withColumn
(
'part_key'
,
F
.
ntile
(
50
)
.
over
(
Window
.
orderBy
(
F
.
rand
())))
\
.
select
(
'asin'
,
'price'
,
'category'
,
'package_length'
,
'package_width'
,
'package_height'
,
'weight'
,
'part_key'
,
'source_month'
,
'asin_crawl_date'
,
)
.
cache
()
count
=
df_result
.
count
()
print
(
f
"待计算利润率数据量:{count:,}"
)
df_result
.
show
(
10
,
truncate
=
False
)
df_flow
.
unpersist
()
return
df_result
def
write_to_pg
(
self
,
df_export
):
con_info
=
DBUtil
.
get_connection_info
(
db_type
=
'postgresql_cluster'
,
site_name
=
self
.
site_name
)
table_name
=
f
"{self.site_name}_asin_profit_rate_calc"
print
(
f
"导出到 PG {table_name}"
)
df_export
.
write
.
format
(
"jdbc"
)
\
.
option
(
"url"
,
con_info
[
"url"
])
\
.
option
(
"dbtable"
,
table_name
)
\
.
option
(
"user"
,
con_info
[
"username"
])
\
.
option
(
"password"
,
con_info
[
"pwd"
])
\
.
mode
(
"append"
)
\
.
save
()
print
(
"success"
)
if
__name__
==
"__main__"
:
site_name
=
sys
.
argv
[
1
]
date_info
=
sys
.
argv
[
2
]
ExportNeedProfitRate
(
site_name
,
date_info
)
.
run
()
Pyspark_job/sqoop_import/ods_asin_profit_rate.py
0 → 100644
View file @
7edc66cc
"""
author: CT
description: sqoop 从 PG {site_name}_asin_profit_rate_calc 增量拉取利润率数据
写入 Hive ods_asin_profit_rate 分区 (site_name, date_info)
增量时间窗:[date_info-1天 00:00:00, 当天运行时刻的 00:00:00)
执行示例: python ods_asin_profit_rate.py us 2026-05-15
"""
import
os
import
sys
from
datetime
import
datetime
,
timedelta
sys
.
path
.
append
(
os
.
path
.
dirname
(
sys
.
path
[
0
]))
from
utils.common_util
import
CommonUtil
from
utils.secure_db_client
import
get_remote_engine
if
__name__
==
'__main__'
:
site_name
=
CommonUtil
.
get_sys_arg
(
1
,
None
)
date_info
=
CommonUtil
.
get_sys_arg
(
2
,
None
)
assert
site_name
is
not
None
,
"site_name 不能为空!"
assert
date_info
is
not
None
,
"date_info 不能为空!"
last_date_info
=
(
datetime
.
strptime
(
date_info
,
"
%
Y-
%
m-
%
d"
)
.
date
()
-
timedelta
(
days
=
1
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
# 上限:程序运行当天 00:00:00,避免拉到 PG 当日正在写入的数据
upper_bound
=
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d"
)
+
" 00:00:00"
hive_table
=
"ods_asin_profit_rate"
import_table
=
f
"{site_name}_asin_profit_rate_calc"
partition_dict
=
{
"site_name"
:
site_name
,
"date_info"
:
date_info
}
hdfs_path
=
CommonUtil
.
build_hdfs_path
(
hive_table
,
partition_dict
=
partition_dict
)
engine
=
get_remote_engine
(
site_name
=
site_name
,
db_type
=
'postgresql_cluster'
)
query
=
f
"""
SELECT asin, price, category, ocean_profit, air_profit,
package_length, package_width, package_height, weight,
updated_time, asin_crawl_date
FROM {import_table}
WHERE updated_time >= '{last_date_info}' AND updated_time < '{upper_bound}' AND
\\
$CONDITIONS
"""
print
(
f
"sqoop query:
\n
{query}"
)
engine
.
sqoop_raw_import
(
query
=
query
,
hive_table
=
hive_table
,
hdfs_path
=
hdfs_path
,
partitions
=
partition_dict
,
check_count
=
True
,
)
print
(
"success!"
)
Pyspark_job/sqoop_import/ods_keepa_asin_detail.py
View file @
7edc66cc
...
@@ -33,9 +33,9 @@ if __name__ == '__main__':
...
@@ -33,9 +33,9 @@ if __name__ == '__main__':
db_type
=
db_type
db_type
=
db_type
)
)
# 增量区间:[last_date_info, 程序运行
时刻
)
# 增量区间:[last_date_info, 程序运行
当天 00:00:00
)
# 下限:last_date_info(前一日),间隔几天补跑时传更早 date_info 自动覆盖区间
# 下限:last_date_info(前一日),间隔几天补跑时传更早 date_info 自动覆盖区间
# 上限:upper_bound(程序运行
时刻),半开区间避免拉到 PG 正在写入的最新数据,且不固定为一天
# 上限:upper_bound(程序运行
当天 00:00:00),半开区间避免拉到 PG 当日正在写入的数据
query
=
f
"""
query
=
f
"""
SELECT asin, last_detail::text as last_detail, update_at FROM {import_table}
SELECT asin, last_detail::text as last_detail, update_at FROM {import_table}
WHERE update_at >= '{last_date_info}' AND update_at < '{upper_bound}' AND last_detail is not null AND
\
$CONDITIONS
WHERE update_at >= '{last_date_info}' AND update_at < '{upper_bound}' AND last_detail is not null AND
\
$CONDITIONS
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment