Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
0f05969c
Commit
0f05969c
authored
Mar 25, 2026
by
chenyuanjie
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix
parent
d6bfe7bf
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
41 additions
and
32 deletions
+41
-32
es_asin_profit_rate.py
Pyspark_job/export_es/es_asin_profit_rate.py
+41
-32
No files found.
Pyspark_job/export_es/es_asin_profit_rate.py
View file @
0f05969c
...
@@ -286,7 +286,10 @@ class EsAsinProfitRate(object):
...
@@ -286,7 +286,10 @@ class EsAsinProfitRate(object):
# ------------------------------------------------------------------ #
# ------------------------------------------------------------------ #
def
get_es_read_options
(
self
,
include_crawl_date
=
False
):
def
get_es_read_options
(
self
,
include_crawl_date
=
False
):
# es.mapping.date.rich=false:date字段以原始字符串返回,避免转为timestamp带来的精度问题
# es.mapping.date.rich=false:date字段以原始字符串返回,避免转为timestamp带来的精度问题
fields
=
"asin,price,profit_key,profit_rate_extra.ocean_profit,profit_rate_extra.air_profit,tracking_since,tracking_since_type"
# profit_rate_extra 不从ES读取:月度索引所有文档初始值为{},ES-Hadoop推断为空struct,
# 写回时F.when/otherwise两分支类型不一致导致struct字段被丢弃,序列化为{}
# 改为直接从Hive新值构建struct写入,彻底绕开空struct的schema推断问题
fields
=
"asin,profit_key,tracking_since,tracking_since_type"
if
include_crawl_date
:
if
include_crawl_date
:
fields
+=
",asin_crawl_date"
fields
+=
",asin_crawl_date"
return
{
return
{
...
@@ -332,7 +335,7 @@ class EsAsinProfitRate(object):
...
@@ -332,7 +335,7 @@ class EsAsinProfitRate(object):
CommonUtil
.
send_wx_msg
([
'chenyuanjie'
],
'
\u26A0
ES索引更新失败'
,
f
'失败索引:{index_name}'
)
CommonUtil
.
send_wx_msg
([
'chenyuanjie'
],
'
\u26A0
ES索引更新失败'
,
f
'失败索引:{index_name}'
)
# ------------------------------------------------------------------ #
# ------------------------------------------------------------------ #
# 核心更新逻辑:单次读 +
单次写
# 核心更新逻辑:单次读 +
分类写入
# ------------------------------------------------------------------ #
# ------------------------------------------------------------------ #
def
update_index_combined
(
self
,
index_name
,
base_date
=
None
):
def
update_index_combined
(
self
,
index_name
,
base_date
=
None
):
"""
"""
...
@@ -346,21 +349,20 @@ class EsAsinProfitRate(object):
...
@@ -346,21 +349,20 @@ class EsAsinProfitRate(object):
print
(
f
"开始处理索引:{index_name},base_date={base_date or 'asin_crawl_date(动态)'}"
)
print
(
f
"开始处理索引:{index_name},base_date={base_date or 'asin_crawl_date(动态)'}"
)
print
(
f
"{'='*60}"
)
print
(
f
"{'='*60}"
)
# 1. 读取ES现有数据,date字段以原始字符串返回,profit_rate_extra以StructType读回
# 1. 读取ES现有数据,date字段以原始字符串返回
# profit_rate_extra 不读取(见 get_es_read_options 注释)
# asin_crawl_date 仅 30day 动态模式需要,按需读取
# asin_crawl_date 仅 30day 动态模式需要,按需读取
# cache:30day索引步骤
9
需复用 df_es.select('asin'),避免二次全量读取
# cache:30day索引步骤
8
需复用 df_es.select('asin'),避免二次全量读取
df_es
=
self
.
spark
.
read
.
format
(
"org.elasticsearch.spark.sql"
)
\
df_es
=
self
.
spark
.
read
.
format
(
"org.elasticsearch.spark.sql"
)
\
.
options
(
**
self
.
get_es_read_options
(
include_crawl_date
=
(
base_date
is
None
)))
\
.
options
(
**
self
.
get_es_read_options
(
include_crawl_date
=
(
base_date
is
None
)))
\
.
load
(
index_name
)
\
.
load
(
index_name
)
\
.
withColumn
(
'price'
,
F
.
round
(
F
.
col
(
'price'
),
2
))
\
.
withColumn
(
'tracking_since_type'
,
F
.
col
(
'tracking_since_type'
)
.
cast
(
'int'
))
\
.
withColumn
(
'tracking_since_type'
,
F
.
col
(
'tracking_since_type'
)
.
cast
(
'int'
))
\
.
repartition
(
40
,
'asin'
)
\
.
repartition
(
40
,
'asin'
)
\
.
cache
()
.
cache
()
# 2. 准备利润率增量(
重命名避免列冲突
)
# 2. 准备利润率增量(
直接用 profit_key 关联,无需读取 ES price 字段
)
df_profit
=
self
.
df_asin_profit_rate
.
select
(
df_profit
=
self
.
df_asin_profit_rate
.
select
(
'asin'
,
'price'
,
'profit_key'
,
F
.
col
(
'profit_key'
)
.
alias
(
'new_profit_key'
),
F
.
col
(
'ocean_profit'
)
.
cast
(
'float'
)
.
alias
(
'new_ocean_profit'
),
F
.
col
(
'ocean_profit'
)
.
cast
(
'float'
)
.
alias
(
'new_ocean_profit'
),
F
.
col
(
'air_profit'
)
.
cast
(
'float'
)
.
alias
(
'new_air_profit'
)
F
.
col
(
'air_profit'
)
.
cast
(
'float'
)
.
alias
(
'new_air_profit'
)
)
)
...
@@ -372,30 +374,15 @@ class EsAsinProfitRate(object):
...
@@ -372,30 +374,15 @@ class EsAsinProfitRate(object):
)
)
# 4. left join 两个增量数据集
# 4. left join 两个增量数据集
# profit_key 格式统一(asin_price,已 round),ES 与 Hive 两侧直接对齐,无需借助 price 列
df
=
df_es
\
df
=
df_es
\
.
join
(
df_profit
,
on
=
[
'asin'
,
'price'
]
,
how
=
'left'
)
\
.
join
(
df_profit
,
on
=
'profit_key'
,
how
=
'left'
)
\
.
join
(
df_keepa
,
on
=
'asin'
,
how
=
'left'
)
.
join
(
df_keepa
,
on
=
'asin'
,
how
=
'left'
)
# 5. 过滤:至少一方有增量数据,减少无效写入
# 5. 过滤:至少一方有增量数据,减少无效写入
df
=
df
.
filter
(
F
.
col
(
'new_profit_key'
)
.
isNotNull
()
|
F
.
col
(
'kp_tracking_since'
)
.
isNotNull
())
df
=
df
.
filter
(
F
.
col
(
'new_ocean_profit'
)
.
isNotNull
()
|
F
.
col
(
'new_air_profit'
)
.
isNotNull
()
|
F
.
col
(
'kp_tracking_since'
)
.
isNotNull
())
# 6. 利润率字段合并:有新数据用新的,否则保留ES旧值
# 直接访问子字段构建固定 schema 的 struct,避免不同索引 profit_rate_extra 子字段数不同导致 schema 冲突
df
=
df
.
withColumn
(
'profit_key'
,
F
.
when
(
F
.
col
(
'new_profit_key'
)
.
isNotNull
(),
F
.
col
(
'new_profit_key'
))
.
otherwise
(
F
.
col
(
'profit_key'
))
)
.
withColumn
(
'profit_rate_extra'
,
F
.
struct
(
F
.
when
(
F
.
col
(
'new_ocean_profit'
)
.
isNotNull
(),
F
.
col
(
'new_ocean_profit'
))
.
otherwise
(
F
.
col
(
'profit_rate_extra.ocean_profit'
)
.
cast
(
'float'
))
.
alias
(
'ocean_profit'
),
F
.
when
(
F
.
col
(
'new_air_profit'
)
.
isNotNull
(),
F
.
col
(
'new_air_profit'
))
.
otherwise
(
F
.
col
(
'profit_rate_extra.air_profit'
)
.
cast
(
'float'
))
.
alias
(
'air_profit'
)
)
)
#
7
. keepa字段合并:有新数据用新的,否则保留ES旧值
#
6
. keepa字段合并:有新数据用新的,否则保留ES旧值
# 将 kp_tracking_since(keepa原始分钟值)转为日期字符串
# 将 kp_tracking_since(keepa原始分钟值)转为日期字符串
df
=
df
.
withColumn
(
'tracking_since'
,
df
=
df
.
withColumn
(
'tracking_since'
,
F
.
when
(
F
.
when
(
...
@@ -434,14 +421,36 @@ class EsAsinProfitRate(object):
...
@@ -434,14 +421,36 @@ class EsAsinProfitRate(object):
)
.
otherwise
(
F
.
col
(
'tracking_since_type'
))
)
.
otherwise
(
F
.
col
(
'tracking_since_type'
))
)
)
# 8. 写回索引(tracking_since_type 写回 ES short 类型,需显式 cast)
# 7. 写回索引(tracking_since_type 写回 ES short 类型,需显式 cast)
df_update
=
df
.
select
(
# 拆为两次写入:
'asin'
,
'profit_key'
,
'profit_rate_extra'
,
'tracking_since'
,
# 7a. 有利润率数据的行 → profit_rate_extra struct 从新值直接构建,彻底绕开空struct schema推断问题
# 7b. 仅有 keepa 数据的行 → 不写 profit_rate_extra,避免 null 覆盖已有利润率值
df
=
df
.
cache
()
df_profit_update
=
df
.
filter
(
F
.
col
(
'new_ocean_profit'
)
.
isNotNull
()
|
F
.
col
(
'new_air_profit'
)
.
isNotNull
()
)
.
select
(
'asin'
,
'profit_key'
,
F
.
struct
(
F
.
col
(
'new_ocean_profit'
)
.
cast
(
'float'
)
.
alias
(
'ocean_profit'
),
F
.
col
(
'new_air_profit'
)
.
cast
(
'float'
)
.
alias
(
'air_profit'
)
)
.
alias
(
'profit_rate_extra'
),
'tracking_since'
,
F
.
col
(
'tracking_since_type'
)
.
cast
(
'short'
)
F
.
col
(
'tracking_since_type'
)
.
cast
(
'short'
)
)
)
self
.
write_combined_update
(
df_update
,
index_name
)
self
.
write_combined_update
(
df_profit_update
,
index_name
)
df_keepa_update
=
df
.
filter
(
F
.
col
(
'new_ocean_profit'
)
.
isNull
()
&
F
.
col
(
'new_air_profit'
)
.
isNull
()
)
.
select
(
'asin'
,
'profit_key'
,
'tracking_since'
,
F
.
col
(
'tracking_since_type'
)
.
cast
(
'short'
)
)
self
.
write_combined_update
(
df_keepa_update
,
index_name
)
df
.
unpersist
()
#
9
. 30day 索引额外更新 cate_flag 相关字段(partial update,不影响其他字段)
#
8
. 30day 索引额外更新 cate_flag 相关字段(partial update,不影响其他字段)
# inner join df_es 确保只更新索引中已存在的 asin,避免 doc missing 报错
# inner join df_es 确保只更新索引中已存在的 asin,避免 doc missing 报错
if
base_date
is
None
:
if
base_date
is
None
:
print
(
f
"[30day] 开始更新 cate_flag 字段:asin_source_flag / bsr / nsr"
)
print
(
f
"[30day] 开始更新 cate_flag 字段:asin_source_flag / bsr / nsr"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment