Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
Amazon-Selection-Data
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
abel_cjy
Amazon-Selection-Data
Commits
3ce637b6
Commit
3ce637b6
authored
Jun 09, 2026
by
hejiangming
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
单复数 归一+符号清洗公共类
parent
9e144349
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
229 additions
and
0 deletions
+229
-0
word_normalize.py
Pyspark_job/utils/word_normalize.py
+229
-0
No files found.
Pyspark_job/utils/word_normalize.py
0 → 100644
View file @
3ce637b6
"""
英文单词归一 / 清洗公共模块(单一权威版,替代原先散落在 5 个文件里的副本)。
提供:
- IRREGULAR_PLURAL 不规则复数强制归一表
- SEMANTIC_DISTINCT_WORDS 单复数语义独立白名单
- to_base_form(word, engine) 单复数归一(单数原形)
- clean_word(word) 单词字符清洗(脏词返回 None)
背景:这套逻辑原本在 dws_st_theme / dws_aba_word_heat / dwt_st_theme_agg /
word_head / holiday_aba_word 各写一份(dwt 还有内联 UDF 副本),靠人工"N 处同步",
加一个词要改 5~6 处、极易漏。现统一抽到本模块,各文件 import 使用。
注:抽成独立模块后,cloudpickle 序列化 UDF 闭包时按"模块名+函数名"引用、不按值 pickle,
绕开了当初 dwt 内联的那个 `args[0] from __newobj__` PickleError,所以 UDF 也能直接 import 用。
更新规则:业务要加词,只改本文件这一处即可,全项目自动生效。
"""
import
re
from
nltk.stem
import
WordNetLemmatizer
from
nltk.corpus
import
wordnet
# 模块级单例,避免每次调用 to_base_form 重新构造 lemmatizer
_LEMMATIZER
=
WordNetLemmatizer
()
# ---------------------------------------------------------------
# IRREGULAR_PLURAL:业务要求"强制合并"但 WordNet/inflect 不会自动归一的词
# ---------------------------------------------------------------
# men/teeth:NLTK WordNet 对其不归一(返回原词) → Step 3 命中直接 return → 不与 man/tooth 合并,手动加映射强制走 Step 1
# mens/mans/womens/womans:搜索场景常见的"省撇号所有格/错误复数",inflect 只砍到 men/women 不到单数,手动归到 man/woman
# 配合 to_base_form 里的撇号清洗:women's→(删撇号)womens→(查表)woman、men's→mens→man
# childrens/feets/mices/geeses/teeths:不规则名词的"双复数/非标准复数"写法。
# 这些词 inflect/WordNet 只能退一步(childrens→children)、退不到单数(child),
# 导致"撇号拆开形 children's→children→child"与"双复数形 childrens→children"对不齐 → 同一词族割裂。
# 手动归到真单数,让所有写法统一。(实测 us 仅 childrens 有量级,其余为 0,一并加做防御)
# people→person 走 SEMANTIC_DISTINCT_WORDS 白名单分开,不在这里
IRREGULAR_PLURAL
=
{
'men'
:
'man'
,
'mens'
:
'man'
,
'mans'
:
'man'
,
'womens'
:
'woman'
,
'womans'
:
'woman'
,
'teeth'
:
'tooth'
,
'childrens'
:
'child'
,
'feets'
:
'foot'
,
'mices'
:
'mouse'
,
'geeses'
:
'goose'
,
'teeths'
:
'tooth'
,
}
# ---------------------------------------------------------------
# SEMANTIC_DISTINCT_WORDS:单复数语义独立白名单(与 IRREGULAR_PLURAL 作用相反)
# ---------------------------------------------------------------
# 这些词的单数和复数含义完全不同(如 short=短的 / shorts=短裤、person=人 / people=人们群体),
# 不能在 to_base_form 里被归一合并,否则两个不同语义的词会共享热度/主题标签(跨语义污染)。
# 业务方确认的清单,后续发现新词直接往里加。
SEMANTIC_DISTINCT_WORDS
=
{
'ten'
,
'tens'
,
'bar'
,
'bars'
,
'short'
,
'shorts'
,
'scrub'
,
'scrubs'
,
'leave'
,
'leaves'
,
'person'
,
'people'
,
'flat'
,
'flats'
,
'string'
,
'strings'
,
'boxer'
,
'boxers'
,
'trunk'
,
'trunks'
,
'slack'
,
'slacks'
,
'sis'
,
'ses'
,
'in'
,
'ins'
,
# in=英寸/介词,ins≠in 复数;不加则 ins 被 inflect 归到 in(singular_noun('ins')='in')
}
def
to_base_form
(
word
,
engine
):
"""
单复数归一工具:输入英文单词 + 一个 inflect.engine(),返回它的单数原形(让单复数共享标签/热度)。
依次走 5 个 Step,命中哪步就 return 哪步的结果(每步详见函数体内注释)。
"""
# ============================================================
# Step 0:长度 ≤ 2 直接返回原词
# 为什么:inflect 对极短词会瞎猜,误把代词/系动词当复数还原
# 例:to_base_form('us') → 'us'(inflect 单独跑会还原成 'me');to_base_form('is') → 'is'
# ============================================================
if
not
word
or
len
(
word
)
<=
2
:
return
word
w
=
word
.
lower
()
# ============================================================
# 撇号清洗:删除所有格/缩写里的撇号,让带撇号形式归到无撇号形式
# 为什么删除而非替换空格:women's 删撇号 → womens(再经 IRREGULAR_PLURAL 归到 woman);
# 换空格会把 women's 拆成 women+s 两个词。处理半角 ' 和弯撇号 ' '(搜索词两种都有)
# 对中文/不含撇号的词无影响(replace 不命中就不改)
# ============================================================
for
_ch
in
(
"'"
,
"’"
,
"‘"
):
w
=
w
.
replace
(
_ch
,
""
)
# ============================================================
# Step 0.5:语义独立白名单词,原样返回不归一(short/shorts、person/people 等)
# 放在最前面(优先级最高),保证白名单词绝不被后面的 WordNet/inflect 归一
# ============================================================
if
w
in
SEMANTIC_DISTINCT_WORDS
:
return
w
# ============================================================
# Step 1:查 IRREGULAR_PLURAL 手动归一表(业务强制合并)
# ============================================================
if
w
in
IRREGULAR_PLURAL
:
return
IRREGULAR_PLURAL
[
w
]
# ============================================================
# Step 2:WordNetLemmatizer 还原单数(传统单复数 + 不规则复数 mice→mouse)
# 加反向校验拒绝错拼词:用 inflect 把还原结果转回复数,== 原词才接受
# 例:girls→girl,plural('girl')='girls'==原词 ✓;dresss→dress,plural('dress')='dresses'≠'dresss' ✗ 拒绝
# ============================================================
try
:
wnl
=
_LEMMATIZER
.
lemmatize
(
w
,
pos
=
'n'
)
except
Exception
:
wnl
=
w
if
wnl
!=
w
:
try
:
if
engine
.
plural
(
wnl
)
==
w
:
return
wnl
except
Exception
:
pass
# ============================================================
# Step 3:WordNet 收录的真单数 → 保留原词(拦住 Step 4 把 analysis/bus/glass 砍 s)
# ============================================================
try
:
if
wordnet
.
synsets
(
w
):
return
w
except
Exception
:
pass
# ============================================================
# Step 4:inflect 兜底(处理 WordNet 没收录的现代品类词 airpods→airpod)
# ============================================================
try
:
inf
=
engine
.
singular_noun
(
w
)
except
Exception
:
return
w
if
not
inf
:
return
w
# inflect 不认识 → 错拼词/现代单数词,保留原词
# Step 4a/4b:-ss / -sis 单数误判 patch(dress→dres、analysis→analysi 时保留原词)
if
w
.
endswith
(
'ss'
)
and
inf
==
w
[:
-
1
]:
return
w
if
w
.
endswith
(
'sis'
)
and
inf
==
w
[:
-
1
]:
return
w
# Step 4c:反向校验拒绝错拼词(plural(inf)==原词才接受)
try
:
if
engine
.
plural
(
inf
)
==
w
:
return
inf
except
Exception
:
return
w
return
w
# ---------------------------------------------------------------
# clean_word:单词字符清洗(在 driver 端对 distinct word 跑)。返回 None 表示该词丢弃。
# ---------------------------------------------------------------
# 缩写白名单(规则6):这些词的撇号不清洗,整词原样保留
ABBREV_WHITELIST
=
{
"i'm"
,
"it's"
,
"don't"
,
"didn't"
,
"can't"
,
"that's"
,
"doesn't"
,
"isn't"
,
"aren't"
,
"you're"
,
}
# 中文 + 韩文(谚文音节/字母/兼容字母)字符范围(规则7)
_CJK_RE
=
re
.
compile
(
r'[一-鿿가-힣ᄀ-ᇿ-]'
)
# 跟在数字后面要保留的度量符(规则2)
_MEASURE
=
{
'"'
,
"'"
,
'°'
,
'
%
'
}
# 夹在字母之间要保留的符号(规则4;'-' 走规则5)
_ALPHA_MID
=
{
'
\\
'
,
'&'
}
def
_is_an
(
ch
):
"""ASCII 字母或数字(非 ASCII 的字母/数字一律当符号处理)。"""
return
ch
.
isascii
()
and
ch
.
isalnum
()
def
clean_word
(
word
):
"""
清洗单个词,返回清洗结果;需丢弃的词返回 None。规则:
1 数字之间任意符号(含连续)保留;2 数字后度量符 " ' °
%
保留;
3 数字前 $ 保留;4 字母之间
\\
& 保留;5 连字符中间留、首尾删;
6 缩写白名单原样;7 含中韩整词删;8 其余符号删(含字母首尾的符号)。
"""
if
word
is
None
:
return
None
w
=
word
.
strip
()
.
lower
()
if
not
w
:
return
None
# 规则6:缩写白名单 → 整词原样
if
w
in
ABBREV_WHITELIST
:
return
w
# 规则7:含中文/韩文 → 整词丢弃
if
_CJK_RE
.
search
(
w
):
return
None
chars
=
list
(
w
)
n
=
len
(
chars
)
out
=
[]
for
i
,
c
in
enumerate
(
chars
):
if
_is_an
(
c
):
out
.
append
(
c
)
continue
# 规则5:连字符先全留,循环后用正则删首尾(含连续 -- 都留)
if
c
==
'-'
:
out
.
append
(
c
)
continue
# 向左/向右跳过符号,找第一个字母或数字作上下文
L
=
next
((
chars
[
j
]
for
j
in
range
(
i
-
1
,
-
1
,
-
1
)
if
_is_an
(
chars
[
j
])),
''
)
R
=
next
((
chars
[
j
]
for
j
in
range
(
i
+
1
,
n
)
if
_is_an
(
chars
[
j
])),
''
)
Ld
,
Rd
=
L
.
isdigit
(),
R
.
isdigit
()
La
,
Ra
=
(
L
.
isascii
()
and
L
.
isalpha
()),
(
R
.
isascii
()
and
R
.
isalpha
())
keep
=
(
(
Ld
and
Rd
)
# 规则1 数字之间(任意符号、任意个)
or
(
Ld
and
c
in
_MEASURE
)
# 规则2 左数字 + 度量符 " ' ° %
or
(
Rd
and
c
==
'$'
)
# 规则3 $ + 右数字
or
(
La
and
Ra
and
c
in
_ALPHA_MID
)
# 规则4 字母之间 \ &
)
if
keep
:
out
.
append
(
c
)
res
=
re
.
sub
(
r'^-+|-+$'
,
''
,
''
.
join
(
out
))
# 规则5 连字符首尾删(中间/连续保留)
if
not
re
.
search
(
r'[0-9a-z]'
,
res
):
# 规则8 纯符号/清洗后无字母数字 → 丢弃
return
None
return
res
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment