单复数归一+符号清洗公共类

3ce637b6 · hejiangming · 9e144349 · 3ce637b6
Commit 3ce637b6 authored Jun 09, 2026 by hejiangming
Show whitespace changes
Inline Side-by-side

Showing with 229 additions and 0 deletions

word_normalize.py Pyspark_job/utils/word_normalize.py +229 -0

No files found.
--- a/Pyspark_job/utils/word_normalize.py
+++ b/Pyspark_job/utils/word_normalize.py
+"""
+英文单词归一 / 清洗公共模块（单一权威版，替代原先散落在 5 个文件里的副本）。
+
+提供：
+  - IRREGULAR_PLURAL        不规则复数强制归一表
+  - SEMANTIC_DISTINCT_WORDS 单复数语义独立白名单
+  - to_base_form(word, engine)  单复数归一（单数原形）
+  - clean_word(word)            单词字符清洗（脏词返回 None）
+
+背景：这套逻辑原本在 dws_st_theme / dws_aba_word_heat / dwt_st_theme_agg /
+  word_head / holiday_aba_word 各写一份（dwt 还有内联 UDF 副本），靠人工"N 处同步"，
+  加一个词要改 5~6 处、极易漏。现统一抽到本模块，各文件 import 使用。
+  注：抽成独立模块后，cloudpickle 序列化 UDF 闭包时按"模块名+函数名"引用、不按值 pickle，
+  绕开了当初 dwt 内联的那个 `args[0] from __newobj__` PickleError，所以 UDF 也能直接 import 用。
+
+更新规则：业务要加词，只改本文件这一处即可，全项目自动生效。
+"""
+import re
+
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import wordnet
+
+# 模块级单例，避免每次调用 to_base_form 重新构造 lemmatizer
+_LEMMATIZER = WordNetLemmatizer()
+
+# ---------------------------------------------------------------
+# IRREGULAR_PLURAL：业务要求"强制合并"但 WordNet/inflect 不会自动归一的词
+# ---------------------------------------------------------------
+# men/teeth：NLTK WordNet 对其不归一(返回原词) → Step 3 命中直接 return → 不与 man/tooth 合并，手动加映射强制走 Step 1
+# mens/mans/womens/womans：搜索场景常见的"省撇号所有格/错误复数"，inflect 只砍到 men/women 不到单数，手动归到 man/woman
+#   配合 to_base_form 里的撇号清洗：women's→(删撇号)womens→(查表)woman、men's→mens→man
+# childrens/feets/mices/geeses/teeths：不规则名词的"双复数/非标准复数"写法。
+#   这些词 inflect/WordNet 只能退一步(childrens→children)、退不到单数(child)，
+#   导致"撇号拆开形 children's→children→child"与"双复数形 childrens→children"对不齐 → 同一词族割裂。
+#   手动归到真单数，让所有写法统一。(实测 us 仅 childrens 有量级，其余为 0，一并加做防御)
+# people→person 走 SEMANTIC_DISTINCT_WORDS 白名单分开，不在这里
+IRREGULAR_PLURAL = {
+    'men': 'man',
+    'mens': 'man',
+    'mans': 'man',
+    'womens': 'woman',
+    'womans': 'woman',
+    'teeth': 'tooth',
+    'childrens': 'child',
+    'feets': 'foot',
+    'mices': 'mouse',
+    'geeses': 'goose',
+    'teeths': 'tooth',
+}
+
+# ---------------------------------------------------------------
+# SEMANTIC_DISTINCT_WORDS：单复数语义独立白名单（与 IRREGULAR_PLURAL 作用相反）
+# ---------------------------------------------------------------
+# 这些词的单数和复数含义完全不同（如 short=短的 / shorts=短裤、person=人 / people=人们群体），
+# 不能在 to_base_form 里被归一合并，否则两个不同语义的词会共享热度/主题标签（跨语义污染）。
+# 业务方确认的清单，后续发现新词直接往里加。
+SEMANTIC_DISTINCT_WORDS = {
+    'ten', 'tens',
+    'bar', 'bars',
+    'short', 'shorts',
+    'scrub', 'scrubs',
+    'leave', 'leaves',
+    'person', 'people',
+    'flat', 'flats',
+    'string', 'strings',
+    'boxer', 'boxers',
+    'trunk', 'trunks',
+    'slack', 'slacks',
+    'sis', 'ses',
+    'in', 'ins',  # in=英寸/介词，ins≠in 复数；不加则 ins 被 inflect 归到 in（singular_noun('ins')='in'）
+}
+
+
+def to_base_form(word, engine):
+    """
+    单复数归一工具：输入英文单词 + 一个 inflect.engine()，返回它的单数原形（让单复数共享标签/热度）。
+    依次走 5 个 Step，命中哪步就 return 哪步的结果（每步详见函数体内注释）。
+    """
+    # ============================================================
+    # Step 0：长度 ≤ 2 直接返回原词
+    # 为什么：inflect 对极短词会瞎猜，误把代词/系动词当复数还原
+    # 例：to_base_form('us') → 'us'（inflect 单独跑会还原成 'me'）；to_base_form('is') → 'is'
+    # ============================================================
+    if not word or len(word) <= 2:
+        return word
+
+    w = word.lower()
+
+    # ============================================================
+    # 撇号清洗：删除所有格/缩写里的撇号，让带撇号形式归到无撇号形式
+    # 为什么删除而非替换空格：women's 删撇号 → womens（再经 IRREGULAR_PLURAL 归到 woman）；
+    #   换空格会把 women's 拆成 women+s 两个词。处理半角 ' 和弯撇号 ' '（搜索词两种都有）
+    # 对中文/不含撇号的词无影响（replace 不命中就不改）
+    # ============================================================
+    for _ch in ("'", "’", "‘"):
+        w = w.replace(_ch, "")
+
+    # ============================================================
+    # Step 0.5：语义独立白名单词，原样返回不归一（short/shorts、person/people 等）
+    # 放在最前面（优先级最高），保证白名单词绝不被后面的 WordNet/inflect 归一
+    # ============================================================
+    if w in SEMANTIC_DISTINCT_WORDS:
+        return w
+
+    # ============================================================
+    # Step 1：查 IRREGULAR_PLURAL 手动归一表（业务强制合并）
+    # ============================================================
+    if w in IRREGULAR_PLURAL:
+        return IRREGULAR_PLURAL[w]
+
+    # ============================================================
+    # Step 2：WordNetLemmatizer 还原单数（传统单复数 + 不规则复数 mice→mouse）
+    # 加反向校验拒绝错拼词：用 inflect 把还原结果转回复数，== 原词才接受
+    # 例：girls→girl，plural('girl')='girls'==原词 ✓；dresss→dress，plural('dress')='dresses'≠'dresss' ✗ 拒绝
+    # ============================================================
+    try:
+        wnl = _LEMMATIZER.lemmatize(w, pos='n')
+    except Exception:
+        wnl = w
+    if wnl != w:
+        try:
+            if engine.plural(wnl) == w:
+                return wnl
+        except Exception:
+            pass
+
+    # ============================================================
+    # Step 3：WordNet 收录的真单数 → 保留原词（拦住 Step 4 把 analysis/bus/glass 砍 s）
+    # ============================================================
+    try:
+        if wordnet.synsets(w):
+            return w
+    except Exception:
+        pass
+
+    # ============================================================
+    # Step 4：inflect 兜底（处理 WordNet 没收录的现代品类词 airpods→airpod）
+    # ============================================================
+    try:
+        inf = engine.singular_noun(w)
+    except Exception:
+        return w
+    if not inf:
+        return w  # inflect 不认识 → 错拼词/现代单数词，保留原词
+
+    # Step 4a/4b：-ss / -sis 单数误判 patch（dress→dres、analysis→analysi 时保留原词）
+    if w.endswith('ss') and inf == w[:-1]:
+        return w
+    if w.endswith('sis') and inf == w[:-1]:
+        return w
+
+    # Step 4c：反向校验拒绝错拼词（plural(inf)==原词才接受）
+    try:
+        if engine.plural(inf) == w:
+            return inf
+    except Exception:
+        return w
+    return w
+
+
+# ---------------------------------------------------------------
+# clean_word：单词字符清洗（在 driver 端对 distinct word 跑）。返回 None 表示该词丢弃。
+# ---------------------------------------------------------------
+# 缩写白名单（规则6）：这些词的撇号不清洗，整词原样保留
+ABBREV_WHITELIST = {
+    "i'm", "it's", "don't", "didn't", "can't",
+    "that's", "doesn't", "isn't", "aren't", "you're",
+}
+# 中文 + 韩文（谚文音节/字母/兼容字母）字符范围（规则7）
+_CJK_RE = re.compile(r'[一-鿿가-힣ᄀ-ᇿ㄰-㆏]')
+# 跟在数字后面要保留的度量符（规则2）
+_MEASURE = {'"', "'", '°', '%'}
+# 夹在字母之间要保留的符号（规则4；'-' 走规则5）
+_ALPHA_MID = {'\\', '&'}
+
+
+def _is_an(ch):
+    """ASCII 字母或数字（非 ASCII 的字母/数字一律当符号处理）。"""
+    return ch.isascii() and ch.isalnum()
+
+
+def clean_word(word):
+    """
+    清洗单个词，返回清洗结果；需丢弃的词返回 None。规则：
+      1 数字之间任意符号（含连续）保留；2 数字后度量符 " ' ° % 保留；
+      3 数字前 $ 保留；4 字母之间 \\ & 保留；5 连字符中间留、首尾删；
+      6 缩写白名单原样；7 含中韩整词删；8 其余符号删（含字母首尾的符号）。
+    """
+    if word is None:
+        return None
+    w = word.strip().lower()
+    if not w:
+        return None
+    # 规则6：缩写白名单 → 整词原样
+    if w in ABBREV_WHITELIST:
+        return w
+    # 规则7：含中文/韩文 → 整词丢弃
+    if _CJK_RE.search(w):
+        return None
+
+    chars = list(w)
+    n = len(chars)
+    out = []
+    for i, c in enumerate(chars):
+        if _is_an(c):
+            out.append(c)
+            continue
+        # 规则5：连字符先全留，循环后用正则删首尾（含连续 -- 都留）
+        if c == '-':
+            out.append(c)
+            continue
+        # 向左/向右跳过符号，找第一个字母或数字作上下文
+        L = next((chars[j] for j in range(i - 1, -1, -1) if _is_an(chars[j])), '')
+        R = next((chars[j] for j in range(i + 1, n) if _is_an(chars[j])), '')
+        Ld, Rd = L.isdigit(), R.isdigit()
+        La, Ra = (L.isascii() and L.isalpha()), (R.isascii() and R.isalpha())
+        keep = (
+            (Ld and Rd)                          # 规则1 数字之间（任意符号、任意个）
+            or (Ld and c in _MEASURE)            # 规则2 左数字 + 度量符 " ' ° %
+            or (Rd and c == '$')                 # 规则3 $ + 右数字
+            or (La and Ra and c in _ALPHA_MID)   # 规则4 字母之间 \ &
+        )
+        if keep:
+            out.append(c)
+
+    res = re.sub(r'^-+|-+$', '', ''.join(out))   # 规则5 连字符首尾删（中间/连续保留）
+    if not re.search(r'[0-9a-z]', res):          # 规则8 纯符号/清洗后无字母数字 → 丢弃
+        return None
+    return res