match_test.py 1.75 KB
import re

from hdfs import InsecureClient


def parse_word(word):

    parsed_words = set()

    parsed_words.add(word)
    parsed_words.add(word + 's')
    parsed_words.add(word + 'es')
    parsed_words.add(word + '-s')
    parsed_words.add(word + '-es')
    parsed_words.add(word + '’s')
    parsed_words.add(word + 's’')
    parsed_words.add(word + '-’s')
    parsed_words.add(word + '-s’')
    parsed_words.add(word + 'er')
    parsed_words.add(word + '-er')
    parsed_words.add(word + 'ed')
    parsed_words.add(word + '-ed')
    parsed_words.add(word + 'ing')

    # 规则2: A B
    # tokens = word.split()
    # if len(tokens) == 2:
    #     parsed_words.add(tokens[0] + ' ' + tokens[1] + 's')
    #     parsed_words.add(tokens[0] + 's ' + tokens[1])
    #     parsed_words.add(tokens[0] + 's ' + tokens[1] + 's')

    return parsed_words

def process_file(input_file, output_hdfs_directory):
    # 读取文件内容
    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # 拆分单词
    words = re.sub(r"[^a-zA-Z0-9\s']", '', content).split()

    # 解析并处理单词
    parsed_words = set()
    for word in words:
        parsed_words.update(parse_word(word))

    # 连接到 HDFS
    hdfs_client = InsecureClient('http://192.168.10.224:9870', user='chenyuanjie')

    # 将解析结果写入新文件到 HDFS 指定目录
    hdfs_output_file = output_hdfs_directory + '/101.txt'
    with hdfs_client.write(hdfs_output_file, encoding='utf-8') as hdfs_output:
        hdfs_output.write('\n'.join(parsed_words))


if __name__ == "__main__":
    input_file_path = "/home/chenyuanjie/data/002.txt"
    output_hdfs_directory = "/home/big_data_selection/dim/dim_sensitive"

    process_file(input_file_path, output_hdfs_directory)