1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
from hdfs import InsecureClient
def parse_word(word):
parsed_words = set()
parsed_words.add(word)
parsed_words.add(word + 's')
parsed_words.add(word + 'es')
parsed_words.add(word + '-s')
parsed_words.add(word + '-es')
parsed_words.add(word + '’s')
parsed_words.add(word + 's’')
parsed_words.add(word + '-’s')
parsed_words.add(word + '-s’')
parsed_words.add(word + 'er')
parsed_words.add(word + '-er')
parsed_words.add(word + 'ed')
parsed_words.add(word + '-ed')
parsed_words.add(word + 'ing')
# 规则2: A B
# tokens = word.split()
# if len(tokens) == 2:
# parsed_words.add(tokens[0] + ' ' + tokens[1] + 's')
# parsed_words.add(tokens[0] + 's ' + tokens[1])
# parsed_words.add(tokens[0] + 's ' + tokens[1] + 's')
return parsed_words
def process_file(input_file, output_hdfs_directory):
# 读取文件内容
with open(input_file, 'r', encoding='utf-8') as file:
content = file.read()
# 拆分单词
words = re.sub(r"[^a-zA-Z0-9\s']", '', content).split()
# 解析并处理单词
parsed_words = set()
for word in words:
parsed_words.update(parse_word(word))
# 连接到 HDFS
hdfs_client = InsecureClient('http://192.168.10.224:9870', user='chenyuanjie')
# 将解析结果写入新文件到 HDFS 指定目录
hdfs_output_file = output_hdfs_directory + '/101.txt'
with hdfs_client.write(hdfs_output_file, encoding='utf-8') as hdfs_output:
hdfs_output.write('\n'.join(parsed_words))
if __name__ == "__main__":
input_file_path = "/home/chenyuanjie/data/002.txt"
output_hdfs_directory = "/home/big_data_selection/dim/dim_sensitive"
process_file(input_file_path, output_hdfs_directory)