import re from hdfs import InsecureClient def parse_word(word): parsed_words = set() parsed_words.add(word) parsed_words.add(word + 's') parsed_words.add(word + 'es') parsed_words.add(word + '-s') parsed_words.add(word + '-es') parsed_words.add(word + '’s') parsed_words.add(word + 's’') parsed_words.add(word + '-’s') parsed_words.add(word + '-s’') parsed_words.add(word + 'er') parsed_words.add(word + '-er') parsed_words.add(word + 'ed') parsed_words.add(word + '-ed') parsed_words.add(word + 'ing') # 规则2: A B # tokens = word.split() # if len(tokens) == 2: # parsed_words.add(tokens[0] + ' ' + tokens[1] + 's') # parsed_words.add(tokens[0] + 's ' + tokens[1]) # parsed_words.add(tokens[0] + 's ' + tokens[1] + 's') return parsed_words def process_file(input_file, output_hdfs_directory): # 读取文件内容 with open(input_file, 'r', encoding='utf-8') as file: content = file.read() # 拆分单词 words = re.sub(r"[^a-zA-Z0-9\s']", '', content).split() # 解析并处理单词 parsed_words = set() for word in words: parsed_words.update(parse_word(word)) # 连接到 HDFS hdfs_client = InsecureClient('http://192.168.10.224:9870', user='chenyuanjie') # 将解析结果写入新文件到 HDFS 指定目录 hdfs_output_file = output_hdfs_directory + '/101.txt' with hdfs_client.write(hdfs_output_file, encoding='utf-8') as hdfs_output: hdfs_output.write('\n'.join(parsed_words)) if __name__ == "__main__": input_file_path = "/home/chenyuanjie/data/002.txt" output_hdfs_directory = "/home/big_data_selection/dim/dim_sensitive" process_file(input_file_path, output_hdfs_directory)