from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF from pyspark.sql import SparkSession # 创建一个SparkSession spark = SparkSession.builder \ .appName("Text Feature Extraction") \ .getOrCreate() # 用于演示的简单文本数据 data = [ (1, "This is an example."), (2, "This is another example."), (3, "Yet another example.") ] # 将数据转换为DataFrame columns = ["id", "text"] data_df = spark.createDataFrame(data, columns) # 分词(Tokenization) tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") words_data = tokenizer.transform(data_df) # 去除停用词(Stop words removal) remover = StopWordsRemover(inputCol="words", outputCol="filtered_words") filtered_data = remover.transform(words_data) # 计算词频(Word frequency) cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features") cv_model = cv.fit(filtered_data) cv_data = cv_model.transform(filtered_data) # 计算TF-IDF特征向量 idf = IDF(inputCol="raw_features", outputCol="features") idf_model = idf.fit(cv_data) result_df = idf_model.transform(cv_data) # 显示结果 result_df.select("id", "features").show(truncate=False)