frompyspark.ml.featureimportRegexTokenizer,StopWordsRemover,CountVectorizer,IDFfrompyspark.sqlimportSparkSession# 创建一个SparkSessionspark=SparkSession.builder \.appName("Text Feature Extraction") \.getOrCreate()# 用于演示的简单文本数据data=[(1,"This is an example."),(2,"This is another example."),(3,"Yet another example.")]# 将数据转换为DataFramecolumns=["id","text"]data_df=spark.createDataFrame(data,columns)# 分词(Tokenization)tokenizer=RegexTokenizer(inputCol="text",outputCol="words",pattern="\\W")words_data=tokenizer.transform(data_df)# 去除停用词(Stop words removal)remover=StopWordsRemover(inputCol="words",outputCol="filtered_words")filtered_data=remover.transform(words_data)# 计算词频(Word frequency)cv=CountVectorizer(inputCol="filtered_words",outputCol="raw_features")cv_model=cv.fit(filtered_data)cv_data=cv_model.transform(filtered_data)# 计算TF-IDF特征向量idf=IDF(inputCol="raw_features",outputCol="features")idf_model=idf.fit(cv_data)result_df=idf_model.transform(cv_data)# 显示结果result_df.select("id","features").show(truncate=False)