1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.sql import SparkSession
# 创建一个SparkSession
spark = SparkSession.builder \
.appName("Text Feature Extraction") \
.getOrCreate()
# 用于演示的简单文本数据
data = [
(1, "This is an example."),
(2, "This is another example."),
(3, "Yet another example.")
]
# 将数据转换为DataFrame
columns = ["id", "text"]
data_df = spark.createDataFrame(data, columns)
# 分词(Tokenization)
tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
words_data = tokenizer.transform(data_df)
# 去除停用词(Stop words removal)
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_data = remover.transform(words_data)
# 计算词频(Word frequency)
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
cv_model = cv.fit(filtered_data)
cv_data = cv_model.transform(filtered_data)
# 计算TF-IDF特征向量
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(cv_data)
result_df = idf_model.transform(cv_data)
# 显示结果
result_df.select("id", "features").show(truncate=False)