一、步骤
1、将文本数据转换为特征向量 : tf-idf
2、使用这些特征向量训练SVM模型
二、代码
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report# 示例数据集,其中包含文本和对应的情感标签(0表示负面,1表示正面)
data = [("I love this product!", 1),("This is terrible.", 0),("The movie was fantastic.", 1),("I dislike this feature.", 0),("Amazing experience!", 1),("Not recommended.", 0)
]# 将数据集拆分为特征和标签
texts, labels = zip(*data)# 使用TF-IDF向量化器将文本转换为特征向量
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
# X = X.todense()
# feature_name = vectorizer.get_feature_names_out()
# feature_indices = {name:idx for idx,name in enumerate(feature_name)}
# for i,doc in enumerate(X):
# nonzero_idx = doc.nonzero()[1]
# dic = {idx:doc[0,idx] for idx in nonzero_idx}
# sorted_dic = dict(sorted(dic.items(),key=lambda x :x[1],reverse=True))
# data_ = {feature_name[k]:v for k,v in sorted_dic.items()}
# print(data_) # 将数据集拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)# 初始化SVM分类器
svm_classifier = SVC(kernel='linear', random_state=42)# 训练SVM模型
svm_classifier.fit(X_train, y_train)# 预测测试集
y_pred = svm_classifier.predict(X_test)# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)# 打印模型性能指标
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)