数据来源是:头条新闻数据,经过处理之后的部分数据如下:
首先通过交叉验证,取选择模型:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVCfrom sklearn.model_selection import cross_val_score
models = [RandomForestClassifier(n_estimators=200,max_depth = 3,random_state = 0),LinearSVC(),MultinomialNB(),LogisticRegression(random_state=0),]
cv = 5
cv_df = pd.DataFrame(index = range(cv*len(models)))entries = []for model in models:model_name = model.__class__.__name__accuracies = cross_val_score(model,features,labels,scoring = 'accuracy',cv = cv)for fold_idx,accuracy in enumerate(accuracies):print(model_name,fold_idx,accuracy)entries.append((model_name,fold_idx,accuracy))
print(entries[:10])#entries加入的是元组
cv_df = pd.DataFrame(entries,columns = ['model_name','fold_idx','accuracy'])
也可以将结果可视化展示:
import seaborn as sns
sns.boxplot(x='model_name',y='accuracy',data = cv_df)
sns.stripplot(x='model_name',y='accuracy',data = cv_df,size =8, jitter = True,edgecolor = 'gray',linewidth =2)
plt.show()
通过混淆矩阵去查看各个分类的结果
from sklearn.metrics import confusion_matrixconf_mat = confusion_matrix(y_test,y_pred)fig,ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat,annot = True,fmt = 'd',xticklabels =category_id_df.label_content.values,yticklabels = category_id_df.label_content.values)plt.ylabel('Actual' )plt.xlabel('Predicted')plt.show()