[Kaggle] Heart Disease Prediction

文章目录

- 1. 数据探索
- 2. 特征处理管道
- 3. 训练模型
- 4. 预测

kaggle项目地址

1. 数据探索

import pandas as pd
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')train.info()
test.info()
abs(train.corr()['target']).sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 14 columns):#   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  0   age       241 non-null    int64  1   sex       241 non-null    int64  2   cp        241 non-null    int64  3   trestbps  241 non-null    int64  4   chol      241 non-null    int64  5   fbs       241 non-null    int64  6   restecg   241 non-null    int64  7   thalach   241 non-null    int64  8   exang     241 non-null    int64  9   oldpeak   241 non-null    float6410  slope     241 non-null    int64  11  ca        241 non-null    int64  12  thal      241 non-null    int64  13  target    241 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 26.5 KB

训练数据241条，13个特征（全部为数字特征），标签为 target

特征与标签的相关系数

target      1.000000
cp          0.457688
exang       0.453784
ca          0.408107
thalach     0.390346
oldpeak     0.389787
slope       0.334991
thal        0.324611
sex         0.281272
age         0.242338
restecg     0.196018
chol        0.170592
trestbps    0.154086
fbs         0.035450
Name: target, dtype: float64

查看特征的值

for col in train.columns:print(col)print(train[col].unique())

age
[37 41 56 44 52 57 54 48 64 50 66 43 69 42 61 71 59 65 46 51 45 47 53 6358 35 62 29 55 60 68 39 34 67 74 49 76 70 38 77 40]
sex
[1 0]
cp
[2 1 0 3]
trestbps
[130 140 120 172 150 110 160 125 142 135 155 104 138 128 108 134 122 115118 100 124  94 112 102 152 101 132 178 129 136 106 156 170 117 145 180165 192 144 123 126 154 148 114 164]
chol
[250 204 294 263 199 168 239 275 211 219 226 247 233 243 302 212 177 273304 232 269 360 308 245 208 235 257 216 234 141 252 201 222 260 303 265309 186 203 183 220 209 258 227 261 221 205 318 298 277 197 214 248 255207 223 160 394 315 270 195 240 196 244 254 126 313 262 215 193 271 268267 210 295 178 242 180 228 149 253 342 157 175 286 229 256 224 206 230276 353 225 330 290 266 172 305 188 282 185 326 274 164 307 249 341 407217 174 281 288 289 246 322 299 300 293 184 409 283 259 200 327 237 319166 218 335 169 187 176 241 264 236]
fbs
[0 1]
restecg
[1 0 2]
thalach
[187 172 153 173 162 174 160 139 144 158 114 171 151 179 178 137 157 140152 170 165 148 142 180 156 115 175 186 185 159 130 190 132 182 143 163147 154 202 161 166 164 184 122 168 169 138 111 145 194 131 133 155 167192 121  96 126 105 181 116 149 150 125 108 129 112 128 109 113  99 177141 146 136 127 103 124  88 120 195  95 117  71 118 134  90 123]
exang
[0 1]
oldpeak
[3.5 1.4 1.3 0.  0.5 1.6 1.2 0.2 1.8 2.6 1.5 0.4 1.  0.8 3.  0.6 2.4 0.11.9 4.2 1.1 2.  0.7 0.3 0.9 2.3 3.6 3.2 2.2 2.8 3.4 6.2 4.  5.6 2.1 4.4]
slope
[0 2 1]
ca
[0 2 1 4 3]
thal
[2 3 0 1]
target
[1 0]

一些特征不能用大小来度量，将其转为分类变量（string 类型，后序onehot编码）

object_cols = ['cp', 'restecg', 'slope', 'ca', 'thal']
def strfeatures(data):data_ = data.copy()for col in object_cols:data_[col] = data_[col].astype(str)return data_train_ = strfeatures(train)
test_ = strfeatures(test)

2. 特征处理管道

数字特征、文字特征分离

def num_cat_split(data):s = (data.dtypes == 'object')object_cols = list(s[s].index)num_cols = list(set(data.columns)-set(object_cols))return num_cols, object_colsnum_cols, object_cols = num_cat_split(train_)
num_cols.remove('target')

抽取部分数据作为本地验证

# 本地测试，分成抽样，分割训练集，验证集
from sklearn.model_selection import StratifiedShuffleSplit
splt = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=1)
for train_idx, valid_idx in splt.split(train_, train_['target']):train_part = train_.loc[train_idx]valid_part = train_.loc[valid_idx]train_part_y = train_part['target']
valid_part_y = valid_part['target']
train_part = train_part.drop(['target'], axis=1)
valid_part = valid_part.drop(['target'], axis=1)

数据处理管道

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputerclass DataFrameSelector(BaseEstimator, TransformerMixin):def __init__(self, attribute_name):self.attribute_name = attribute_namedef fit(self, X, y=None):return selfdef transform(self, X):return X[self.attribute_name].valuesnum_pipeline = Pipeline([('selector', DataFrameSelector(num_cols)),# ('imputer', SimpleImputer(strategy='median')),# ('std_scaler', StandardScaler()),
])cat_pipeline = Pipeline([('selector', DataFrameSelector(object_cols)),('cat_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])full_pipeline = FeatureUnion(transformer_list=[('num_pipeline', num_pipeline),('cat_pipeline', cat_pipeline)
])

3. 训练模型

# 本地测试
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCVrf = RandomForestClassifier()
knn = KNeighborsClassifier()
lr = LogisticRegression()
svc = SVC()
gbdt = GradientBoostingClassifier()
perceptron = Perceptron()models = [perceptron, knn, lr, svc, rf, gbdt]
param_grid_list = [# perceptron[{'model__max_iter' : [10000, 5000]}],# knn[{'model__n_neighbors' : [3,5,10,15,35],'model__leaf_size' : [3,5,10,20,30,40,50]}],# lr[{'model__penalty' : ['l1', 'l2'],'model__C' : [0.05, 0.1, 0.2, 0.5, 1, 1.2],'model__max_iter' : [50000]}],# svc[{'model__degree' : [3, 5, 7],'model__C' : [0.2, 0.5, 1, 1.2, 1.5],'model__kernel' : ['rbf', 'sigmoid', 'poly']}],# rf[{#     'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],'model__n_estimators' : [100,200,250,300,350],'model__max_features' : [5,8, 10, 12, 15, 20, 30, 40, 50],'model__max_depth' : [3,5,7]}],# gbdt[{'model__learning_rate' : [0.02, 0.05, 0.1, 0.2],'model__n_estimators' : [30, 50, 100, 150],'model__max_features' : [5, 8, 10,20,30,40],'model__max_depth' : [3,5,7],'model__min_samples_split' : [10, 20,40],'model__min_samples_leaf' : [5,10,20],'model__subsample' : [0.5, 0.8, 1]}],
]for i, model in enumerate(models):pipe = Pipeline([('preparation', full_pipeline),('model', model)])grid_search = GridSearchCV(pipe, param_grid_list[i], cv=3,scoring='accuracy', verbose=2, n_jobs=-1)grid_search.fit(train_part, train_part_y)print(grid_search.best_params_)final_model = grid_search.best_estimator_pred = final_model.predict(valid_part)print('accuracy score: ', accuracy_score(valid_part_y, pred))

Fitting 3 folds for each of 2 candidates, totalling 6 fits
{'model__max_iter': 10000}
accuracy score:  0.4489795918367347Fitting 3 folds for each of 35 candidates, totalling 105 fits
{'model__leaf_size': 3, 'model__n_neighbors': 3}
accuracy score:  0.5306122448979592Fitting 3 folds for each of 12 candidates, totalling 36 fits
{'model__C': 0.1, 'model__max_iter': 50000, 'model__penalty': 'l2'}
accuracy score:  0.8979591836734694Fitting 3 folds for each of 45 candidates, totalling 135 fits
{'model__C': 1, 'model__degree': 5, 'model__kernel': 'poly'}
accuracy score:  0.6326530612244898Fitting 3 folds for each of 135 candidates, totalling 405 fits
{'model__max_depth': 5, 'model__max_features': 5, 
'model__n_estimators': 250}
accuracy score:  0.8775510204081632Fitting 3 folds for each of 7776 candidates, totalling 23328 fits
{'model__learning_rate': 0.05, 'model__max_depth': 7, 
'model__max_features': 20, 'model__min_samples_leaf': 10, 
'model__min_samples_split': 40, 'model__n_estimators': 150, 
'model__subsample': 0.5}
accuracy score:  0.8163265306122449

LR，RF，GBDT 表现较好

4. 预测

# 全量数据训练，提交测试
# 采用随机参数搜索
y_train = train_['target']
X_train = train_.drop(['target'], axis=1)
X_test = test_from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import numpy as npselect_model = [lr, rf, gbdt]
name = ['lr', 'rf', 'gbdt']
param_distribs = [# lr[{'model__penalty' : ['l1', 'l2'],'model__C' : np.linspace(0.01, 0.5, 10),'model__max_iter' : [50000]}],# rf[{#     'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],'model__n_estimators' : randint(low=50, high=500),'model__max_features' : randint(low=3, high=30),'model__max_depth' : randint(low=2, high=20)}],# gbdt[{'model__learning_rate' : np.linspace(0.01, 0.3, 10),'model__n_estimators' : randint(low=30, high=500),'model__max_features' : randint(low=5, high=50),'model__max_depth' : randint(low=3, high=20),'model__min_samples_split' : randint(low=10, high=100),'model__min_samples_leaf' : randint(low=3, high=50),'model__subsample' : np.linspace(0.5, 1.5, 10)}],
]for i, model in enumerate(select_model):pipe = Pipeline([('preparation', full_pipeline),('model', model)])rand_search = RandomizedSearchCV(pipe, param_distributions=param_distribs[i], cv=5,n_iter=1000, scoring='accuracy', verbose=2, n_jobs=-1)rand_search.fit(X_train, y_train)print(rand_search.best_params_)final_model = rand_search.best_estimator_pred = final_model.predict(X_test)print(model,"\nFINISH !!!")res = pd.DataFrame()res['Id'] = range(1,63,1)res['Prediction'] = predres.to_csv('{}_pred.csv'.format(name[i]), index=False)