数据展示
首先登kaggle 下载泰坦尼克训练相关数据
import pandas as pd
import numpy as np
data = pd.read_csv('train.csv')
print(data.shape)
print(data.head)
train = data[:800]
test = data[800:]
print(train.shape)
print(test.shape)
选择特征
selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare']
X_train = train[selected_features]
X_test = test[selected_features]y_train = train['Survived']
print(X_train['Embarked'].value_counts())
print(X_test['Embarked'].value_counts())
对空数据进行填充,采用均值填充
X_train['Embarked'].fillna('S', inplace=True)
X_test['Embarked'].fillna('S', inplace=True)X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_test['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_test['Fare'].fillna(X_train['Fare'].mean(), inplace=True)
特征向量化
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
print(dict_vec.feature_names_)
X_test=dict_vec.transform(X_test.to_dict(orient='record'))
引入随机森林和XGB分类器
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
from xgboost import XGBClassifier
xgbc = XGBClassifier()
交叉验证
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
print(cross_val_score(rfc, X_train, y_train, cv=4, scoring='accuracy').mean())
print(cross_val_score(xgbc, X_train, y_train, cv=4, scoring='accuracy').mean())
y_test = test['Survived']
使用RandomForestClassifier 进行预测操作
rfc.fit(X_train, y_train)
rfc_y_predict = rfc.predict(X_test)
rfc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rfc_y_predict})
rfc_submission.to_csv('rfc_submission.csv', index=False)
print('Train Accuracy: %.1f%%' % (np.mean(rfc_y_predict == y_test) * 100))
使用默认配置的XGBClassifier进行预测操作
xgbc.fit(X_train, y_train)
XGBClassifier(base_score=0.5, colsample_bylevel=1,colsample_bytree=1,gamma=0,learning_rate=0.1,max_delta_step=0,max_depth=3,min_child_weight=1,missing=None,n_estimators=100,nthread=-1,objective='binary:logistic',reg_alpha=0,reg_lambda=1,scale_pos_weight=1,seed=0,silent=True,subsample=1)
xgbc_y_predict=xgbc.predict(X_test)
xgbc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':xgbc_y_predict})
xgbc_submission.to_csv('xgbc_submission.csv', index=False)
print('Train Accuracy: %.1f%%' % (np.mean(xgbc_y_predict == y_test) * 100))
使用并行网格搜索的方式寻找更好的超餐组合
params={'max_depth':range(2, 7), 'n_estimators':range(100,1100,200),'learning_rate':[0.05, 0.1, 0.25, 0.5, 1.0]}
xgbc_best = XGBClassifier()
gs=GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)xgbc_best_y_predict=gs.predict(X_test)
xgbc_best_submission=pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':xgbc_best_y_predict})
xgbc_best_submission.to_csv('xgbc_best_submission.csv')