计算相关系数,Filter方法,并筛选特征
def feature_select_pearson(train, test):"""利用pearson系数进行相关性特征选择:param train:训练集:param test:测试集:return:经过特征选择后的训练集与测试集"""print('feature_select...')features = train.columns.tolist()features.remove("card_id")features.remove("target")featureSelect = features[:]# 去掉缺失值比例超过0.99的for fea in features:if train[fea].isnull().sum() / train.shape[0] >= 0.99:featureSelect.remove(fea)# 进行pearson相关性计算corr = []for fea in featureSelect:corr.append(abs(train[[fea, 'target']].fillna(0).corr().values[0][1]))# 取top300的特征进行建模,具体数量可选se = pd.Series(corr, index=featureSelect).sort_values(ascending=False)feature_select = ['card_id'] + se[:300].index.tolist()print('done')return train[feature_select + ['target']], test[feature_select]
```c
def param_grid_search(train):"""网格搜索参数调优:param train:训练集:return:网格搜索训练结果"""# Step 1.创建网格搜索空间print('param_grid_search')features = train.columns.tolist()features.remove("card_id")features.remove("target")parameter_space = {"n_estimators": [81], "min_samples_leaf": [31],"min_samples_split": [2],"max_depth": [10],"max_features": [80]}# Step 2.执行网格搜索过程print("Tuning hyper-parameters for mse")# 实例化随机森林模型clf = RandomForestRegressor(criterion="mse",n_jobs=15,random_state=22)# 带入网格搜索grid = GridSearchCV(clf, parameter_space, cv=2, scoring="neg_mean_squared_error")grid.fit(train[features].values, train['target'].values)# Step 3.输出网格搜索结果print("best_params_:")print(grid.best_params_)means = grid.cv_results_["mean_test_score"]stds = grid.cv_results_["std_test_score"]# 此处额外考虑观察交叉验证过程中不同超参数的for mean, std, params in zip(means, stds, grid.cv_results_["params"]):print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))return grid
4.随机森林交叉验证评估与中间结果保存
在实际模型优化的过程中,有很多方法可以考虑,包括使用更加复杂高效的模型、进行模型融合、特征优化等等,但除此以外,还有一类经常被忽视但又同样高效优化的方法,那就是借助交叉验证进行多模型结果集成,当然此处所谓的多模型并不是采用了不同的评估器,而是同一个评估器(例如随机森林)在不同数据集上进行多次训练后生成多个模型,然后借助多个模型对测试集数据输出预测结果,最终通过取均值的方式来计算最终模型对测试集的预测结果。例如当前我们已经挑选了一组最优超参数,那么接下来就可以在这组超参数基础上进行五折交叉验证模型训练,该过程中对验证集的预测结果可以参与到后续Stacking融合过程中,而对测试集的预测结果则可以作为最终预测结果进行提交,相关过程如下所示:
image-20211208192640281
当然,交叉验证可以直接调用sklearn中评估器来实现,我们可以通过如下代码实现上述过程:
def train_predict(train, test, best_clf):"""进行训练和预测输出结果:param train:训练集:param test:测试集:param best_clf:最优的分类器模型:return:"""# Step 1.选择特征print('train_predict...')features = train.columns.tolist()features.remove("card_id")features.remove("target")# Step 2.创建存储器# 测试集评分存储器prediction_test = 0# 交叉验证评分存储器cv_score = []# 验证集的预测结果prediction_train = pd.Series()# Step 3.交叉验证# 实例化交叉验证评估器kf = KFold(n_splits=5, random_state=22, shuffle=True)# 执行交叉验证过程for train_part_index, eval_index in kf.split(train[features], train['target']):# 在训练集上训练模型best_clf.fit(train[features].loc[train_part_index].values, train['target'].loc[train_part_index].values)# 模型训练完成后,输出测试集上预测结果并累加至prediction_test中prediction_test += best_clf.predict(test[features].values)# 输出验证集上预测结果,eval_pre为临时变量eval_pre = best_clf.predict(train[features].loc[eval_index].values)# 输出验证集上预测结果评分,评估指标为MSEscore = np.sqrt(mean_squared_error(train['target'].loc[eval_index].values, eval_pre))# 将本轮验证集上的MSE计算结果添加至cv_score列表中cv_score.append(score)print(score)# 将验证集上的预测结果放到prediction_train中prediction_train = prediction_train.append(pd.Series(best_clf.predict(train[features].loc[eval_index]),index=eval_index)