本文的目的是演示如何构建一个相对完整的机器学习工作流
1.首先对工程进行基本的参数配置
# 进行建模基本配置
SCORE_EVA = 'roc_auc'
random_state_clf = 1
n_jobs= 4
cv_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cv_split2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
X, y = data_of_features, label
2.基于各ML模型默认参数进行建模评估,了解模型针对当前任务的基本建模能力,筛选出有前途的模型进行超参数调优
#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [#Ensemble Methodsensemble.AdaBoostClassifier(random_state=random_state_clf),ensemble.BaggingClassifier(random_state=random_state_clf),ensemble.ExtraTreesClassifier(random_state=random_state_clf),ensemble.GradientBoostingClassifier(random_state=random_state_clf),ensemble.RandomForestClassifier(random_state=random_state_clf),#Gaussian Processesgaussian_process.GaussianProcessClassifier(random_state=random_state_clf),#GLMlinear_model.LogisticRegressionCV(random_state=random_state_clf),linear_model.PassiveAggressiveClassifier(random_state=random_state_clf),linear_model.RidgeClassifierCV(random_state=random_state_clf),linear_model.SGDClassifier(random_state=random_state_clf),linear_model.Perceptron(random_state=random_state_clf),#Navies Bayesnaive_bayes.BernoulliNB(random_state=random_state_clf),naive_bayes.GaussianNB(random_state=random_state_clf),#Nearest Neighborneighbors.KNeighborsClassifier(random_state=random_state_clf),#SVMsvm.SVC(probability=True,random_state=random_state_clf),svm.NuSVC(probability=True,random_state=random_state_clf),svm.LinearSVC(random_state=random_state_clf),#Trees tree.DecisionTreeClassifier(random_state=random_state_clf),tree.ExtraTreeClassifier(random_state=random_state_clf),#Discriminant Analysisdiscriminant_analysis.LinearDiscriminantAnalysis(random_state=random_state_clf),discriminant_analysis.QuadraticDiscriminantAnalysis(random_state=random_state_clf),XGBClassifier(random_state=random_state_clf) ]MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Matric Mean', 'MLA Test Matric Mean', 'MLA Test Matric 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0
for alg in MLA:#set name and parametersMLA_name = alg.__class__.__name__MLA_compare.loc[row_index, 'MLA Name'] = MLA_nameMLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())#score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validatecv_results = model_selection.cross_validate(alg, X, y, cv = cv_split,return_train_score=True,scoring = SCORE_EVA)MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()MLA_compare.loc[row_index, 'MLA Train Matric Mean'] = cv_results['train_score'].mean()MLA_compare.loc[row_index, 'MLA Test Matric Mean'] = cv_results['test_score'].mean() #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsetsMLA_compare.loc[row_index, 'MLA Test Matric 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen!row_index+=1MLA_compare = MLA_compare.sort_values(by='MLA Test Matric Mean', ascending=False) # 降序排列
MLA_compare
3.挑选出表现较好的模型,结合交叉验证和递归特征消除技术,同时进行超参数调优和特征选择
#函数可以选择使用REF或者SelectKBest方法结合贝叶斯或网格进行交叉验证寻优
def SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_split,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref', bayes_n_iter=10,verbose = 0,n_jobs=1):if feature_method == 'ref':pipe = Pipeline( [('scaler', StandardScaler()),('feature_selector', RFE(estimator=clf_model)),('model', clf_model)])else:pipe = Pipeline( [('scaler', StandardScaler()),('feature_selector', SelectKBest(f_classif)), ('model', clf_model)])if Search_method =='grid':grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=cv_split, verbose=verbose,scoring = SCORE_EVA,n_jobs=n_jobs)else:grid_search = BayesSearchCV(pipe, search_spaces=param_grid, verbose=verbose, scoring=SCORE_EVA, cv=cv_split, n_iter=bayes_n_iter,n_jobs=n_jobs)grid_search.fit(X, y)return grid_search
#函数通过多次,多折交叉验证的形式找到最佳特征集和超参数,在多次的交叉验证结果中,以出现最多的特征和参数作为最终的优选结果。这里如果用贝叶斯方法,可能每次交叉验证的超参数结果都是唯一的,导致所有
#结果出现次数都是1def mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter=10,cv_inner=5,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref',bayes_n_iter=2,verbose=0,n_jobs=1):start_time = timeit.default_timer()inner_cv = StratifiedKFold(n_splits=cv_inner, shuffle=True, random_state=1)if cv_outter==1:grid_search_result = SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)end_time = timeit.default_timer()print(f"函数运行时间为 {(end_time - start_time)/60} 分")print("Best score found: ", grid_search_result.best_score_)print("Best parameters found: ", grid_search_result.best_params_)print("Selected features:", np.array(features43)[grid_search_result.best_estimator_.named_steps['feature_selector'].support_])return grid_search_resultelse: outer_cv = StratifiedKFold(n_splits=cv_split, shuffle=True, random_state=0)roc =[]best_params_history = [] selected_features_history = [] # 执行超参数优化for train_idx, test_idx in outer_cv.split(X, y):X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]search = SearchCV_Feature_and_Parameter(X_train,y_train,clf_model,param_grid,inner_cv,SCORE_EVA,Search_method,feature_method,bayes_n_iter,verbose,n_jobs)best_params_history.append(search.best_params_)best_model = search.best_estimator_selected_features = best_model.named_steps['feature_selector'].get_support()selected_features_history.append(selected_features)y_pred = best_model.predict(X_test)y_pred_proba = best_model.predict_proba(X_test)[:,1]roc.append(roc_auc_score(y_test, y_pred_proba))best_params_history_df = pd.DataFrame([dict(ordered_dict) for ordered_dict in best_params_history])best_params_history_df[SCORE_EVA] = rocprint(f"{cv_split}次{cv_split}折交叉验证平均ROC: {np.mean(roc):.4f} std: {np.std(roc):.4f} ,{[round(meta,3) for meta in roc]}")#for i, selected_features in enumerate(selected_features_history, start=1):# print(f"第{i}次交叉验证所选择的特征: {np.array(features)[selected_features]}")param_names = best_params_history[0].keys()overall_best_params = {}for param_name in param_names:value_counts = Counter([params[param_name] for params in best_params_history])most_common_value = value_counts.most_common(1)[0][0]overall_best_params[param_name] = most_common_valueprint("整体最佳超参数: ", overall_best_params)# 多模型集成+计算整体最佳作为最终超参数total_features = X.shape[1]feature_selection_counts = np.zeros(total_features)# 统计每个特征被选中的次数for selected_features in selected_features_history:feature_selection_counts += selected_features.astype(int)# 设置阈值以确定整体最佳特征集threshold = len(selected_features_history) // 2overall_best_features = feature_selection_counts > thresholdprint("整体最佳特征集: ", np.array(features)[overall_best_features])end_time = timeit.default_timer()print(f"函数运行时间为 {(end_time - start_time)/60} 分")return best_params_history_df,selected_features_history,roc
def model_evaluate(X,y,model,n_times,test_size=0.3):scores = []# 进行多次随机数据划分for i in range(n_times):# 划分训练集和测试集X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=i)# 训练并评估模型,每次调用fit都会重新初始化模型权重model.fit(X_train, y_train)y_pred = model.predict_proba(X_test)[:,1]score = roc_auc_score(y_test, y_pred)scores.append(score)return scores
# 通过建立ML工作流对,针对较优模型进行RFE特征选择,和超参数调优,作为本次建模的核心模型
grid_n_estimator = Integer(1, 300)
grid_ratio = Real(0.01, 1.0, 'log-uniform')
grid_learn = Real(0.01, 1.0, 'log-uniform')
grid_max_depth = Integer(1, 15)
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]# 定义超参数搜索空间
param_grid = {#'feature_selector__k': Integer(5, 15),'feature_selector__n_features_to_select': Integer(5, 15),'model__learning_rate': Real(0.01, 1.0, 'log-uniform'),'model__max_depth': Integer(1, 50),'model__n_estimators': Integer(50, 200),'model__random_state': grid_seed
}clf_model = XGBClassifier(scale_pos_weight=2,objective='binary:logistic',seed=0)
grid_search_result = mutil_times_SearchCV_Feature_and_Parameter(X,y,clf_model,param_grid,cv_outter=1,cv_inner=5,SCORE_EVA='roc_auc',Search_method ='Bayes',feature_method = 'ref',bayes_n_iter=10,verbose=0,n_jobs=n_jobs)
# 得到optimal特征子集和超参数后,通过多次数据划分,评估模型整体和泛化性能,其中,泛化性能以std结果体现
X_best = X[np.array(features43)[grid_search_result.best_estimator_.named_steps['feature_selector'].support_]]
X_best = StandardScaler().fit_transform(X_best)
clf_model.set_params(**{k.replace("model__", ""): v for k, v in grid_search_result.best_params_.items() if k.startswith("model__")})
scores = model_evaluate(X_best,y,clf_model_EVA,n_times=100,test_size=0.3)
mean_score = round(np.mean(scores),3)
std_score = round(np.std(scores),3)
print('最佳模型',mean_score,std_score)
4.完成特征选择后,基于优选特征子集,结合贝叶斯交叉验证,对备选模型进行超参数调优
#基于optimal特征子集,进行多模型集成
#why choose one model, when you can pick them all with voting classifier
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
#removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model
vote_est = [#Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html('ada', ensemble.AdaBoostClassifier()),('rfc', ensemble.RandomForestClassifier()),('gbc', ensemble.GradientBoostingClassifier()),('xgb', XGBClassifier())#('bc', ensemble.BaggingClassifier()),#('etc',ensemble.ExtraTreesClassifier()),#('gbc', ensemble.GradientBoostingClassifier()),#('rfc', ensemble.RandomForestClassifier()),#Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc#('gpc', gaussian_process.GaussianProcessClassifier()),#GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression#('lr', linear_model.LogisticRegressionCV()),#Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html#('bnb', naive_bayes.BernoulliNB()),#('gnb', naive_bayes.GaussianNB()),#Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html#('knn', neighbors.KNeighborsClassifier()),#SVM: http://scikit-learn.org/stable/modules/svm.html# ('svc', svm.SVC(probability=True)),#xgboost: http://xgboost.readthedocs.io/en/latest/model.html#('xgb', XGBClassifier())]#WARNING: Running is very computational intensive and time expensive.
#Code is written for experimental/developmental purposes and not production ready!
#Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
grid_param = [[{#AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html'n_estimators': grid_n_estimator, #default=50'learning_rate': grid_learn, #default=1#'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R'random_state': grid_seed}],[{#RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier'n_estimators': grid_n_estimator, #default=10'criterion': grid_criterion, #default=”gini”'max_depth': grid_max_depth, #default=None'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.'random_state': grid_seed}],[{#GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier#'loss': ['deviance', 'exponential'], #default=’deviance’'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.#'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”'max_depth': grid_max_depth, #default=3 'random_state': grid_seed}],[{#XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html'learning_rate': grid_learn, #default: .3'max_depth': [1,2,4,6,8,10], #default 2'n_estimators': grid_n_estimator, 'seed': grid_seed }] ,'''[{#ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier'n_estimators': grid_n_estimator, #default=10'criterion': grid_criterion, #default=”gini”'max_depth': grid_max_depth, #default=None'random_state': grid_seed}],[{#BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier'n_estimators': grid_n_estimator, #default=10'max_samples': grid_ratio, #default=1.0'random_state': grid_seed}],[{ #GaussianProcessClassifier'max_iter_predict': grid_n_estimator, #default: 100'random_state': grid_seed}],[{#LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV'fit_intercept': grid_bool, #default: True#'penalty': ['l1','l2'],'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs'random_state': grid_seed}],[{#BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB'alpha': grid_ratio, #default: 1.0}],#GaussianNB - [{}],[{#KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier'n_neighbors': [1,2,3,4,5,6,7], #default: 5'weights': ['uniform', 'distance'], #default = ‘uniform’'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}],[{#SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC#http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r#'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],'C': [1,2,3,4,5], #default=1.0'gamma': grid_ratio, #edfault: auto'decision_function_shape': ['ovo', 'ovr'], #default:ovr'probability': [True],'random_state': grid_seed}],''']start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter
for clf, param in zip (vote_est, grid_param): #https://docs.python.org/3/library/functions.html#zip#print(clf[0],clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm#print(param)start = time.perf_counter() #best_search = model_selection.GridSearchCV(estimator = clf[1], param_grid = param, cv = cv_split, scoring = SCORE_EVA)best_search = BayesSearchCV(clf[1], search_spaces=param, scoring=SCORE_EVA, cv=cv_split, n_iter=50,n_jobs=16)best_search.fit(X_best, y)run = time.perf_counter() - startbest_param = best_search.best_params_clf[1].set_params(**best_param) #对模型进行多次评估,以分析泛化能力scores = model_evaluate(X_best, y,clf[1],10,test_size=0.3)print('The best parameter for {} is {} with a runtime of {:.2f} seconds, scoreing is {:.3f}, std: {:.3f}'.format(clf[1].__class__.__name__, best_param, run,np.mean(scores),np.std(scores)))run_total = time.perf_counter() - start_total
print('Total optimization time was {:.2f} minutes.'.format(run_total/60))
print('-'*10)
5.完成各模型超参数调优后,进行多模型集成,包括ensemble 或 stacking
#通过投票法 进行多模型集成
#Soft Vote or weighted probabilities w/Tuned Hyperparameters
vote = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft') #voting = 'hard'
vote_cv = model_selection.cross_validate(vote, X_best, y, cv = cv_split,scoring = SCORE_EVA,return_train_score=True,n_jobs=16)
print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_cv['train_score'].mean()*100))
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_cv['test_score'].std()*100*3))
print('-'*10)# stacking 法
meta_learner = LogisticRegression()
stacking_model = StackingClassifier(estimators=vote_est, final_estimator=meta_learner)
stacking_cv = model_selection.cross_validate(stacking_model, X_best, y, cv = cv_split,scoring = SCORE_EVA,return_train_score=True,n_jobs=16)
print("Stacking Training w/bin score mean: {:.2f}". format(stacking_cv['train_score'].mean()*100))
print("Stacking Test w/bin score mean: {:.2f}". format(stacking_cv['test_score'].mean()*100))
print("Stacking Test w/bin score 3*std: +/- {:.2f}". format(stacking_cv['test_score'].std()*100*3))
print('-'*10)
参考: A Data Science Framework: To Achieve 99% Accuracy | Kaggle