RStudio是一个开源的集成开发环境(IDE),专门用于R编程语言的开发和数据分析。R语言是一种流行的统计计算和数据可视化语言,广泛用于数据科学、统计学和机器学习领域。
RStudio提供了许多功能强大的工具,包括代码编辑器、调试器、数据可视化工具和集成的帮助文档。对于机器学习推广,RStudio可以帮助您进行数据预处理、模型训练、评估和部署。
RStudio中有许多用于机器学习的包和库,如caret、tidymodels、tensorflow等,这些包提供了丰富的机器学习算法和工具,帮助您构建和部署机器学习模型。
library(rlang)
library(gbm)
library(xgboost)
library(vip)
library(pdp)
library(ggplot2)
library(tidyverse)
library(recipes)
rm(list=ls())s<-Sys.time()#数据# prereqs-data
# ames data
#install.packages("AmesHousing")
library(AmesHousing)
ames <- AmesHousing::make_ames()# split data
set.seed(123)
split <- rsample::initial_split(ames, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test <- rsample::testing(split)## GBM## * 非常受欢迎
## * 非常准确
## * 和随机森林类似,但是大不相同## 第一个GBM模型# shrinkage: learning rate=0.1学习率默认# basic-gbm
tick <- proc.time()
set.seed(123)
ames_gbm <- gbm(formula = Sale_Price ~ .,data = ames_train,distribution = "gaussian", # or bernoulli, multinomial, etc. n.trees = 5000, shrinkage = 0.1, interaction.depth = 1, n.minobsinnode = 10, cv.folds = 5
)
tock <- proc.time() - tick;tock# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm$cv.error)# get MSE and compute RMSE
sqrt(ames_gbm$cv.error[min_MSE])# 检查是否过拟合gbm.perf(ames_gbm, method = "cv") # or "OOB"## 调参## * 刚开始learning rate学习率默认0.1, 或者0.05-0.2
## * 确保number of trees不要过拟合
## * 调整learning rate学习率
## * 调整树参数
## * 可能降低learning rate, 增加number of trees## 固定树参数 ## 树高3## 默认min obs## 设定learning rate 0.01## 增加CV更利于error估计## 1分钟## 用尽了所有的树# tune1
set.seed(123)
tick <- proc.time()
ames_gbm1 <- gbm(formula = Sale_Price ~ .,data = ames_train,distribution = "gaussian", # or bernoulli, multinomial, etc. #<<n.trees = 5000, #<<shrinkage = 0.01, #<<interaction.depth = 3, #<<n.minobsinnode = 10, #<<cv.folds = 10 #<<
)
tock <- proc.time() - tick;tock# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm1$cv.error)# get MSE and compute RMSE
sqrt(ames_gbm1$cv.error[min_MSE])gbm.perf(ames_gbm1, method = "cv")## 提高number of trees## 尝试0.001## 不准确## 4分钟# tune2
set.seed(123)
tick <- proc.time()
ames_gbm1 <- gbm(formula = Sale_Price ~ .,data = ames_train,distribution = "gaussian", # or bernoulli, multinomial, etc. #<<n.trees = 20000, #<<shrinkage = 0.001, #<<interaction.depth = 3, #<<n.minobsinnode = 10, #<<cv.folds = 10 #<<
)
tock <- proc.time() - tick;tock# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm1$cv.error)# get MSE and compute RMSE
sqrt(ames_gbm1$cv.error[min_MSE])gbm.perf(ames_gbm1, method = "cv")# 调整树参数: 30分钟# assess 3 values for tree depth# assess 3 values for min obs# tune12
# eval=FALSE或者cache = TRUE
# search grid
hyper_grid <- expand.grid(n.trees = 6000,shrinkage = .01,interaction.depth = c(3, 5, 7), #<<n.minobsinnode = c(5, 10, 15) #<<
)model_fit <- function(n.trees, shrinkage, interaction.depth, n.minobsinnode) {set.seed(123)m <- gbm(formula = Sale_Price ~ .,data = ames_train,distribution = "gaussian",n.trees = n.trees,shrinkage = shrinkage, #<<interaction.depth = interaction.depth, #<<n.minobsinnode = n.minobsinnode,cv.folds = 10)# compute RMSEsqrt(min(m$cv.error))
}tick <- proc.time()
hyper_grid$rmse <- purrr::pmap_dbl(hyper_grid,~ model_fit(n.trees = ..1,shrinkage = ..2,interaction.depth = ..3,n.minobsinnode = ..4)
)
tock <- proc.time() - tick;tockarrange(hyper_grid, rmse)
## n.trees shrinkage interaction.depth n.minobsinnode rmse
## 1 6000 0.01 7 5 20683.65
## 2 6000 0.01 5 5 20746.36
## 3 6000 0.01 7 10 21114.62
## 4 6000 0.01 3 5 21126.33
## 5 6000 0.01 5 10 21228.68
## 6 6000 0.01 7 15 21246.73
## 7 6000 0.01 5 15 21272.91
## 8 6000 0.01 3 10 21767.08
## 9 6000 0.01 3 15 21862.35# 加入随机性# * 0.5-0.8
# * 可能需要Zoom in
# * 10分钟# stochastic
bag_frac <- c(.5, .65, .8) #<<tick <- proc.time()
for(i in bag_frac) {set.seed(123)m <- gbm(formula = Sale_Price ~ .,data = ames_train,distribution = "gaussian",n.trees = 6000, shrinkage = 0.01, interaction.depth = 7, n.minobsinnode = 5,bag.fraction = i, #<<cv.folds = 10 )# compute RMSEprint(sqrt(min(m$cv.error)))
}tock <- proc.time() - tick;tock## [1] 20683.65
## [1] 20398.93
## [1] 20287.76## XGBoost## XGBoost 需要one hot## caret可以自动one hot## * Other: 低频## * Encode: 有序变量;提升速度和表现# xgb-feature-prep
library(rlang)
xgb_prep <- recipe(Sale_Price ~ ., data = ames_train) %>%step_other(all_nominal(), threshold = .005) %>%step_integer(all_nominal()) %>%prep(training = ames_train, retain = TRUE) %>%juice()X <- as.matrix(xgb_prep[setdiff(names(xgb_prep), "Sale_Price")])
Y <- xgb_prep$Sale_Price# 第一个XGBoost模型# 大概几秒钟# Early stopping rounds: CV RMSE 50棵树不进步# https://xgboost.readthedocs.io/en/latest/parameter.html# 默认eta=0.3# basic-xgboost
tick <- proc.time()
set.seed(123)
ames_xgb <- xgb.cv(data = X,label = Y,nrounds = 5000,objective = "reg:squarederror",early_stopping_rounds = 50, nfold = 10,verbose = 0,
)
tock <- proc.time() - tick;tockames_xgb$evaluation_log %>% tail()
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1: 195 600.2057 55.89026 24382.85 1725.531
## 2: 196 591.9463 55.26891 24381.77 1725.488
## 3: 197 582.9737 53.65047 24382.20 1724.203
## 4: 198 572.7220 52.89840 24382.79 1723.236
## 5: 199 561.9686 49.76037 24382.41 1724.374
## 6: 200 554.3745 48.24520 24382.83 1723.857# xgboost-eta
set.seed(123)
ames_xgb <- xgb.cv(data = X,label = Y,nrounds = 6000,objective = "reg:squarederror",early_stopping_rounds = 50, nfold = 10,verbose = 0,params = list(eta = .05) #<<
) ames_xgb$evaluation_log %>% tail()
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1: 1023 1034.059 51.96559 21879.08 1248.379
## 2: 1024 1032.371 52.17706 21879.89 1248.430
## 3: 1025 1030.903 52.01260 21879.53 1248.890
## 4: 1026 1029.302 51.47370 21879.29 1249.095
## 5: 1027 1027.053 50.80923 21879.43 1248.593
## 6: 1028 1025.107 50.38078 21879.46 1249.280## - eta = .3(default): 24,382 w/200 trees (< 1 min)
## - eta = .1: 22,333 w/398 trees (< 1 min)
## - eta = .05: 21,877 w/978 trees (1.5 min)
## - eta = .01: 22,094 w/2843 trees (4 min)
## - one-hot encoded 30 min# * Learning rate=0.05
# * 6000棵
# * Early stopping
# * 调整
# - tree depth
# - Child weight# 30分钟# xgboost-tree-specific
# grid
hyper_grid <- expand.grid(eta = .05,max_depth = c(1, 3, 5, 7, 9), #<<min_child_weight = c(1, 3, 5, 7, 9), #<<rmse = 0 # a place to dump results
)# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {set.seed(123)m <- xgb.cv(data = X,label = Y,nrounds = 6000,objective = "reg:squarederror",early_stopping_rounds = 50, nfold = 10,verbose = 0,params = list( #<<eta = hyper_grid$eta[i], #<<max_depth = hyper_grid$max_depth[i], #<<min_child_weight = hyper_grid$min_child_weight[i] #<<) #<<)hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tockarrange(hyper_grid, rmse)
## eta max_depth min_child_weight rmse
## 1 0.05 3 3 20989.27
## 2 0.05 3 1 21062.92
## 3 0.05 3 5 21453.00
## 4 0.05 5 1 21685.04
## 5 0.05 5 3 21748.12
## 6 0.05 3 7 22058.70
## 7 0.05 7 1 22110.09
## 8 0.05 3 9 22181.64
## 9 0.05 5 5 22185.64
## 10 0.05 7 3 22468.97
## 11 0.05 9 1 22632.80
## 12 0.05 9 3 22664.75
## 13 0.05 7 5 22721.41
## 14 0.05 5 7 22801.02
## 15 0.05 7 7 22951.21
## 16 0.05 5 9 22970.42
## 17 0.05 9 5 22987.65
## 18 0.05 9 7 23212.10
## 19 0.05 1 1 23252.66
## 20 0.05 7 9 23452.08
## 21 0.05 9 9 23714.29
## 22 0.05 1 3 24147.17
## 23 0.05 1 5 25089.55
## 24 0.05 1 7 26534.20
## 25 0.05 1 9 26784.77# * Learning rate=0.05
# * 6000棵
# * Early stopping
# * tree depth=3
# * Child weight=3
# * 调整随机性
# - subsampling rows for each tree
# - subsampling columns for each tree# 12分钟# xgb-stochastic-grid
# grid
hyper_grid <- expand.grid(eta = .05,max_depth = 3, min_child_weight = 3,subsample = c(.5, .65, .8, 1), #<<colsample_bytree = c(.5, .65, .8, 1), #<<rmse = 0 # a place to dump results
)# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {set.seed(123)m <- xgb.cv(data = X,label = Y,nrounds = 6000,objective = "reg:squarederror",early_stopping_rounds = 50, nfold = 10,verbose = 0,params = list( #<<eta = hyper_grid$eta[i],max_depth = hyper_grid$max_depth[i],min_child_weight = hyper_grid$min_child_weight[i],subsample = hyper_grid$subsample[i], #<<colsample_bytree = hyper_grid$colsample_bytree[i] #<<) #<<)hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tockarrange(hyper_grid, rmse)
## eta max_depth min_child_weight subsample colsample_bytree rmse
## 1 0.05 3 3 0.80 1.00 20732.22
## 2 0.05 3 3 0.50 1.00 20752.65
## 3 0.05 3 3 1.00 1.00 20989.27
## 4 0.05 3 3 0.65 1.00 21018.13
## 5 0.05 3 3 1.00 0.80 21425.00
## 6 0.05 3 3 0.80 0.80 21444.68
## 7 0.05 3 3 1.00 0.65 21597.04
## 8 0.05 3 3 0.65 0.80 21890.11
## 9 0.05 3 3 0.80 0.65 21907.55
## 10 0.05 3 3 0.65 0.65 22047.80
## 11 0.05 3 3 0.50 0.80 22211.92
## 12 0.05 3 3 0.50 0.65 22460.97
## 13 0.05 3 3 0.80 0.50 26071.94
## 14 0.05 3 3 1.00 0.50 26555.12
## 15 0.05 3 3 0.65 0.50 26577.24
## 16 0.05 3 3 0.50 0.50 26952.51# xgb-regularize
hyper_grid <- expand.grid(eta = .05,max_depth = 3, min_child_weight = 3,subsample = .8, colsample_bytree = 1,#gamma = c(1, 100, 1000, 10000),#lambda = c(1e-2, 0.1, 1, 100, 1000, 10000),alpha = c(1e-2, 0.1, 1, 100, 1000, 10000), #<<rmse = 0 # a place to dump results
)# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {set.seed(123)m <- xgb.cv(data = X,label = Y,nrounds = 6000,objective = "reg:squarederror",early_stopping_rounds = 50, nfold = 10,verbose = 0,params = list( eta = hyper_grid$eta[i], max_depth = hyper_grid$max_depth[i],min_child_weight = hyper_grid$min_child_weight[i],subsample = hyper_grid$subsample[i], #<<colsample_bytree = hyper_grid$colsample_bytree[i],#gamma = hyper_grid$gamma[i], #lambda = hyper_grid$lambda[i]#, alpha = hyper_grid$alpha[i] #<<) )hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tockarrange(hyper_grid, rmse)
## eta max_depth min_child_weight subsample colsample_bytree alpha rmse
## 1 0.05 3 3 0.8 1 1e+02 20581.31
## 2 0.05 3 3 0.8 1 1e+03 20605.30
## 2 0.05 3 3 0.8 1 1e+04 20615.19
## 3 0.05 3 3 0.8 1 1e-01 20732.23
## 4 0.05 3 3 0.8 1 1e-02 20732.23
## 5 0.05 3 3 0.8 1 1e+00 20732.23# final-xgb-model
# parameter list
params <- list(eta = .05,max_depth = 3, min_child_weight = 3,subsample = .8, colsample_bytree = 1,alpha = 100
)# final cv fit
set.seed(123)
tick <- proc.time()
final_cv <- xgb.cv(data = X,label = Y,nrounds = 6000,objective = "reg:squarederror",early_stopping_rounds = 50, nfold = 10,verbose = 0,params = params
)
tock <- proc.time() - tick;tockfinal_cv$evaluation_log %>%ggplot(aes(x=iter,y=test_rmse_mean)) +geom_line()# train final model
ames_final_xgb <- xgboost(data = X,label = Y,nrounds = final_cv$best_iteration, objective = "reg:squarederror",params = params, verbose = 0
)# xgb-vip
vip::vip(ames_final_xgb, num_features = 25)# xgb-pdp
ames_final_xgb %>%pdp::partial(pred.var = "Gr_Liv_Area", n.trees = ames_final_xgb$niter, grid.resolution = 50, train = X) %>%autoplot(rug = TRUE, train = X)# top-bottom-vip
ames_vi <- vi(ames_final_xgb, feature_names = colnames(X))
feats <- c(head(ames_vi, n = 4)$Variable, tail(ames_vi, n = 4)$Variable)
pds <- lapply(feats, FUN = function(x) {pd <- cbind(x, pdp::partial(ames_final_xgb, pred.var = x, train = X))names(pd) <- c("xvar", "xval", "yhat")pd
})
pds <- do.call(rbind, pds)
pds$xvar <- factor(pds$xvar,levels=feats)
ggplot(pds, aes(x = xval, y = yhat)) +geom_line(size = 1.5) +geom_hline(yintercept = mean(ames$Sale_Price), linetype = 2, col = "red2") +facet_wrap( ~ xvar, scales = "free_x", nrow = 2) +labs(x = "", y = "Partial dependence") +theme_light()## test data测试集上面预测# recipe
test <- recipe(Sale_Price ~ ., data = ames_train) %>%step_other(all_nominal(), threshold = .005) %>%step_integer(all_nominal()) %>%prep(training = ames_train) %>%bake(new_data = ames_test)# obtain X for test data
xgtest <- xgb.DMatrix(as.matrix(test %>% select(-Sale_Price)))# make prediction and calculate error
test <- test %>%mutate(prediction = predict(ames_final_xgb, xgtest)) %>%mutate(error = (prediction-Sale_Price)^2)# plot
test %>% ggplot(aes(Sale_Price,prediction)) +geom_point()# test error
test %>%summarize(mean(error)) %>%sqrt()save.image("6_boosting_CCB_cohort2.RData")#??????1
e1<-Sys.time()
print(e1-s)
总的来说,RStudio是一个功能强大的工具,可用于机器学习推广,帮助用户在数据分析和模型开发过程中更高效地工作。