机器算法验证 - 以最佳方式调整弹性网络的 alpha 和 lambda 参数 - 吾爱随笔录

我正在尝试根据glmnet包调整弹性网络的 alpha 和 lambda 参数。我找到了一些资料来源，它们为此目的提出了不同的选择。根据这个指令我做了一个基于caret包的优化。根据这个线程，我手动优化了参数。两种方法都给了我有效的结果，但是，方法的选择参数非常不同。请参阅下面的 R 中的可重现示例：

library("caret")
library("glmnet")

set.seed(1234)

# Some example data
N <- 1000
y <- rnorm(N, 5, 10)
x1 <- y + rnorm(N, 2, 10)
x2 <- y + rnorm(N, - 5, 20)
x3 <- y + rnorm(N, 10, 200)
x4 <- rnorm(N, 20, 50)
x5 <- rnorm(N, - 7, 200)
x6 <- rbinom(N, 1, exp(x1) / (exp(x1) + 1))
x7 <- rbinom(N, 1, exp(x2) / (exp(x2) + 1))
x8 <- rbinom(N, 1, exp(x3) / (exp(x3) + 1))
x9 <- rbinom(N, 1, exp(x4) / (exp(x4) + 1))
x10 <- rbinom(N, 1, exp(x5) / (exp(x5) + 1))

data <- data.frame(y, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10)

# Tune parameteres with caret and glmnet

# Set up grid and cross validation method for train function
lambda_grid <- seq(0, 3, 0.1)
alpha_grid <- seq(0, 1, 0.1)

trnCtrl <- trainControl(method = "repeatedCV",
                    number = 10,
                    repeats = 5)

srchGrid <- expand.grid(.alpha = alpha_grid, .lambda = lambda_grid)

# Cross validation
my_train <- train(y ~., data,
              method = "glmnet",
              tuneGrid = srchGrid,
              trControl = trnCtrl)

# Best tuning parameters
my_train$bestTune


# Tune parameteres with glmnet only

alphasOfInterest <- seq(0, 1, 0.1)

# Step 1: Do all crossvalidations for each alpha
cvs <- lapply(alphasOfInterest, function(curAlpha) {
  cv.glmnet(x = as.matrix(data[ , colnames(data) %in% "y" == FALSE]), 
        y = y, alpha = curAlpha, family = "gaussian")
})

# Step 2: Collect the optimum lambda for each alpha
optimumPerAlpha <- sapply(seq_along(alphasOfInterest), function(curi) {
  curcvs <- cvs[[curi]]
  curAlpha <- alphasOfInterest[curi]
  indOfMin <- match(curcvs$lambda.min, curcvs$lambda)
  c(lam = curcvs$lambda.min, alph = curAlpha, cvup = curcvs$cvup[indOfMin])
})

# Step 3: Find the overall optimum
posOfOptimum <- which.min(optimumPerAlpha["lam", ])
overall.lambda.min <- optimumPerAlpha["lam", posOfOptimum]
overall.alpha.min <- optimumPerAlpha["alph", posOfOptimum]
overall.criterionthreshold <- optimumPerAlpha["cvup", posOfOptimum]

# Step 4: Now check for each alpha which lambda is the best within the threshold
corrected1se <- sapply(seq_along(alphasOfInterest), function(curi) {
  curcvs <- cvs[[curi]]
  lams <- curcvs$lambda
  lams[lams < overall.lambda.min] <- NA
  lams[curcvs$cvm > overall.criterionthreshold] <- NA
  lam1se<-max(lams, na.rm = TRUE)
  c(lam = lam1se, alph = alphasOfInterest[curi])
})

# Step 5: Find the best (lowest) of these lambdas
overall.lambda.1se <- max(corrected1se["lam", ])
pos <- match(overall.lambda.1se, corrected1se["lam", ])
overall.alpha.1se <- corrected1se["alph", pos]


# Comparison --> Parameters are very different
my_train$bestTune # Parameters according to caret
c(overall.alpha.1se, overall.lambda.1se) # Parameters according to glmnet only

好像我做错了什么，但不幸的是我无法弄清楚问题所在。 问题：如何为 R 中的弹性网络调整 alpha 和 lambda？

更新：添加了模拟研究，用于比较caret和手动调整 alpha 和 lambda

根据 Hong Ooi 的建议，我在一个小型模拟研究中比较了这两种调整方法的结果。这两种方法仍然会导致非常不同的最佳参数，并且手动调整的性能略优于插入符号包。这个结果对我来说非常令人惊讶，因为caret与手动编程相比，我本以为该包会产生更好的估计。因此我想知道，如果手动调整实际上优于caret软件包，或者我是否犯了任何错误。非常欢迎任何建议！

##### Small simulation #####


alpha_caret <- numeric()
lambda_caret <- numeric()
MSE_caret <- numeric()
alpha_without_caret <- numeric()
lambda_without_caret <- numeric()
MSE_without_caret <- numeric()

R <- 20 # Simulation runs

for(r in 1:R) {


  ##### Tune parameteres with caret and glmnet #####


  # Set up grid and cross validation method for train function
  lambda_grid <- seq(0, 3, 0.1)
  alpha_grid <- seq(0, 1, 0.1)

  trnCtrl <- trainControl(method = "repeatedCV",
                      number = 10,
                      repeats = 5)

  srchGrid <- expand.grid(.alpha = alpha_grid, .lambda = lambda_grid)

  # Cross validation
  my_train <- train(y ~., data,
                method = "glmnet",
                tuneGrid = srchGrid,
                trControl = trnCtrl)

  # Best parameters
  alpha_caret[r] <- as.numeric(my_train$bestTune[1]) # alpha according to caret
  lambda_caret[r] <- as.numeric(my_train$bestTune[2]) # lambda according to caret

  # Elastic net with best parameters
  mod_elnet <- glmnet(x = as.matrix(data[colnames(data) %in% "y" == FALSE]), 
                   y = data$y, alpha = alpha_caret[r], family = "gaussian", lambda = lambda_caret[r])

  # Estimation of lm with the variables that have been selected in the elastic net
  vars_elnet <- names(mod_elnet$beta[ , 1])[as.numeric(mod_elnet$beta[ , 1]) != 0]
  mod_elnet_lm <- lm(y ~ ., data[ , colnames(data) %in% c(vars_elnet, "y")])

  # MSE
  MSE_caret[r] <- mean(mod_elnet_lm$residuals^2)


  ##### Tune parameteres with glmnet only #####


  alphasOfInterest <- seq(0, 1, 0.1)

  # Step 1: Do all crossvalidations for each alpha
  cvs <- lapply(alphasOfInterest, function(curAlpha) {
cv.glmnet(x = as.matrix(data[ , colnames(data) %in% "y" == FALSE]), 
          y = y, alpha = curAlpha, family = "gaussian")
  })

  # Step 2: Collect the optimum lambda for each alpha
  optimumPerAlpha <- sapply(seq_along(alphasOfInterest), function(curi) {
curcvs <- cvs[[curi]]
curAlpha <- alphasOfInterest[curi]
indOfMin <- match(curcvs$lambda.min, curcvs$lambda)
c(lam = curcvs$lambda.min, alph = curAlpha, cvup = curcvs$cvup[indOfMin])
  })

  # Step 3: Find the overall optimum
  posOfOptimum <- which.min(optimumPerAlpha["lam", ])
  overall.lambda.min <- optimumPerAlpha["lam", posOfOptimum]
  overall.alpha.min <- optimumPerAlpha["alph", posOfOptimum]
  overall.criterionthreshold <- optimumPerAlpha["cvup", posOfOptimum]

  # Step 4: Now check for each alpha which lambda is the best within the threshold
  corrected1se <- sapply(seq_along(alphasOfInterest), function(curi) {
curcvs <- cvs[[curi]]
lams <- curcvs$lambda
lams[lams < overall.lambda.min] <- NA
lams[curcvs$cvm > overall.criterionthreshold] <- NA
lam1se<-max(lams, na.rm = TRUE)
c(lam = lam1se, alph = alphasOfInterest[curi])
  })

  # Step 5: Find the best (lowest) of these lambdas
  overall.lambda.1se <- max(corrected1se["lam", ])
  pos <- match(overall.lambda.1se, corrected1se["lam", ])
  overall.alpha.1se <- corrected1se["alph", pos]

  # Best parameters
  alpha_without_caret[r] <- as.numeric(overall.alpha.1se) # alpha according to glmnet only
  lambda_without_caret[r] <- as.numeric(overall.lambda.1se) # lambda according to glmnet only

  # Elastic net with best parameters
  mod_elnet_wc <- glmnet(x = as.matrix(data[colnames(data) %in% "y" == FALSE]), 
                  y = data$y, alpha = alpha_without_caret[r], family = "gaussian", lambda = lambda_without_caret[r])

  # Estimation of lm with the variables that have been selected in the elastic net
  vars_elnet_wc <- names(mod_elnet_wc$beta[ , 1])[as.numeric(mod_elnet_wc$beta[ , 1]) != 0]
  mod_elnet_wc_lm <- lm(y ~ ., data[ , colnames(data) %in% c(vars_elnet_wc, "y")])

  # MSE
  MSE_without_caret[r] <- mean(mod_elnet_wc_lm$residuals^2)
}


# Compare results
data.frame(alpha_caret, lambda_caret, MSE_caret, alpha_without_caret, lambda_without_caret, MSE_without_caret)

mean(MSE_caret)
mean(MSE_without_caret) # Better results

结果如下所示：

   alpha_caret lambda_caret MSE_caret alpha_without_caret lambda_without_caret MSE_without_caret
1          0.9          0.2  40.28436                 0.0             1.850340          40.14838
2          1.0          0.2  40.28436                 0.4             1.228666          40.48928
3          1.0          0.2  40.28436                 0.0             1.850340          40.14838
4          1.0          0.2  40.28436                 0.2             1.693744          40.23916
5          1.0          0.2  40.28436                 0.0             2.030746          40.14838
6          1.0          0.2  40.28436                 0.2             1.858882          40.36684
7          1.0          0.2  40.28436                 0.0             2.684526          40.14838
8          1.0          0.2  40.28436                 0.1             2.127441          40.16517
9          0.7          0.1  40.16302                 0.1             1.766239          40.16011
10         1.0          0.2  40.28436                 0.1             2.127441          40.16517
11         0.7          0.1  40.16302                 0.0             1.536185          40.14838
12         1.0          0.2  40.28436                 0.2             2.239030          40.36684
13         1.0          0.2  40.28436                 0.1             1.938445          40.16011
14         1.0          0.2  40.28436                 0.1             2.127441          40.16517
15         1.0          0.2  40.28436                 0.1             2.334864          40.16517
16         0.9          0.2  40.28436                 0.1             2.127441          40.16517
17         0.8          0.1  40.16302                 0.2             1.543276          40.22040
18         1.0          0.2  40.28436                 0.1             2.562510          40.22040
19         1.0          0.2  40.28436                 0.0             2.946264          40.14838
20         1.0          0.2  40.28436                 0.1             1.938445          40.16011

手工编程的 MSE 更像是基于caret包的 MSE。估计的最佳 alpha 和 lambda 非常不同。

问题：为什么这两种方法会导致对 alpha 和 lambda 的估计如此不同？