有时,如果残差的分布是重尾的,我会看到建议使用student-t 残差拟合回归,而不是使用OLS(相当于假设正态分布的残差)。然而,由于 OLS 估计量是蓝色的(由 Gauss-Markov 提出),它的方差(因此 MSE)应该比假设 student-t 残差通过最大似然拟合的回归要低。即使残差确实是 t 分布的,也是如此。
在一个简单的模拟中(参见底部的 R 代码),OLS 和 t 回归在样本外 MSE 方面基本上是等效的,并且真实残差跟随学生-分布。如果 OLS 在预测(由 MSE 判断)或推理(由模型参数的 CI 覆盖率判断)等标准任务上表现相同,那么假设学生 t 误差拟合回归有什么优势?我不应该通过假设正确指定的可能性而不是错误指定的可能性来估计参数而受益吗?还是我的模拟具有误导性?
对这篇文章的回答表明,如果没有使用 t 分布的误差,预测区间将是错误的。这真的是唯一的好处吗?
下面是模拟代码:
library(hett) # for fitting t-based regressions
get_regression_function <- function(beta) {
reg_function <- function(x){
beta[1] + beta[2] * x
}
return(reg_function)
}
get_t_prediction <- function(df, model) {
t_beta_hat <- model$loc.fit$coefficients
design_mat <- df$x
intercept <- rep(1, length(df$x))
design_mat <- cbind(intercept, design_mat)
pred <- design_mat %*% as.matrix(t_beta_hat)
return(pred)
}
get_ols_ci_coverage <- function(ols_fit, par_name, true_theta) {
CI_mat <- confint(ols_fit)
lw <- CI_mat[par_name, '2.5 %']
up <- CI_mat[par_name, '97.5 %']
cover_beta_1_ols <- (lw <= true_theta) & (true_theta <= up)
return(cover_beta_1_ols)
}
get_t_ci_coverage <- function(t_fit, par_name, true_theta) {
t_summary <- as.data.frame(summary(t_fit)$loc.summary$coefficients)
t_pt_est <- t_summary[par_name, 'Estimate']
t_se <- t_summary[par_name, 'Std. Error']
t_ci <- t_pt_est + 1.96 * t_se * c(-1, 1)
cover_t <- (t_ci[1] <= true_theta) &
(true_theta <= t_ci[2])
return(cover_t)
}
## simulation
true_beta_0 = 0.5
true_beta_1 = 1.5
reg_function <- get_regression_function(
beta = c(true_beta_0, true_beta_1))
ssr_ols <- c()
ssr_t <- c()
ols_ci_coverage <- c()
t_ci_coverage <- c()
num_sims <- 10000
set.seed(1)
for (i in 1:num_sims){
# Generate a dataset of size N
N = 10000
x <- rnorm(n = N, mean = 0, sd = 1)
errors <- rt(N, df = 3)
y_mean <- reg_function(x = x)
y <- y_mean + errors
df <- data.frame(y = y, x = x)
df_train <- df[1:(N/2), ]
df_test <- df[(N/2 + 1):N, ]
# fit linear models
OLS_fit <- lm(y ~ x, data = df_train)
t_fit <- tlm(y ~ x,
data = df_train)
# generate test set predictions
lm_pred <- predict(OLS_fit, newdata = df_test)
t_pred <- get_t_prediction(
df = df_test, model = t_fit)
# test residuals
lm_res <- lm_pred - df_test$y
t_res <- t_pred - df_test$y
# MSE -- test set
ssr_ols[i] <- mean(lm_res^2)
ssr_t[i] <- mean(t_res^2)
# CI coverage for beta1
ols_ci_coverage[i] <- get_ols_ci_coverage(
ols_fit = OLS_fit, par_name = 'x', true_theta = true_beta_1)
t_ci_coverage[i] <- get_t_ci_coverage(
t_fit = t_fit, par_name = 'x', true_theta = true_beta_1)
}
mean(ssr_ols)
mean(ssr_t)
mean(ols_ci_coverage)
mean(t_ci_coverage)