根据 StatsStudent 的回答中 Sherman 和 Cessie (1997) 的引述,我在 R 中写了一些东西供我自己使用。
它在具有不同大小的集群的集群数据上实现引导复制。
它确保多次采样的集群(由于替换)被视为引导样本中的不同集群(在涉及沿着集群维度的固定效应的估计中尤其重要)。
最后,请注意,这种方法允许计算通过用户提供的自定义函数计算的任何统计数据的单向集群引导标准误差。在这方面,它比 Sandwich::vcovBS 或 sandwich::vcovCL 更灵活,后者只接受一些拟合的模型对象作为输入。
library(boot)
library(sandwich)
## Make some necessary objects
# unbalanced panel data
data("PetersenCL", package = "sandwich")
data <- subset(PetersenCL, !(firm %in% c(1:10) & year == 10))
cluster_var <- "firm"
# get the different cluster sizeS. This is necessary to cluster bootstrapping with clusters of different sizes.
sizes <- table(data[,cluster_var])
u_sizes <- sort(unique(sizes))
# names and numbers of clusters of every sizes
cl_names <- list()
n_clusters <- list()
for(s in u_sizes){
cl_names[[s]] <- names(sizes[sizes == s])
n_clusters[[s]] <- length(cl_names[[s]])
}
par_list <- list(unique_sizes = u_sizes,
cluster_names = cl_names,
number_clusters = n_clusters)
## Design the bootstrap sampling function.
# It will tell boot::boot how to sample data at each replicate
ran.gen_cluster <- function(original_data, arg_list){
cl_boot_dat <- NULL
# non-unique names of clusters (repeated when there is more than one obs. in a cluster)
nu_cl_names <- as.character(original_data[,cluster_var])
for(s in arg_list[["unique_sizes"]]){
# sample, in the vector of names of clusters of size s, as many draws as there are clusters of that size, with replacement
sample_cl_s <- sample(arg_list[["cluster_names"]][[s]],
arg_list[["number_clusters"]][[s]],
replace = TRUE)
# because of replacement, some names are sampled more than once
sample_cl_s_tab <- table(sample_cl_s)
# we need to give them a new cluster identifier, otherwise a cluster sampled more than once
# will be "incorrectly treated as one large cluster rather than two distinct clusters" (by the fixed effects) (Cameron 2015)
for(n in 1:max(sample_cl_s_tab)){ # from 1 to the max number of times a name was sampled bc of replacement
# vector to select obs. that are within the sampled clusters.
names_n <- names(sample_cl_s_tab[sample_cl_s_tab == n])
sel <- nu_cl_names %in% names_n
# select data accordingly to the cluster sampling (duplicating n times observations from clusters sampled n times)
clda <- original_data[sel,][rep(seq_len(sum(sel)), n), ]
#identify row names without periods, and add ".0"
row.names(clda)[grep("\\.", row.names(clda), invert = TRUE)] <- paste0(grep("\\.", row.names(clda), invert = TRUE, value = TRUE),".0")
# add the suffix due to the repetition after the existing cluster identifier.
clda[,cluster_var] <- paste0(clda[,cluster_var], sub(".*\\.","_",row.names(clda)))
# stack the bootstrap samples iteratively
cl_boot_dat <- rbind(cl_boot_dat, clda)
}
}
return(cl_boot_dat)
}
#test that the returned data are the same dimension as input
test_boot_d <- ran.gen_cluster(original_data = data,
arg_list = par_list)
dim(test_boot_d)
dim(data)
# test new clusters are not duplicated (correct if anyDuplicated returns 0)
base::anyDuplicated(test_boot_d[,c("firm","year")])
## Custom ("black-box") estimation function
est_fun <- function(est_data){
est <- lm(as.formula("y ~ x"), est_data)
# statistics we want to evaluate the variance of:
return(est$coefficients[2])
}
# see ?boot::boot for more details on these arguments
boot(data = data,
statistic = est_fun,
ran.gen = ran.gen_cluster,
mle = par_list,
sim = "parametric",
parallel = "no",
R = 200)
与渐近解比较(可用于不平衡数据,但仅适用于 lm 或 glm 对象):
sdw_cl <- vcovCL(lm(as.formula("y ~ x"), data), cluster = ~firm)
sqrt(sdw_cl["x","x"])
注意:我还没有设法确定为什么我提供的代码的输出不等于 vcovBS 的输出。即使对于平衡数据:
## compare with sandwich::vcovBS, on balanced panel data
set.seed(1234)
custom_bs <- boot(data = PetersenCL,
statistic = est_fun,
ran.gen = ran.gen_cluster,
mle = par_list,
sim = "parametric",
parallel = "no",
R = 200)
sd(custom_bs$t[,2])
set.seed(1234)
sdw_bs <- vcovBS(lm(as.formula("y ~ x"), PetersenCL), cluster = ~firm, R=200)#
sqrt(sdw_bs["x","x"])