使用 R,我使用 5 种不同数量的树(3,10,30,100,300)创建了 5 个不同的随机森林模型。我的目的是计算每个模型的误分类率,并将误分类率与树的数量作图,以说明通常情况下,随机森林模型中树的增加与误分类率的降低相关。
我有几个同事在 Python 中运行了同样的模型,他们的模型在 300 棵树模型中的错误分类率达到了 ~0.08。然而,当我在 R 中运行我的模型时,错误分类率似乎在 100 棵树模型中稳定在 ~0.2 左右,并且在 ~300 棵树模型中并没有降低。我很好奇可能导致这种差异的原因。我在下面提供了我的代码。
madelon_train <- data.frame(madelon_train_data, madelon_train_labels)
for(i in c(3,10,30,100,300)){
assign(paste("madelonforest", i, sep = ""),
randomForest(as.factor(madelon_train$V1.1) ~ ., data = madelon_train, ntree =
i, mtry = sqrt(500), replace = FALSE))
}
modellist <- vector(mode="list", length=5)
for(i in c(3,10,30,100,300)){
modellist[[i]] <- eval(as.name(paste("madelonforest", i, sep = "")))
}
#Use models to predict training data and compute misclassification error
classerrlisttrain <- vector(mode="list", length=5)
for(i in c(3,10,30,100,300)){
err <-table(as.numeric(as.character(predict(modellist[[i]],
madelon_train_data, type = 'class', OOB = TRUE))) - madelon_train_labels)
classerrlisttrain[[i]] <- assign(paste("misclassification", i, sep = ""),
err[names(err)==0])
}
for(i in c(3,10,30,100,300)){
classerrlisttrain[[i]] = as.double(classerrlisttrain[[i]])
classerrlisttrain[[i]] = 1 -
classerrlisttrain[[i]]/length(madelon_train_labels$V1)
}
#Use models to predict test data and compute misclassification error
classerrlisttest <- vector(mode="list", length=5)
for(i in c(3,10,30,100,300)){
err <-table(as.numeric(as.character(predict(modellist[[i]],
madelon_valid_data, type = 'class'))) - madelon_valid_labels)
classerrlisttest[[i]] <- assign(paste("misclassification", i, sep = ""),
err[names(err)==0])
}
for(i in c(3,10,30,100,300)){
classerrlisttest[[i]] = as.double(classerrlisttest[[i]])
classerrlisttest[[i]] = 1 -
classerrlisttest[[i]]/length(madelon_valid_labels$V1)
}
#Plot misclassification errors vs Tree Depth
plot(c(3,10,30,100,300), classerrlisttrain[c(3,10,30,100,300)], type = 'l',
xlab = 'Number of Trees', ylab = 'Misclassification Rate', xlim = c(1,300),
ylim = c(0,0.5), col = "red")
lines(c(3,10,30,100,300), classerrlisttest[c(3,10,30,100,300)], type = 'l',
col = "blue")
legend(1,0.1,legend = c("Train Data", "Test Data"), col =
c("red","blue"),lty=1, cex=0.8)