数据挖掘 - 提高 R 中不平衡数据集的分类器性能 - 吾爱随笔录

我在不平衡的数据集（6% 肯定）上使用了“adabag”（boosting + bagging）模型，我试图在保持准确率高于 70% 的同时最大化灵敏度，我得到的最佳结果是：

中华民国= 0.711
SENS=0.94
规格=0.21

结果不是 Inhofe，尤其是不好的特异性。关于如何改进结果的任何建议？可以改进优化，还是添加惩罚项有帮助？

这是代码：

ctrl <- trainControl(method = "cv",
                     number = 5,
                     repeats = 2, 
                     p = 0.80,
                     search = "grid", 
                     initialWindow = NULL, 
                     horizon = 1,
                     fixedWindow = TRUE,
                     skip = 0,
                     verboseIter = FALSE,
                     returnData = TRUE,
                     returnResamp = "final",
                     savePredictions = "all",
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     preProcOptions = list(thresh = 0.80, ICAcomp = 3, k = 7, freqCut = 90/10,uniqueCut = 10, cutoff = 0.2),
                     sampling = "smote",
                     selectionFunction = "best",
                     index = NULL,
                     indexOut = NULL,
                     indexFinal = NULL,
                     timingSamps = 0,
                     predictionBounds = rep(FALSE, 2),
                     seeds = NA,
                     adaptive = list(min = 5,alpha = 0.05, method = "gls", complete = TRUE),
                     trim = FALSE,
                     allowParallel = TRUE)

grid <- expand.grid(maxdepth = 25, mfinal = 4000)

classifier <- train(x = training_set[,-1],y = training_set[,1], method = 'AdaBag',trControl = ctrl,metric = "ROC",tuneGrid = grid)
prediction <- predict(classifier, newdata= test_set,'prob')

来自 classifierplots 包的绘图：

我也试过 xgboost。

这是代码：

gbmGrid <- expand.grid(nrounds = 50, eta = 0.3,max_depth = 3,gamma = 0,colsample_bytree=0.6,min_child_weight=1,subsample=0.75)

ctrl <- trainControl(method = "cv",
                     number = 10,
                     repeats = 2, 
                     p = 0.80,
                     search = "grid", 
                     initialWindow = NULL, 
                     horizon = 1,
                     fixedWindow = TRUE,
                     skip = 0,
                     verboseIter = FALSE,
                     returnData = TRUE,
                     returnResamp = "final",
                     savePredictions = "all",
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     sampling = "smote",
                     selectionFunction = "best",
                     index = NULL,
                     indexOut = NULL,
                     indexFinal = NULL,
                     timingSamps = 0,
                     predictionBounds = rep(FALSE, 2),
                     seeds = NA,
                     adaptive = list(min = 5,alpha = 0.05, method = "gls", complete = TRUE),
                     trim = FALSE,
                     allowParallel = TRUE)

classifier <- train(x = training_set[,-1],y = training_set[,1], method = 'xgbTree',metric = "ROC",trControl = ctrl,tuneGrid = gbmGrid)
prediction <- predict(classifier, newdata= test_set[,-1],'prob')

来自 classifierplots 包的绘图：

更新：

我尝试了非对称 adaboost，这是代码：

model_weights <- ifelse(training_set$readmmited == "yes",
                        (1/table(training_set$readmmited)[1]) * 0.4,
                        (1/table(training_set$readmmited)[2]) * 0.6)


ctrl <- trainControl(method = "repeatedcv",
                     number = 5,
                     repeats = 2, 
                     search = "grid", 
                     returnData = TRUE,
                     returnResamp = "final",
                     savePredictions = "all",
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     selectionFunction = "best",
                     allowParallel = TRUE)

classifier <- train(x = training_set[,-1],y = training_set[,1], method = 'ada',trControl = ctrl,metric = "ROC",weights = model_weights)

但特异性为零，我做错了什么？