我正在处理一个不平衡的数据集。训练数据中有 11567 个负样本和 3737 个正样本。验证数据中有 2892 个负样本和 935 个正样本。这是一个二元分类问题,我使用微观和宏观平均 ROC 来评估我的模型。但是,我注意到 Micro 平均 Roc-Auc 分数高于特定类别的 Roc-Auc 分数。这对我来说没有意义。
正如您在图中看到的那样,所有点的微平均 roc-auc 得分都更高。如果可能的话,你能解释一下背后的原因吗?我使用了sklearn-link并将其转换为二进制分类(y-true -> 一种热表示)。我还在下面添加了我的代码。
xgboost_model = XGBClassifier(n_estimators= 450,max_depth= 5,min_child_weight=2)
xgboost_model.fit(X_train,y_train)
yy_true,yy_pred = yy_val, xgboost_model.predict_proba(Xx_val)# .predict_proba gives probability for each class
# Compute ROC curve and ROC area for each class
y_test = flat(yy_true) # Convert labels to one hot encoded version
y_score = yy_pred
n_classes=2
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=2)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=2)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for nonSampling Training Data')
plt.legend(loc="lower right")
plt.savefig('nonsample.png', format='png', dpi=600)
plt.show()