数据挖掘 - ROC AUC 分数远低于平均交叉验证分数 - 吾爱随笔录

使用 Lending club Dataset 查找违约概率。我正在使用 hyperopt 库来微调 XGBclassifier 的超参数并尝试最大化 ROC AUC 分数。我还在管道内使用随机过采样并对整个管道执行交叉验证。问题是，使用交叉验证从 Hyperopt 获得的参数与在整个训练数据上拟合模型并尝试在验证集上计算 ROC AUC 分数时，我得到的分数非常不同。

尽管进行了交叉验证，但模型似乎过度拟合。我不知道我该怎么办。交叉验证分数：0.74

验证分数：0.66

找到下面的代码：

#creating lists for numerical,text,categorical features for preprocessing step
numerical_features =(sorted(features.select_dtypes(include=['float64']).columns))
categorical_features = (sorted(features.select_dtypes(exclude=['float64']).columns))
text_features=['emp_title','title']
ordinal_features=['grade']
categorical_features.remove('emp_title')
categorical_features.remove('title')
categorical_features.remove('grade')
numerical_features.remove('int_rate')
#%%
numerical_features.remove('total_pymnt')
#label encoding label/target variable combining different classes
#le = preprocessing.LabelEncoder()
#eh=le.fit_transform(target)
#%%
#creating training and validation sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,random_state=777)
#%%
preprocess = make_column_transformer(((make_pipeline(IterativeImputer(initial_strategy='median',add_indicator=True,verbose=2,max_iter=100),StandardScaler())),numerical_features),((make_pipeline(SimpleImputer(strategy='constant',fill_value="Not Available",add_indicator=True),OneHotEncoder(handle_unknown='ignore'))),categorical_features),(OrdinalEncoder(),ordinal_features))
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

#RandomOverSampler(sampling_strategy=sampling,random_state=777)
#%%
import numpy as np


unique, counts = np.unique(y_train, return_counts=True)

counts2=np.asarray((unique, counts)).T
#%%



#%%
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
def objective(space):

    classifier = make_pipeline(preprocess,RandomOverSampler(random_state=777),XGBClassifier(n_jobs=-1,verbosity=3,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27,tree_method='hist',n_estimators = space['n_estimators'],
                            max_depth = int(space['max_depth']),
                            learning_rate = space['learning_rate'],
                            gamma = space['gamma'],
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],
                            colsample_bytree = space['colsample_bytree']))




    # Applying k-Fold Cross Validation
    from sklearn.model_selection import cross_val_score



    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv =3,scoring='roc_auc')
    CrossValMean = accuracies.mean()

    print("CrossValMean:", CrossValMean)




    return{'loss':1-CrossValMean, 'status': STATUS_OK }

space = {
    'max_depth' : hp.choice('max_depth', range(5, 50, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators' : hp.choice('n_estimators', range(20, 500, 10)),
    'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=300,
            trials=trials)

print("Best: ", best)
#%%
#training model on parameters got from hyperopt
grid_clf = make_pipeline(preprocess,RandomOverSampler(random_state=777),XGBClassifier(n_jobs=-1,verbosity=3,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27,tree_method='hist',n_estimators = 370,
                            max_depth = 6,
                            learning_rate = 0.16,
                            gamma = 0.45,
                            min_child_weight =7.0,
                            subsample = 0.52,
                            colsample_bytree = 0.76))


print(grid_clf)
#%%

clf=grid_clf.fit(X_train, y_train)
#, xgbclassifier__early_stopping_rounds=20, xgbclassifier__eval_set=[(X_test, y_test)],xgbclassifier__eval_metric='refit_score')
#%%
print(classification_report(y_test, grid_clf.predict(X_test) ))
#%%
print(confusion_matrix(y_test, grid_clf.predict(X_test) ))
#%%
from sklearn.metrics import roc_auc_score
y_predicted = grid_clf.predict(X_test)
r_adj_test = roc_auc_score(y_test, y_predicted)
print(r_adj_test )