使用 Lending club Dataset 查找违约概率。我正在使用 hyperopt 库来微调 XGBclassifier 的超参数并尝试最大化 ROC AUC 分数。我还在管道内使用随机过采样并对整个管道执行交叉验证。问题是,使用交叉验证从 Hyperopt 获得的参数与在整个训练数据上拟合模型并尝试在验证集上计算 ROC AUC 分数时,我得到的分数非常不同。
尽管进行了交叉验证,但模型似乎过度拟合。我不知道我该怎么办。交叉验证分数:0.74
验证分数:0.66
找到下面的代码:
#creating lists for numerical,text,categorical features for preprocessing step
numerical_features =(sorted(features.select_dtypes(include=['float64']).columns))
categorical_features = (sorted(features.select_dtypes(exclude=['float64']).columns))
text_features=['emp_title','title']
ordinal_features=['grade']
categorical_features.remove('emp_title')
categorical_features.remove('title')
categorical_features.remove('grade')
numerical_features.remove('int_rate')
#%%
numerical_features.remove('total_pymnt')
#label encoding label/target variable combining different classes
#le = preprocessing.LabelEncoder()
#eh=le.fit_transform(target)
#%%
#creating training and validation sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,random_state=777)
#%%
preprocess = make_column_transformer(((make_pipeline(IterativeImputer(initial_strategy='median',add_indicator=True,verbose=2,max_iter=100),StandardScaler())),numerical_features),((make_pipeline(SimpleImputer(strategy='constant',fill_value="Not Available",add_indicator=True),OneHotEncoder(handle_unknown='ignore'))),categorical_features),(OrdinalEncoder(),ordinal_features))
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
#RandomOverSampler(sampling_strategy=sampling,random_state=777)
#%%
import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
counts2=np.asarray((unique, counts)).T
#%%
#%%
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
def objective(space):
classifier = make_pipeline(preprocess,RandomOverSampler(random_state=777),XGBClassifier(n_jobs=-1,verbosity=3,
objective= 'binary:logistic',
nthread=-1,
scale_pos_weight=1,
seed=27,tree_method='hist',n_estimators = space['n_estimators'],
max_depth = int(space['max_depth']),
learning_rate = space['learning_rate'],
gamma = space['gamma'],
min_child_weight = space['min_child_weight'],
subsample = space['subsample'],
colsample_bytree = space['colsample_bytree']))
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv =3,scoring='roc_auc')
CrossValMean = accuracies.mean()
print("CrossValMean:", CrossValMean)
return{'loss':1-CrossValMean, 'status': STATUS_OK }
space = {
'max_depth' : hp.choice('max_depth', range(5, 50, 1)),
'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
'n_estimators' : hp.choice('n_estimators', range(20, 500, 10)),
'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)}
trials = Trials()
best = fmin(fn=objective,
space=space,
algo=tpe.suggest,
max_evals=300,
trials=trials)
print("Best: ", best)
#%%
#training model on parameters got from hyperopt
grid_clf = make_pipeline(preprocess,RandomOverSampler(random_state=777),XGBClassifier(n_jobs=-1,verbosity=3,
objective= 'binary:logistic',
nthread=-1,
scale_pos_weight=1,
seed=27,tree_method='hist',n_estimators = 370,
max_depth = 6,
learning_rate = 0.16,
gamma = 0.45,
min_child_weight =7.0,
subsample = 0.52,
colsample_bytree = 0.76))
print(grid_clf)
#%%
clf=grid_clf.fit(X_train, y_train)
#, xgbclassifier__early_stopping_rounds=20, xgbclassifier__eval_set=[(X_test, y_test)],xgbclassifier__eval_metric='refit_score')
#%%
print(classification_report(y_test, grid_clf.predict(X_test) ))
#%%
print(confusion_matrix(y_test, grid_clf.predict(X_test) ))
#%%
from sklearn.metrics import roc_auc_score
y_predicted = grid_clf.predict(X_test)
r_adj_test = roc_auc_score(y_test, y_predicted)
print(r_adj_test )