数据挖掘 - 如何在使用管道时绘制学习曲线和验证曲线 - 吾爱随笔录

如何在使用管道时绘制学习曲线和验证曲线

数据挖掘分类 scikit-学习

2021-09-14 09:28:53

如果您能在以下示例代码中告诉我，我将不胜感激：

from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split,StratifiedKFold,learning_curve,validation_curve,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of training points')
    plt.ylabel('F-measure')
    plt.grid(ls='--')
    plt.legend(loc='best')
    plt.show()


def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(param_range, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.grid(ls='--')
    plt.xlabel('Parameter value')
    plt.ylabel('F-measure')
    plt.legend(loc='best')
    plt.show()

X, y = make_classification(n_classes=2, class_sep=2,weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
print('Original dataset shape {}'.format(Counter(y)))

ln = X.shape
names = ["x%s" % i for i in range(1, ln[1] + 1)]

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)
st=StandardScaler()

rg = LogisticRegression(class_weight = { 0:1, 1:6.5 }, random_state = 42, solver = 'saga',max_iter=100,n_jobs=-1)

param_grid = {'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005],
              'clf__class_weight':[{ 0:1, 1:6 },{ 0:1, 1:4 },{ 0:1, 1:5.5 },{ 0:1, 1:4.5 },{ 0:1, 1:5 }]
              }

pipeline = Pipeline(steps=[('scaler', st),
                           ('clf', rg )])

cv=StratifiedKFold(n_splits=5,random_state=42)
rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring =  'f1')
rg_cv.fit(X_train, y_train)
print("Tuned rg best params: {}".format(rg_cv.best_params_))

ypred = rg_cv.predict(X_train)
print(classification_report(y_train, ypred))
print('######################')
ypred2 = rg_cv.predict(X_test)
print(classification_report(y_test, ypred2))

plt.figure(figsize=(9,6))
param_range1=[i / 10000.0 for i in range(1, 11)]
param_range2=[{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}, {0: 1, 1: 4.5}, {0: 1, 1: 5}]

if __name__ == '__main__':
    train_sizes, train_scores, test_scores = learning_curve(
              estimator= rg_cv.best_estimator_ , X= X_train, y = y_train,
                train_sizes=np.arange(0.1,1.1,0.1), cv= cv,  scoring='f1', n_jobs= - 1)

    plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__C", param_range=param_range1,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range1, train_scores, test_scores, title="Validation Curve for C", alpha=0.1)

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__class_weight", param_range=param_range2,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range2, train_scores, test_scores, title="Validation Curve for class_weight", alpha=0.1)

为什么将 GridSearchCv 的最佳估计器传递给学习曲线函数时，它会多次打印所有先前的打印行？
如何绘制班级权重的验证曲线？ TypeError: float() argument must be a string or a number, not 'dict'

2个回答

关于第一个和第二个问题，代码应改为：

from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, learning_curve, validation_curve, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt


def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')

    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of training points')
    plt.ylabel('F-measure')
    plt.grid(ls='--')
    plt.legend(loc='best')
    plt.show()


def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
    param_range = [x[1] for x in param_range] 
    sort_idx = np.argsort(param_range)
    param_range=np.array(param_range)[sort_idx]
    train_mean = np.mean(train_scores, axis=1)[sort_idx]
    train_std = np.std(train_scores, axis=1)[sort_idx]
    test_mean = np.mean(test_scores, axis=1)[sort_idx]
    test_std = np.std(test_scores, axis=1)[sort_idx]
    plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.grid(ls='--')
    plt.xlabel('Weight of class 2')
    plt.ylabel('Average values and standard deviation for F1-Score')
    plt.legend(loc='best')
    plt.show()


if __name__ == '__main__':
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    print('Original dataset shape {}'.format(Counter(y)))

    ln = X.shape
    names = ["x%s" % i for i in range(1, ln[1] + 1)]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    st = StandardScaler()

    rg = LogisticRegression(class_weight={0: 1, 1: 6.5}, random_state=42, solver='saga', max_iter=100, n_jobs=-1)

    param_grid = {'clf__C': [0.001, 0.01, 0.1, 0.002, 0.02, 0.005, 0.0007, .0006, 0.0005],
                  'clf__class_weight': [{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}, {0: 1, 1: 4.5}, {0: 1, 1: 5}]
                  }

    pipeline = Pipeline(steps=[('scaler', st),
                               ('clf', rg)])

    cv = StratifiedKFold(n_splits=5, random_state=42)
    rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1')
    rg_cv.fit(X_train, y_train)
    print("Tuned rg best params: {}".format(rg_cv.best_params_))

    ypred = rg_cv.predict(X_train)
    print(classification_report(y_train, ypred))
    print('######################')
    ypred2 = rg_cv.predict(X_test)
    print(classification_report(y_test, ypred2))

    plt.figure(figsize=(9, 6))
    param_range1 = [i / 10000.0 for i in range(1, 11)]
    param_range2 = [{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}, {0: 1, 1: 4.5}, {0: 1, 1: 5}]

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train,
        train_sizes=np.arange(0.1, 1.1, 0.1), cv=cv, scoring='f1', n_jobs=- 1)

    plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__C", param_range=param_range1,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range1, train_scores, test_scores, title="Validation Curve for C", alpha=0.1)

    train_scores, test_scores = validation_curve(
        estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__class_weight", param_range=param_range2,
        cv=cv, scoring="f1", n_jobs=-1)

    plot_validation_curve(param_range2, train_scores, test_scores, title="Validation Curve for class_weight", alpha=0.1)

目前（sklearn 0.22），通过问题中提供的示例，未来会警告 sklearn 0.24 将在第 72 行引发错误：

cv = StratifiedKFold(n_splits=5, random_state=42)

\lib\site-packages\sklearn\model_selection_split.py:296：FutureWarning：设置 random_state 无效，因为 shuffle 为 False。这将在 0.24 中引发错误。您应该将 random_state 保留为默认值（无），或设置 shuffle=True。未来警告

根据StratifiedKFold 文档：

shuffle : boolean, optional
是否在分批之前对每个类的样本进行洗牌。 random_state：int，RandomState实例或None，可选，默认=None
如果是int，random_state是随机数生成器使用的种子；如果是 RandomState 实例，则 random_state 是随机数生成器；如果没有，随机数生成器是 np.random 使用的 RandomState 实例。仅在 shuffle 为 True 时使用。如果 shuffle 为 False，则应将其保留为 None。
注释
该实现旨在： (...)
当 shuffle=False 时，保留数据集排序中的顺序依赖关系：某些测试集中的类 k 的所有样本在 y 中是连续的，或者在 y 中被来自 k 以外的类的样本分开.

除了要解决的未来警告外，默认设置shuffle=False确保了可重复性，因为它保留了数据集的顺序。到现在为止还挺好。

但是，在不知道数据集顺序也是随机的情况下，只有使用StratifiedKFold(..., shuffle=True) 才能确保不会有任何数据集排序偏差影响StratifiedKFold.

由于数据集生成器make_classification与其默认的 `shuffle=True' 一起使用，因此这次不会出现数据集排序偏差问题：

根据make_classification 文档：

在不打乱的情况下，X 按以下顺序水平堆叠特征：主要的 n_informative 特征，然后是 n_redundant 信息特征的线性组合，然后是 n_repeated 重复，随机抽取信息和冗余特征的替换。其余特征充满随机噪声。因此，无需改组，所有有用的特征都包含在列 X[:, :n_informative + n_redundant + n_repeated] 中。

为了解决未来的错误，删除无用的random_state=42：
cv = StratifiedKFold(n_splits=5)

为了在无法确保数据集排序是随机的情况下解决未来的错误：
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)甚至
cv = StratifiedShuffleSplit(n_splits=5, random_state=42)

其它你可能感兴趣的问题

上一篇Python中随机梯度下降的实现下一篇如何找出概率输出中的每一列对应于使用 Keras 解决多类分类问题的类？