数据挖掘 - 当训练分数高而测试分数低时，如何避免岭回归和套索回归中的过度拟合或数据泄漏？ - 吾爱随笔录

我使用了这里提供的代码： https ://towardsdatascience.com/ridge-and-lasso-regression-a-complete-guide-with-python-scikit-learn-e20e34bcbf0b

唯一的区别是我在下面给出的数据上使用了 StandardScalar：

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)
print len(X_test), len(y_test)

这是我的岭回归结果： linear regression train score: 1.0 linear regression test score: -0.07550729376673715 ridge regression train score low alpha: 0.9999999970240117 ridge regression test score low alpha: -0.07532716978805554 ridge regression train score high alpha: 0.8659167364307487 ridge regression test score high alpha: 0.013702748149851396

我的套索结果： training score: 0.48725444995774625 test score: -0.3393210376146986 number of features used: 4 training score for alpha=0.01: 0.9998352085084429 test score for alpha =0.01: -0.6995903332119675 number of features used: for alpha =0.01: 24 training score for alpha=0.0001: 0.9999999830932269 test score for alpha =0.0001: -0.7189894474663594 number of features used: for alpha =0.0001: 25 LR training score: 1.0 LR test score: -0.7217224228737649

我无法理解为什么我会得到这样的结果！非常感谢任何帮助。

编辑：代码如下

    #Importing modules

        import sys
        import math 
        import itertools
        import numpy as np
        import pandas as pd
        from numpy import genfromtxt
        from matplotlib import style
        import matplotlib.pyplot as plt
        from sklearn import linear_model
        from matplotlib import style, figure
        from sklearn.linear_model import Lasso
        from sklearn.linear_model import Ridge
        from sklearn.linear_model import LinearRegression
        from sklearn.cross_validation import train_test_split

    #Importing data
    df = np.genfromtxt('/Users/pfc.csv', delimiter=',')

    X = df[0:,1:298]
    y = df[0:,0]
    print (X).shape
    print (y).shape
    display (X)
    display (y)
    print (y)



#print type(newY)# pandas core frame
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4)

  #Apply StandardScaler for feature scaling
        from sklearn.preprocessing import StandardScaler
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform (X_test)
        print len(X_test), len(y_test)

    lr = LinearRegression()
    lr.fit(X_train, y_train)
    rr = Ridge(alpha=0.01) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely restricted and in this case linear and ridge regression resembles

    from sklearn.metrics import mean_squared_error
    from math import sqrt
    rr.fit(X_train, y_train)
    rr100 = Ridge(alpha=115.5) #  comparison with alpha value
    rr100.fit(X_train, y_train)
    train_score=lr.score(X_train, y_train)
    test_score=lr.score(X_test, y_test)
    Ridge_train_score = rr.score(X_train,y_train)
    Ridge_test_score = rr.score(X_test, y_test)
    Ridge_train_score100 = rr100.score(X_train,y_train)
    Ridge_test_score100 = rr100.score(X_test, y_test)

    print "linear regression train score:", train_score
    print "linear regression test score:", test_score
    print "ridge regression train score low alpha:", Ridge_train_score
    print "ridge regression test score low alpha:", Ridge_test_score
    print "ridge regression train score high alpha:", Ridge_train_score100
    print "ridge regression test score high alpha:", Ridge_test_score100
    plt.figure (figsize= (12.8,9.6), dpi =100)
    plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
    plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency
    plt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')
    plt.xlabel('Coefficient Index',fontsize=16)
    plt.ylabel('Coefficient Magnitude',fontsize=16)
    plt.legend(fontsize=13,loc=4)
    plt.show()

    # difference of lasso and ridge regression is that some of the coefficients can be zero i.e. some of the features are 
    # completely neglected
    lasso = Lasso()
    lasso.fit(X_train,y_train)
    train_score=lasso.score(X_train,y_train)
    test_score=lasso.score(X_test,y_test)
    coeff_used = np.sum(lasso.coef_!=0)
    print "training score:", train_score 
    print "test score: ", test_score
    print "number of features used: ", coeff_used
    lasso001 = Lasso(alpha=0.01, max_iter=10e5)
    lasso001.fit(X_train,y_train)
    train_score001=lasso001.score(X_train,y_train)
    test_score001=lasso001.score(X_test,y_test)
    coeff_used001 = np.sum(lasso001.coef_!=0)
    print "training score for alpha=0.01:", train_score001 
    print "test score for alpha =0.01: ", test_score001
    print "number of features used: for alpha =0.01:", coeff_used001
    lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
    lasso00001.fit(X_train,y_train)
    train_score00001=lasso00001.score(X_train,y_train)
    test_score00001=lasso00001.score(X_test,y_test)
    coeff_used00001 = np.sum(lasso00001.coef_!=0)
    print "training score for alpha=0.0001:", train_score00001 
    print "test score for alpha =0.0001: ", test_score00001
    print "number of features used: for alpha =0.0001:", coeff_used00001
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    lr_train_score=lr.score(X_train,y_train)
    lr_test_score=lr.score(X_test,y_test)
    print "LR training score:", lr_train_score 
    print "LR test score: ", lr_test_score
    plt.figure (figsize= (12.8,9.6), dpi =100)
    plt.subplot(1,2,1)
    plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
    plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
    plt.xlabel('Coefficient Index',fontsize=16)
    plt.ylabel('Coefficient Magnitude',fontsize=16)
    plt.legend(fontsize=13,loc=4)
    plt.subplot(1,2,2)
    plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
    plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
    plt.plot(lasso00001.coef_,alpha=0.8,linestyle='none',marker='v',markersize=6,color='black',label=r'Lasso; $\alpha = 0.00001$') # alpha here is for transparency
    plt.plot(lr.coef_,alpha=0.7,linestyle='none',marker='o',markersize=5,color='green',label='Linear Regression',zorder=2)
    plt.xlabel('Coefficient Index',fontsize=16)
    plt.ylabel('Coefficient Magnitude',fontsize=16)
    plt.legend(fontsize=13,loc=4)
    plt.tight_layout()
    plt.show()

PS：请忽略缩进。