自我实现逻辑回归的问题

数据挖掘 机器学习 Python scikit-学习 逻辑回归 执行
2022-03-02 21:37:10

我正在尝试自我实现一个逻辑回归算法来进行一些自我学习,但是我在实现与 sklearn 的逻辑回归相似的准确性方面遇到了一些麻烦。

这是我正在使用的代码(我正在使用的数据集是来自 kaggle 的 Titanic 'training.csv' 数据集,如果你想自己测试,可以在这里下载。)

import numpy as np
import random
import matplotlib.pyplot as plt
#%matplotlib inline

def cost(X, Y, W):
    """
    x = matrix of features
    y = vector of ground truth labels
    w = vector of weight parameters
    """
    m = len(Y)
    if isinstance(Y, list):
        Y = np.array(Y)
    return -(1/m) * np.sum([Y*np.log(sigmoid(W, X)), (1-Y)*np.log(1-sigmoid(W, X))])

def sigmoid(w, x):
    """Hypothesis function of the logisitic regression.
    
    w = column vector of weights
    x = column vector of features
    """

    z = np.dot(w.T, x)
    return 1 / (1 + np.exp(-z))

def grad_descent(A, w, y, 
                 lr = 0.01, 
                 stochastic = False, 
                 max_iter = 1000, 
                 mute = True, 
                 plot = True):
    """
    A = design matrix
    w = weights column vector
    y = ground truth label
    lr = learning rate
    stochastic = whether to use stochastic gradient descent or not
    max_iter = maximum number of epochs to train for
    mute = whether to print the current epoch to the screen
    plot = whether to plot the loss function after training ends
    """
    if not isinstance(A, np.ndarray):
        m = "A must be a numpy array, got %s"
        raise TypeError(m % type(A).__name__)
    
    if isinstance(y, list):
        y = np.array(y)
        y = y.T
        y = np.expand_dims(y, axis = 1)
    if isinstance(w, list):
        w = np.array(w)
        # Make w a column vector
        w.shape = (A.shape[1], 1)
    
    losses = []
    i = 0
    
    while i < max_iter:
        old_weights = w
        # create/update the alpha vector
        alpha = [sigmoid(w, A[i, :]) for i in range(A.shape[0])]
        if not mute:
            print("Epoch %d" % (i+1))
        
        if stochastic:
            # stochastic grad descent chooses a training point at random
            # so here we choose a random row from the matrix A
            rand = random.randint(0, A.shape[0]-1)
            # select random entries
            temp_A = A[rand].T
            temp_A = temp_A.reshape(A.shape[1], 1)
            temp_b = alpha[rand] - y[rand]
            
            # Calc gradient
            grad = np.dot(temp_A, (temp_b))
            # Update weights
            w = (w.T - (lr * grad)).T
            
        # perform batch gradient descent
        else:
            # number of samples
            m = len(y)
            # Calc gradient
            grad = (1/m) * np.dot(A.T, (alpha - y))
            # Update weights
            w = w - (lr * grad)
        
        if i != 0:
            # if loss starts increasing then stop
            if cost(A.T, y, w) > losses[-1]:
                print("Stopping at epoch %d" % i)
                if plot:
                    print('Losses')
                    plt.plot(losses)
                return old_weights
                
                break
        
        # Track the loss function value
        losses.append(cost(A.T, y, w))
        
        # iterate epoch counter
        i += 1
        
    print("Stopping at epoch %d" % i)
    if plot:
        print('Losses')
        plt.plot(losses)
    
    return w


#############################################################################
#############################################################################
#############################################################################
if __name__ == "__main__":

    import pandas as pd

    train = pd.read_csv(r'C:\Users\LENOVO\Documents\Self_Study\titanic\train.csv')
    
    # convert the Sex column to zeros and ones
    train['Sex'] = train['Sex'].map({'female': 1, 'male': 0})
    
    # There are some zero values in the fare column, replace these with a more likely value
    rows = np.where(train.Fare == np.min(train.Fare))
    # assign the mean fare value for the given class these people were staying in
    class_ = train.iloc[rows[0], 2].values

    for clas, row in zip(class_, rows[0]):
        
        # get the mean
        Pclass = train.loc[(train['Pclass'] == clas)]
        c_mean = np.mean(Pclass['Fare'])
        # assign the value to the proper row
        train.iloc[row, 9] = c_mean

        
    train.head()
    
    
    # set a learning rate
    lr = 0.01
    
    sexes = train.Sex
    fares = train.Fare
    
    # scale the fare value by dividing by the highest value so we get a range of 0 - 1
    fares = fares/np.max(fares)
    
    # put into matrix format
    A = np.array([[1, s, f] for s, f in zip(sexes, fares)])
    # create initial weights
    w = [0, 0, 0]
    # get the ground truth
    y = list(train.Survived)
    
    # train the model
    weights = grad_descent(A, w, y, lr = 0.01,
                           stochastic = False)
    
    # Lets use these weights to make predictions on the training data and see how it looks
    def classification(weights, features):
        prob = sigmoid(weights, features)
        if prob > .5:
            return 1
        else:
            return 0
    
    correct = 0
    
    for i, row in train.iterrows():
        fare = row['Fare']
        sex = row['Sex']
        A = np.array([[1, sex, fare]])
        A.shape = (1, 3)
        pred = classification(weights, A[0,:])
            
        if row['Survived'] == pred:
            correct += 1
    
    print(correct/len(train.index))


最后我得到大约 65% 的准确率,而使用 sklearn 我可以得到 78% 的准确率。我知道 sklearn 的算法可能比我的复杂得多,但我希望我至少可以接近(可能在 70 年代)。有什么建议吗?

1个回答

您正在使用(随机)梯度下降。为了使其正常工作,必须正确设置学习率(步长)。我认为错误就在那里。

相反,您可以尝试通过 IRLS 进行逻辑回归(参见其定义),还可以比较IRLS 与 GD

或者对于您测试的输入,您刚刚发现了一个不好的局部最优值。