我正在尝试自我实现一个逻辑回归算法来进行一些自我学习,但是我在实现与 sklearn 的逻辑回归相似的准确性方面遇到了一些麻烦。
这是我正在使用的代码(我正在使用的数据集是来自 kaggle 的 Titanic 'training.csv' 数据集,如果你想自己测试,可以在这里下载。)
import numpy as np
import random
import matplotlib.pyplot as plt
#%matplotlib inline
def cost(X, Y, W):
"""
x = matrix of features
y = vector of ground truth labels
w = vector of weight parameters
"""
m = len(Y)
if isinstance(Y, list):
Y = np.array(Y)
return -(1/m) * np.sum([Y*np.log(sigmoid(W, X)), (1-Y)*np.log(1-sigmoid(W, X))])
def sigmoid(w, x):
"""Hypothesis function of the logisitic regression.
w = column vector of weights
x = column vector of features
"""
z = np.dot(w.T, x)
return 1 / (1 + np.exp(-z))
def grad_descent(A, w, y,
lr = 0.01,
stochastic = False,
max_iter = 1000,
mute = True,
plot = True):
"""
A = design matrix
w = weights column vector
y = ground truth label
lr = learning rate
stochastic = whether to use stochastic gradient descent or not
max_iter = maximum number of epochs to train for
mute = whether to print the current epoch to the screen
plot = whether to plot the loss function after training ends
"""
if not isinstance(A, np.ndarray):
m = "A must be a numpy array, got %s"
raise TypeError(m % type(A).__name__)
if isinstance(y, list):
y = np.array(y)
y = y.T
y = np.expand_dims(y, axis = 1)
if isinstance(w, list):
w = np.array(w)
# Make w a column vector
w.shape = (A.shape[1], 1)
losses = []
i = 0
while i < max_iter:
old_weights = w
# create/update the alpha vector
alpha = [sigmoid(w, A[i, :]) for i in range(A.shape[0])]
if not mute:
print("Epoch %d" % (i+1))
if stochastic:
# stochastic grad descent chooses a training point at random
# so here we choose a random row from the matrix A
rand = random.randint(0, A.shape[0]-1)
# select random entries
temp_A = A[rand].T
temp_A = temp_A.reshape(A.shape[1], 1)
temp_b = alpha[rand] - y[rand]
# Calc gradient
grad = np.dot(temp_A, (temp_b))
# Update weights
w = (w.T - (lr * grad)).T
# perform batch gradient descent
else:
# number of samples
m = len(y)
# Calc gradient
grad = (1/m) * np.dot(A.T, (alpha - y))
# Update weights
w = w - (lr * grad)
if i != 0:
# if loss starts increasing then stop
if cost(A.T, y, w) > losses[-1]:
print("Stopping at epoch %d" % i)
if plot:
print('Losses')
plt.plot(losses)
return old_weights
break
# Track the loss function value
losses.append(cost(A.T, y, w))
# iterate epoch counter
i += 1
print("Stopping at epoch %d" % i)
if plot:
print('Losses')
plt.plot(losses)
return w
#############################################################################
#############################################################################
#############################################################################
if __name__ == "__main__":
import pandas as pd
train = pd.read_csv(r'C:\Users\LENOVO\Documents\Self_Study\titanic\train.csv')
# convert the Sex column to zeros and ones
train['Sex'] = train['Sex'].map({'female': 1, 'male': 0})
# There are some zero values in the fare column, replace these with a more likely value
rows = np.where(train.Fare == np.min(train.Fare))
# assign the mean fare value for the given class these people were staying in
class_ = train.iloc[rows[0], 2].values
for clas, row in zip(class_, rows[0]):
# get the mean
Pclass = train.loc[(train['Pclass'] == clas)]
c_mean = np.mean(Pclass['Fare'])
# assign the value to the proper row
train.iloc[row, 9] = c_mean
train.head()
# set a learning rate
lr = 0.01
sexes = train.Sex
fares = train.Fare
# scale the fare value by dividing by the highest value so we get a range of 0 - 1
fares = fares/np.max(fares)
# put into matrix format
A = np.array([[1, s, f] for s, f in zip(sexes, fares)])
# create initial weights
w = [0, 0, 0]
# get the ground truth
y = list(train.Survived)
# train the model
weights = grad_descent(A, w, y, lr = 0.01,
stochastic = False)
# Lets use these weights to make predictions on the training data and see how it looks
def classification(weights, features):
prob = sigmoid(weights, features)
if prob > .5:
return 1
else:
return 0
correct = 0
for i, row in train.iterrows():
fare = row['Fare']
sex = row['Sex']
A = np.array([[1, sex, fare]])
A.shape = (1, 3)
pred = classification(weights, A[0,:])
if row['Survived'] == pred:
correct += 1
print(correct/len(train.index))
最后我得到大约 65% 的准确率,而使用 sklearn 我可以得到 78% 的准确率。我知道 sklearn 的算法可能比我的复杂得多,但我希望我至少可以接近(可能在 70 年代)。有什么建议吗?