我正在学习机器学习,在阅读了有关逻辑回归的材料后,我尝试从头开始在 python 中使用梯度下降实现逻辑回归。
它在某些情况下效果很好,但在某些情况下会导致数学错误,如果我们看到下面的情况,这是可以理解的。
逻辑回归中的成本函数是 -( ylog(predicted) + (1-y)log(1-predicted))
当预测为 1 时会发生什么?代码失败,因为它试图计算未定义的 log(1-1) = log(0)。明确地,我们在 python 中得到了这个错误
ValueError('数学域错误')
请帮助我了解如何防止这种情况。
代码如下:
from numpy.random import RandomState
import pandas as panda
import matplotlib.pyplot as plot
import random
from math import sqrt, exp, log
remote_location = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
def standard_deviation(values):
average = sum(values) / len(values)
variance = sum([(average - i)**2/len(values) for i in values])
return sqrt(variance)
class LogisticRegression(object):
def __init__(self, epochs, learning_rate, _x_training_set, _y_training_set, standardize = False, random_state = None):
self.epochs = epochs
self.learning_rate = learning_rate
self.standardize = standardize
self._x_training_set = _x_training_set
self._y_training_set = _y_training_set
self.number_of_training_set = len(self._y_training_set)
self.weights = []
self.random_state = RandomState(random_state if random_state else 1)
def standardizeInputData(self):
"""
Standardizing of feature set means substracting the mean of
each training sample from the feature value and dividing it by
the standard deviation
1. take average of j features from i th training sample . say avg
2. calculate the variance of each j feature
3. variance(j) = (avg - x(j))**2/len(features)
4. standard deviation of x(j) = sq rt(variance(j))
so standardized(x(j)) = x(j) - avg / standard deviation(x(j))
"""
temp = []
for i in range(len(self._x_training_set)):
mean = sum(self._x_training_set[i])/ len(self._x_training_set[i])
std_deviation = standard_deviation(self._x_training_set[i])
temp.append([ (j - mean)/std_deviation for j in self._x_training_set[i]])
return temp
def setup(self):
if self.standardize:
self._x_training_set = self.standardizeInputData()
self.initialize_weights(len(self._x_training_set[0]) + 1)
def initialize_weights(self, number_of_weights):
self.weights = list(self.random_state.normal(loc = 0.0, scale = 0.01, size = len(self._x_training_set[0]) + 1))
def learn(self):
self.setup()
epoch_data = {}
error = 0
for epoch in range(self.epochs):
cost =0
for i in range(self.number_of_training_set):
_x = self._x_training_set[i]
_desired = self._y_training_set[i]
_weight = self.weights
weighted_sum = _weight[0] + sum([_weight[j+1] * _x[j] for j in range(len(_x))])
guess = 1 / ( 1 + exp(- weighted_sum))
error = _desired - guess
## i am going to reset all the weights
if error!= 0 :
## resetting the bias unit
self.weights[0] = error * self.learning_rate
self.weights[1:] =[self.weights[j+1] + error * self.learning_rate * _x[j] \
for j in range(len(_x))]
## cost entropy loss function
cost+= - ( _desired * log(guess) + (1 - _desired) *log(1-guess))
#saving error at the end of the training set
epoch_data[epoch] = cost ##summation of all such y predictions for a training set
print(epoch_data)
def predict(self, _x_test_data):
"""
Given algorithm has been trained using the #learn method
this method will predict the y values based on the last
values calculated for weights. This is because
by the end of the learn method, algorithm has already
converged as close to 0 error as it can
"""
prediction = []
for i in range(len(_x_test_data)):
weighted_sum = self.weights[0] + \
sum([self.weights[j+1] * _x_test_data[i][j] \
for j in range(len(_x_test_data[i]))])
guess = 1 / ( 1 + exp(- weighted_sum))
prediction.append( 1 if guess >= 0.5 else 0)
print(prediction)
return prediction
客户端代码:
import pandas as panda
from sklearn.model_selection import train_test_split
from predicting_logistic_regression import LogisticRegression
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn import datasets
remote_location = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
# data = panda.read_csv(remote_location)
# _x_training_set = list(data.iloc[0:, [0,2]].values)
# _y_training_set = [0 if i.lower()!='iris-setosa' else 1 for i in data.iloc[0:, 4].values]
data = datasets.load_iris()
_x_training_set = data.data[:,[2,3]]
_y_training_set = data.target
_x_train, _x_test, _y_train, _y_test = train_test_split( \
_x_training_set,\
_y_training_set, \
test_size = 0.3, \
random_state = 1, \
stratify = _y_training_set)
random_generator_start = -1
random_generator_end = 1
logistic_regression = LogisticRegression( \
learning_rate = 0.01, \
epochs = 40, \
_x_training_set = _x_train, \
_y_training_set = _y_train,
standardize= False
)
logistic_regression.learn()
_y_predicted = logistic_regression.predict(_x_test)
print(_y_predicted)
print(_y_test)
print(accuracy_score(_y_test, _y_predicted))
print(mean_absolute_error(_y_test, _y_predicted))