我已经为 OLS 实现了自己的梯度下降算法,代码如下。但是,当学习率太大(即 learn_rate >= .3)时,我的方法是不稳定的。系数爆炸,我得到一个溢出错误。我知道如果我的学习率太大,我会得到不好的结果。该算法将采取太大的步骤并不断错过最佳状态。然而,鉴于 OLS 损失函数是一个凸优化问题,我很惊讶大的学习率会导致爆炸性的系数估计。任何见解将不胜感激(以及编码建议,虽然我知道这不是那种谈话的正确地方)
import pandas as pd
import numpy as np
y1 = np.array([0, 1, 2])
X1 = np.array([[0, 0], [1, 1], [2, 2]])
def gradient_descent(LHS, RHS, N, learn_rate, tol, params, max_iter):
#Append intercept
constant = np.ones((N, 1))
RHS = np.hstack((constant, RHS))
#Define Gradient - Using Matrix Notation
def gradient(X, y, p):
grad_part1 = -2*np.dot(np.transpose(X),y)
grad_part2 = 2*np.dot(np.dot(np.transpose(X), X), p)
return((1/len(y))*(grad_part1 + grad_part2))
#Define Gradient update
def param_update(p,lr,X,y):
return(p - lr*gradient(X, y, p))
#Check if we start at optimia
old_params = params
params = param_update(params, learn_rate, RHS, LHS)
if all(abs(params - old_params) <= tol):
return(params)
#If not, run gradient descent
else:
iter = 1
while(any(abs(params - old_params)) > tol and iter < max_iter):
old_params = params
params = param_update(params, learn_rate, RHS, LHS)
iter += 1
return([params, iter])
LHS = y1.reshape(len(y1),1)
RHS = X1
myres = gradient_descent(LHS, RHS, len(LHS), .1, .1, np.array([[1], [1], [1]]), 10000)
myres = gradient_descent(LHS, RHS, len(LHS), .3, .1, np.array([[1], [1], [1]]), 10000)