我的权重从初始化时的 0 到 1 到在下一次迭代中爆炸到数万。在第 3 次迭代中,它们变得如此之大,以至于只显示 nan 值的数组。
我该如何解决这个问题?
是与 sigmoid 函数的不稳定性质有关,还是我的方程式之一在反向传播期间不正确,导致我的梯度爆炸?
import numpy as np
from numpy import exp
import matplotlib.pyplot as plt
import h5py
# LOAD DATASET
MNIST_data = h5py.File('data/MNISTdata.hdf5', 'r')
x_train = np.float32(MNIST_data['x_train'][:])
y_train = np.int32(np.array(MNIST_data['y_train'][:,0]))
x_test = np.float32(MNIST_data['x_test'][:])
y_test = np.int32(np.array(MNIST_data['y_test'][:,0]))
MNIST_data.close()
##############################################################################
# PARAMETERS
number_of_digits = 10 # number of outputs
nx = x_test.shape[1] # number of inputs ... 784 --> 28*28
ny = number_of_digits
m_train = x_train.shape[0]
m_test = x_test.shape[0]
Nh = 30 # number of hidden layer nodes
alpha = 0.001
iterations = 3
##############################################################################
# ONE HOT ENCODER - encoding y data into 'one hot encoded'
lr = np.arange(number_of_digits)
y_train_one_hot = np.zeros((m_train, number_of_digits))
y_test_one_hot = np.zeros((m_test, number_of_digits))
for i in range(len(y_train_one_hot)):
y_train_one_hot[i,:] = (lr==y_train[i].astype(np.int))
for i in range(len(y_test_one_hot)):
y_test_one_hot[i,:] = (lr==y_test[i].astype(np.int))
# VISUALISE SOME DATA
for i in range(5):
img = x_train[i].reshape((28,28))
plt.imshow(img, cmap='Greys')
plt.show()
y_train = np.array([y_train]).T
y_test = np.array([y_test]).T
##############################################################################
# INITIALISE WEIGHTS & BIASES
params = { "W1": np.random.rand(nx, Nh),
"b1": np.zeros((1, Nh)),
"W2": np.random.rand(Nh, ny),
"b2": np.zeros((1, ny))
}
# TRAINING
# activation function
def sigmoid(z):
return 1/(1+exp(-z))
# derivative of activation function
def sigmoid_der(z):
return z*(1-z)
# softamx function
def softmax(z):
return 1/sum(exp(z)) * exp(z)
# softmax derivative is alike to sigmoid
def softmax_der(z):
return sigmoid_der(z)
def cross_entropy_error(v,y):
return -np.log(v[y])
# forward propagation
def forward_prop(X, y, params):
outs = {}
outs['A0'] = X
outs['Z1'] = np.matmul(outs['A0'], params['W1']) + params['b1']
outs['A1'] = sigmoid(outs['Z1'])
outs['Z2'] = np.matmul(outs['A1'], params['W2']) + params['b2']
outs['A2'] = softmax(outs['Z2'])
outs['error'] = cross_entropy_error(outs['A2'], y)
return outs
# back propagation
def back_prop(X, y, params, outs):
grads = {}
Eo = (y - outs['A2']) * softmax_der(outs['Z2'])
Eh = np.matmul(Eo, params['W2'].T) * sigmoid_der(outs['Z1'])
dW2 = np.matmul(Eo.T, outs['A1']).T
dW1 = np.matmul(Eh.T, X).T
db2 = np.sum(Eo,0)
db1 = np.sum(Eh,0)
grads['dW2'] = dW2
grads['dW1'] = dW1
grads['db2'] = db2
grads['db1'] = db1
# print('dW2:',grads['dW2'])
return grads
# optimise weights and biases
def optimise(X,y,params,grads):
params['W2'] -= alpha * grads['dW2']
params['W1'] -= alpha * grads['dW1']
params['b2'] -= alpha * grads['db2']
params['b1'] -= alpha * grads['db1']
return
# main
for epoch in range(iterations):
print(epoch)
outs = forward_prop(x_train, y_train, params)
grads = back_prop(x_train, y_train, params, outs)
optimise(x_train,y_train,params,grads)
loss = 1/ny * np.sum(outs['error'])
print(loss)
```