我正在尝试实现我自己的 LSTM 网络。我实现了反向传播算法,但它没有通过梯度检查。无法意识到错误在哪里。请帮忙
这是代码问题的一段代码:
def backward_propagation(self, x, y, cache):
# T - the length of the sequence
T = len(y)
# perform forward propagation
cache = self.forward_propagation(x)
#...
# delta for output layer
dy = cache['y'].copy()
dy[np.arange(len(y)), y] -= 1. # softmax loss gradient
for t in reversed(range(T)):
dV += np.outer(dy[t], h[t].T)
dh[t] += self.V.T.dot(dy[t])
dhtmp = self.V.T.dot(dy[t])
dctmp = dct[0] * (1.0 - ct[0]**2)
for bptt_step in np.arange(t)[::-1]:
# add to gradients at each previous step
do[bptt_step] = dhtmp * ct[bptt_step]
dct[bptt_step] = dhtmp * o[bptt_step]
dctmp += dct[bptt_step] * (1.0 - ct[bptt_step]**2)
di[bptt_step] = dctmp * g[bptt_step]
df[bptt_step] = dctmp * c[bptt_step-1]
dg[bptt_step] = dctmp * i[bptt_step]
# backprop activation functions
diga[bptt_step] = di[bptt_step] * i[bptt_step] * (1.0 - i[bptt_step])
dfga[bptt_step] = df[bptt_step] * f[bptt_step] * (1.0 - f[bptt_step])
doga[bptt_step] = do[bptt_step] * o[bptt_step] * (1.0 - o[bptt_step])
dgga[bptt_step] = dg[bptt_step] * (1.0 - g[bptt_step] ** 2)
# backprop matrix multiply
dWi += np.dot(diga[bptt_step].T, h[bptt_step-1])
dWf += np.dot(dfga[bptt_step].T, h[bptt_step-1])
dWo += np.dot(doga[bptt_step].T, h[bptt_step-1])
dWg += np.dot(dgga[bptt_step].T, h[bptt_step-1])
dUi[:, x[bptt_step]] += diga[bptt_step]
dUf[:, x[bptt_step]] += dfga[bptt_step]
dUo[:, x[bptt_step]] += doga[bptt_step]
dUg[:, x[bptt_step]] += dgga[bptt_step]
# update deltas for next step
dhtmp = np.dot(self.Wi.T, diga[bptt_step])
dhtmp += np.dot(self.Wf.T, dfga[bptt_step])
dhtmp += np.dot(self.Wo.T, doga[bptt_step])
dhtmp += np.dot(self.Wg.T, dgga[bptt_step])
dctmp += dctmp * f[bptt_step]
return [dV, dWi, dWf, dWo, dWg, dUi, dUf, dUo, dUg]
这里的“ga”后缀是“门激活”——在应用非线性激活函数之前指定门输入。