我正在尝试使用 RNN 进行 POS 标记,但无法弄清楚我的实现中出了什么问题,因为梯度检查失败了。请帮忙。我在下面粘贴相关部分:
# weights
Wxh = randn(inputLayerSize, hiddenLayerSize)*0.01; # input to hidden
Whh = randn(hiddenLayerSize, hiddenLayerSize)*0.01; # hidden to hidden
Bh = zeros(1, hiddenLayerSize); # hidden bias
Why = randn(hiddenLayerSize, outputLayerSize)*0.01; # hidden to output
By = zeros(1, outputLayerSize); # output bias
function forwardRNN(x, y, h, p, hPrev)
global Wxh, Whh, Why, Bh, By;
cost = 0;
for time in 1:length(x)
if time == 1
h[time] = tanh(x[time]'*Wxh + hPrev*Whh .+ Bh);
else
h[time] = tanh(x[time]'*Wxh + h[time-1]*Whh .+ Bh);
end
score = h[time]*Why .+ By;
p_softmax = exp(score) / sum(exp(score));
p[time] = vec(p_softmax);
cost += -sum(log(y[time]'*p[time]));
end
return cost;
end
function backwardRNN(x, y, h, p, hPrev)
global Wxh, Whh, Why, Bh, By;
dWxh = zeros(size(Wxh));
dWhh = zeros(size(Whh));
dBh = zeros(size(Bh));
dWhy = zeros(size(Why));
dBy = zeros(size(By));
dh = zeros(size(Bh));
dhnext = zeros(size(h[1]));
for time in length(x):-1:1
dy = p[time] - y[time];
dWhy = dWhy + (dy * h[time])';
dBy = dBy + dy';
dh = (Why*dy)' + dhnext;
dhRaw = (1 - (h[time].*h[time])) .* dh;
dWxh = dWxh + (x[time] * dhRaw);
dBh = dBh + dhRaw;
if time == 1
dWhh = dWhh + (hPrev' * dhRaw);
else
dWhh = dWhh + (h[time-1]' * dhRaw);
end
dhnext = dhRaw*Whh;
end
return dWxh, dWhh, dBh, dWhy, dBy;
end
# gradient checking
function gradCheck(inputs, targets, h, p, hPrev)
paramNameList = ["Wxh", "Whh", "Bh", "Why", "By"];
global Wxh, Whh, Why, Bh, By;
paramList = [x for x=(Wxh, Whh, Bh, Why, By)];
num_checks = 2;
delta = 1e-5;
cost = forwardRNN(inputs, targets, h, p, hPrev);
dWxh, dWhh, dBh, dWhy, dBy = backwardRNN(inputs, targets, h, p, hPrev);
dParamList = [x for x=(dWxh, dWhh, dBh, dWhy, dBy)];
for (param,dparam,name) in zip(paramList, dParamList, paramNameList)
s0 = size(dparam);
s1 = size(param);
if s0 != s1
println("Error dims dont match: ", s0," and ",s1);
end
println(name)
for i in 1:num_checks
ri = rand(1:length(param));
old_val = param[ri];
param[ri] = old_val + delta;
cg0 = forwardRNN(inputs, targets, h, p, hPrev);
param[ri] = old_val - delta;
cg1 = forwardRNN(inputs, targets, h, p, hPrev);
param[ri] = old_val
grad_analytic = dparam[ri];
grad_numerical = (cg0 - cg1) / ( 2 * delta );
rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic);
println(grad_numerical,", ", grad_analytic, " => ",rel_error);
if rel_error > 1e-5
error("Gradient check failed.");
end
println("Gradient check passed.")
end
end
end
代码采用 Julia 编程语言,灵感来自 Karpathy 的min-char-rnn.py