RNN 的梯度检查失败

数据挖掘 机器学习 nlp 深度学习 朱莉娅 rnn
2021-09-18 20:39:37

我正在尝试使用 RNN 进行 POS 标记,但无法弄清楚我的实现中出了什么问题,因为梯度检查失败了。请帮忙。我在下面粘贴相关部分:

# weights
Wxh = randn(inputLayerSize, hiddenLayerSize)*0.01; # input to hidden
Whh = randn(hiddenLayerSize, hiddenLayerSize)*0.01; # hidden to hidden
Bh = zeros(1, hiddenLayerSize); # hidden bias
Why = randn(hiddenLayerSize, outputLayerSize)*0.01; # hidden to output
By = zeros(1, outputLayerSize); # output bias

function forwardRNN(x, y, h, p, hPrev)
  global Wxh, Whh, Why, Bh, By;
  cost = 0;
  for time in 1:length(x)
    if time == 1
      h[time] = tanh(x[time]'*Wxh + hPrev*Whh .+ Bh);
    else
      h[time] = tanh(x[time]'*Wxh + h[time-1]*Whh .+ Bh);
    end
    score = h[time]*Why .+ By;
    p_softmax = exp(score) / sum(exp(score));
    p[time] = vec(p_softmax);
    cost += -sum(log(y[time]'*p[time]));
  end
  return cost;
end

function backwardRNN(x, y, h, p, hPrev)
  global Wxh, Whh, Why, Bh, By;
  dWxh = zeros(size(Wxh));
  dWhh = zeros(size(Whh));
  dBh = zeros(size(Bh));
  dWhy = zeros(size(Why));
  dBy = zeros(size(By));
  dh = zeros(size(Bh));
  dhnext = zeros(size(h[1]));
  for time in length(x):-1:1
    dy = p[time] - y[time];
    dWhy = dWhy + (dy * h[time])';
    dBy = dBy + dy';
    dh = (Why*dy)' + dhnext;
    dhRaw = (1 - (h[time].*h[time])) .* dh;
    dWxh = dWxh + (x[time] * dhRaw);
    dBh = dBh + dhRaw;
    if time == 1
      dWhh = dWhh + (hPrev' * dhRaw);
    else
      dWhh = dWhh + (h[time-1]' * dhRaw);
    end
    dhnext = dhRaw*Whh;
  end
  return dWxh, dWhh, dBh, dWhy, dBy;
end

# gradient checking
function gradCheck(inputs, targets, h, p, hPrev)
  paramNameList = ["Wxh", "Whh", "Bh", "Why", "By"];
  global Wxh, Whh, Why, Bh, By;
  paramList = [x for x=(Wxh, Whh, Bh, Why, By)];
  num_checks = 2;
  delta = 1e-5;
  cost = forwardRNN(inputs, targets, h, p, hPrev);
  dWxh, dWhh, dBh, dWhy, dBy = backwardRNN(inputs, targets, h, p, hPrev);
  dParamList = [x for x=(dWxh, dWhh, dBh, dWhy, dBy)];
  for (param,dparam,name) in zip(paramList, dParamList, paramNameList)
    s0 = size(dparam);
    s1 = size(param);
    if s0 != s1
      println("Error dims dont match: ", s0," and ",s1);
    end
    println(name)
    for i in 1:num_checks
      ri = rand(1:length(param));
      old_val = param[ri];
      param[ri] = old_val + delta;
      cg0 = forwardRNN(inputs, targets, h, p, hPrev);
      param[ri] = old_val - delta;
      cg1 = forwardRNN(inputs, targets, h, p, hPrev);
      param[ri] = old_val
      grad_analytic = dparam[ri];
      grad_numerical = (cg0 - cg1) / ( 2 * delta );
      rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic);
      println(grad_numerical,", ", grad_analytic, " => ",rel_error);
      if rel_error > 1e-5
        error("Gradient check failed.");
      end
      println("Gradient check passed.")
    end
  end
end

代码采用 Julia 编程语言,灵感来自 Karpathy 的min-char-rnn.py

1个回答

据我了解,你有错误的反向传播梯度实现。在这里您应该考虑到,rnn 的隐藏状态 h 在等式中具有其先前的状态:h[time-1]。这也必须通过链式法则提取。更多信息建议参考这篇文章
它还包含 Python rnn 实现。