数据挖掘 - MNIST 的神经网络：非常低的准确度 - 吾爱随笔录

我正在通过实现神经网络来解决手写数字识别问题。但是网络的准确率非常低，训练数据集的准确率约为 11%。我不确定我的程序有什么问题。我尝试改变学习率和隐藏单元的数量，但没有运气。有人可以看看并帮助我解决我所缺少的吗？我在下面粘贴我的 Julia 代码：

# install
Pkg.add("MNIST");
using MNIST

# training data
X,y = traindata(); 
m = size(X, 2);
inputLayerSize = size(X,1); 
hiddenLayerSize = 300;
outputLayerSize = 10;

# representing each output as an array of size of the output layer
eyeY = eye(outputLayerSize);
intY = [convert(Int64,i)+1 for i in y];
Y = zeros(outputLayerSize, m);
for i = 1:m
    Y[:,i] = eyeY[:,intY[i],];
end

# weights with bias
Theta1 = randn(inputLayerSize+1, hiddenLayerSize); 
Theta2 = randn(hiddenLayerSize+1, outputLayerSize); 

function sigmoid(z)
    g = 1.0 ./ (1.0 + exp(-z));
    return g;
end

function sigmoidGradient(z)
  return sigmoid(z).*(1-sigmoid(z));
end

# learning rate
alpha = 0.01;
# number of iterations
epoch = 20;
# cost per epoch
J = zeros(epoch,1);
# backpropagation algorithm
for i = 1:epoch
    for j = 1:m # for each input
        # Feedforward
        # input layer
        # add one bias element
        x1 = [1, X[:,j]];

        # hidden layer
        z2 = Theta1'*x1;
        x2 = sigmoid(z2);
        # add one bias element
        x2 = [1, x2];

        # output layer
        z3 = Theta2'*x2;
        x3 = sigmoid(z3);

        # Backpropagation process
        # delta for output layer
        delta3 = x3 - Y[:,j];
        delta2 = (Theta2[2:end,:]*delta3).*sigmoidGradient(z2) ;

        # update weights
        Theta1 = Theta1 - alpha* x1*delta2';
        Theta2 = Theta2 - alpha* x2*delta3';
    end
end

function predict(Theta1, Theta2, X)
    m = size(X, 2); 
    p = zeros(m, 1);
    h1 = sigmoid(Theta1'*[ones(1,size(X,2)), X]);
    h2 = sigmoid(Theta2'*[ones(1,size(h1,2)), h1]);
    # 1 index is for 0, 2 for 1 ...so forth
    for i=1:m
        p[i,:] = indmax(h2[:,i])-1;
    end
    return p;
end

function accuracy(truth, prediction)
    m = length(truth);
    sum =0;
    for i=1:m
        if truth[i,:] == pred[i,:]
            sum = sum +1;
        end
    end
  return (sum/m)*100;
end

pred = predict(Theta1, Theta2, X);
println("train accuracy: ", accuracy(y, pred));