我正在MONK的问题数据集上训练我的神经网络实现。我读到使用我正在使用的超参数(第 9 章)几乎可以实现 0 损失。这些是我在 90 个 epoch 后的训练集上的结果,如您所见,它们并不好:
Training error after 0 iterations: 0.26427015361
Training error after 20 iterations: 0.231331617693
Training error after 40 iterations: 0.216396390383
Training error after 60 iterations: 0.198107908553
Training error after 80 iterations: 0.166181613583
TrErr: 0.155389641876
这是我的神经网络的代码。我认为代码对于专家来说非常简单易读,如果您需要任何说明,请告诉我。
# paramaters
etas = [0.1]
hidden_dim = 2
alpha = 0.5
reg_lambda = 0.0
input_dim = 17
output_dim = 1
# read and parse MONK data
def parse_data(text):
with open(text) as f:
content = f.readlines()
t = np.zeros((len(content), output_dim))
X = np.zeros((len(content), input_dim))
for i,val in enumerate(content):
s = val.split(" ")
t[i] = float(s[1])
for idx in xrange(17):
X[i][idx] = 0.0
j = -1
# |0 |1 |2 |3 |4 |5 |6 |7 |8 |9 |10|11|12|13|14|15|16
# |a1|a1|a1|a2|a2|a2|a3|a3|a4|a4|a4|a5|a5|a5|a5|a6|a6
X[i][int(s[2])+j] = 1.0 # a1
j = 2
X[i][int(s[3])+j] = 1.0 # a2
j = 5
X[i][int(s[4])+j] = 1.0 # a3
j = 7
X[i][int(s[5])+j] = 1.0 # a4
j = 10
X[i][int(s[6])+j] = 1.0 # a5
j = 14
X[idx][int(s[7])+j] = 1.0 # a6
return (X, t)
def sigmoid(x):
output = 1/(1+np.exp(-x))
return output
def sigmoid_output_to_derivative(output):
return output*(1-output)
# initialize weights
W0 = np.random.uniform(-0.7,0.7,size=(input_dim,hidden_dim))
W1 = np.random.uniform(-0.7,0.7,size=(hidden_dim,output_dim))
b0 = np.zeros((1, hidden_dim))
b1 = np.zeros((1, output_dim))
dW0 = 0
dW1 = 0
for j in xrange(90):
# forward propagation
l0 = X
l1 = sigmoid(np.dot(l0,W0) + b0)
l2 = sigmoid(np.dot(l1,W1) + b1)
# back propagation
l2_error = t - l2
l2_delta = l2_error*sigmoid_output_to_derivative(l2)
l1_error = l2_delta.dot(W1.T)
l1_delta = l1_error*sigmoid_output_to_derivative(l1)
if (j% 20) == 0:
print "Training error after", j, "iterations:",np.mean(np.square(l2_error))
dW1 = eta*(l1.T.dot(l2_delta) + reg_lambda*W1) + alpha*dW1
dW0 = eta*(l0.T.dot(l1_delta) + reg_lambda*W0) + alpha*dW0
db1 = eta*(np.sum(l2_delta, axis=0))
db0 = eta*(np.sum(l1_delta, axis=0))
W1 += dW1
W0 += dW0
b1 += db1
b0 += db0