# Neural Network Architecture
no_hid_layers = 1
hid = 3
no_out = 1
# Xavier Ininitialization of weights w
w1 = np.random.randn(hid, n+1)*np.sqrt(2/(hid+n+1))
w2 = np.random.randn(no_out, hid+1)*np.sqrt(2/(no_out+hid+1))
# Sigmoid Activation Function
def g(x):
sig = 1/(1+np.exp(-x))
return sig
def frwrd_prop(X, w1, w2):
z2 = w1 @ X.T
z2 = norm(z2, axis=0)
a2 = np.insert(g(z2), 0, 1, axis=0)
h = g((w2@a2))
return (h,a2)
# Calculating Cost and Gradient
def Cost(X, y, w1, w2, lmbda=0):
# Initializing Cost J and Gradients dw
J = 0
dw1 = np.zeros(w1.shape)
dw2 = np.zeros(w2.shape)
# Forward Propagation to calculate the value of the output
h, a2 = frwrd_prop(X, w1, w2)
# Calculate the Cost Function J
J = -(np.sum(y.T*np.log(h) + (1-y).T*np.log(1-h)) - lmbda/2*(np.sum(np.sum(w1[:,1:].T@w1[:,1:])) + np.sum(w2[:,1:].T@w2[:,1:])))/m
# Applying Back Propagation to calculate the Gradients dw
D3 = h-y
D2 = (w2.T@D3)*a2*(1-a2)
dw1[:,0] = (D2[1:]@X)[:,0]/m
dw2[:,0] = (D3@a2.T)[:,0]/m
dw1[:, 1:] = ((D2[1:]@X)[:,1:] + lmbda*w1[:,1:])/m
dw2[:, 1:] = ((D3@a2.T)[:,1:] + lmbda*w2[:,1:])/m
# Gradient clipping
if(abs(np.linalg.norm(dw1))>4.5):
dw1 = dw1*4.5/(np.linalg.norm(dw1))
if(abs(np.linalg.norm(dw2))>4.5):
dw1 = dw1*4.5/(np.linalg.norm(dw2))
return (J, dw1, dw2)
# Adam's Optimization technique for training w
def Train(w1, w2, maxIter=50):
# Algorithm
a, b1, b2, e = 0.001, 0.9, 0.999, 10**(-8)
V1 = np.zeros(w1.shape)
V2 = np.zeros(w2.shape)
S1 = np.zeros(w1.shape)
S2 = np.zeros(w2.shape)
for i in range(maxIter):
J, dw1, dw2 = Cost(X, y, w1, w2)
V1 = b1*V1 + (1-b1)*dw1
S1 = b2*S1 + (1-b2)*(dw1**2)
V2 = b1*V2 + (1-b1)*dw2
S2 = b2*S2 + (1-b2)*(dw2**2)
if i!=0:
V1 = V1/(1-b1**i)
S1 = S1/(1-b2**i)
V2 = V2/(1-b1**i)
S2 = S2/(1-b2**i)
w1 = w1 - a*V1/(np.sqrt(S1)+e)*dw1
w2 = w2 - a*V2/(np.sqrt(S2)+e)*dw2
print("\t\t\tIteration : ", i+1, " \tCost : ", J)
return (w1, w2)
# Training Neural Network
w1, w2 = Train(w1,w2)
我正在使用亚当优化将梯度下降收敛到全局最小值,但在大约 15 次迭代后成本变得停滞(没有变化)(数量不固定)。由于权重的随机初始化导致的初始成本在变得恒定之前会发生非常微小的变化。这使得完全相同代码的不同运行的训练准确度从 45% 提高到 70%。你能帮我解释一下这背后的原因吗?