数据挖掘 - 亚当的优化是否容易受到局部最小值的影响？ - 吾爱随笔录

# Neural Network Architecture 

no_hid_layers = 1
hid = 3
no_out = 1

# Xavier Ininitialization of weights w

w1 = np.random.randn(hid, n+1)*np.sqrt(2/(hid+n+1))
w2 = np.random.randn(no_out, hid+1)*np.sqrt(2/(no_out+hid+1))

# Sigmoid Activation Function
def g(x):
    sig = 1/(1+np.exp(-x))
    return sig

def frwrd_prop(X, w1, w2):
    z2 = w1 @ X.T
    z2 = norm(z2, axis=0)
    a2 = np.insert(g(z2), 0, 1, axis=0)
    h = g((w2@a2))
    return (h,a2)

# Calculating Cost and Gradient

def Cost(X, y, w1, w2, lmbda=0):
    # Initializing Cost J and Gradients dw
    J = 0
    dw1 = np.zeros(w1.shape)
    dw2 = np.zeros(w2.shape)
    # Forward Propagation to calculate the value of the output
    h, a2 = frwrd_prop(X, w1, w2)
    # Calculate the Cost Function J 
    J = -(np.sum(y.T*np.log(h) + (1-y).T*np.log(1-h)) - lmbda/2*(np.sum(np.sum(w1[:,1:].T@w1[:,1:])) + np.sum(w2[:,1:].T@w2[:,1:])))/m
    # Applying Back Propagation to calculate the Gradients dw
    D3 = h-y
    D2 = (w2.T@D3)*a2*(1-a2)
    dw1[:,0] = (D2[1:]@X)[:,0]/m
    dw2[:,0] = (D3@a2.T)[:,0]/m
    dw1[:, 1:] = ((D2[1:]@X)[:,1:] + lmbda*w1[:,1:])/m
    dw2[:, 1:] = ((D3@a2.T)[:,1:] + lmbda*w2[:,1:])/m
    # Gradient clipping
    if(abs(np.linalg.norm(dw1))>4.5):
        dw1 = dw1*4.5/(np.linalg.norm(dw1))
    if(abs(np.linalg.norm(dw2))>4.5):
        dw1 = dw1*4.5/(np.linalg.norm(dw2))
    return (J, dw1, dw2)

# Adam's Optimization technique for training w 

def Train(w1, w2, maxIter=50):
    # Algorithm
    a, b1, b2, e = 0.001, 0.9, 0.999, 10**(-8)
    V1 = np.zeros(w1.shape)
    V2 = np.zeros(w2.shape)
    S1 = np.zeros(w1.shape)
    S2 = np.zeros(w2.shape)
    for i in range(maxIter):
        J, dw1, dw2 = Cost(X, y, w1, w2)
        V1 = b1*V1 + (1-b1)*dw1
        S1 = b2*S1 + (1-b2)*(dw1**2)
        V2 = b1*V2 + (1-b1)*dw2
        S2 = b2*S2 + (1-b2)*(dw2**2)
        if i!=0: 
            V1 = V1/(1-b1**i)
            S1 = S1/(1-b2**i)
            V2 = V2/(1-b1**i)
            S2 = S2/(1-b2**i)
        w1 = w1 - a*V1/(np.sqrt(S1)+e)*dw1
        w2 = w2 - a*V2/(np.sqrt(S2)+e)*dw2
        print("\t\t\tIteration : ", i+1, " \tCost : ", J)
    return (w1, w2)

# Training Neural Network     

w1, w2 = Train(w1,w2)

我正在使用亚当优化将梯度下降收敛到全局最小值，但在大约 15 次迭代后成本变得停滞（没有变化）（数量不固定）。由于权重的随机初始化导致的初始成本在变得恒定之前会发生非常微小的变化。这使得完全相同代码的不同运行的训练准确度从 45% 提高到 70%。你能帮我解释一下这背后的原因吗？