神经网络为每个输入返回大约相同的输出(平均值)

人工智能 神经网络 深度学习 图像识别
2021-11-12 23:20:45

我尝试从头开始构建神经网络,以使用 sigmoid 输出单元构建猫或狗二元分类器。对于每个输入,我似乎得到了大约 0.5(+/- 0.002)的输出值。这对我来说真的很奇怪。这是我的代码,如果实施中有错误,请告诉我。

def initialize_parameters_deep(layer_dims):
    l=len(layer_dims)
    parameters={}
    for l in range(1,len(layer_dims)):
        parameters['W'+str(l)]=np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters['b'+str(l)]=np.zeros((layer_dims[l],1))
    return parameters

def linear_forward(A,W,b):
    Z=np.dot(W,A)+b
    cache=(A,W,b)
    return Z,cache


def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    cache=Z
    return A, cache


def relu(Z):
    A = np.maximum(0,Z)

    assert(A.shape == Z.shape)

    cache = Z 
    return A, cache

def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.

    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0

    assert (dZ.shape == Z.shape)

    return dZ

def sigmoid_backward(dA, cache):
    Z = cache

    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)

    assert (dZ.shape == Z.shape)

    return dZ


def linear_activation_forward(A_prev,W,b,activation):
    if(activation=='sigmoid'):
        Z,linear_cache=linear_forward(A_prev,W,b)
        A,activation_cache=sigmoid(Z)
    elif activation=='relu':
        Z,linear_cache=linear_forward(A_prev,W,b)
        A,activation_cache=relu(Z)
    cache=(linear_cache,activation_cache)
    return A,cache

def L_model_forward(X,parameters):
    A=X
    L=len(parameters)//2
    caches=[]
    for l in range(1,L):
        A,cache=linear_activation_forward(A,parameters['W'+str(l)],parameters['b'+str(l)],'relu')
        caches.append(cache)
    AL,cache=linear_activation_forward(A,parameters['W'+str(L)],parameters['b'+str(L)],'sigmoid')
    caches.append(cache)
    return AL,caches

def compute_cost(AL,Y):
    m=Y.shape[1]
    cost=-1/m*np.sum(np.multiply(np.log(AL),Y)+np.multiply(np.log(1-AL),1-Y))
    return cost

def linear_backward(dZ,cache):
    A_prev,W,b=cache
    m=A_prev.shape[1]
    dW = np.dot(dZ,A_prev.T)/m
    db = np.sum(dZ,axis=1,keepdims=True)/m
    dA_prev = np.dot(W.T,dZ)
    return dA_prev,dW,db

def linear_activation_backward(activation,dA_prev,cache):
    linear_cache,activation_cache=cache
    if activation=='sigmoid':

        dZ=sigmoid_backward(dA_prev,activation_cache)
        dA_prev,dW,db=linear_backward(dZ,linear_cache)
    if activation=='relu':
        dZ=relu_backward(dA_prev,activation_cache)
        dA_prev,dW,db=linear_backward(dZ,linear_cache)
    return dA_prev,dW,db

def L_model_backward(AL,Y,caches):
    L=len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

    grads={}
    current_cache=caches[-1]
    grads['dA'+str(L-1)],grads['dW'+str(L)],grads['db'+str(L)]=linear_activation_backward('sigmoid',dAL,current_cache)

    for l in reversed(range(L-1)):
        current_cache=caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward('relu',grads['dA'+str(l+1)],current_cache)
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
    return grads
def Grad_Desc(parameters,grads,learning_rate):
    L=len(parameters)//2
    for l in range(L):
        parameters['W'+str(l+1)]=parameters['W'+str(l+1)]-learning_rate*grads['dW'+str(l+1)]
        parameters['b'+str(l+1)]=parameters['b'+str(l+1)]-learning_rate*grads['db'+str(l+1)] 
    return parameters

def L_layer_model(X,Y,learning_rate,num_iter,layer_dims):
    parameters=initialize_parameters_deep(layer_dims)
    costs=[]
    for i in range(num_iter):
        AL,caches=L_model_forward(X,parameters)
        cost=compute_cost(AL,Y)
        grads=L_model_backward(AL,Y,caches)
        parameters=Grad_Desc(parameters,grads,learning_rate)
        if i%100==0:
            print(cost)
            costs.append(cost)
    plt.plot(np.squeeze(costs))
def predict(X,parameters):
    AL,caches=L_model_forward(X,parameters)
    prediction=(AL>0.5)
    return AL,prediction

L_layer_model(x_train,y_train,0.0075,12000,[12288,20,7,5,1])
prediction=predict(x_train,initialize_parameters_deep([12288,20,7,5,1])) 
1个回答

有一种技术叫做梯度检查

在此处输入图像描述

有了它,您可以断言您是否正在计算 ANN 组件中的正确梯度。代码实现是:

def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7):

parameters_values, _ = dictionary_to_vector(parameters)
grad = gradients_to_vector(gradients)
num_parameters = parameters_values.shape[0]
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
gradapprox = np.zeros((num_parameters, 1))

# Compute gradapprox
for i in range(num_parameters):


    thetaplus = np.copy(parameters_values)                                      
    thetaplus[i][0] = thetaplus[i][0]+  epsilon                              
    J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary( thetaplus  ))                                  

    thetaminus =  np.copy(parameters_values)                                      
    thetaminus[i][0] = thetaplus[i][0]-  epsilon                                  
    J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary( thetaminus  ))                            

    gradapprox[i] = (J_plus[i]-J_minus[i])/(2*epsilon)



numerator = np.linalg.norm(grad-gradapprox)                              
denominator = np.linalg.norm(grad) +  np.linalg.norm(gradapprox)                            
difference = numerator/denominator                             

if difference > 2e-7:
    print ("There is a mistake in the backward propagation. Difference = " + str(difference))
else:
    print ("Backward propagation is Okay. Difference = " + str(difference))

return difference

其中parameters是带有参数“W1”,“b1”......“Wl”,“bl”的字典,而grad是L_model_backward的输出,其中包含相对于参数的成本梯度。另外,如果你能分享 x_train,y_train 以便我们调试它,那就太好了,祝你好运。