使用 TensorFlow 的 MNIST 深度神经网络

数据挖掘 Python 神经网络 深度学习 张量流
2022-03-02 17:26:50

我已经在这段代码上工作了一段时间,在我开始工作之前它让我很头疼。它基本上尝试使用 mnist 数据集对手写数字进行分类。我没有在 TensorFlow 中使用预先打包的 mnist,因为我想自己学习预处理数据并更深入地了解 TensorFlow。

它终于起作用了,但如果有专业知识的人可以看看它并告诉我他们的想法以及它产生的结果是否实际上是真实的统计数据,或者它是否过度拟合或根本没有学习,我会喜欢它。

它使我在测试数据集中获得了 83% 到 91% 的准确率。

我使用的数据集来自https://pjreddie.com/projects/mnist-in-csv/基本上是页面顶部的两个链接。

这是代码:

import numpy as np
import tensorflow as tf
sess = tf.Session()
from sklearn import preprocessing
import matplotlib.pyplot as plt
with tf.Session() as sess:
    # lets load the file
    train_file = 'mnist_train.csv'
    test_file = 'mnist_test.csv'
    #train_file = 'mnist_train_small.csv'
    #test_file = 'mnist_test_small.csv'

    train = np.loadtxt(train_file, delimiter=',')
    test = np.loadtxt(test_file, delimiter=',')

    x_train = train[:,1:785]
    y_train = train[:,:1]

    x_test = test[:,1:785]
    y_test = test[:,:1]
    print(x_test.shape)

    # lets normalize the data
    def normalize(input_data):
        minimum = input_data.min(axis=0)
        maximum = input_data.max(axis=0)
        #normalized = (input_data - minimum) / ( maximum - minimum )
        normalized = preprocessing.normalize(input_data, norm='l2')
        return normalized

    # convert to a onehot array 
    def one_hot(input_data):
        one_hot = []
        for item in input_data:
            if item == 0.:
                one_h = [1.,0.,0.,0.,0.,0.,0.,0.,0.,0.]
            elif item == 1.:
                one_h = [0.,1.,0.,0.,0.,0.,0.,0.,0.,0.]
            elif item == 2.:
                one_h = [0.,0.,1.,0.,0.,0.,0.,0.,0.,0.]
            elif item == 3.:
                one_h = [0.,0.,0.,1.,0.,0.,0.,0.,0.,0.]
            elif item == 4.:
                one_h = [0.,0.,0.,0.,1.,0.,0.,0.,0.,0.]
            elif item == 5.:
                one_h = [0.,0.,0.,0.,0.,1.,0.,0.,0.,0.]
            elif item == 6.:
                one_h = [0.,0.,0.,0.,0.,0.,1.,0.,0.,0.]
            elif item == 7.:
                one_h = [0.,0.,0.,0.,0.,0.,0.,1.,0.,0.]
            elif item == 8.:
                one_h = [0.,0.,0.,0.,0.,0.,0.,0.,1.,0.]
            elif item == 9.:
                one_h = [0.,0.,0.,0.,0.,0.,0.,0.,0.,1.]

            one_hot.append(one_h)
        one_hot = np.array(one_hot)
        #one_hot = one_hot.reshape(len(one_hot),10,1)
        #one_hot = one_hot.reshape(len(one_hot), 7,1)
        #return tf.constant([one_hot])
        return one_hot
    def one_hot_tf(val):
        indices = val
        depth = 10
        on_value = 1.0
        off_value = 0.0
        axis = -1
        oh = tf.one_hot(indices, depth,
                   on_value=on_value, off_value=off_value,
                   axis=axis, dtype=tf.float32,
                   name='ONEHOT')
        return (oh)
    x_train = normalize(x_train)
    x_test =  normalize(x_test)
    #    x_train = sess.run(tf.convert_to_tensor(x_train))
    #    x_test =  sess.run(tf.convert_to_tensor(x_test))

    '''
    data_initializer = tf.placeholder(dtype=x_train.dtype,
                                        shape=x_train.shape)
    label_initializer = tf.placeholder(dtype=x_test.dtype,
                                         shape=x_test.shape)
    x_train= sess.run(tf.Variable(data_initializer, trainable=False, collections=[]))
    x_test = sess.run(tf.Variable(label_initializer, trainable=False, collections=[]))
    '''


    y_test =  one_hot(y_test)
    y_train =  one_hot(y_train)
    print(y_test[:5])
    #   y_test =  sess.run(one_hot_tf(y_test))
    #   y_train =  sess.run(one_hot_tf(y_train))


    # define the parameters
    input_nodes = 784
    output_nodes = 10
    hl1_nodes = 500
    hl2_nodes = 500
    hl3_nodes = 500
    epochs = 10
    x = tf.placeholder(tf.float32, [None, input_nodes])
    y = tf.placeholder(tf.float32)

    # graphing
    loss_rate = []


    def nn(data):
        layer1 = {'w':tf.Variable(tf.random_normal([input_nodes, hl1_nodes])),
                  'b':tf.Variable(tf.random_normal([hl1_nodes]))}
        layer2 = {'w':tf.Variable(tf.random_normal([hl1_nodes, hl2_nodes])),
                  'b':tf.Variable(tf.random_normal([hl2_nodes]))}
        layer3 = {'w':tf.Variable(tf.random_normal([hl2_nodes, hl3_nodes])),
                  'b':tf.Variable(tf.random_normal([hl3_nodes]))}
        output_layer = {'w':tf.Variable(tf.random_normal([hl3_nodes, output_nodes])),
                  'b':tf.Variable(tf.random_normal([output_nodes]))}

        l1 = tf.add(tf.matmul(data, layer1['w']), layer1['b'])
        l1 = tf.nn.relu(l1)

        l2 = tf.add(tf.matmul(l1, layer2['w']), layer2['b'])
        l2 = tf.nn.relu(l2)

        l3 = tf.add(tf.matmul(l2, layer3['w']), layer3['b'])
        l3 = tf.nn.relu(l3)

        output = tf.add(tf.matmul(l3, output_layer['w']), output_layer['b'])

        return(output)


    def train(x):
        prediction = nn(x)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
        optimizer = tf.train.GradientDescentOptimizer(0.001).minimize(loss)

        init = tf.global_variables_initializer()
        sess.run(init)

        for epoch in range(epochs):
            epochloss = 0
            batch_size = 10
            batches = 0
            for batch in range(int(len(x_train)/batch_size)):
                next_batch = batches+batch
                _, c = sess.run([optimizer, loss], feed_dict={x:x_train[batches:next_batch, :], y:y_train[batches:next_batch, :]})
                epochloss = epochloss + c
                batches += batch
                loss_rate.append(c)

            print("Epoch ", epoch, " / ", epochs, " - Loss ", epochloss)

        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        print("Accuracy : ", accuracy.eval({x:x_test, y:y_test}))


    train(x)

    plt.plot(loss_rate)
    plt.show()

3次不同运行的输出是:

=========== RESTART: /Users/macbookpro/Desktop/AI/tf/OWN/test3.py ===========
(10000, 784)
[[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]]
Epoch  0  /  5  - Loss  nan
Epoch  1  /  5  - Loss  nan
Epoch  2  /  5  - Loss  nan
Epoch  3  /  5  - Loss  nan
Epoch  4  /  5  - Loss  nan
Accuracy :  0.9053

=========== RESTART: /Users/macbookpro/Desktop/AI/tf/OWN/test3.py ===========
(10000, 784)
[[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]]
Epoch  0  /  5  - Loss  nan
Epoch  1  /  5  - Loss  nan
Epoch  2  /  5  - Loss  nan
Epoch  3  /  5  - Loss  nan
Epoch  4  /  5  - Loss  nan
Accuracy :  0.8342

=========== RESTART: /Users/macbookpro/Desktop/AI/tf/OWN/test3.py ===========
(10000, 784)
[[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]]
Epoch  0  /  5  - Loss  nan
Epoch  1  /  5  - Loss  nan
Epoch  2  /  5  - Loss  nan
Epoch  3  /  5  - Loss  nan
Epoch  4  /  5  - Loss  nan
Accuracy :  0.9

---更新---我在重写代码中找到了答案,如下:

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

sess = tf.Session()

file = "mnist_train.csv"
data = np.loadtxt(file, delimiter=',')


y_vals = data[:,0:1]
x_vals = data[:,1:785]

seed = 3
tf.set_random_seed(seed)
np.random.seed(seed)
batch_size = 90

# split into 80/20 datasets, normalize between 0:1 with min max scaling
train_indices = np.random.choice(len(x_vals), round(len(x_vals)*0.8), replace=False)
# up there we chose randomly 80% of the data
test_indices = np.array(list(set(range(len(x_vals))) - set(train_indices)))
# up we chose the remaining 20% 
print(test_indices)

x_vals_train = x_vals[train_indices]
x_vals_test = x_vals[test_indices]
y_vals_train = y_vals[train_indices]
y_vals_test = y_vals[test_indices]

def normalize_cols(m):
    col_max = m.max(axis=0)
    col_min = m.min(axis=0)
    return (m-col_min)/(col_max - col_min)
x_vals_train = np.nan_to_num(normalize_cols(x_vals_train))
x_vals_test = np.nan_to_num(normalize_cols(x_vals_test))

# function that initializes the weights and the biases 
def init_weight(shape, std_dev):
    weight = tf.Variable(tf.random_normal(shape, stddev=std_dev))
    return(weight)

def init_bias(shape, std_dev):
    bias= tf.Variable(tf.random_normal(shape, stddev=std_dev))
    return(bias)

# initialize placeholders. 
x_data = tf.placeholder(shape=[None, 784], dtype=tf.float32)
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)


# the fully connected layer will be used three times for all three hidden layers
def fully_connected(input_layer, weights, biases):
    layer = tf.add(tf.matmul(input_layer, weights), biases)
    return (tf.nn.relu(layer))

# Now create the model for each layer and the output layer.
# we will initialize a weight matrix, bias matrix and the fully connected layer
# for this, we will use hidden layers of size 500, 500, and 10

'''
This will mean many variables variables to fit. This is because between the data and the first hidden layer we have 
784*500+500 = 392,500 variables to change.
continuing this way we will have end up with how many variables we have overall to fit
'''

# create first layer (500 hidden nodes)
weight_1 = init_weight(shape=[784,500], std_dev=10.0)
bias_1 = init_bias(shape=[500], std_dev=10.0)
layer_1 = fully_connected(x_data, weight_1, bias_1)

# create second layer (5-- hidden nodes)
weight_2 = init_weight(shape=[500,500], std_dev=10.0)
bias_2 = init_bias(shape=[500], std_dev=10.0)
layer_2 = fully_connected(layer_1, weight_2, bias_2)

# create third layer (10 hidden nodes)
weight_3 = init_weight(shape=[500,10], std_dev=10.0)
bias_3 = init_bias(shape=[10], std_dev=10.0)
layer_3 = fully_connected(layer_2, weight_3, bias_3)

# create output layer (1 output value)
weight_4 = init_weight(shape=[10,1], std_dev=10.0)
bias_4 = init_bias(shape=[1], std_dev=10.0)
final_output = fully_connected(layer_3, weight_4, bias_4)


# define the loss function and the optimizer and initializing the model
loss = tf.reduce_mean(tf.abs(y_target - final_output))
optimizer = tf.train.AdamOptimizer(0.05)
train_step = optimizer.minimize(loss)

init = tf.global_variables_initializer()
sess.run(init)

# we will now train our model 10 times, store train and test los, select a random batch size, 
# and print the status every 1 generation

# initalize the loss vectors
loss_vec = []
test_loss = []
for i in range(10):
    # choose random indices for batch selection
    rand_index = np.random.choice(len(x_vals_train), size=batch_size)
    # get random batch
    rand_x = x_vals_train[rand_index]
    #rand_y = np.transpose(y_vals_train[rand_index])
    rand_y = y_vals_train[rand_index] #???????????
    # run the training step
    sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
    # get and store train loss
    temp_loss = sess.run(loss, feed_dict={x_data:rand_x, y_target:rand_y})
    loss_vec.append(temp_loss)
    # get and store test loss 
    #test_temp_loss = sess.run(loss, feed_dict={x_data:x_vals_test, y_target:np.transpose([y_vals_test])})
    test_temp_loss = sess.run(loss, feed_dict={x_data:x_vals_test, y_target:y_vals_test}) #???????
    test_loss.append(test_temp_loss)
    if(i+1) %1==0:
        print('Generation: '+str(i+1)+". Loss = "+str(temp_loss))

plt.plot(loss_vec, 'k-', label='Train Loss')
plt.plot(test_loss, 'r--', label='Test Loss')
plt.title('Loss Per generation ')
plt.xlabel('Generation')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.show()

我评论了大部分内容,所以如果有人在这里绊倒并需要一些帮助,他们可以理解发生了什么。

1个回答

鉴于您在测试集上有如此高的错误并且有如此多的隐藏层/节点,您的模型很可能是过度拟合的。尝试使用 dropout 或权重衰减来规范网络的权重。