最初在stackoverflow上询问,已删除
我无法在 tensorflow 中训练卷积神经网络。当我开始我的程序时,有时模型学习得很好(成本/cross_entropy 下降,训练和测试数据的准确性上升)并且似乎在 50-100 个 epoch 后收敛。
然而,有时训练似乎只在几个 (2-4) epochs 后就卡住了。成本徘徊在相同的值 (1.25) 附近,准确度停留在相同的值上:训练数据为 0.45970,测试数据为 0.016666。在运行之间没有对代码或训练数据进行任何更改。
“好的”运行(当模型学习时)收敛到各种最终精度。另一方面,每一次“糟糕”的运行都达到完全相同的精度:0.45970。
我有 330 000 张训练图像(灰度),转换为 numpy 数组。我用来创建 tensorflow 数据集的生成器的工作原理是列出文件夹中的所有文件,然后从该列表中获取每个第 N 个文件,并仅将那些选定的文件提供给数据集。目前我使用每 100 个文件,它可以计算出 33 批 100 张图像,总共 3300 张训练图像。我有 4 个班级,每个班级的样本数为:904、321、558、1517。
我尝试使用更多的训练样本(33 000)和不同的学习率,但结果是一样的。网络运行不佳的情况比运行良好的情况更常见。
我能想象的唯一不同之处在于各个运行之间的随机初始化权重,但它会产生这样的效果吗?特别是考虑到随机权重初始化使用小标准差(0.03/0.01)?还是我的代码有问题?
代码位于两个文件中:main 和 stream_data.py,用于处理从磁盘加载图像。
主程序:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import os
from timeit import default_timer as timer
import time
import stream_data as sd #for reading train/eval data
tf.logging.set_verbosity(tf.logging.INFO)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
epochs = 10000
learning_rate = 0.03
#use every 100-th training image
sd.generator_filenames_every_nth_train = 100
#use every testing image
sd.generator_filenames_every_nth_eval = 1
batch_size = 100
batch_size_eval = 100
#num training and evaluation batches per epoch - this works out to 33 training batches of 100 images and 12 batches of 100 images for evaluation
num_train_samples = len(os.listdir(sd.path_to_train_files_npy_stream))
total_batch = int(num_train_samples /( sd.generator_filenames_every_nth_train * batch_size))
print('Num training samples:', num_train_samples, 'Generator train skip:', sd.generator_filenames_every_nth_train, 'Num train batches:', total_batch)
num_eval_samples = len(os.listdir(sd.path_to_eval_files_npy_stream))
total_batch_eval = int(num_eval_samples / (sd.generator_filenames_every_nth_eval * batch_size_eval))
print('Num eval samples:', num_eval_samples, 'Generator eval skip:', sd.generator_filenames_every_nth_eval, 'Num eval batches:', total_batch_eval)
def run_cnn():
#for logging results
log_accuracy_train=[]
log_accuracy_eval=[]
log_cross_entropy=[]
# SET UP THE NETWORK
# declare the training data placeholders
x = tf.placeholder(tf.float32, [None, sd.image_width * sd.image_height * sd.num_channels_used], name='placeholder_x')
x_shaped = tf.reshape(x, [-1, sd.image_height, sd.image_width, sd.num_channels_used], name='placeholder_x_shaped')
y = tf.placeholder(tf.float32, [None, sd.num_output_classes], name='placeholder_y')
# create some convolutional layers
layer1 = create_new_conv_layer(x_shaped, 1, 10, [5, 5], [4, 4], [4,4], name='layer1')
layer2 = create_new_conv_layer(layer1, 10, 20, [5, 5], [5,5], [5,5], name='layer2')
# flatten the output ready for the fully connected output stage
flattened = tf.reshape(layer2, [-1, 17 * 66 * 20])
# setup some weights and bias values for this layer, then activate with ReLU
wd1 = tf.Variable(tf.truncated_normal([17 * 66 * 20, 50], stddev=0.03), name='wd1')
bd1 = tf.Variable(tf.truncated_normal([50], stddev=0.01), name='bd1')
dense_layer1 = tf.matmul(flattened, wd1) + bd1
dense_layer1 = tf.nn.relu(dense_layer1, name = 'dense1')
# another layer with softmax activations - this is output layer
wd2 = tf.Variable(tf.truncated_normal([50, sd.num_output_classes], stddev=0.03), name='wd2')
bd2 = tf.Variable(tf.truncated_normal([sd.num_output_classes], stddev=0.01), name='bd2')
dense_layer2 = tf.matmul(dense_layer1, wd2) + bd2
y_prediction = tf.nn.softmax(dense_layer2, name='y_prediction')
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=dense_layer2, labels=y), name='cross_entropy')
# add an optimizer
adam_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, name='adam_optimizer_var')
optimizer = adam_optimizer.minimize(cross_entropy, name='adam_optimizer_minimize')
# define an accuracy assessment operation
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_prediction, 1), name = 'correct_prediction')
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
#define how to get batches
batch_xt, batch_yt, batch_ft = sd.get_inputs_train(batch_size)
batch_xe, batch_ye, batch_fe = sd.get_inputs_eval(batch_size_eval)
# setup the initialisation operator
init_op = tf.global_variables_initializer()
model_saver = tf.train.Saver(name='model_saver')
with tf.Session() as sess:
sess.run(init_op)
#in each epoch, train all batches and then evaluate the network on testing data
for epoch in range(epochs):
avg_cost = 0
avg_accuracy = 0.0
print('Training batches in epoch', epoch)
for i in range(total_batch):
time_0 = timer()
#get training data
batch_x_np, batch_y_np, batch_f_np = sess.run([batch_xt, batch_yt, batch_ft])
time_1 = timer()
#train network
_, c, train_acc = sess.run([optimizer, cross_entropy, accuracy], feed_dict={x: batch_x_np, y: batch_y_np})
avg_cost += c / total_batch
avg_accuracy += train_acc / total_batch
time_2 = timer()
print('Trained batch ', i, '/', total_batch, ' Time: {:.3f} :'.format(time_2 - time_0), '{:.3f},'.format(time_1 - time_0), '{:.3f}'.format(time_2 - time_1),'Acc: {:.3f}'.format(train_acc))
print('After train epoch:', epoch, "train cost =", "{:.5f}".format(avg_cost), 'Avg train accuracy: {:.5f}'.format(avg_accuracy))
#save cross correlation and accuracy after training one epoch
log_accuracy_train = np.append(log_accuracy_train, avg_accuracy)
log_cross_entropy = np.append(log_cross_entropy, avg_cost)
np.save('./results/train_cost.npy', log_cross_entropy)
np.save('./results/train_accuracy.npy', log_accuracy_train)
print('Saving model after epoch', epoch)
model_saver.save(sess, './models/merc_proper/model_merc_full', global_step = epoch, write_meta_graph = True)
#evaluating model on testing data
avg_accuracy=0.0
for i in range(total_batch_eval):
time_0 = timer()
#get testing data
batch_x_np, batch_y_np, batch_f_np = sess.run([batch_xe, batch_ye, batch_fe])
time_1 = timer()
#feedforward and get accuracy
test_acc = sess.run(accuracy, feed_dict={x: batch_x_np, y: batch_y_np})
time_2 = timer()
avg_accuracy += test_acc/total_batch_eval
print("Epoch:", (epoch), "eval accuracy: {:.3f}".format(test_acc), ' Time: {:.3f} :'.format(time_2 - time_0), '{:.3f},'.format(time_1 - time_0), '{:.3f}'.format(time_2 - time_1))
print('Average eval accuracy after epoch', epoch, ':', avg_accuracy)
#save accuracy on testing data after epoch
log_accuracy_eval = np.append(log_accuracy_eval, avg_accuracy)
np.save('./results/eval_accuracy.npy', log_accuracy_eval)
print("\nTraining complete!")
def create_new_conv_layer(input_data, num_input_channels, num_filters, filter_shape, pool_shape, pool_stride, name):
# setup the filter input shape for tf.nn.conv_2d
conv_filt_shape = [filter_shape[0], filter_shape[1], num_input_channels, num_filters]
# initialise weights and bias for the filter
#orig mean=0, std=0.03
weights = tf.Variable(tf.truncated_normal(conv_filt_shape, mean=0.0, stddev=0.03), name=name+'_W')
bias = tf.Variable(tf.truncated_normal([num_filters]), name=name+'_b')
# setup the convolutional layer operation
out_layer = tf.nn.conv2d(input_data, weights, [1, 1, 1, 1], padding='SAME')
# add the bias
out_layer += bias
# apply a ReLU non-linear activation
out_layer = tf.nn.relu(out_layer)
# now perform max pooling
ksize = [1, pool_shape[0], pool_shape[1], 1]
strides = [1, pool_stride[0], pool_stride[1], 1]
out_layer = tf.nn.max_pool(out_layer, ksize=ksize, strides=strides, padding='SAME')
return out_layer
def stop(msg = None):
if(msg != None):
print(msg)
raw_input('Press ENTER to continue')
if __name__ == "__main__":
run_cnn()
流数据.py:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os #listdir
import numpy as np
import tensorflow as tf
from timeit import default_timer as timer
import pickle
import random
import copy
from PIL import Image
import threading
#resolution of images
image_width = 1320
image_height = 340
#number of channels - images are grayscale
num_channels_raw = 1
num_channels_used = 1
#how many images should be skipped - only every N-th image will be used
#so if this is == 100, only every 100th image will be used
generator_filenames_every_nth_train = 1
generator_filenames_every_nth_eval = 1
num_output_classes = 4
path_to_train_files_npy_stream = '/media/user/DiskNaData_1TB/data/merc/train_artificial_npy/'
path_to_eval_files_npy_stream = '/media/user/DiskNaData_1TB/data/merc/eval_artificial_npy/'
def stop(msg = None):
if(msg != None):
print(msg)
raw_input('Press ENTER to continue')
#generator that produces train/eval data
#returns data (grayscale values), label(onehot) and filename (for debugging purposes)
#reads list of all files in a folder, then skips some of them and only yields every N-th
def read_stream_files(path_to_files_npy_stream, every_nth):
files = os.listdir(path_to_files_npy_stream)
for i in range(0, len(files), every_nth):
filename=files[i]
data = np.load(path_to_files_npy_stream + filename)
data=data.astype(float)
data=data / 255.0
label = int(filename[0:2])
#make one-hot
onehot = np.zeros(num_output_classes)
onehot[label] = 1
label=onehot
yield data, label, filename
def get_dataset_train():
generator = lambda: read_stream_files(path_to_train_files_npy_stream, generator_filenames_every_nth_train)
return tf.data.Dataset.from_generator(
generator, (tf.float32, tf.int32, tf.string), ((image_width*image_height*num_channels_used,), (num_output_classes, ), ()))
def get_dataset_eval():
generator = lambda: read_stream_files(path_to_eval_files_npy_stream, generator_filenames_every_nth_eval)
return tf.data.Dataset.from_generator(
generator, (tf.float32, tf.int32, tf.string), ((image_width*image_height*num_channels_used,), (num_output_classes, ), ()))
def get_inputs_train(batch_size):
dataset = get_dataset_train()
dataset = dataset.shuffle(1000)
dataset = dataset.repeat() # repeat indefinitely
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(3)
features, label, filename = dataset.make_one_shot_iterator().get_next()
return features, label, filename
def get_inputs_eval(batch_size):
dataset = get_dataset_eval() # one of the above implementations
dataset = dataset.batch(batch_size)
dataset = dataset.repeat()
dataset = dataset.prefetch(3)
features, label, filename = dataset.make_one_shot_iterator().get_next()
return features, label, filename

