我正在处理街景门牌号识别问题。我正在尝试用 Keras 训练 CNN。
这是我准备输入的方式:
from PIL import Image
from PIL import ImageFilter
train_folders = 'sv_train/train'
test_folders = 'test'
extra_folders = 'extra'
SV_IMG_SIZE = 28
SV_CHANNELS = 1
train_imsize = np.ndarray([len(train_data),2])
k = 500
sv_images = []
max_images = 20000#len(train_data)
max_digits = 5
sv_labels = np.ones([max_images, max_digits], dtype=int) * 10 # init to 10 cause it would be no digit
nboxes = [[] for i in range(max_images)]
print ("%d to load" % len(train_data))
def getBBox(i,perc):
'''
Given i, the desired i.png, returns
x_min, y_min, x_max, y_max,
the four numbers which define the small rectangular bounding
box that contains all individual character bounding boxes
'''
boxes = train_data[i]['boxes']
x_min=9990
y_min=9990
x_max=0
y_max=0
for bid,b in enumerate(boxes):
x_min = b['left'] if b['left'] <= x_min else x_min
y_min = b['top'] if b['top'] <= y_min else y_min
x_max = b['left']+b['width'] if b['left']+b['width'] >= x_max else x_max
y_max = b['top']+b['height'] if b['top']+b['height'] >= y_max else y_max
dy = y_max-y_min
dx = x_max-x_min
dpy = dy*perc
dpx = dx*perc
nboxes[i]=[dpx,dpy,dx,dy]
return x_min-dpx, y_min-dpy, x_max+dpx, y_max+dpy
for i in range(max_images):
print (" \r%d" % i ,end="")
filename = train_data[i]['filename']
fullname = os.path.join(train_folders, filename)
boxes = train_data[i]['boxes']
label = [10,10,10,10,10]
lb = len(boxes)
if lb <= max_digits:
im = Image.open(fullname)
x_min, y_min, x_max, y_max = getBBox(i,0.3)
im = im.crop([x_min,y_min,x_max,y_max])
owidth, oheight = im.size
wr = SV_IMG_SIZE/float(owidth)
hr = SV_IMG_SIZE/float(oheight)
for bid,box in enumerate(boxes):
sv_labels[i][max_digits-lb+bid] = int(box['label'])
box = nboxes[i]
box[0]*=wr
box[1]*=wr
box[2]*=hr
box[3]*=hr
im = im.resize((SV_IMG_SIZE,SV_IMG_SIZE),Image.ANTIALIAS)
img = img - np.mean(img)
im = im.filter(ImageFilter.EDGE_ENHANCE)
img = img - np.mean(img)
array = np.asarray(im)
array = array.reshape((SV_IMG_SIZE,SV_IMG_SIZE,3)).astype(np.float32)
na = np.zeros([SV_IMG_SIZE,SV_IMG_SIZE],dtype=int)
for x in range (array.shape[0]):
for y in range (array.shape[1]):
na[x][y]=np.average(array[x][y][:])
na = na.reshape(SV_IMG_SIZE,SV_IMG_SIZE,1)
#print(array.shape)
sv_images.append(na.astype(np.float32))
sv_train, sv_validation, svt_labels, svv_labels = train_test_split(sv_images, sv_labels, test_size=0.33, random_state=42)
以下是我创建和训练模型的方式:
model = Sequential()
x = Input((28, 28,1))
y = Convolution2D(16, 3, 3, border_mode="same")(x)
#y = MaxPooling2D(pool_size = (2, 2), strides = (2, 2)) (y)
#y = Dropout(0.25)(y)
y = Convolution2D(32, 4, 4, border_mode="same")(y)
y = MaxPooling2D(pool_size = (3, 3)) (y)
#y = Dropout(0.25)(y)
y = Convolution2D(64, 5, 5, border_mode="same", activation="relu")(y)
y = MaxPooling2D((2, 2))(y)
#y = Dropout(0.25)(y)
y = Convolution2D(128, 5, 5, border_mode="same", activation="relu")(y)
y = MaxPooling2D((2, 2))(y)
#y = Dropout(0.25)(y)
y = Flatten()(y)
y = Dense(1024, activation="relu")(y)
digit1 = Dense(11, activation="softmax")(y)
digit2 = Dense(11, activation="softmax")(y)
digit3 = Dense(11, activation="softmax")(y)
digit4 = Dense(11, activation="softmax")(y)
digit5 = Dense(11, activation="softmax")(y)
model = Model(input=x, output=[digit1, digit2, digit3,digit4,digit5])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
print(model.layers[0].output_shape)
print(model.layers[2].output_shape)
print(model.layers[4].output_shape)
print(model.layers[6].output_shape)
print(model.layers[8].output_shape)
sv_train_labels = [svt_labels[:,0],svt_labels[:,1],svt_labels[:,2],svt_labels[:,3],svt_labels[:,4]]
sv_validation_labels = [svv_labels[:,0],svv_labels[:,1],svv_labels[:,2],svv_labels[:,3],svv_labels[:,4]]
model.fit(sv_train, sv_train_labels, nb_epoch=10, batch_size=64,validation_data=(sv_validation, sv_validation_labels))
问题是我得到的精度非常低,在每个时期都保持相同的值:
Train on 13400 samples, validate on 6600 samples
Epoch 1/10
13400/13400 [==============================] - 78s - loss: 34.7407 - dense_740_loss: 0.1161 - dense_741_loss: 0.6879 - dense_742_loss: 4.7988 - dense_743_loss: 14.7893 - dense_744_loss: 14.3486 - dense_740_acc: 0.9902 - dense_741_acc: 0.9542 - dense_742_acc: 0.7001 - dense_743_acc: 0.0810 - dense_744_acc: 0.1055 - val_loss: 34.7760 - val_dense_740_loss: 0.0049 - val_dense_741_loss: 0.7131 - val_dense_742_loss: 4.8721 - val_dense_743_loss: 14.8091 - val_dense_744_loss: 14.3769 - val_dense_740_acc: 0.9997 - val_dense_741_acc: 0.9558 - val_dense_742_acc: 0.6977 - val_dense_743_acc: 0.0812 - val_dense_744_acc: 0.1080
Epoch 2/10
13400/13400 [==============================] - 70s - loss: 34.7032 - dense_740_loss: 0.0036 - dense_741_loss: 0.6760 - dense_742_loss: 4.7861 - dense_743_loss: 14.8118 - dense_744_loss: 14.4257 - dense_740_acc: 0.9998 - dense_741_acc: 0.9581 - dense_742_acc: 0.7031 - dense_743_acc: 0.0810 - dense_744_acc: 0.1050 - val_loss: 34.7760 - val_dense_740_loss: 0.0049 - val_dense_741_loss: 0.7131 - val_dense_742_loss: 4.8721 - val_dense_743_loss: 14.8091 - val_dense_744_loss: 14.3769 - val_dense_740_acc: 0.9997 - val_dense_741_acc: 0.9558 - val_dense_742_acc: 0.6977 - val_dense_743_acc: 0.0812 - val_dense_744_acc: 0.1080