数据挖掘 - 神经风格迁移中的损失平台期 - 吾爱随笔录

我正在通过从 keras 加载 vgg 模型并将其提供给 tensorflow 模型来编写样式转换的实现。

我正在使用亚当优化器。损失函数正在减少，但它非常缓慢并且在大约 10 ⁸处趋于平稳。此外，生成的图像颜色似乎正在正确更改，但仍明显是噪点。

此外，样式损失很大（10 ⁸的数量级），而内容的损失要小得多（10 ⁵的数量级）。这很奇怪，因为风格迁移的论文说在计算总损失时将内容损失缩小 100 或 1000 倍。

我尝试提高学习率，但这只会使梯度过冲。

我怀疑我的实现中一定有一个错误，但是尽管无休止地搜索，我还是找不到问题所在。

这是代码：

# coding: utf-8
# In[1]:

from keras.applications.vgg16 import VGG16
from keras.models import Model
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import matplotlib.pyplot as plt


# In[2]:


content_image_path = './skyline.jpg'
style_image_path = './starry_night.jpg'
output_image_path = './output.jpg'

# In[4]:

from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input

# In[5]:

content_image = image.load_img(content_image_path, target_size=(224, 224))
#plt.imshow(content_image)
content_arr = image.img_to_array(content_image)
content_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(content_arr, axis=0)), tf.float64)
sess.run(tf.shape(content_arr))

# In[6]:

style_image = image.load_img(style_image_path, target_size=(224, 224))
#plt.imshow(style_image)
style_arr = image.img_to_array(style_image)
style_arr = tf.convert_to_tensor(preprocess_input(np.expand_dims(style_arr, axis=0)), tf.float64)
sess.run(tf.shape(style_arr))

# In[7]:

#generate random image with pixel values b/w 0 -> 255
o_input = np.random.randint(low=0, high=256, size=(224, 224, 3)).astype('float64')
plt.imshow(o_input)
o_input_old = np.copy(o_input)
o_input = preprocess_input(np.expand_dims(o_input, axis=0))
print(o_input_old)

o_input_var = tf.Variable(o_input, name="gen_img_vector", trainable=True)

# In[8]:

content_model = VGG16(include_top=False, weights='imagenet', input_tensor=content_arr, input_shape=(224, 224, 3))
style_model = VGG16(include_top=False, weights='imagenet', input_tensor=style_arr, input_shape=(224, 224, 3))
train_model = VGG16(include_top=False, weights='imagenet', input_tensor=o_input_var, input_shape=(224, 224, 3))

# In[10]:

content_model.summary()

# In[11]:

def get_feature_rep(layer_type, layer_names, model):

    outputs = []
    for name in layer_names:
        out = model.get_layer(name=name).output

        N = tf.shape(out)[3]#number of channels
        M = tf.multiply(tf.shape(out)[1], tf.shape(out)[2])#product of dimensions

        out = tf.transpose(tf.reshape(out, (M, N)))#Flattens each channel into 1-D tensor & reshapes layer
        if layer_type == 'style':
            out = get_gram_matrix(out)
        print(out)
        outputs.append(out)
    return outputs

# In[12]:

def get_gram_matrix(F):
    G = tf.matmul(F, tf.transpose(F))
    return G


# In[13]:


def style_loss(Gs, As):

    total = tf.Variable(tf.constant(0.0, tf.float64), name="style_loss", trainable=False)
    style_reps = list(zip(Gs, As))

    for layer in style_reps:
        loss = tf.reduce_sum(tf.cast(tf.squared_difference(layer[0], layer[1]), tf.float64), [0, 1])
        N_layer = tf.shape(layer[0])[0]
        M_layer = tf.shape(layer[0])[1]
        den = tf.square(tf.cast(tf.multiply(N_layer, M_layer), tf.float64))
        loss = loss/den
        loss = loss*0.2/4.0 #weighting loss
        total = total + loss

    return total


# In[14]:

def content_loss(P, F):
#     loss = tf.Variable(tf.constant(0.0, tf.float64), name="content_loss", trainable=False)
    loss = tf.reduce_sum(tf.cast(tf.squared_difference(P, F), tf.float64), [0, 1])
    loss = loss/2.0
    return loss

# In[15]:

content_layer_names = ['block4_conv2']
style_layer_names = ['block1_conv1', 'block2_conv1', 'block3_conv1', 'block4_conv1']

# In[32]:

P = tf.squeeze(get_feature_rep('content', content_layer_names, content_model))

# In[34]:

F = tf.squeeze(get_feature_rep('content', content_layer_names, train_model))

# In[18]:

#Each member of As consists of a feature map corresponding to a particular layer (dim. channels x pixels per channel)
As = get_feature_rep('style', style_layer_names, style_model)

# In[19]:

Gs = get_feature_rep('style', style_layer_names, train_model)

# In[20]:

styleloss = style_loss(Gs, As)

# In[21]:

contentloss = content_loss(P, F)

# In[22]:

total_loss = tf.add(styleloss, tf.multiply(tf.constant(0.01, tf.float64), contentloss))


# In[23]:

optimizer = tf.train.AdamOptimizer(5).minimize(total_loss, var_list=[o_input_var])

# In[26]:

def reprocess(x):
    VGG_MEAN = [123.68, 116.78, 103.94]
    means = tf.reshape(tf.constant(VGG_MEAN, tf.float64), [1, 1, 3])
    #Undo mean imagenet scale preprocessing
    x = tf.add(x, means)
    tf.clip_by_value(x, 0, 255)
    #bgr to rgb
    x = x[..., ::-1]
    return x

# In[27]:

saver = tf.train.Saver(tf.global_variables())

# In[28]:

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

#     saver.restore(sess, './model/nst_model.ckpt')

    for epoch in range(100):
        _, styleloss_curr, contentloss_curr, loss_curr, new_arr = sess.run([optimizer, styleloss, contentloss, total_loss, o_input_var])

        print('Epoch: %i    Content Loss: %.2f    Style Loss: %.2f    Total Loss: %.2f' % (epoch, contentloss_curr, styleloss_curr, loss_curr))

        if epoch % 15 == 0:
            saver.save(sess, './model/nst_model.ckpt')

# In[30]:

with tf.Session() as sess:
    new_arr = reprocess(new_arr)
    new_im = sess.run(tf.cast(tf.round(tf.squeeze(new_arr)), tf.uint8))
#     new_im = new_im[...,::-1]
#     print(sess.run(new_arr[0]/255))
    print(sess.run(tf.shape(new_im)))
    plt.imshow(new_im)

以下是 150 次迭代（6-7 分钟）后样式（蓝色）和内容（红色）损失的图：

众所周知，典型的实现会在 15-20 分钟后收敛，最初的损失会急剧下降。在这种情况下，即使经过 500 次迭代，生成的图像也基本上是有色噪声。