数据挖掘 - L2 正则化和 He 法线权重初始化的关联导致 ResNet3D 在训练过程中的 NaN 损失 - 吾爱随笔录

当我从头开始训练 ResNet 3D 架构以执行视频数据集上的分类任务时，我遇到了一个问题。当我启用与权重初始化的 He 正态分布相同的 L2 正则化时，就会出现问题。此设置推动训练和验证损失从随机训练迭代开始产生 NaN 值，直到训练过程结束。

在数据集中，旨在馈送 ResNet3D 网络的每个视频剪辑的格式如下：

16 images x 112 width x 112 height x 3 channels

我在这里放了我正在使用的 ResNet 3D 架构的代码。我从这里https://github.com/JihongJu/keras-resnet3d/blob/master/resnet3d/resnet3d.py

def _bn_relu(input):
    """Helper to build a BN -> relu block (by @raghakot)."""
    norm = BatchNormalization(axis=CHANNEL_AXIS)(input)
    return Activation("relu")(norm)

def _conv_bn_relu3D(**conv_params):
    filters = conv_params["filters"]
    kernel_size = conv_params["kernel_size"]
    strides = conv_params.setdefault("strides", (1, 1, 1))
    # kernel_initializer = conv_params.setdefault(
    #     "kernel_initializer", "he_normal")
    padding = conv_params.setdefault("padding", "same")
    kernel_regularizer = conv_params.setdefault("kernel_regularizer",
                                                l2(1e-4))

    def f(input):
        conv = Conv3D(filters=filters, kernel_size=kernel_size,
                      strides=strides, 
                      # kernel_initializer=kernel_initializer,
                      kernel_regularizer=kernel_regularizer,
                      padding=padding)(input)
        return _bn_relu(conv)

    return f

def _bn_relu_conv3d(**conv_params):
    """Helper to build a  BN -> relu -> conv3d block."""
    filters = conv_params["filters"]
    kernel_size = conv_params["kernel_size"]
    strides = conv_params.setdefault("strides", (1, 1, 1))
    # kernel_initializer = conv_params.setdefault("kernel_initializer",
    #                                             "he_normal")
    padding = conv_params.setdefault("padding", "same")
    kernel_regularizer = conv_params.setdefault("kernel_regularizer",
                                                l2(1e-4))

    def f(input):
        activation = _bn_relu(input)
        return Conv3D(filters=filters, kernel_size=kernel_size,
                      strides=strides, 
                      # kernel_initializer=kernel_initializer,
                      kernel_regularizer=kernel_regularizer,
                      padding=padding)(activation)
    return f


def _shortcut3d(input, residual):
    """3D shortcut to match input and residual and merges them with "sum"."""
    stride_dim1 = math.ceil(int(input.shape[DIM1_AXIS]) \
        / int(residual.shape[DIM1_AXIS]))
    stride_dim2 = math.ceil(int(input.shape[DIM2_AXIS]) \
        / int(residual.shape[DIM2_AXIS]))
    stride_dim3 = math.ceil(int(input.shape[DIM3_AXIS]) \
        / int(residual.shape[DIM3_AXIS]))
    equal_channels = int(residual.shape[CHANNEL_AXIS]) \
        == int(input.shape[CHANNEL_AXIS])

    shortcut = input
    if stride_dim1 > 1 or stride_dim2 > 1 or stride_dim3 > 1 \
            or not equal_channels:
        shortcut = Conv3D(
            filters=int(residual.shape[CHANNEL_AXIS]),
            kernel_size=(1, 1, 1),
            strides=(stride_dim1, stride_dim2, stride_dim3),
            # kernel_initializer="he_normal",
            kernel_regularizer=l2(1e-4),
            padding="valid"
            )(input)
    return add([shortcut, residual])


def _residual_block3d(block_function, filters, 
                      kernel_regularizer,
                      repetitions,
                      is_first_layer=False):
    def f(input):
        for i in range(repetitions):
            strides = (1, 1, 1)
            if i == 0 and not is_first_layer:
                strides = (2, 2, 2)
            input = block_function(filters=filters, strides=strides,
                                   kernel_regularizer=kernel_regularizer,
                                   is_first_block_of_first_layer=(
                                       is_first_layer and i == 0)
                                   )(input)
        return input

    return f


def basic_block(filters, strides=(1, 1, 1), kernel_regularizer=l2(1e-4),
                is_first_block_of_first_layer=False):
    """Basic 3 X 3 X 3 convolution blocks. Extended from raghakot's 2D impl."""
    def f(input):
        if is_first_block_of_first_layer:
            # don't repeat bn->relu since we just did bn->relu->maxpool
            conv1 = Conv3D(filters=filters, kernel_size=(3, 3, 3),
                           strides=strides,
                           # kernel_initializer="he_normal",
                           kernel_regularizer=kernel_regularizer,
                           padding="same"
                           )(input)
        else:
            conv1 = _bn_relu_conv3d(filters=filters,
                                    kernel_size=(3, 3, 3),
                                    kernel_regularizer=kernel_regularizer,
                                    strides=strides
                                    )(input)

        residual = _bn_relu_conv3d(filters=filters, 
                                   kernel_regularizer=kernel_regularizer,
                                   kernel_size=(3, 3, 3)
                                   )(conv1)
        return _shortcut3d(input, residual)

    return f


def bottleneck(filters, strides=(1, 1, 1), kernel_regularizer=l2(1e-4),
               is_first_block_of_first_layer=False):
    """Basic 3 X 3 X 3 convolution blocks. Extended from raghakot's 2D impl."""
    def f(input):
        if is_first_block_of_first_layer:
            # don't repeat bn->relu since we just did bn->relu->maxpool
            conv_1_1 = Conv3D(filters=filters, kernel_size=(1, 1, 1),
                              strides=strides, 
                              # kernel_initializer="he_normal",
                              kernel_regularizer=kernel_regularizer,
                              padding="same"
                              )(input)
        else:
            conv_1_1 = _bn_relu_conv3d(filters=filters, kernel_size=(1, 1, 1),
                                       kernel_regularizer=kernel_regularizer,
                                       strides=strides
                                       )(input)

        conv_3_3 = _bn_relu_conv3d(filters=filters,
                                   kernel_regularizer=kernel_regularizer,
                                   kernel_size=(3, 3, 3)
                                   )(conv_1_1)
        residual = _bn_relu_conv3d(filters=filters * 4,
                                   kernel_regularizer=kernel_regularizer,
                                   kernel_size=(1, 1, 1)
                                   )(conv_3_3)

        return _shortcut3d(input, residual)

    return f


def _handle_data_format():
    global DIM1_AXIS
    global DIM2_AXIS
    global DIM3_AXIS
    global CHANNEL_AXIS
    if K.image_data_format() == 'channels_last':
        print("here CHANNELS last")
        DIM1_AXIS = 1
        DIM2_AXIS = 2
        DIM3_AXIS = 3
        CHANNEL_AXIS = 4
    else:
        CHANNEL_AXIS = 1
        DIM1_AXIS = 2
        DIM2_AXIS = 3
        DIM3_AXIS = 4


def _get_block(identifier):
    if isinstance(identifier, six.string_types):
        res = globals().get(identifier)
        if not res:
            raise ValueError('Invalid {}'.format(identifier))
        return res
    return identifier


class Resnet3DBuilder(object):
    """ResNet3D."""

    @staticmethod
    def build(input_shape, num_outputs, block_fn, repetitions, reg_factor):
        """Instantiate a vanilla ResNet3D keras model.
        # Arguments
            input_shape: Tuple of input shape in the format
            (conv_dim1, conv_dim2, conv_dim3, channels) if dim_ordering='tf'
            (filter, conv_dim1, conv_dim2, conv_dim3) if dim_ordering='th'
            num_outputs: The number of outputs at the final softmax layer
            block_fn: Unit block to use {'basic_block', 'bottlenack_block'}
            repetitions: Repetitions of unit blocks
        # Returns
            model: a 3D ResNet model that takes a 5D tensor (volumetric images
            in batch) as input and returns a 1D vector (prediction) as output.
        """
        _handle_data_format()
        if len(input_shape) != 4:
            raise ValueError("Input shape should be a tuple "
                             "(conv_dim1, conv_dim2, conv_dim3, channels) "
                             "for tensorflow as backend or "
                             "(channels, conv_dim1, conv_dim2, conv_dim3) "
                             "for theano as backend")

        block_fn = _get_block(block_fn)
        input = Input(shape=input_shape)
        # first conv
        conv1 = _conv_bn_relu3D(filters=64, kernel_size=(7, 7, 7),
                                kernel_regularizer=l2(reg_factor),
                                strides=(2, 2, 2)
                                )(input)
        pool1 = MaxPooling3D(pool_size=(3, 3, 3), strides=(2, 2, 2),
                             padding="same")(conv1)

        # repeat blocks
        block = pool1
        filters = 64
        for i, r in enumerate(repetitions):
            block = _residual_block3d(block_fn, filters=filters,
                                      kernel_regularizer=l2(reg_factor),
                                      repetitions=r, is_first_layer=(i == 0)
                                      )(block)
            filters *= 2

        # last activation
        block_output = _bn_relu(block)

        # average poll and classification
        pool2 = AveragePooling3D(pool_size=(int(block.shape[DIM1_AXIS]),
                                            int(block.shape[DIM2_AXIS]),
                                            int(block.shape[DIM3_AXIS])),
                                 strides=(1, 1, 1))(block_output)
        flatten1 = Flatten()(pool2)
        if num_outputs > 1:
            dense = Dense(units=num_outputs,
                          # kernel_initializer="he_normal",
                          kernel_regularizer=l2(reg_factor),
                          activation="softmax"
                          
                          )(flatten1)
        else:
            dense = Dense(units=num_outputs,
                          # kernel_initializer="he_normal",
                          kernel_regularizer=l2(reg_factor),
                          activation="sigmoid"
                          )(flatten1)

        model = Model(inputs=input, outputs=dense)
        return model

    @staticmethod
    def build_resnet_18(input_shape, num_outputs, reg_factor=1e-4):
        """Build resnet 18."""
        regularization_factor = 1e-4
        return Resnet3DBuilder.build(input_shape, num_outputs, basic_block,
                                     [2, 2, 2, 2], reg_factor=regularization_factor)

    @staticmethod
    def build_resnet_34(input_shape, num_outputs, reg_factor=1e-4):
        """Build resnet 34."""
        regularization_factor = 1e-4
        return Resnet3DBuilder.build(input_shape, num_outputs, basic_block,
                                     [3, 4, 6, 3], reg_factor=regularization_factor)

    @staticmethod
    def build_resnet_50(input_shape, num_outputs, reg_factor=1e-4):
        """Build resnet 50."""
        regularization_factor = 1e-4
        return Resnet3DBuilder.build(input_shape, num_outputs, bottleneck,
                                     [3, 4, 6, 3], reg_factor=regularization_factor)

    @staticmethod
    def build_resnet_101(input_shape, num_outputs, reg_factor=1e-4):
        """Build resnet 101."""
        regularization_factor = 1e-4
        return Resnet3DBuilder.build(input_shape, num_outputs, bottleneck,
                                     [3, 4, 23, 3], reg_factor=regularization_factor)

    @staticmethod
    def build_resnet_152(input_shape, num_outputs, reg_factor=1e-4):
        """Build resnet 152."""
        regularization_factor = 1e-4
        return Resnet3DBuilder.build(input_shape, num_outputs, bottleneck,
                                     [3, 8, 36, 3], reg_factor=regularization_factor)

我正在训练 34 层的架构。

启用 L2 正则化时，我使用以下正则化因子值训练 Resnet3D 网络：1e-4

对于所有情况，我使用以下超参数：

reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                                           patience=1, verbose=1, mode='min', min_lr=1e-4)
callbacks_list = [checkpoint, reduceLROnPlat, earlyStop]
optim = Adam(lr=1e-3)
model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['accuracy'])

在下图中，我展示了当（1）单独启用 He Normal 权重初始化、（2）单独 L2 正则化以及（3）两个超参数都启用时，验证损失和验证准确度的演变。请注意，最初它在 github 中启用了两个设置，但是当应用于我的数据集时，它不起作用。

在这里，从第18 个 epoch开始，绿色曲线的验证损失变为 NaN，这引发了验证准确性的崩溃。

我的问题是：当 L2 正则化和 He Normal 权重初始化都启用时，为什么会发生 NaN 损失？ 当 L2 和 He Normal 都关联时，我怀疑是梯度爆炸/消失的问题。