为什么在我的这个 CNN 中训练和验证损失稳步上升(最终达到 NaN)?

数据挖掘 机器学习 图像分类 火炬 卷积神经网络
2022-03-07 00:03:46

亲爱的机器学习和数据科学家:

我的数据集中的每个生物样本都有 4 层灰度图像。我正在尝试训练一个 4 卷积 CNN(请参阅下面的 pytorch 架构)将生物标本分类为 3 类。

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset #custom dataset class
from torchvision import transforms
import numpy as np
import pandas as pd #read csv
import matplotlib.pyplot as plt

"""
Define the neural network architecture
"""
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        #input shape (4,224,224)
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=4,out_channels=64,kernel_size=5,stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),#output shape (64,112,112)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(64,256,5,1,2),#shape (256,112,112)
            nn.ReLU(),
            nn.MaxPool2d(2),#output shape (256,56,56)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(256,512,4,4,4), #shape (512,16,16)
            nn.ReLU(),
            nn.MaxPool2d(2), #output shape (512,8,8)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(512,1024,5,1,2), #shape (1024,8,8)
            nn.ReLU(),
            nn.MaxPool2d(2), #output shape (1024,4,4)
        )

        self.fc1 = nn.Linear(1024*4*4,1024) 
        self.fc2 = nn.Linear(1024,32)
        self.fc3 = nn.Linear(32,3) #3 classes              

    def forward(self,x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = x.view(-1,1024*4*4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

"""
statistics I have calculated on my own from the training dataset
"""
DATASET_MEAN = [0.136,0.113,0.182,0.428]
DATASET_SD = [0.259,0.181,0.230,0.190]
normalize = transforms.Normalize(DATASET_MEAN,DATASET_SD)

data_transforms = {
    #input is ndarray of shape H,W,4
    'train': transforms.Compose([
        transforms.ToPILImage(), #wrongly assumes its RGBA, but no choice because transforms can only be done on PILImage-s
        transforms.Resize(size=256),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomResizedCrop(size=224),
        transforms.ToTensor(),
        normalize]),
    'val': transforms.Compose([        
        transforms.ToPILImage(), #same thing, wrongly assumes its RGBA as the 4 channels
        transforms.Resize(size=224),
        transforms.ToTensor(),
        normalize]),
    }
"""
Define the dataset class
"""
class CTCDataset(Dataset):
    def __init__(self,csv_path,transforms):
        self.transformations = transforms
        self.csv_data = pd.read_csv(csv_path,header=None)
        self.image_arr = np.asarray(self.csv_data.iloc[:,0])
        self.label_arr = np.asarray(self.csv_data.iloc[:,1])
        self.data_len = len(self.csv_data.index)

    def __getitem__(self,index):
        single_image_name = self.image_arr[index]
        img_as_np = np.load(single_image_name)
        img_as_tensor = self.transformations(img_as_np)
        single_image_label = self.label_arr[index]
        return (img_as_tensor,single_image_label)

    def __len__(self):
        return self.data_len

train_csv_path = r'train.csv'
train_dataset = CTCDataset(
    csv_path=train_csv_path,
    transforms=data_transforms['train']
    )

val_csv_path = r'val.csv'
val_dataset = CTCDataset(
    csv_path=val_csv_path,
    transforms=data_transforms['val']
    )

from torch.utils.data.sampler import SubsetRandomSampler

num_train = len(train_dataset)
train_indices = list(range(num_train))
num_val = len(val_dataset)
val_indices = list(range(num_val))
trainloader = torch.utils.data.DataLoader(train_dataset,
                                          batch_size = 16,
                                          sampler=SubsetRandomSampler(train_indices))

testloader = torch.utils.data.DataLoader(val_dataset,
                                         batch_size = 4,
                                         sampler=SubsetRandomSampler(val_indices))

model = Net()
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-6)
"""
following code is from https://towardsdatascience.com/how-to-train-an-image-classifier-in-pytorch-and-use-it-to-perform-basic-inference-on-single-images-99465a1e9bf5
"""
epochs = 5
steps= 0
running_loss = 0
print_every = 10
train_losses, test_losses = [],[]

for epoch in range(epochs):
    for inputs, labels in trainloader:
        steps += 1
        optimizer.zero_grad()
        logps = model.forward(inputs)
        loss = criterion(logps,labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if steps % print_every == 0:
            test_loss = 0
            accuracy = 0
            model.eval()
            with torch.no_grad():
                for inputs, labels in testloader:
                    logps = model.forward(inputs)
                    batch_loss = criterion(logps,labels)
                    test_loss += batch_loss.item()
                    ps = torch.exp(logps)
                    top_p, top_class = ps.topk(1,dim=1)
                    equals = (top_class == labels.view(*top_class.shape))                    
                    accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

            train_losses.append(running_loss/len(trainloader))
            test_losses.append(test_loss/len(testloader))
            print(f"Epoch {epoch+1}/{epochs}.. "
                  f"Train loss: {running_loss/print_every:.3f}.."
                  f"Test loss: {test_loss/len(testloader):.3f}.."
                  f"Test accuracy: {accuracy/len(testloader):.3f}")
            running_loss = 0
            model.train()

#plot the training and validation losses
plt.plot(train_losses,label='Training loss')
plt.plot(test_losses,label='Validation loss')
plt.legend(frameon=False)
plt.show()

torch.save(model,'model_v1.1.pth')

以下是损失图:

训练和验证损失的数量呈指数增长

以下是每个 epoch 结束时的准确度:

Epoch 1/5.. Train loss: -1.906..Test loss: -1.412..Test accuracy: 0.056
Epoch 2/5.. Train loss: -72.720..Test loss: -52.513..Test accuracy: 0.056
Epoch 3/5.. Train loss: -573.684..Test loss: -390.878..Test accuracy: 0.014
Epoch 4/5.. Train loss: -2662.921..Test loss: -1772.838..Test accuracy: 0.014
Epoch 5/5.. Train loss: -8421.151..Test loss: -5458.454..Test accuracy: 0.014

我的网络似乎没有训练。您认为问题出在我的网络架构中的错误吗?我的多通道输入是否使用了太少的层?

我想让你看看我的输入是什么样的。请参阅下面每个生物样本的 4 个通道: 图像输入有4个通道

我的硬盘中有 2746 张这样的图像用于训练集,另外 100 张左右用于验证集。我需要指出的是,这个类非常不平衡,54% 是“0”类,45% 是“1”类,1% 是“2”类。这反映了自然界中类的实际分布。

1个回答

在LinkedIn上发布这个问题后,我找到了答案。以下答案来自一些领先公司的 ML 研究人员和数据科学经理:

“检查你的变量是否被归一化?NAN 在梯度消失和梯度爆炸的情况下出现......”

“检查你的体重。他们很可能会走向无穷大”

基于以上两个答案,我们可以感觉到我们的权重正在爆炸……