使用 pyTorch 训练具有 4 个 GPU 的神经网络,与使用一个 GPU 相比,性能甚至不到 2 倍(顺便说一句 1 和 2 倍)。从 Nvidia-smi 我们看到 GPU 使用率只有几毫秒,接下来的 5-10 秒看起来像是为新的执行卸载和加载数据(大多数 GPU 使用率是 0%)。pyTorch 中是否有任何方法可以改善 GPU 执行的数据上传和卸载。
为什么具有 4 个 GPU 的训练网络的性能并不是一个 GPU 的 4 倍?
人工智能
训练
火炬
显卡
2021-11-05 20:03:34
1个回答
您的数据集类可能有很多预处理代码。您应该使用数据加载器。它会在 GPU 处理时从数据集中预取数据。此外,您可以预先处理所有数据并保存到文件中。多个 GPU 无法扩展,因为 GPU 必须将所有数据传输到一个 GPU 才能计算损失。4 GPU 的性能大约是 3.5 倍。大批量大小也会有所帮助,因为每个 GPU 将有 1/4 的批量大小。64-128 的批量大小适用于 4 个 GPU。有关多 gpu 代码,请参阅以下来自 CIFAR-10 的示例代码。它有数据加载器和数据并行。
import os
import time
import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from model import pyramidnet
import argparse
from tensorboardX import SummaryWriter
parser = argparse.ArgumentParser(description='cifar10 classification models')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--resume', default=None, help='')
parser.add_argument('--batch_size', type=int, default=768, help='')
parser.add_argument('--num_worker', type=int, default=4, help='')
parser.add_argument("--gpu_devices", type=int, nargs='+', default=None, help="")
args = parser.parse_args()
gpu_devices = ','.join([str(id) for id in args.gpu_devices])
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_devices
def main():
best_acc = 0
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('==> Preparing data..')
transforms_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
dataset_train = CIFAR10(root='../data', train=True, download=True,
transform=transforms_train)
train_loader = DataLoader(dataset_train, batch_size=args.batch_size,
shuffle=True, num_workers=args.num_worker)
# there are 10 classes so the dataset name is cifar-10
classes = ('plane', 'car', 'bird', 'cat', 'deer',
'dog', 'frog', 'horse', 'ship', 'truck')
print('==> Making model..')
net = pyramidnet()
net = nn.DataParallel(net)
net = net.to(device)
num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
print('The number of parameters of model is', num_params)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=args.lr)
# optimizer = optim.SGD(net.parameters(), lr=args.lr,
# momentum=0.9, weight_decay=1e-4)
train(net, criterion, optimizer, train_loader, device)
def train(net, criterion, optimizer, train_loader, device):
net.train()
train_loss = 0
correct = 0
total = 0
epoch_start = time.time()
for batch_idx, (inputs, targets) in enumerate(train_loader):
start = time.time()
inputs = inputs.to(device)
targets = targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
acc = 100 * correct / total
batch_time = time.time() - start
if batch_idx % 20 == 0:
print('Epoch: [{}/{}]| loss: {:.3f} | acc: {:.3f} | batch time: {:.3f}s '.format(
batch_idx, len(train_loader), train_loss/(batch_idx+1), acc, batch_time))
elapse_time = time.time() - epoch_start
elapse_time = datetime.timedelta(seconds=elapse_time)
print("Training time {}".format(elapse_time))
if __name__=='__main__':
main()
来源:https ://github.com/dnddnjs/pytorch-multigpu/blob/master/data_parallel/train.py
希望我能帮助你,祝你有美好的一天!