我正在使用内置损失函数中提供的不同减少方法。特别是,我想比较以下内容。
通过对使用计算的每个损失值执行反向传递的平均梯度
reduction="none"
通过将批大小除以平均梯度
reduction="sum"
产生的平均梯度
reduction="mean"
- 由 计算的平均梯度
reduction="mean"
,数据点一次输入一个模型。
我的实验代码如下:
def estimate_gradient(model, optimizer, batch):
criterion_no_reduction = nn.CrossEntropyLoss(reduction="none").cuda()
criterion_sum = nn.CrossEntropyLoss(reduction="sum").cuda()
criterion_avg = nn.CrossEntropyLoss().cuda()
input, target = batch
input, target = input.cuda(), target.cuda()
output = model(input)
n = len(output)
loss_no_reudction = criterion_no_reduction(output, target)
grad_list_no_reduction = []
for i in range(n):
optimizer.zero_grad()
loss_no_reudction[i].backward(retain_graph=True)
for j, param in enumerate(model.parameters()):
if param.requires_grad:
grad = param.grad.view(-1, 1)
if i == 0:
grad_list_no_reduction.append(grad)
else:
grad_list_no_reduction[j] = torch.cat((grad_list_no_reduction[j], grad), dim=1)
grad_out_no_reduction = torch.cat(grad_list_no_reduction, dim=0)
grad_out_no_reduction = (torch.sum(grad_out_no_reduction, dim=1) / n).cpu().detach().numpy().flatten()
loss_sum = criterion_sum(output, target)
optimizer.zero_grad()
loss_sum.backward(retain_graph=True)
for j, param in enumerate(model.parameters()):
if param.requires_grad:
if j == 0:
grad_list_sum = param.grad.view(-1)
else:
grad_list_sum = torch.cat((grad_list_sum, param.grad.view(-1)))
grad_out_sum = (grad_list_sum / n).cpu().detach().numpy().flatten()
loss_avg = criterion_avg(output, target)
optimizer.zero_grad()
loss_avg.backward(retain_graph=True)
for j, param in enumerate(model.parameters()):
if param.requires_grad:
if j == 0:
grad_list_avg = param.grad.view(-1)
else:
grad_list_avg = torch.cat((grad_list_avg, param.grad.view(-1)))
grad_out_avg = grad_list_avg.cpu().detach().numpy().flatten()
target = target.view(-1, 1)
grad_list_one_by_one = []
for i in range(n):
optimizer.zero_grad()
curr_output = output[i].view(1, -1)
loss = criterion_avg(curr_output, target[i])
loss.backward(retain_graph=True)
for j, param in enumerate(model.parameters()):
if param.requires_grad:
grad = param.grad.view(-1, 1)
if i == 0:
grad_list_one_by_one.append(grad)
else:
grad_list_one_by_one[j] = torch.cat((grad_list_one_by_one[j], grad), dim=1)
grad_out_one_by_one = torch.cat(grad_list_one_by_one, dim=0)
grad_out_one_by_one = (torch.sum(grad_out_one_by_one, dim=1) / n).cpu().detach().numpy().flatten()
assert grad_out_no_reduction.shape == grad_out_sum.shape == grad_out_avg.shape == grad_out_one_by_one.shape
print("Maximum discrepancy between reduction = none and sum: {}".format(np.max(np.abs(grad_out_no_reduction - grad_out_sum))))
print("Maximum discrepancy between reduction = none and avg: {}".format(np.max(np.abs(grad_out_no_reduction - grad_out_avg))))
print("Maximum discrepancy between reduction = none and one-by-one: {}".format(np.max(np.abs(grad_out_no_reduction - grad_out_one_by_one))))
print("Maximum discrepancy between reduction = sum and avg: {}".format(np.max(np.abs(grad_out_sum - grad_out_avg))))
print("Maximum discrepancy between reduction = sum and one-by-one: {}".format(np.max(np.abs(grad_out_sum - grad_out_one_by_one))))
print("Maximum discrepancy between reduction = avg and one-by-one: {}".format(np.max(np.abs(grad_out_avg- grad_out_one_by_one))))
结果如下:
Maximum discrepancy between reduction = none and sum: 0.0316
Maximum discrepancy between reduction = none and avg: 0.0316
Maximum discrepancy between reduction = none and one-by-one: 0.0
Maximum discrepancy between reduction = sum and avg: 0.0
Maximum discrepancy between reduction = sum and one-by-one: 0.0316
Maximum discrepancy between reduction = avg and one-by-one: 0.0316
也就是说,reduction=none
一对一的反向传播产生的结果似乎是相同的,而reduciton=sum
与reduction=mean
前一对产生的结果不同。解释差异(可能是由于retain_graph=True
?)真的很有帮助,并提前感谢您的帮助!