PyTorch 给出 cuda 运行时错误 [英] PyTorch giving cuda runtime error

查看:126
本文介绍了PyTorch 给出 cuda 运行时错误的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我对我的代码做了一点修改,以便它不使用 DataParallelDistributedDataParallel.代码如下:

导入 argparse导入操作系统进口木材导入时间进口火炬将 torch.nn 导入为 nn导入 torch.nn.parallel将 torch.backends.cudnn 导入为 cudnn导入 torch.distributed 作为 dist导入 torch.optim导入 torch.utils.data导入 torch.utils.data.distributed导入 torchvision.transforms 作为转换导入 torchvision.datasets 作为数据集导入 torchvision.models 作为模型model_names = sorted(name for name in models.__dict__如果 name.islower() 而不是 name.startswith("__")和 callable(models.__dict__[name]))parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')parser.add_argument('data', metavar='DIR',help='数据集的路径')parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',选择=模型名称,help='模型架构:' +' |'.join(model_names) +'(默认值:resnet18)')parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',help='数据加载工作者的数量(默认值:4)')parser.add_argument('--epochs', default=90, type=int, metavar='N',help='要运行的总时期数')parser.add_argument('--start-epoch', default=0, type=int, metavar='N',help='手动纪元编号(在重新启动时有用)')parser.add_argument('-b', '--batch-size', default=256, type=int,metavar='N', help='mini-batch size (default: 256)')parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,metavar='LR', help='初始学习率')parser.add_argument('--momentum', default=0.9, type=float, metavar='M',帮助='动量')parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,metavar='W', help='权重衰减(默认值:1e-4)')parser.add_argument('--print-freq', '-p', default=10, type=int,metavar='N', help='打印频率(默认值:10)')parser.add_argument('--resume', default='', type=str, metavar='PATH',help='最新检查点的路径(默认:无)')parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',help='在验证集上评估模型')parser.add_argument('--pretrained', dest='pretrained', action='store_true',help='使用预训练模型')parser.add_argument('--world-size', default=1, type=int,help='分布式进程的数量')parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,help='url 用于设置分布式训练')parser.add_argument('--dist-backend', default='gloo', type=str,help='分布式后端')best_prec1 = 0定义主():全局参数,best_prec1args = parser.parse_args()args.distributed = args.world_size >1如果 args.distributed:dist.init_process_group(后端=args.dist_backend,init_method=args.dist_url,world_size=args.world_size)# 创建模型如果 args.pretrained:print("=> using pre-trained model '{}'".format(args.arch))模型 = models.__dict__[args.arch](pretrained=True)别的:打印(=>创建模型'{}'".格式(args.arch))模型 = models.__dict__[args.arch]()如果不是 args.distributed:如果 args.arch.startswith('alexnet') 或 args.arch.startswith('vgg'):#model.features = torch.nn.DataParallel(model.features)模型.cuda()#别的:#model = torch.nn.DataParallel(model).cuda()别的:模型.cuda()#model = torch.nn.parallel.DistributedDataParallel(模型)# 定义损失函数(标准)和优化器标准 = nn.CrossEntropyLoss().cuda()优化器 = torch.optim.SGD(model.parameters(), args.lr,动量=args.动量,weight_decay=args.weight_decay)# 可选地从检查点恢复如果 args.resume:如果 os.path.isfile(args.resume):打印(=> 加载检查点 '{}'".format(args.resume))检查点 = torch.load(args.resume)args.start_epoch = 检查点['纪元']best_prec1 = 检查点['best_prec1']model.load_state_dict(checkpoint['state_dict'])优化器.load_state_dict(检查点['优化器'])打印(=> 加载检查点'{}'(纪元{})".format(args.resume, checkpoint['epoch']))别的:打印(=> 没有在 '{}' 处找到检查点".format(args.resume))cudnn.benchmark = 真# 数据加载代码traindir = os.path.join(args.data, 'train')valdir = os.path.join(args.data, 'val')归一化 = 变换.归一化(平均值 = [0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])train_dataset = datasets.ImageFolder(训练目录,转换.撰写([transforms.RandomSizedCrop(224),transforms.RandomHorizo​​ntalFlip(),transforms.ToTensor(),标准化,]))如果 args.distributed:train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)别的:train_sampler = 无train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=args.batch_size,shuffle=(train_sampler 为 None),num_workers=args.workers,pin_memory=True,sampler=train_sampler)val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(valdir, transforms.Compose([变换.规模(256),变换.CenterCrop(224),transforms.ToTensor(),标准化,])),batch_size=args.batch_size, shuffle=False,num_workers=args.workers,pin_memory=True)如果 args.evaluate:验证(val_loader,模型,标准)返回对于范围内的纪元(args.start_epoch,args.epochs):如果 args.distributed:train_sampler.set_epoch(epoch)adjust_learning_rate(优化器,纪元)# 训练一个时期火车(train_loader,模型,标准,优化器,时代)# 评估验证集prec1 = 验证(val_loader,模型,标准)# 记住最好的 prec@1 并保存检查点is_best = prec1 >best_prec1best_prec1 = max(prec1, best_prec1)保存检查点({'纪元':纪元+1,'拱':args.arch,'state_dict': model.state_dict(),'best_prec1':best_prec1,'优化器' : optimizer.state_dict(),}, 是最好的)def train(train_loader,模型,标准,优化器,时代):batch_time = AverageMeter()data_time = AverageMeter()损失 = AverageMeter()top1 = AverageMeter()top5 = AverageMeter()# 切换到训练模式模型.train()结束 = time.time()对于 enumerate(train_loader) 中的 i, (input, target):# 测量数据加载时间data_time.update(time.time() - 结束)目标 = target.cuda(async=True)input_var = torch.autograd.Variable(输入)target_var = torch.autograd.Variable(目标)# 计算输出输出 = 模型(输入变量)损失 = 标准(输出,target_var)# 测量准确度并记录损失prec1, prec5 = 准确度(output.data, target, topk=(1, 5))loss.update(loss.data[0], input.size(0))top1.update(prec1[0], input.size(0))top5.update(prec5[0], input.size(0))# 计算梯度并执行 SGD 步骤optimizer.zero_grad()损失.向后()优化器.step()# 测量经过的时间batch_time.update(time.time() - 结束)结束 = time.time()如果我 % args.print_freq == 0:打印('时代:[{0}][{1}/{2}]	''时间 {batch_time.val:.3f} ({batch_time.avg:.3f})	''数据 {data_time.val:.3f} ({data_time.avg:.3f})	''损失 {loss.val:.4f} ({loss.avg:.4f})	''Prec@1 {top1.val:.3f} ({top1.avg:.3f})	''Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(纪元,我,len(train_loader),batch_time=batch_time,数据时间=数据时间,损失=损失,top1=top1,top5=top5))定义验证(val_loader,模型,标准):batch_time = AverageMeter()损失 = AverageMeter()top1 = AverageMeter()top5 = AverageMeter()# 切换到评估模式模型.评估()结束 = time.time()对于 enumerate(val_loader) 中的 i, (input, target):目标 = target.cuda(async=True)input_var = torch.autograd.Variable(输入,易失性=真)target_var = torch.autograd.Variable(目标,易失性=真)# 计算输出输出 = 模型(输入变量)损失 = 标准(输出,target_var)# 测量准确度并记录损失prec1, prec5 = 准确度(output.data, target, topk=(1, 5))loss.update(loss.data[0], input.size(0))top1.update(prec1[0], input.size(0))top5.update(prec5[0], input.size(0))# 测量经过的时间batch_time.update(time.time() - 结束)结束 = time.time()如果我 % args.print_freq == 0:打印('测试:[{0}/{1}]	''时间 {batch_time.val:.3f} ({batch_time.avg:.3f})	''损失 {loss.val:.4f} ({loss.avg:.4f})	''Prec@1 {top1.val:.3f} ({top1.avg:.3f})	''Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(我,len(val_loader),batch_time=batch_time,loss=losses,顶1=顶1,顶5=顶5))打印(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1,top5=top5))返回top1.avgdef save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):torch.save(状态,文件名)如果 is_best:关闭.copyfile(文件名,'model_best.pth.tar')类AverageMeter(对象):"""计算并存储平均值和当前值"""def __init__(self):self.reset()定义重置(自我):self.val = 0self.avg = 0self.sum = 0self.count = 0定义更新(自我,val,n=1):self.val = valself.sum += val * nself.count += nself.avg = self.sum/self.countdef adjust_learning_rate(优化器,纪元):"""将学习率设置为每 30 个 epoch 衰减 10 的初始 LR"""lr = args.lr * (0.1 ** (epoch//30))对于optimizer.param_groups 中的param_group:param_group['lr'] = lr定义精度(输出,目标,topk=(1,)):"""计算指定的 k 值的 precision@k"""maxk = max(topk)batch_size = target.size(0)_, pred = output.topk(maxk, 1, True, True)pred = pred.t()正确 = pred.eq(target.view(1, -1).expand_as(pred))资源 = []对于topk中的k:正确_k =正确[:k].view(-1).float().sum(0, keepdim=True)res.append(correct_k.mul_(100.0/batch_size))返回资源如果 __name__ == '__main__':主要的()

而且,当我在一组具有 alexnet 神经网络架构的图像上运行此代码时,它给出了一个奇怪的 cuda 错误,如下所示:

=>创建模型'alexnet'THCudaCheck FAIL file=/pytorch/torch/lib/THC/THCGeneral.c line=70 error=30 : 未知错误回溯(最近一次调用最后一次): 中的文件imagenet2.py",第 319 行主要的()文件imagenet2.py",第 87 行,在主目录中模型.cuda()文件/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py",第147行,cuda返回 self._apply(lambda t: t.cuda(device_id))文件/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py",第 118 行,在 _applymodule._apply(fn)文件/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py",第 118 行,在 _applymodule._apply(fn)文件/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py",第 118 行,在 _applymodule._apply(fn)文件/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py",第 124 行,在 _applyparam.data = fn(param.data)文件/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py",第147行,在<lambda>返回 self._apply(lambda t: t.cuda(device_id))文件/usr/local/lib/python2.7/dist-packages/torch/_utils.py",第 66 行,在 _cuda返回 new_type(self.size()).copy_(self, async)文件/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py",第266行,_lazy_new_lazy_init()文件/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py",第85行,_lazy_inittorch._C._cuda_init()运行时错误:cuda 运行时错误(30):/pytorch/torch/lib/THC/THCGeneral.c:70 处的未知错误

用于运行代码的命令:python imagenet.py --world-size 1 --arch 'alexnet'

我哪里做错了?

PS:在 AWS g2.2xlarge Ubuntu 实例上运行.

CUDA 版本如下:

nvcc:NVIDIA (R) Cuda 编译器驱动程序版权所有 (c) 2005-2016 NVIDIA Corporation建立于 Tue_Jan_10_13:22:03_CST_2017Cuda 编译工具, release 8.0, V8.0.61

解决方案

  1. CUDNN 给出了无用的错误信息.对于调试,使用 net.cpu() 在 CPU 上测试您的网络,或者只是简单地删除 net.cuda().您必须对训练、验证和输出变量执行相同的操作.

  2. 问题在于,您在尺寸不同于 224x224 的图像上使用了预训练的 AlexNet.根据文档,只要图像大小至少为 224x224,它就应该可以工作.

  3. 这可能是由于 pytorch 的 AlexNet 实现中的硬编码参数造成的张量整形问题.在 vision/torchvision/models/alexnet.py 第 44 行,它说

<块引用>

x = x.view(x.size(0), 256 * 6 * 6)

改成

x = x.view(x.size(0), -1)

这应该允许它处理不同的图像大小.

  1. 我已将此修改提交到 github 存储库,但我想它尚未更新.

I have made a slight modification in my code so that it does not use DataParallel and DistributedDataParallel. The code is as follows:

import argparse
import os
import shutil
import time

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('data', metavar='DIR',
                    help='path to dataset')
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N', help='mini-batch size (default: 256)')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                    metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('--print-freq', '-p', default=10, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=1, type=int,
                    help='number of distributed processes')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='gloo', type=str,
                    help='distributed backend')

best_prec1 = 0


def main():
    global args, best_prec1
    args = parser.parse_args()

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            #model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        #else:
            #model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        #model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomSizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Scale(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)


def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top5.update(prec5[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]	'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})	'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})	'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})	'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})	'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5))


def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):
        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input, volatile=True)
        target_var = torch.autograd.Variable(target, volatile=True)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top5.update(prec5[0], input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print('Test: [{0}/{1}]	'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})	'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})	'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})	'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1, top5=top5))

    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'
          .format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


if __name__ == '__main__':
    main()

And, when I run this code on a set of images with the alexnet neuralnet architecture, it gives a weird cuda error, which is as follows:

=> creating model 'alexnet'
THCudaCheck FAIL file=/pytorch/torch/lib/THC/THCGeneral.c line=70 error=30 : unknown error
Traceback (most recent call last):
  File "imagenet2.py", line 319, in <module>
    main()
  File "imagenet2.py", line 87, in main
    model.cuda()
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in cuda
    return self._apply(lambda t: t.cuda(device_id))
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply
    module._apply(fn)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply
    module._apply(fn)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply
    module._apply(fn)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 124, in _apply
    param.data = fn(param.data)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in <lambda>
    return self._apply(lambda t: t.cuda(device_id))
  File "/usr/local/lib/python2.7/dist-packages/torch/_utils.py", line 66, in _cuda
    return new_type(self.size()).copy_(self, async)
  File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 266, in _lazy_new
    _lazy_init()
  File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 85, in _lazy_init
    torch._C._cuda_init()
RuntimeError: cuda runtime error (30) : unknown error at /pytorch/torch/lib/THC/THCGeneral.c:70

Command used for running the code: python imagenet.py --world-size 1 --arch 'alexnet' <image_folder>

Where did I go wrong?

PS: Running on an AWS g2.2xlarge Ubuntu instance.

The CUDA version is as follows:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2016 NVIDIA Corporation
Built on Tue_Jan_10_13:22:03_CST_2017
Cuda compilation tools, release 8.0, V8.0.61

解决方案

  1. CUDNN gives useless error messages. For debug, test your net on CPU using net.cpu() or just simple remove the net.cuda(). You will have to do the same with training, validation and output variables.

  2. It seams the problem is that you used pre-trained AlexNet on a images of size different than 224x224. According to the documentation it should work as long as the image size is at least 224x224.

  3. This is probably a tensor shaping problem due to a hard-coded parameter in pytorch's implementation of AlexNet. In vision/torchvision/models/alexnet.py at line 44 it says

x = x.view(x.size(0), 256 * 6 * 6)

change it to

x = x.view(x.size(0), -1)

This should allow it to work with different images sizes.

  1. I subbmitted this modification to the github repository, but I guess it has not been updated yet.

这篇关于PyTorch 给出 cuda 运行时错误的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆