从设备复制到主机时出现CUDA unkown错误 [英] CUDA unkown error when copying from device to host

查看:372
本文介绍了从设备复制到主机时出现CUDA unkown错误的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我写了一些CUDA代码,一切似乎很棒,直到我尝试从代码中获得结果:

  #include cuda_runtime.h
#includedevice_launch_parameters.h
#include< cstdlib>
#include< ctime>
#include< iostream>

#define maskSize 3

__constant__ float masks [32 * maskSize * maskSize];

__global__ void myConv(float * res,const float * mats,int mSize)
{
extern __shared__ float curr [];
int rSize = maskSize + mSize-1;
int idxmod =(threadIdx.x + maskSize-1)%(mSize + 2 * maskSize-2); //这两个映射任何值不在内(mSize-1,mSize-1)到边界填充。
int idymod =(threadIdx.y + maskSize-1)%(mSize + 2 * maskSize-2);
if(threadIdx.x< mSize&& threadIdx.y< mSize)//将mats的值放在curr矩阵的中间
curr [(threadIdx.x + maskSize-1 )*(mSize + 2 *(maskSize-1))+ threadIdx.y + maskSize-1] = mats [mSize *(blockIdx.y * mSize + threadIdx.x)+ threadIdx.y];
else // zero padding
if(threadIdx.x curr [threadIdx.x *(mSize + 2 *(maskSize-1))+ idymod] = 0;
else
curr [idxmod *(mSize + 2 *(maskSize-1))+ threadIdx.y] = 0;

__syncthreads();
float tmp = 0;

if(threadIdx.x< mSize + maskSize-1&& threadIdx.y< mSize + maskSize-1)
{
#pragma unroll $ b $ (for int i = 0; i #pragma unroll
for(int j = 0; j
tmp + = curr (threadIdx.x + i)*(mSize + 2 *(maskSize-1))+ threadIdx.y + j] * masks [blockIdx.x * maskSize * maskSize + maskSize * i + j]
res [blockIdx.y * rSize * rSize + threadIdx.x * rSize + threadIdx.y] = tmp;
}
}

int main()
{
int MatSize = 5;
int bSize = 2000;
int maskNum = 10;
int resSize = MatSize + maskSize-1;
float * ms
ms =(float *)malloc(maskSize * maskSize * maskNum * sizeof(float));
float * resPtr =(float *)malloc((MatSize + maskSize-1)*(MatSize + maskSize-1)* bSize * maskNum * sizeof
for(int i = 0; i for(int j = 0; j for(int k = 0; k< maskNum; )
ms [k * maskSize * maskSize + j * maskSize + i] =(float)(rand()%1000)/ 100;
float * inp =(float *)malloc(MatSize * MatSize * bSize * sizeof(float));
for(int i = 0; i for(int j = 0; j for(int k = 0; k< bSize; )
inp [k * MatSize * MatSize + j * MatSize + i] =(float)(rand()%500)/ 100;
float * cudams,* cudaresPtr,* cudainp;
cudaMalloc((void **)& cudams,maskSize * maskSize * maskNum * sizeof(float));
cudaMalloc((void **)& cudaresPtr,(MatSize + maskSize-1)*(MatSize + maskSize-1)* bSize * maskNum * sizeof(float)
cudaMalloc((void **)& cudainp,MatSize * MatSize * bSize * sizeof(float));

cudaMemcpy((void *)cudams,(void *)ms,maskSize * maskSize * maskNum * sizeof(float),cudaMemcpyHostToDevice);

cudaMemcpy((void *)cudainp,(void *)inp,MatSize * MatSize * bSize * sizeof(float),cudaMemcpyHostToDevice);

cudaMemcpyToSymbol(mask,(void *)cudams,maskSize * maskSize * maskNum * sizeof(float),0,cudaMemcpyDeviceToDevice);
dim3 threadSize(MatSize + 2 *(maskSize-1),MatSize + 2 *(maskSize-1));
dim3 blockSize(1,1); //用于测试目的。应该是dim3 blockSize(maskNum,bSize);
myConv<<<< blockSize,threadSize,(MatSize + 2 *(maskSize-1))*(MatSize + 2 *(maskSize-1))>(cudaresPtr,cudainp,MatSize) ;
cudaMemcpy((void *)resPtr,(const void *)cudaresPtr,(MatSize + maskSize-1)*(MatSize + maskSize-1)* bSize * maskNum * sizeof(float),cudaMemcpyDeviceToHost);
//问题是这里 - 他们的复制将无法工作!

free(inp);
free(ms);
free(resPtr);
return 0;
}

我把printf放在不同的地方, string ...找不到任何会导致将指针内容复制到主机的错误。



编辑:memcheck result:no errors如果我理解正确:


O:\CudaTst> cuda-memcheck CUDA_TST
========= CUDA-MEMCHECK



所用时间:0.144000秒错误:无法读取
的错误记录的字符串
=========错误摘要:0错误


使用-l(leak) - 0泄漏重新运行。

解决方案

看起来你是(至少)启动内核,动态分配的内存不足,无法在内核内部缓冲区溢出。



每个块的共享内存量以字节为单位,因此我怀疑你想要的是:



size_t shmsz = sizeof(float)* size_t(MatSize + 2 *(maskSize-1))*
(MatSize + 2 * 1));
myConv << >>(cudaresPtr,cudainp,MatSize);

除此之外,我将调试留给你。


I wrote some CUDA code, and everything seems great until I try to get the results from the code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdlib> 
#include <ctime> 
#include <iostream>

#define maskSize 3 

__constant__ float masks[32*maskSize*maskSize];

__global__ void myConv(float *res, const float* mats, int mSize)
{
    extern __shared__ float curr[];
    int rSize=maskSize+mSize-1;
    int idxmod=(threadIdx.x+maskSize-1) % (mSize+2*maskSize-2); //these two map any value not within (mSize-1,mSize-1) to the boarders for padding.
    int idymod=(threadIdx.y+maskSize-1) % (mSize+2*maskSize-2);
    if (threadIdx.x < mSize && threadIdx.y < mSize) //put the value of mats in the middle of the curr matrix
        curr[(threadIdx.x+ maskSize-1)*(mSize+2*(maskSize-1)) + threadIdx.y + maskSize-1]=mats[mSize*(blockIdx.y*mSize + threadIdx.x) + threadIdx.y];
    else //zero padding
        if (threadIdx.x < mSize)
            curr[threadIdx.x*(mSize+2*(maskSize-1)) +idymod] =0;
        else 
            curr[idxmod*(mSize+2*(maskSize-1)) +threadIdx.y] =0;

    __syncthreads();
    float tmp=0;

if (threadIdx.x < mSize+maskSize-1 && threadIdx.y < mSize+maskSize-1)
{
#pragma unroll
    for (int i=0;i<maskSize;i++)
        #pragma unroll
        for (int j=0;j<maskSize;j++)

            tmp+=curr[(threadIdx.x+i)*(mSize+2*(maskSize-1)) + threadIdx.y+j]*masks[blockIdx.x*maskSize*maskSize +maskSize*i +j];
    res[blockIdx.y*rSize*rSize + threadIdx.x*rSize + threadIdx.y]=tmp;
}
}

int main()
{
    int MatSize=5;
    int bSize=2000;
    int maskNum=10;
    int resSize=MatSize+maskSize-1;
    float* ms;
    ms=(float *)malloc(maskSize*maskSize*maskNum*sizeof(float));
    float* resPtr=(float *)malloc((MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
    for (int i=0; i<maskSize;i++)
        for (int j=0; j<maskSize; j++)
            for (int k=0; k<maskNum; k++)
                ms[k*maskSize*maskSize + j*maskSize + i]=(float)(rand() % 1000)/100;
    float* inp=(float *)malloc(MatSize*MatSize*bSize*sizeof(float));
    for (int i=0; i<MatSize; i++)
        for (int j=0; j<MatSize; j++)
            for (int k=0;k<bSize;k++)
                inp[k*MatSize*MatSize + j*MatSize + i]=(float)(rand() % 500)/100;
    float *cudams, *cudaresPtr,*cudainp;
    cudaMalloc((void **) &cudams,maskSize*maskSize*maskNum*sizeof(float));
    cudaMalloc((void **) &cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
    cudaMalloc((void **) &cudainp,MatSize*MatSize*bSize*sizeof(float));

    cudaMemcpy((void *)cudams,(void *)ms,maskSize*maskSize*maskNum*sizeof(float),cudaMemcpyHostToDevice);

    cudaMemcpy((void *)cudainp,(void *)inp,MatSize*MatSize*bSize*sizeof(float),cudaMemcpyHostToDevice);

    cudaMemcpyToSymbol(masks,(void *)cudams,maskSize*maskSize*maskNum*sizeof(float),0,cudaMemcpyDeviceToDevice);
    dim3 threadSize(MatSize+2*(maskSize-1),MatSize+2*(maskSize-1));
    dim3 blockSize(1, 1); //for testing purposes. should be dim3 blockSize(maskNum,bSize);
    myConv<<<blockSize, threadSize, (MatSize+2*(maskSize-1))*(MatSize+2*(maskSize-1))>>>(cudaresPtr,cudainp,MatSize);
    cudaMemcpy((void *)resPtr,(const void *)cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float),cudaMemcpyDeviceToHost);
    //The problem is here - They copying won't work!

    free(inp);
    free(ms);
    free(resPtr);
    return 0;
}

I put printf in various places, used error checking as recommended here, printed error string... Can't find anything that would cause an error copying the contents of the pointer back to the host.

Edit: memcheck result: no errors if I understand correctly:

O:\CudaTst>cuda-memcheck CUDA_TST ========= CUDA-MEMCHECK

Time spent: 0.144000 secondsError: Failed to read the strings for error record ========= ERROR SUMMARY: 0 errors

Re-ran with -l (leak) - 0 leaks.

解决方案

It would appear that you are (at least) launching your kernel with insufficient dynamically allocated shared memory for it to run without a buffer overflow inside the kernel.

The amount of shared memory per block is specific in bytes, so I suspect you want something like:

size_t shmsz = sizeof(float)*size_t((MatSize+2*(maskSize-1))*
                                    (MatSize+2*(maskSize-1));
myConv<<<blockSize, threadSize, shmz)>>>(cudaresPtr,cudainp,MatSize);

Beyond that, I leave the debugging to you.

这篇关于从设备复制到主机时出现CUDA unkown错误的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆