CUDA - 内存限制 - 向量和 [英] CUDA - Memory Limit - Vector Summation

查看:129
本文介绍了CUDA - 内存限制 - 向量和的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试学习CUDA,下面的代码适用于值N <= 16384,但对于较大的值失败(在代码结束时的求和检查失败,c值对于索引值始终为0 of i> = 16384)。

 #include< iostream> 
#includecuda_runtime.h
#include../ cuda_be / book.h

#define N(16384)

__global__ void add(int * a,int * b,int * c)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N)
{
c [tid] = a [tid] + b [tid]
tid + = blockDim.x * gridDim.x;
}
}

int main()
{
int a [N],b [N],c [N]
int * dev_a,* dev_b,* dev_c;

//分配mem on gpu
HANDLE_ERROR(cudaMalloc((void **)& dev_a,N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void **)& dev_b,N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void **)& dev_c,N * sizeof(int)));

for(int i = 0; i {
a [i] = -i;
b [i] = i * i;
}

HANDLE_ERROR(cudaMemcpy(dev_a,a,N * sizeof(int),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b,b,N * sizeof(int),cudaMemcpyHostToDevice));
system(PAUSE);
add<<<< 128/128>>>(dev_a,dev_b,dev_c);

//将数组'c'从gpu复制到cpu

HANDLE_ERROR(cudaMemcpy(c,dev_c,N * sizeof(int),cudaMemcpyDeviceToHost)

系统(PAUSE);
bool success = true;
for(int i = 0; i {
if((a [i] + b [i])!= c [i])
{
printf(%d中的错误:%d +%d!=%d \\\
,i,a [i],b [i],c [i]);
system(PAUSE);
success = false;
}

}

if(success)printf(We did it!\\\
);

cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);

return 0;
}

我认为这是一个共享内存相关的问题,但我不能来有一个很好的解释(可能缺乏知识)。你能为我提供一个解释和一个解决方法来运行N的值大于16384.这是我的GPU的规格:

 设备0的一般信息
名称:GeForce 9600M GT
计算能力:1.1
时钟速率:1250000
设备复制重叠:启用
内核执行超时:启用
设备的内存信息0
全局内存总数:536870912
总内存:65536
最大内存:2147483647
纹理对齐256
MP信息设备0
多处理器计数:4
共享内存每个mp:16384
注册每个mp:8192
线程在warp:32
每个块的最大线程:512
最大线程尺寸:(512,512,64)
最大网格尺寸:(65535,65535,1)




  while(tid< N) 

  if(tid< N)


I'm trying to learn CUDA and the following code works OK for the values N<= 16384, but fails for the greater values(Summation check at the end of the code fails, c values are always 0 for the index value of i>=16384).

#include<iostream>
#include"cuda_runtime.h"
#include"../cuda_be/book.h"

#define N (16384)

__global__ void add(int *a,int *b,int *c)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid<N)
    {
        c[tid] = a[tid] + b[tid];
        tid += blockDim.x * gridDim.x;
    }
}

int main()
{
    int a[N],b[N],c[N];
    int *dev_a,*dev_b,*dev_c;

    //allocate mem on gpu
    HANDLE_ERROR(cudaMalloc((void**)&dev_a,N*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&dev_b,N*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&dev_c,N*sizeof(int)));

    for(int i=0;i<N;i++)
    {
        a[i] = -i;
        b[i] = i*i;
    }

    HANDLE_ERROR(cudaMemcpy(dev_a,a,N*sizeof(int),cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(dev_b,b,N*sizeof(int),cudaMemcpyHostToDevice));
    system("PAUSE");
    add<<<128,128>>>(dev_a,dev_b,dev_c);

    //copy the array 'c' back from the gpu to the cpu

    HANDLE_ERROR( cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost));

    system("PAUSE");
    bool success = true;
    for(int i=0;i<N;i++)
    {
        if((a[i] + b[i]) != c[i])
        {
            printf("Error in %d: %d + %d != %d\n",i,a[i],b[i],c[i]);
            system("PAUSE");
            success = false;
        }

    }

    if(success) printf("We did it!\n");

    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return 0;
}

I think it's a shared memory related problem, but I can't come up with a good explanation(Possible lack of knowledge). Could you provide me an explanation and a workaround to run for the values of N greater than 16384. Here is the specs for my GPU:

General Info for device 0
Name: GeForce 9600M GT
Compute capability: 1.1
Clock rate: 1250000
Device copy overlap : Enabled
Kernel Execution timeout : Enabled
Mem info for device 0
Total global mem: 536870912
Total const mem: 65536
Max mem pitch: 2147483647
Texture Alignment 256
MP info about device 0
Multiproccessor count: 4
Shared mem per mp: 16384
Registers per mp: 8192
Threads in warp: 32
Max threads per block: 512
Max thread dimensions: (512,512,64)
Max grid dimensions: (65535,65535,1)

解决方案

You probably intended to write

while(tid<N)

not

if(tid<N)

这篇关于CUDA - 内存限制 - 向量和的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆