最小减少cuda不工作 [英] Min reduction cuda does not work

查看:116
本文介绍了最小减少cuda不工作的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我写了一个代码,通过减少找到最小值。但是结果总是为零。我不知道是什么问题。请帮帮我。



这里是内核代码:我修改了Nvidia的sum减少代码。

  #include< limits.h> 

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int * In,int * Out,int n){
__shared__ int sdata [NumThread];
unsigned int i = blockIdx.x * NumThread + threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int gridSize = NumBlock * NumThread;
int myMin = INT_MAX;

while(i if(In [i] myMin = In [i]
i + = gridSize;
}
sdata [tid] = myMin;
__syncthreads();

if(NumThread> = 1024){
if(tid< 512)
if(sdata [tid]> sdata [tid + 512])sdata [tid ] = sdata [tid + 512];
__syncthreads();
}
if(NumThread> = 512){
if(sdata [tid]> sdata [tid + 256])sdata [tid] = sdata [tid + 256]
__syncthreads();
}
if(NumThread> = 256){
if(sdata [tid]> sdata [tid + 128]&& sdata [tid + 128]!= sdata [tid] = sdata [tid + 128];
__syncthreads();
}
if(NumThread> = 128){
if(sdata [tid]> sdata [tid + 64])sdata [tid] = sdata [tid + 64]
__syncthreads();
}
//以下做法已弃用:
if(tid< 32){
volatile int * smem = sdata;
if(NumThread> = 64)if(smem [tid]> smem [tid + 32])smem [tid] = smem [tid + 32];
if(NumThread> = 32)if(smem [tid]> smem [tid + 16])smem [tid] = smem [tid + 16];
if(NumThread> = 16)if(smem [tid]> smem [tid + 8])smem [tid] = smem [tid + 8];
if(NumThread> = 8)if(smem [tid]> smem [tid + 4])smem [tid] = smem [tid + 4];
if(NumThread> = 4)if(smem [tid]> smem [tid + 2])smem [tid] = smem [tid + 2];
if(NumThread> = 2)if(smem [tid]> smem [tid + 1])smem [tid] = smem [tid + 1];
}
if(tid == 0)
if(sdata [0]< sdata [1])Out [blockIdx.x] = sdata [0]
else Out [blockIdx.x] = sdata [1];
}

这里是我的主要代码:

  #include< stdio.h> 
#include< stdlib.h>

#includemin_reduction.cu

int main(int argc,char * argv []){
unsigned int length = 1048576;
int i,Size,min;
int * a,* out,* gpuA,* gpuOut;

cudaSetDevice(0);
Size = length * sizeof(int);
a =(int *)malloc(Size);
out =(int *)malloc(NumBlock * sizeof(int));
for(i = 0; i
cudaMalloc((void **)& gpuA,Size);
cudaMalloc((void **)& gpuOut,NumBlock * sizeof(int));
cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
min_reduce<<<< NumBlock,NumThread>>>(gpuA,gpuOut,length);
cudaDeviceSynchronize();
cudaMemcpy(out,gpuOut,NumBlock * sizeof(int),cudaMemcpyDeviceToHost);

min = out [0];
for(i = 1; i return 0;
}


解决方案

同意@HubertApplebaum所说的一切,但我可​​以同意建议使用正确的cuda错误检查。正如你在代码中提到的,warp同步编程可以被认为是已经过时的,但是我不能支持它的断言 。然而,我不想就此辩论;



另一个有用的调试建议是按照以下步骤操作:此处使用 -lineinfo 编译代码,并运行您的代码 cuda-memcheck 。如果你这样做,你会看到很多这样的报告:

  =========无效的__shared__读size 4 
========= at 0x000001e0 in /home/bob/misc/t1074.cu:39:min_reduce(int*,int *,int)
===== ==== by thread(64,0,0)in block(24,0,0)
=========地址0x00000200超出范围
===== ====在内核启动时保存的主机回溯到驱动程序入口点
=========主机框架:/lib64/libcuda.so.1(cuLaunchKernel + 0x2cd)[0x15859d]
=========主机框架:./ t1074 [0x16dc1]
=========主机框架:./ t1074 [0x315d3]
==== =====主机框架:./ t1074 [0x28f5]
=========主机框架:./ t1074 [0x2623]
=========主机框架:/lib64/libc.so.6(__libc_start_main + 0xf5)[0x21d65]
=========主机框架:./ t1074 [0x271d]

这表示代码中的主要问题是您未正确地索引到 __ shared __ 内存数组以及正在发生的特定代码行。整齐! (在我的情况下是39行,但它可能是一个不同的线,可能在你的情况下)。如果你钻进那一行,你会想学习这段代码:

  #define NumThread 128 
...
__shared__ int sdata [NumThread];
...
if(NumThread> = 128){
if(sdata [tid]> sdata [tid + 64])sdata [tid] = sdata [tid + 64] ; // line 39 in my case
__syncthreads();
}

您已定义 NumThread 在128处,并且静态地分配了许多 int 数量的共享存储器阵列。一切都很好。 if语句中的代码怎么办?那个if条件将被满足,这意味着块中的所有128个线程将执行该if语句的主体。但是,您正在从共享内存中读取 sdata [tid + 64] ,对于 tid 大于63的线程(即每个块中的一半线程),这将产生大于127的共享内存的索引(这是超出界限的,即非法的)。



修复(对于您显示的特定代码)很简单,只需添加另一个if-test:

  if(NumThread> = 128){
if(tid< 64)
if(sdata [tid]> sdata [tid + 64])sdata [tid] = sdata [tid + 64];
__syncthreads();
}

如果您修改代码,并重新运行 cuda-memcheck test,你会看到所有运行时报告的错误都消失了。 Yay!



但是代码仍然没有产生正确的答案。您在此处发生了另一个错误:

  for(i = 1; i   

如果您想查找最小值仔细地,你会意识到你应该这样做:

  for(i = 1; i  ^ 
|
大于

对于这两个更改,您的代码为我生成正确的结果: / p>

  $ cat t1074.cu 
#include< stdio.h>
#include< stdlib.h>


#include< limits.h>

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int * In,int * Out,int n){
__shared__ int sdata [NumThread];
unsigned int i = blockIdx.x * NumThread + threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int gridSize = NumBlock * NumThread;
int myMin = INT_MAX;

while(i if(In [i] myMin = In [i]
i + = gridSize;
}
sdata [tid] = myMin;
__syncthreads();

if(NumThread> = 1024){
if(tid< 512)
if(sdata [tid]> sdata [tid + 512])sdata [tid ] = sdata [tid + 512];
__syncthreads();
}
if(NumThread> = 512){
if(sdata [tid]> sdata [tid + 256])sdata [tid] = sdata [tid + 256]
__syncthreads();
}
if(NumThread> = 256){
if(sdata [tid]> sdata [tid + 128]&& sdata [tid + 128]!= sdata [tid] = sdata [tid + 128];
__syncthreads();
}
if(NumThread> = 128){
if(tid< 64)
if(sdata [tid]> sdata [tid + 64])sdata [ tid] = sdata [tid + 64];
__syncthreads();
}
//以下做法已弃用:
if(tid< 32){
volatile int * smem = sdata;
if(NumThread> = 64)if(smem [tid]> smem [tid + 32])smem [tid] = smem [tid + 32];
if(NumThread> = 32)if(smem [tid]> smem [tid + 16])smem [tid] = smem [tid + 16];
if(NumThread> = 16)if(smem [tid]> smem [tid + 8])smem [tid] = smem [tid + 8];
if(NumThread> = 8)if(smem [tid]> smem [tid + 4])smem [tid] = smem [tid + 4];
if(NumThread> = 4)if(smem [tid]> smem [tid + 2])smem [tid] = smem [tid + 2];
if(NumThread> = 2)if(smem [tid]> smem [tid + 1])smem [tid] = smem [tid + 1];
}
if(tid == 0)
if(sdata [0]< sdata [1])Out [blockIdx.x] = sdata [0];
else Out [blockIdx.x] = sdata [1];
}

int main(int argc,char * argv []){
unsigned int length = 1048576;
int i,Size,min;
int * a,* out,* gpuA,* gpuOut;

cudaSetDevice(0);
Size = length * sizeof(int);
a =(int *)malloc(Size);
out =(int *)malloc(NumBlock * sizeof(int));
for(i = 0; i a [10] = 5;
cudaMalloc((void **)& gpuA,Size);
cudaMalloc((void **)& gpuOut,NumBlock * sizeof(int));
cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
min_reduce<<<< NumBlock,NumThread>>>(gpuA,gpuOut,length);
cudaDeviceSynchronize();
cudaMemcpy(out,gpuOut,NumBlock * sizeof(int),cudaMemcpyDeviceToHost);

min = out [0];
for(i = 1; i out [i])min = out [i]
printf(min =%d\\\
,min);
return 0;
}
$ nvcc -o t1074 t1074.cu
$ cuda-memcheck ./t1074
========= CUDA-MEMCHECK
min = 5
=========错误摘要:0个错误
$

请注意,在1024个线程的情况下,你已经有if-check,你可能想要添加一个适当的if-check到512和256个线程的情况,就像我为上面的128个线程情况添加。 / p>

I wrote a code to find the minimum by reduction. However the result is always zero. I don't know what is the problem. Please help me.

Here it is the kernel code: I modified the sum reduction code from the Nvidia.

#include <limits.h>

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int* In, int* Out, int n){
  __shared__ int sdata[NumThread];
  unsigned int i = blockIdx.x * NumThread + threadIdx.x;
  unsigned int tid = threadIdx.x;
  unsigned int gridSize = NumBlock * NumThread;
  int myMin = INT_MAX;

  while (i < n){
    if(In[i] < myMin)
    myMin = In[i];
    i += gridSize;
  }
  sdata[tid] = myMin;
  __syncthreads();

  if (NumThread >= 1024){
    if (tid < 512)
    if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
    __syncthreads();
  }
  if (NumThread >= 512){
    if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
    __syncthreads();
  }
  if (NumThread >= 256){
    if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] =  sdata[tid + 128];
    __syncthreads();
  }
  if (NumThread >= 128){
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }
  //the following practice is deprecated
   if (tid < 32){
    volatile int *smem = sdata;
    if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] =  smem[tid+32];
    if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] =  smem[tid+16];
    if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] =  smem[tid+8];
    if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] =  smem[tid+4];
    if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] =  smem[tid+2];
    if (NumThread >= 2) if(smem[tid] > smem[tid + 1] )      smem[tid] =  smem[tid+1];
  }
  if (tid == 0)
    if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
    else Out[blockIdx.x] = sdata[1];      
}

And here it is my main code:

#include <stdio.h>
#include <stdlib.h>

#include "min_reduction.cu"

int main(int argc, char* argv[]){
  unsigned int length = 1048576;
  int i, Size, min;
  int *a, *out, *gpuA, *gpuOut;

  cudaSetDevice(0);
  Size = length * sizeof(int);
  a = (int*)malloc(Size);
  out = (int*)malloc(NumBlock*sizeof(int));
  for(i=0;i<length;i++) a[i] = (i + 10);

  cudaMalloc((void**)&gpuA,Size);
  cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
  cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
  min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
  cudaDeviceSynchronize();
  cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);

  min = out[0];
  for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];
  return 0;
}

解决方案

I'm not sure I agree with everything that @HubertApplebaum said, but I can agree with the suggestion to use proper cuda error checking. And as you mention in the code, warp synchronous programming can be considered to be deprecated but I cannot support the claim that it is broken (yet). However I don't wish to argue about that; it's not central to your question here.

Another useful debugging suggestion would be to follow the steps here to compile your code with -lineinfo and run your code with cuda-memcheck. If you did that, you would see many reports like this:

========= Invalid __shared__ read of size 4
=========     at 0x000001e0 in /home/bob/misc/t1074.cu:39:min_reduce(int*, int*, int)
=========     by thread (64,0,0) in block (24,0,0)
=========     Address 0x00000200 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15859d]
=========     Host Frame:./t1074 [0x16dc1]
=========     Host Frame:./t1074 [0x315d3]
=========     Host Frame:./t1074 [0x28f5]
=========     Host Frame:./t1074 [0x2623]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1074 [0x271d]

which would indicate both that a primary problem in your code is that you are incorrectly indexing into your __shared__ memory array as well as the specific line of code where that is taking place. Neat! (It's line 39 in my case, but it would be a different line probably in your case). If you then drill into that line, you will want to study this section of code:

  #define NumThread 128
  ...
  __shared__ int sdata[NumThread];
  ...
  if (NumThread >= 128){
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64]; //line 39 in my case
    __syncthreads();
  }

You have defined NumThread at 128, and have statically allocated a shared memory array of that many int quantities. All well and good. What about the code in the if-statement? That if-condition will be satisfied, which means that all 128 threads in the block will execute the body of that if-statement. However, you are reading sdata[tid + 64] from shared memory, and for threads whose tid is greater than 63 (i.e. half of the threads in each block), this will generate an index into shared memory of greater than 127 (which is out-of-bounds, i.e. illegal).

The fix (for this specific code that you have shown) is fairly simple, just add another if-test:

  if (NumThread >= 128){
    if (tid < 64)
      if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }

If you make that modification to your code, and rerun the cuda-memcheck test, you'll see that all the runtime-reported errors are gone. Yay!

But the code still doesn't produce the right answer yet. You've made another error here:

  for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];

If you want to find the minimum value, and think about that logic carefully, you'll realize you should have done this:

  for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
                                 ^
                                 |
                              greater than

With those two changes, your code produces the correct result for me:

$ cat t1074.cu
#include <stdio.h>
#include <stdlib.h>


#include <limits.h>

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int* In, int* Out, int n){
  __shared__ int sdata[NumThread];
  unsigned int i = blockIdx.x * NumThread + threadIdx.x;
  unsigned int tid = threadIdx.x;
  unsigned int gridSize = NumBlock * NumThread;
  int myMin = INT_MAX;

  while (i < n){
    if(In[i] < myMin)
    myMin = In[i];
    i += gridSize;
  }
  sdata[tid] = myMin;
  __syncthreads();

  if (NumThread >= 1024){
    if (tid < 512)
    if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
    __syncthreads();
  }
  if (NumThread >= 512){
    if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
    __syncthreads();
  }
  if (NumThread >= 256){
    if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] =  sdata[tid + 128];
    __syncthreads();
  }
  if (NumThread >= 128){
    if (tid < 64)
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }
  //the following practice is deprecated
   if (tid < 32){
    volatile int *smem = sdata;
    if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] =  smem[tid+32];
    if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] =  smem[tid+16];
    if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] =  smem[tid+8];
    if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] =  smem[tid+4];
    if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] =  smem[tid+2];
    if (NumThread >= 2) if(smem[tid] > smem[tid + 1] )      smem[tid] =  smem[tid+1];
  }
  if (tid == 0)
    if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
    else Out[blockIdx.x] = sdata[1];
}

int main(int argc, char* argv[]){
  unsigned int length = 1048576;
  int i, Size, min;
  int *a, *out, *gpuA, *gpuOut;

  cudaSetDevice(0);
  Size = length * sizeof(int);
  a = (int*)malloc(Size);
  out = (int*)malloc(NumBlock*sizeof(int));
  for(i=0;i<length;i++) a[i] = (i + 10);
  a[10]=5;
  cudaMalloc((void**)&gpuA,Size);
  cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
  cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
  min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
  cudaDeviceSynchronize();
  cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);

  min = out[0];
  for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
  printf("min = %d\n", min);
  return 0;
}
$ nvcc -o t1074 t1074.cu
$ cuda-memcheck ./t1074
========= CUDA-MEMCHECK
min = 5
========= ERROR SUMMARY: 0 errors
$

Note that you already have the if-check in the 1024 threads case, you may want to add an appropriate if-check to the 512 and 256 threads case, just as I have added it for the 128 threads case above.

这篇关于最小减少cuda不工作的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆