减少不给出一致的结果 [英] Reduce not giving consistent results

查看:95
本文介绍了减少不给出一致的结果的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试针对大型一维数组实现自己的减少总和。我提出了一个reduce内核,以及多次调用该内核以逐步减少以达到单个值的方法。现在我知道这不是计算此值的最佳方法(如果您看到我的代码,可能会达到需要进行内核调用以添加3个值的地步),但让我们暂时消除它并尝试工作

I am trying to implement my own reduce sum for big 1D arrays. I came up with a reduce kernel and a way to call the kernel several times for step by step reduction to reach a single value. Now I know that this is not the optimal way of computing this (if you see my code it may get to a point where a kernel call needs to be made to add 3 values), but let's obviate that for a moment and try to work with this.

简而言之,我每次都将reduce内核称为reduce每次减少 MAXTHREADS 次情况为1024。因此,阵列的大小每次都会减少1024。当大小小于1024时,似乎可以正常工作,但对于较大的值,很可惜无法获得正确的总和。

Basically, in short: I call the reduce kernel to each time reduce MAXTHREADS times, in this case 1024. So the size of the array will be reduced by 1024 each time. When the size is smaller than 1024 it seems that works properly, but for bigger values it miserably fails to get the right sum.

这种情况在所有数组大小的情况下都会发生尝试过的。我缺少什么?

This happens with all of the array sizes I tried. What am I missing?

我也很乐意接受有关代码质量的评论,但主要是对修复它感兴趣。

I will also gladly accept comments about the quality of the code but mainly interested in fixing it.

下面,我发布了整个内核和内核调用的片段。如果我错过了内核调用的某些相关部分,请发表评论,我将对其进行修复。
原始代码具有错误检查功能,所有内核始终运行,并且永不返回 CUDAError s。

Below I have posted the whole kernel and a snippet of the kernel call. If I have missed some relevant part of the kernel call, please comment and I will fix it. The original code has error checks, all kernels run alway and are never returning CUDAErrors.

__global__ void  reduce6(float *g_idata, float *g_odata, unsigned int n){
    extern __shared__ float sdata[];

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(MAXTREADS*2) + tid;
    unsigned int gridSize = MAXTREADS*2*gridDim.x;
    //blockSize==MAXTHREADS
    sdata[tid] = 0;
    float mySum = 0;   
    if (tid>=n) return; 
    if ( (gridDim.x>0) & ((i+MAXTREADS)<n)){
        while (i < n) { 
            sdata[tid] += g_idata[i] + g_idata[i+MAXTREADS]; 
            i += gridSize; 
        }
    }else{
        sdata[tid] += g_idata[i];
    }
   __syncthreads();


    // do reduction in shared mem
   if (tid < 512)
        sdata[tid] += sdata[tid + 512];
    __syncthreads();
    if (tid < 256)
        sdata[tid] += sdata[tid + 256];
    __syncthreads();

    if (tid < 128)
        sdata[tid] += sdata[tid + 128];
     __syncthreads();

    if (tid <  64)
       sdata[tid]  += sdata[tid +  64];
    __syncthreads();


#if (__CUDA_ARCH__ >= 300 )
    if ( tid < 32 )
    {
        // Fetch final intermediate sum from 2nd warp
        mySum = sdata[tid]+ sdata[tid + 32];
        // Reduce final warp using shuffle
        for (int offset = warpSize/2; offset > 0; offset /= 2) 
            mySum += __shfl_down(mySum, offset);
    }
    sdata[0]=mySum;
#else

    // fully unroll reduction within a single warp
    if (tid < 32) {
        warpReduce(sdata,tid);
    }
#endif
    // write result for this block to global mem
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}



__device__ void warpReduce(volatile float *sdata,unsigned int tid) {
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}



调用内核以任意大小 total_pixels



Call the kernel for an arbitrary size of total_pixels:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>    
#defineMAXTREADS 1024
__global__ void initKernel(float * devPtr, const float val, const size_t nwords)
{
    //https://stackoverflow.com/questions/10589925/initialize-device-array-in-cuda
    int tidx = threadIdx.x + blockDim.x * blockIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (; tidx < nwords; tidx += stride)
        devPtr[tidx] = val;
} 


int main()
{

size_t total_pixels = 5000;
unsigned long int n = (unsigned long int)total_pixels;
float* d_image, *d_aux;
float sum;
cudaMalloc(&d_image, total_pixels*sizeof(float));
cudaMalloc(&d_aux, sizeof(float)*(n + MAXTREADS - 1) / MAXTREADS);


for (int i = 0; i < 10; i++){
    sum = 0;
    cudaMemset(&d_image, 1, total_pixels*sizeof(float));

    int dimblockRed = MAXTREADS;
    int dimgridRed = (total_pixels + MAXTREADS - 1) / MAXTREADS;
    int reduceCont = 0;
    initKernel << < dimgridRed, dimblockRed >> >(d_image, 1.0, total_pixels);

    while (dimgridRed > 1) {
        if (reduceCont % 2 == 0){
            reduce6 << <dimgridRed, dimblockRed, MAXTREADS*sizeof(float) >> >(d_image, d_aux, n);
        }
        else{
            reduce6 << <dimgridRed, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);
        }
        n = dimgridRed;
        dimgridRed = (n + MAXTREADS - 1) / MAXTREADS;
        reduceCont++;

    }
    if (reduceCont % 2 == 0){
        reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_image, d_aux, n);
        cudaMemcpy(&sum, d_aux, sizeof(float), cudaMemcpyDeviceToHost);

    }
    else{
        reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);
        cudaMemcpy(&sum, d_image, sizeof(float), cudaMemcpyDeviceToHost);
    }
    printf("%f ", sum);
}
cudaDeviceReset();
return 0;
}


推荐答案

有一个很多在这里破坏了您的主机和设备代码,我不会尝试解决所有问题。但我至少可以看到:

There is a lot broken in both your host and device code here, and I am not going to attempt to go through all of the problems. but I can at least see:

extern __shared__ float sdata[]; // must be declared volatile for the warp reduct to work reliably

此累积代码已被大量破坏方式,但至少:

This accumulate code is broken in a lot of ways, but at least:

if (tid>=n) return; // undefined behaviour with __syncthreads() below
if ( (gridDim.x>0) & ((i+MAXTREADS)<n)){
    while (i < n) { 
        sdata[tid] += g_idata[i] + g_idata[i+MAXTREADS]; // out of bounds if i > n - MAXTREADS
        i += gridSize; 
    }
}else{
    sdata[tid] += g_idata[i]; // out of bounds if i > n
}
__syncthreads(); // potential deadlock 

基于随机播放的减少也不正确:

The shuffle based reduction is not correct either:

if ( tid < 32 )
{
    // Fetch final intermediate sum from 2nd warp
    mySum = sdata[tid]+ sdata[tid + 32];
    // Reduce final warp using shuffle
    for (int offset = warpSize/2; offset > 0; offset /= 2) 
        mySum += __shfl_down(mySum, offset);
}
sdata[0]=mySum; // must be conditionally executed only by thread 0, otherwise a memory race

您的主机代码用于调用还原内核是一个完全的谜。还原内核最多只需要调用两次,因此循环是多余的。在还原的最后阶段,您将内核称为:

Your host code for calling the reduction kernels is a complete mystery. The reduction kernel needs to only be called at most twice, so the loop is redundant. The, in the final phase of the reduction you call the kernel like this:

reduce6 << <1, dimblockRed, MAXTREADS*sizeof(float) >> >(d_aux, d_image, n);

d_aux 最多只有 dimgridRed 条目,所以这也是内存溢出。

but d_aux only has at most dimgridRed entries, so that is a memory overflow as well.

我认为您真正想要的是这样的:

I think what you really want is something like this:

#include <cstdio>    
#include <assert.h>
#define MAXTHREADS 1024    


__device__ void warpReduce(volatile float *sdata,unsigned int tid) {
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}


__global__ void mymemset(float* image, const float val, unsigned int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < N) {
        image[tid] = val;
        tid += gridDim.x * blockDim.x;
    }
}

__global__ void  reduce6(float *g_idata, float *g_odata, unsigned int n){
    extern __shared__ volatile float sdata[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x + tid;
    unsigned int gridSize = blockDim.x*gridDim.x;
    float mySum = 0;   
    while (i < n) { 
        mySum += g_idata[i]; 
        i += gridSize; 
    }
    sdata[tid] = mySum;
    __syncthreads();

   if (tid < 512)
        sdata[tid] += sdata[tid + 512];
    __syncthreads();
    if (tid < 256)
        sdata[tid] += sdata[tid + 256];
    __syncthreads();

    if (tid < 128)
        sdata[tid] += sdata[tid + 128];
     __syncthreads();

    if (tid <  64)
       sdata[tid]  += sdata[tid +  64];
    __syncthreads();


#if (__CUDA_ARCH__ >= 300)
    if ( tid < 32 )
    {
        mySum = sdata[tid] + sdata[tid + 32];
        for (int offset = warpSize/2; offset > 0; offset /= 2) {
            mySum += __shfl_down(mySum, offset);
        }
    }
#else
    if (tid < 32) {
        warpReduce(sdata,tid);
        mySum = sdata[0];
    }
#endif
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}


int main()
{
    size_t total_pixels = 8000;
    unsigned long int n = (unsigned long int)total_pixels;
    float* d_image, *d_aux;
    cudaMalloc(&d_image, total_pixels*sizeof(float));
    cudaMalloc(&d_aux, sizeof(float)*(n + MAXTHREADS - 1) / MAXTHREADS);

    for (int i = 0; i < 10000; i++){
        {
            dim3 bsz = dim3(1024);
            dim3 gsz = dim3(total_pixels / bsz.x + ((total_pixels % bsz.x > 0) ? 1: 0));
            mymemset<<<gsz, bsz>>>(d_image, 1.0f, total_pixels);
            cudaDeviceSynchronize();
        }

        int dimblockRed = MAXTHREADS;
        int dimgridRed = (n + MAXTHREADS - 1) / MAXTHREADS;
        float sum;
        reduce6<<<dimgridRed, dimblockRed, MAXTHREADS*sizeof(float)>>>(d_image, d_aux, n);

        if (dimgridRed > 1) {
            reduce6<<<1, dimblockRed, MAXTHREADS*sizeof(float)>>>(d_aux, d_image, dimgridRed);
            cudaMemcpy(&sum, d_image, sizeof(float), cudaMemcpyDeviceToHost);
        } else {
            cudaMemcpy(&sum, d_aux, sizeof(float), cudaMemcpyDeviceToHost);
        }
        assert(sum == float(total_pixels));
    }
    cudaDeviceReset();
    return 0;
}

[供将来参考,这就是MCVE的样子]

[for future reference, that is what an MCVE looks like]

但是我不会再浪费时间试图解释内核和主机代码在累加阶段中的任何扭曲逻辑。还有其他一些应该解决的问题(网格大小只需要与设备上可以容纳的最大并发块数一样大),但是我将其留给读者练习。

But I am not going to waste any more time trying to decipher whatever the twisted logic in the accumulation phase the kernel and the host code was. There are other things that should be fixed (the grid size only needs to be as large as the maximum number of concurrent blocks that will fit on your device), but I leave that as an exercise to the reader.

这篇关于减少不给出一致的结果的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆