CUDA最大简算法不工作 [英] CUDA Maximum Reduction Algorithm Not Working

查看:179
本文介绍了CUDA最大简算法不工作的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

一个previous问题问如何找到有效地找到在CUDA数组的最大值:的查找最大值CUDA中,顶响应中提供优化减少内核链接到NVIDIA presentation。

A previous question asked how to find to find the maximum value of an array in CUDA efficiently: Finding max value in CUDA, the top response provided a link to a NVIDIA presentation on optimizing reduction kernels.

如果您正在使用Visual Studio,只需删除之间的标题参考,一切 CPU执行

If you are using Visual Studio, simply remove the header reference, and everything between CPU EXECUTION.

予设置,其中发现的最大的变体,但它不匹配的CPU被找到:

I setup a variant which found the max, but it doesn't match what the CPU is finding:

// Returns the maximum value of
// an array of size n
float GetMax(float *maxes, int n)
{
    int i = 0;
    float max = -100000;
    for(i = 0; i < n; i++)
    {
        if(maxes[i] > max)
            max = maxes[i];
    }

    return max;
}

// Too obvious...
__device__ float MaxOf2(float a, float b)
{
    if(a > b)   return a;
    else            return b;
}


__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
    extern __shared__ float sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(BLOCKSIZE*2) + tid;
    unsigned int gridSize = BLOCKSIZE*2*gridDim.x;

    sdata[tid] = 0;

    //MMX(index,i)
    //MMX(index,i+blockSize)
    // Final Optimized Kernel
    while (i < n) {
        sdata[tid] = MaxOf2(g_idata[i], g_idata[i+BLOCKSIZE]);
        i += gridSize;
    }
    __syncthreads();

    if (BLOCKSIZE >= 512) { if (tid < 256) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
    if (BLOCKSIZE >= 256) { if (tid < 128) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
    if (BLOCKSIZE >= 128) { if (tid < 64) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 64]); } __syncthreads(); }

    if (tid < 32) {
        if (BLOCKSIZE >= 64) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 32]);
        if (BLOCKSIZE >= 32) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
        if (BLOCKSIZE >= 16 ) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
        if (BLOCKSIZE >= 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
        if (BLOCKSIZE >= 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
        if (BLOCKSIZE >= 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 1]);
    }

    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

我有一个巨大的设置来测试这个算法:

I have a giant setup to test this algorithm:

#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <sys/time.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include "book.h"

#define ARRAYSIZE 16384
#define GRIDSIZE 60
#define BLOCKSIZE 32
#define SIZEFLOAT 4

using namespace std;

// Function definitions
float GetMax(float *maxes, int n);
__device__ float MaxOf2(float a, float b);
__global__ void MaxReduction(int n, float *g_idata, float *g_odata);

// Returns random floating point number
float RandomReal(float low, float high)
{
    float d;

    d = (float) rand() / ((float) RAND_MAX + 1);
    return (low + d * (high - low));
}

int main()
{
    /*****************VARIABLE SETUP*****************/
    // Pointer to CPU numbers
    float *numbers;
    // Pointer to GPU numbers
    float *dev_numbers;
    // Counter
    int i = 0;

    // Randomize
    srand(time(0));

    // Timers
    // Kernel timers
    cudaEvent_t start_kernel, stop_kernel;
    float elapsedTime_kernel;
    HANDLE_ERROR(cudaEventCreate(&start_kernel));
    HANDLE_ERROR(cudaEventCreate(&stop_kernel));
    // cudaMalloc timers
    cudaEvent_t start_malloc, stop_malloc;
    float elapsedTime_malloc;
    HANDLE_ERROR(cudaEventCreate(&start_malloc));
    HANDLE_ERROR(cudaEventCreate(&stop_malloc));
    // CPU timers
    struct timeval start, stop;
    float elapsedTime = 0;
    /*****************VARIABLE SETUP*****************/


    /*****************CPU ARRAY SETUP*****************/
    // Setup CPU array
    HANDLE_ERROR(cudaHostAlloc((void**)&numbers, ARRAYSIZE * sizeof(float), cudaHostAllocDefault));
    for(i = 0; i < ARRAYSIZE; i++)
        numbers[i] = RandomReal(0, 50000.0);
    /*****************CPU ARRAY SETUP*****************/


    /*****************GPU ARRAY SETUP*****************/
    // Start recording cuda malloc time
    HANDLE_ERROR(cudaEventRecord(start_malloc,0));

    // Allocate memory to GPU
    HANDLE_ERROR(cudaMalloc((void**)&dev_numbers, ARRAYSIZE * sizeof(float)));
    // Transfer CPU array to GPU
    HANDLE_ERROR(cudaMemcpy(dev_numbers, numbers, ARRAYSIZE*sizeof(float), cudaMemcpyHostToDevice));
    // An array to temporarily store maximum values on the GPU
    float *dev_max;
    HANDLE_ERROR(cudaMalloc((void**)&dev_max, GRIDSIZE * sizeof(float)));
    // An array to hold grab the GPU max
    float maxes[GRIDSIZE];
    /*****************GPU ARRAY SETUP*****************/

    /*****************KERNEL EXECUTION*****************/
    // Start recording kernel execution time
    HANDLE_ERROR(cudaEventRecord(start_kernel,0));
    // Run kernel
    MaxReduction<<<GRIDSIZE, BLOCKSIZE, SIZEFLOAT*BLOCKSIZE>>> (ARRAYSIZE, dev_numbers, dev_max);
    // Transfer maxes over
    HANDLE_ERROR(cudaMemcpy(maxes, dev_max, GRIDSIZE * sizeof(float), cudaMemcpyDeviceToHost));
    // Print out the max
    cout << GetMax(maxes, GRIDSIZE) << endl;

    // Stop recording kernel execution time
    HANDLE_ERROR(cudaEventRecord(stop_kernel,0));
    HANDLE_ERROR(cudaEventSynchronize(stop_kernel));
    // Retrieve recording data
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_kernel, start_kernel, stop_kernel));
    // Stop recording cuda malloc time
    HANDLE_ERROR(cudaEventRecord(stop_malloc,0));
    HANDLE_ERROR(cudaEventSynchronize(stop_malloc));
    // Retrieve recording data
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_malloc, start_malloc, stop_malloc));
    // Print results
    printf("%5.3f\t%5.3f\n", elapsedTime_kernel,  elapsedTime_malloc);
    /*****************KERNEL EXECUTION*****************/


    /*****************CPU EXECUTION*****************/
    // Capture the start time
    gettimeofday(&start, NULL);
    // Call generic P7Viterbi function
    cout << GetMax(numbers, ARRAYSIZE) << endl;
    // Capture the stop time
    gettimeofday(&stop, NULL);
    // Retrieve time elapsed in milliseconds
    long int elapsed_sec = stop.tv_sec - start.tv_sec;
    long int elapsed_usec = stop.tv_usec - start.tv_usec;
    elapsedTime = (float)(1000.0f * elapsed_sec) + (float)(0.001f * elapsed_usec);
    // Print results
    printf("%5.3f\n", elapsedTime);
    /*****************CPU EXECUTION*****************/

    // Free memory
    cudaFreeHost(numbers);
    cudaFree(dev_numbers);
    cudaFree(dev_max);
    cudaEventDestroy(start_kernel);
    cudaEventDestroy(stop_kernel);
    cudaEventDestroy(start_malloc);
    cudaEventDestroy(stop_malloc);

    // Exit program
    return 0;
}

我跑CUDA-上的Memcheck这个测试程序,使用-g和放大器; -G接通,并报告0的问题。任何人都可以发现这个问题?

I ran cuda-memcheck on this test program, with -g & -G switches on, and it reports 0 problems. Can anyone spot the issue?

注意:请确保已经从CUDA通过实例书book.h在当前目录,当你编译程序。这里来源链接:http://developer.nvidia.com/cuda-example-introduction-general-purpose-gpu-programming 下载源$ C ​​$ c和book.h将根据标准的目录/文件夹。

NOTE: Be sure to have book.h from the CUDA by Example book in your current directory when you compile the program. Source link here: http://developer.nvidia.com/cuda-example-introduction-general-purpose-gpu-programming Download the source code, and book.h will be under the common directory/folder.

推荐答案

您的内核看起来破我。线程本地搜索(共享内存减持前),应该是这个样子:

Your kernel looks broken to me. The thread local search (before the shared memory reduction), should look something like this:

sdata[tid] = g_idata[i];
i += gridSize;

while (i < n) {
    sdata[tid] = MaxOf2(sdata[tid], g_idata[i]);
    i += gridSize;
}

不应该吗?

shouldn't it?

另外请注意,如果您在费米运行此,共享内存缓冲区应宣布挥发,你会得到性能上的显着改善,如果线程本地搜索,是用寄存器变量,而不是在共享内存。有大约两者之间的8倍的差异在有效带宽。

Also note that if you run this on Fermi, the shared memory buffer should be declared volatile, and you will get a noticeable improvement in performance if the thread local search is done with a register variable, rather than in shared memory. There is about an 8 times difference in effective bandwidth between the two.

编辑:这是你减少内核的简化,工作版本。你应该注意的一些分歧相比,原来的:

Here is a simplified, working version of your reduction kernel. You should note a number of differences compared to your original:

__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
    extern __shared__ volatile float sdata[];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(BLOCKSIZE) + tid;
    unsigned int gridSize = BLOCKSIZE*gridDim.x;

    float val = g_idata[i];
    i += gridSize;
    while (i < n) {
        val = MaxOf2(g_idata[i],val);
        i += gridSize;
    }
    sdata[tid] = val;
    __syncthreads();

    // This versions uses a single warp for the shared memory 
    // reduction
# pragma unroll
    for(int i=(tid+32); ((tid<32)&&(i<BLOCKSIZE)); i+=32)
        sdata[tid] = MaxOf2(sdata[tid], sdata[i]);

    if (tid < 16) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
    if (tid < 8)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
    if (tid < 4)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
    if (tid < 2)  sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
    if (tid == 0) g_odata[blockIdx.x] = MaxOf2(sdata[tid], sdata[tid + 1]);
}

这code也应该是安全的费米。你也应该熟悉CUDA数学库,因为有一个 FMAX(X,Y)内在,你应该在的地方你的 MaxOf2的使用的功能。

This code should also be safe on Fermi. You should also familiarise yourself with the CUDA math library, because there is a fmax(x,y) intrinsic which you should use in place of your MaxOf2 function.

这篇关于CUDA最大简算法不工作的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆