CUDA中的原子操作失败 [英] Atomic Operation failed in CUDA

查看：111 发布时间：2021/4/27 20:08:24 cuda

本文介绍了CUDA中的原子操作失败的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

由于计算能力为2.1，所以 atomicAdd 和 atomicMax 操作不支持双精度，因此我根据堆栈溢出的一些答案定义了这两个函数.

As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.

奇怪的是 atomicAdd 函数运行良好，但是 atomicMax 不起作用，这是我的代码.

It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.

我的代码的测试是在每个块上生成一个随机数，然后对每个块上的随机数求和，我们有块求和，我想测试 atomicAdd 和 atomicMax的总和.

The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.

#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>


#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4


__device__ double gsum[num_of_blocks];

__device__ double dev_sum;

__device__ double dev_max;

// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &globalState[idx]);
}

// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
    unsigned long long int* address_as_i = (unsigned long long int*)address;
    unsigned long long int old = *address_as_i, assumed;
    do {
        assumed = old;
        old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
    } while (assumed != old);
    return __longlong_as_double(old);
}

// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do{
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
    }while(assumed != old);
    return __longlong_as_double(old);
}

__global__ void kernel(curandState *globalState){
    // global id
    int gidx    = threadIdx.x + blockIdx.x * blockDim.x;
    // local id
    int lidx    = threadIdx.x;

    // creat shared memory to store seeds
    __shared__ curandState localState[tot_threads];

    __shared__ double srandnum[threads_per_block];

    // copy global seed to local
    localState[lidx]    = globalState[gidx];

    //synchronize the local threads writing to the local memory cache
    __syncthreads();

    // generate random number from normal distribution in shared memory
    srandnum[lidx]  = curand_normal(&localState[lidx]);
    __syncthreads();

    if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];}   // sum of each block
    if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];}      // copy the sums back to global memory

    __threadfence();

    if( gidx < num_of_blocks){
        atomicAdd_d(&dev_sum, gsum[gidx]);
    }

    if( gidx < num_of_blocks){
        atomicMax_d(&dev_max, gsum[gidx]);
    }

    if( gidx == 0){
        printf("Sum is: %lf\n", dev_sum);
    }

    if( gidx == 1){
        printf("Max is: %lf\n", dev_max);
    }
}


int main(){
    // set seed on device
    curandState *globalState;
    cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
    initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);

    // launch kernel
    kernel<<<num_of_blocks, threads_per_block>>>(globalState);
    double randnum[num_of_blocks];

    cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);

    std::cout << "Sum of each block:\n";
    for (int i = 0; i < num_of_blocks; ++i){
        std::cout << randnum[i] << std::endl;
    }

    cudaFree(globalState);
    return 0;
}

我得到的结果是

Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303

从结果中，我知道 atomicAdd 函数有效，但是 atomicMax 函数不起作用，对此我一无所知.预先感谢.

From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.

CUDA中的原子操作失败 [英] Atomic Operation failed in CUDA

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

CUDA中的原子操作失败 [英] Atomic Operation failed in CUDA

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭