cudaMemcpyFromSymbol on一个__device__变量 [英] cudaMemcpyFromSymbol on a __device__ variable

查看:196
本文介绍了cudaMemcpyFromSymbol on一个__device__变量的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图在 __ device __ 变量上应用内核函数,根据规范,它驻留在全局内存

  #include< stdio.h> 
#includesys_data.h
#includemy_helper.cuh
#includehelper_cuda.h
#include< cuda_runtime.h>


double X [10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y [10] = {0};
__device__ double DEV_X [10];


int main(void){
checkCudaErrors(cudaMemcpyToSymbol(DEV_X,X,10 * sizeof(double)));
vector_projection< double><<< 1,10>>(DEV_X,10);
getLastCudaError(oops);
checkCudaErrors(cudaMemcpyFromSymbol(Y,DEV_X,10 * sizeof(double)));
return 0;
}

内核函数 vector_projection my_helper.cuh 中定义如下:

  template< typename T> __global__ void vector_projection(T * dx,int n){
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< n){
if(dx [tid]< 0)
dx [tid] =(T)
}
}

正如你所看到的,我使用 cudaMemcpyToSymbol cudaMemcpyFromSymbol 来传输数据到设备或从设备传输数据。但是,我收到以下错误:

  CUDA错误在../src/vectorAdd.cu:19 code = 4 (cudaErrorLaunchFailure)
cudaMemcpyFromSymbol(Y,DEV_X,10 * sizeof(double))

Footnote:我当然可以避免使用 __ device __ 变量,然后去处理喜欢这工作正常;我只想看看如何做同样的事情(如果可能的话)与 __ device __ 变量。



更新: cuda-memcheck 的输出可在 http://pastebin.com/AW9vmjFs 。我得到的错误消息如下:

  =========无效__global__读取大小8 
========= at 0x000000c8 in /home/ubuntu/Test0001/Debug/../src/my_helper.cuh:75:void vector_projection< double>(double *,int)
= ========通过线程(9,0,0)在块(0,0,0)
=========地址0x000370e8超出范围


解决方案

问题的根源是你不允许在普通主机代码中使用设备变量的地址

  vector_projection< double><< 1,10>>(DEV_X,10) 
^

虽然这看起来编译正确,但传递的实际地址是垃圾。 p>

要获取主机代码中设备变量的地址,我们可以使用>>>#group__CUDART__MEMORY_1g1b2a5ca2b220e77deb6a777bf3f75a65rel =nofollow> cudaGetSymbolAddress



为我编译并正确运行的示例:

  $ cat t577.cu 
#include< stdio.h>

double X [10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y [10] = {0};
__device__ double DEV_X [10];

template< typename T> __global__ void vector_projection(T * dx,int n){
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< n){
if(dx [tid]< 0)
dx [tid] =(T)
}
}



int main(void){
cudaMemcpyToSymbol(DEV_X,X,10 * sizeof(double));
double * my_dx;
cudaGetSymbolAddress((void **)& my_dx,DEV_X);
vector_projection< double><<<< 1,10>>>(my_dx,10)
cudaMemcpyFromSymbol(Y,DEV_X,10 * sizeof(double));
for(int i = 0; i <10; i ++)
printf(%d:%f \\\
,i,Y [i]);
return 0;
}
$ nvcc -arch = sm_35 -o t577 t577.cu
$ cuda-memcheck ./t577
========= CUDA-MEMCHECK
0:1.000000
1:0.000000
2:3.000000
3:0.000000
4:5.000000
5:0.000000
6:7.000000
7:0.000000
8:9.000000
9:0.000000
=========错误摘要:0错误
$

这不是解决这个问题的唯一方法。在设备代码中获取设备变量的地址是合法的,因此您可以使用以下行来修改内核:

  T * dx = DEV_X; 

并放弃传递设备变量作为内核参数。根据评论中的建议,您还可以修改代码以使用统一内存



关于错误检查,如果偏离正确的cuda错误检查, ,结果可能会令人困惑。大多数cuda API调用,除了由自己的行为引起的错误之外,还会返回一些由CUDA异步活动(通常是内核调用)产生的错误。


I am trying to apply a kernel function on a __device__ variable, which, according to the specs, resides "in global memory"

#include <stdio.h>
#include "sys_data.h"
#include "my_helper.cuh"
#include "helper_cuda.h"
#include <cuda_runtime.h>


double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];


int main(void) {
    checkCudaErrors(cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double)));
    vector_projection<double><<<1,10>>>(DEV_X, 10);
    getLastCudaError("oops");
    checkCudaErrors(cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double)));
    return 0;
}

The kernel function vector_projection is defined in my_helper.cuh as follows:

template<typename T> __global__ void vector_projection(T *dx, int n) {
    int tid;
    tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < n) {
        if (dx[tid] < 0)
            dx[tid] = (T) 0;
    }
}

As you can see, I use cudaMemcpyToSymbol and cudaMemcpyFromSymbol to transfer data to and from the device. However, I'm getting the following error:

CUDA error at ../src/vectorAdd.cu:19 code=4(cudaErrorLaunchFailure) 
  "cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double))" 

Footnote: I can of course avoid to use __device__ variables and go for something like this which works fine; I just want to see how to do the same thing (if possible) with __device__ variables.

Update: The output of cuda-memcheck can be found at http://pastebin.com/AW9vmjFs. The error messages I get are as follows:

========= Invalid __global__ read of size 8
=========     at 0x000000c8 in /home/ubuntu/Test0001/Debug/../src/my_helper.cuh:75:void vector_projection<double>(double*, int)
=========     by thread (9,0,0) in block (0,0,0)
=========     Address 0x000370e8 is out of bounds

解决方案

The root of the problem is that you are not allowed to take the address of a device variable in ordinary host code:

vector_projection<double><<<1,10>>>(DEV_X, 10);
                                    ^

Although this seems to compile correctly, the actual address passed is garbage.

To take the address of a device variable in host code, we can use cudaGetSymbolAddress

Here is a worked example that compiles and runs correctly for me:

$ cat t577.cu
#include <stdio.h>

double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];

template<typename T> __global__ void vector_projection(T *dx, int n) {
    int tid;
    tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < n) {
        if (dx[tid] < 0)
            dx[tid] = (T) 0;
    }
}



int main(void) {
    cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double));
    double *my_dx;
    cudaGetSymbolAddress((void **)&my_dx, DEV_X);
    vector_projection<double><<<1,10>>>(my_dx, 10);
    cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double));
    for (int i = 0; i < 10; i++)
      printf("%d: %f\n", i, Y[i]);
    return 0;
}
$ nvcc -arch=sm_35 -o t577 t577.cu
$ cuda-memcheck ./t577
========= CUDA-MEMCHECK
0: 1.000000
1: 0.000000
2: 3.000000
3: 0.000000
4: 5.000000
5: 0.000000
6: 7.000000
7: 0.000000
8: 9.000000
9: 0.000000
========= ERROR SUMMARY: 0 errors
$

This is not the only way to address this. It is legal to take the address of a device variable in device code, so you could modify your kernel with a line something like this:

T *dx = DEV_X;

and forgo passing of the device variable as a kernel parameter. As suggested in the comments, you could also modify your code to use Unified Memory.

Regarding error checking, if you deviate from proper cuda error checking and are not careful in your deviations, the results may be confusing. Most cuda API calls can, in addition to errors arising from their own behavior, return an error that resulted from some previous CUDA asynchronous activity (usually kernel calls).

这篇关于cudaMemcpyFromSymbol on一个__device__变量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆