cudaMemcpyFromSymbol on一个__device__变量 [英] cudaMemcpyFromSymbol on a __device__ variable
问题描述
我试图在 __ device __
变量上应用内核函数,根据规范,它驻留在全局内存
#include< stdio.h>
#includesys_data.h
#includemy_helper.cuh
#includehelper_cuda.h
#include< cuda_runtime.h>
double X [10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y [10] = {0};
__device__ double DEV_X [10];
int main(void){
checkCudaErrors(cudaMemcpyToSymbol(DEV_X,X,10 * sizeof(double)));
vector_projection< double><<< 1,10>>(DEV_X,10);
getLastCudaError(oops);
checkCudaErrors(cudaMemcpyFromSymbol(Y,DEV_X,10 * sizeof(double)));
return 0;
}
内核函数 vector_projection
在 my_helper.cuh
中定义如下:
template< typename T> __global__ void vector_projection(T * dx,int n){
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< n){
if(dx [tid]< 0)
dx [tid] =(T)
}
}
正如你所看到的,我使用 cudaMemcpyToSymbol
和 cudaMemcpyFromSymbol
来传输数据到设备或从设备传输数据。但是,我收到以下错误:
CUDA错误在../src/vectorAdd.cu:19 code = 4 (cudaErrorLaunchFailure)
cudaMemcpyFromSymbol(Y,DEV_X,10 * sizeof(double))
Footnote:我当然可以避免使用 __ device __
变量,然后去处理喜欢这工作正常;我只想看看如何做同样的事情(如果可能的话)与 __ device __
变量。
更新: cuda-memcheck
的输出可在 http://pastebin.com/AW9vmjFs 。我得到的错误消息如下:
=========无效__global__读取大小8
========= at 0x000000c8 in /home/ubuntu/Test0001/Debug/../src/my_helper.cuh:75:void vector_projection< double>(double *,int)
= ========通过线程(9,0,0)在块(0,0,0)
=========地址0x000370e8超出范围
问题的根源是你不允许在普通主机代码中使用设备变量的地址:
vector_projection< double><< 1,10>>(DEV_X,10)
^
虽然这看起来编译正确,但传递的实际地址是垃圾。 p>
要获取主机代码中设备变量的地址,我们可以使用>>>#group__CUDART__MEMORY_1g1b2a5ca2b220e77deb6a777bf3f75a65rel =nofollow> cudaGetSymbolAddress
为我编译并正确运行的示例:
$ cat t577.cu
#include< stdio.h>
double X [10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y [10] = {0};
__device__ double DEV_X [10];
template< typename T> __global__ void vector_projection(T * dx,int n){
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< n){
if(dx [tid]< 0)
dx [tid] =(T)
}
}
int main(void){
cudaMemcpyToSymbol(DEV_X,X,10 * sizeof(double));
double * my_dx;
cudaGetSymbolAddress((void **)& my_dx,DEV_X);
vector_projection< double><<<< 1,10>>>(my_dx,10)
cudaMemcpyFromSymbol(Y,DEV_X,10 * sizeof(double));
for(int i = 0; i <10; i ++)
printf(%d:%f \\\
,i,Y [i]);
return 0;
}
$ nvcc -arch = sm_35 -o t577 t577.cu
$ cuda-memcheck ./t577
========= CUDA-MEMCHECK
0:1.000000
1:0.000000
2:3.000000
3:0.000000
4:5.000000
5:0.000000
6:7.000000
7:0.000000
8:9.000000
9:0.000000
=========错误摘要:0错误
$
这不是解决这个问题的唯一方法。在设备代码中获取设备变量的地址是合法的,因此您可以使用以下行来修改内核:
T * dx = DEV_X;
并放弃传递设备变量作为内核参数。根据评论中的建议,您还可以修改代码以使用统一内存。
关于错误检查,如果偏离正确的cuda错误检查, ,结果可能会令人困惑。大多数cuda API调用,除了由自己的行为引起的错误之外,还会返回一些由CUDA异步活动(通常是内核调用)产生的错误。
I am trying to apply a kernel function on a __device__
variable, which, according to the specs, resides "in global memory"
#include <stdio.h>
#include "sys_data.h"
#include "my_helper.cuh"
#include "helper_cuda.h"
#include <cuda_runtime.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
int main(void) {
checkCudaErrors(cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double)));
vector_projection<double><<<1,10>>>(DEV_X, 10);
getLastCudaError("oops");
checkCudaErrors(cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double)));
return 0;
}
The kernel function vector_projection
is defined in my_helper.cuh
as follows:
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
As you can see, I use cudaMemcpyToSymbol
and cudaMemcpyFromSymbol
to transfer data to and from the device. However, I'm getting the following error:
CUDA error at ../src/vectorAdd.cu:19 code=4(cudaErrorLaunchFailure)
"cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double))"
Footnote: I can of course avoid to use __device__
variables and go for something like this which works fine; I just want to see how to do the same thing (if possible) with __device__
variables.
Update: The output of cuda-memcheck
can be found at http://pastebin.com/AW9vmjFs. The error messages I get are as follows:
========= Invalid __global__ read of size 8
========= at 0x000000c8 in /home/ubuntu/Test0001/Debug/../src/my_helper.cuh:75:void vector_projection<double>(double*, int)
========= by thread (9,0,0) in block (0,0,0)
========= Address 0x000370e8 is out of bounds
The root of the problem is that you are not allowed to take the address of a device variable in ordinary host code:
vector_projection<double><<<1,10>>>(DEV_X, 10);
^
Although this seems to compile correctly, the actual address passed is garbage.
To take the address of a device variable in host code, we can use cudaGetSymbolAddress
Here is a worked example that compiles and runs correctly for me:
$ cat t577.cu
#include <stdio.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
int main(void) {
cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double));
double *my_dx;
cudaGetSymbolAddress((void **)&my_dx, DEV_X);
vector_projection<double><<<1,10>>>(my_dx, 10);
cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double));
for (int i = 0; i < 10; i++)
printf("%d: %f\n", i, Y[i]);
return 0;
}
$ nvcc -arch=sm_35 -o t577 t577.cu
$ cuda-memcheck ./t577
========= CUDA-MEMCHECK
0: 1.000000
1: 0.000000
2: 3.000000
3: 0.000000
4: 5.000000
5: 0.000000
6: 7.000000
7: 0.000000
8: 9.000000
9: 0.000000
========= ERROR SUMMARY: 0 errors
$
This is not the only way to address this. It is legal to take the address of a device variable in device code, so you could modify your kernel with a line something like this:
T *dx = DEV_X;
and forgo passing of the device variable as a kernel parameter. As suggested in the comments, you could also modify your code to use Unified Memory.
Regarding error checking, if you deviate from proper cuda error checking and are not careful in your deviations, the results may be confusing. Most cuda API calls can, in addition to errors arising from their own behavior, return an error that resulted from some previous CUDA asynchronous activity (usually kernel calls).
这篇关于cudaMemcpyFromSymbol on一个__device__变量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!