从错误Ç结果调用CUDA [英] call cuda from c results in errors
问题描述
我收到的错误是:
blockIdx'不是在这个范围内声明
之前的预期初级前pression'<'令牌
之前的预期主要-EX pression'>'标记
预计主要-EX前pression'<'令牌
之前的预期主要-EX pression'>'标记
块引用>(以下简称<,>指的是内核调用<<< >>>)
此外,在主要功能我得到:
错误:无法转换'浮*的'到'浮动的'的说法1到无效
kernel_wrapper(*浮动,浮动*,INT,INT)
块引用>铜文件:
的#include<&iostream的GT;
#包括LT&;&cstdio GT;
#包括LT&; cstdlib>
#包括LT&;&math.h中GT;
#包括LT&;&curand_kernel.h GT;
#包括LT&;&cuda_runtime.h GT;
#包括LT&;&cuda.h GT;.....
__global__无效内核(浮动* A,*浮动B,curandState * globalState,诠释asize的,诠释BSIZE)
{
...无效kernel_wrapper(浮动* A_host,浮* B_host,诠释asize的,诠释BSIZE)
{
...
//分配主机内存
A_host =(浮点*)malloc的(A大小*的sizeof(浮动));
B_host =(浮点*)malloc的(BSIZE *的sizeof(浮动)); //分配设备内存
浮* A_dev,* B_dev;
cudaMalloc((无效**)及A_dev,asize的*的sizeof(浮动));
cudaMalloc((无效**)及B_dev,BSIZE *的sizeof(浮动));
.... 内核<<< 1,1 GT;>>(A_host,B_host,devStates,asize的,BSIZE);
...C文件:
的#include<&stdio.h中GT;
#包括LT&;&stdlib.h中GT;
#包括LT&;&string.h中GT;
#包括LT&; SYS / time.h中>
#包括LT&;&string.h中GT;
#包括LT&;&ASSERT.H GT;
#包括LT&;&STDARG.H GT;
#包括LT&;&cuda.h GT;
#包括LT&;&cuda_runtime.h GT;
#包括solve.cu
EXTERN无效kernel_wrapper(浮动* A,*浮动B,诠释asize的,诠释BSIZE);
...
诠释的main()
{...
A =(浮点*)malloc的(N * N *的sizeof(浮动));
B =(浮点*)malloc的(N * HS *的sizeof(浮动));
...
kernel_wrapper(A,B,A大小,BSIZE);
...我编译如下:
G ++ -o我的code myfile.c文件-I在/ usr /本地/ CUDA-5.5 / -L包括在/ usr /本地/ CUDA-5.5 / lib64下-lcurand - lcutil -lcudpp -lcuda -lstdc +
解决方案您不能包括
编译solve.cu
包含设备code(例如内核)在一个.C
文件,然后妥善G ++
设备code必须由
编译NVCC
相反,你将需要分别编译两个文件,然后将它们链接在一起。
我建议重新命名
myfile.c文件
到MYFILE.CPP
另外的删除的这条线从
MYFILE.CPP
:的#includesolve.cu
然后编译:
NVCC -c solve.cu
G ++ -c -I在/ usr /本地/ CUDA-5.5 /包括MYFILE.CPP
G ++ -o我的code solve.o myfile.o -L在/ usr /本地/ CUDA-5.5 / lib64的-lcudart -lcurand -lcutil -lcudpp -lcuda对于最后一个问题,你是路过双指针(
**
)kernel_wrapper(安培; A,和B,A大小,BSIZE);
当原型期待一个三分球(
*
)的extern无效kernel_wrapper(浮动* A,*浮动B,诠释asize的,INT BSIZE);
A
和B
是已经类型的浮动*
,所以它看起来像你对我应该直接通过他们:kernel_wrapper(A,B,A大小,BSIZE);
编辑:在回答下面的问题。
问题是指针
A_host
和B_host
(参数kernel_wrapper
)被传递的通过值的内核封装函数,内核封装功能分配这些指针存储,但新修改的指针,反映了分配的存储不是(也不可能)被传递回调用函数(即所谓的kernel_wrapper
的功能)。您可以分配存储为
A_host
和B_host
中调用函数,然后传递指针(和那么没有必要的malloc
这些指针在kernel_wrapper
),或者您可以如下修改内核封装:无效kernel_wrapper(浮** A_host,浮** B_host,诠释asize的,诠释BSIZE)
{
...
//分配主机内存
* A_host =(浮点*)malloc的(A大小*的sizeof(浮动));
* B_host =(浮点*)malloc的(BSIZE *的sizeof(浮动)); //分配设备内存
浮* A_dev,* B_dev;
cudaMalloc((无效**)及A_dev,asize的*的sizeof(浮动));
cudaMalloc((无效**)及B_dev,BSIZE *的sizeof(浮动));
....
cudaMemcpy(A_dev,* A_host,asize的*的sizeof(浮动),cudaMemcpyHostToDevice);
cudaMemcpy(B_dev,* B_host,BSIZE *的sizeof(浮动),cudaMemcpyHostToDevice); 内核<<< 1,1 GT;>>(A_dev,B_dev,devStates,asize的,BSIZE);
...您会那么还需要修改您的电话线在.cpp文件中:
INT的main()
{...
浮* A,* B;
INT A大小= N * N;
INT BSIZE = N * NHS;
...
kernel_wrapper(安培; A,和B,A大小,BSIZE);
...您code现在张贴的方式,你在做一个
的malloc
操作两次,每次A
和B
,这是没有必要的。The errors I am receiving are:
‘blockIdx’ was not declared in this scope expected primary-expression before ‘<’ token
expected primary-expression before ‘>’ token
expected primary-expression before ‘<’ token
expected primary-expression before ‘>’ token
(the "<,>" refers to the kernel call <<<>>>)
Also,in main function I receive:
error: cannot convert ‘float*’ to ‘float’ for argument ‘1’ to ‘void kernel_wrapper(float*, float*, int, int)
cu file:
#include <iostream> #include <cstdio> #include <cstdlib> #include <math.h> #include <curand_kernel.h> #include <cuda_runtime.h> #include <cuda.h> ..... __global__ void kernel(float* A,float *B, curandState* globalState, int Asize,int Bsize) { ... void kernel_wrapper(float* A_host,float* B_host, int Asize ,int Bsize) { ... //allocate host memory A_host=(float*)malloc(Asize*sizeof(float)); B_host=(float*)malloc(Bsize*sizeof(float)); //allocate device memory float* A_dev,*B_dev; cudaMalloc((void**) &A_dev,Asize* sizeof(float)); cudaMalloc((void**) &B_dev,Bsize* sizeof(float)); .... kernel<<<1,1>>>(A_host,B_host, devStates,Asize,Bsize); ...
c file:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/time.h> #include <string.h> #include <assert.h> #include <stdarg.h> #include <cuda.h> #include <cuda_runtime.h> #include "solve.cu" extern void kernel_wrapper(float* A,float* B, int Asize ,int Bsize); ... int main() {... A = (float*)malloc(N*N*sizeof(float)); B = (float*)malloc(N*HS*sizeof(float)); ... kernel_wrapper(A,B,Asize ,Bsize); ...
I am compiling as:
g++ -o mycode myfile.c -I/usr/local/cuda-5.5/include -L/usr/local/cuda-5.5/lib64 -lcurand -lcutil -lcudpp -lcuda -lstdc+
解决方案You can't include
solve.cu
which contains device code (e.g. kernels) in a.c
file and then compile it properly withg++
Device code has to be compiled by
nvcc
Instead, you will need to compile the two files separately, then link them together.
I would suggest renaming your
myfile.c
tomyfile.cpp
Also remove this line from your
myfile.cpp
:#include "solve.cu"
Then compile with:
nvcc -c solve.cu g++ -c -I/usr/local/cuda-5.5/include myfile.cpp g++ -o mycode solve.o myfile.o -L/usr/local/cuda-5.5/lib64 -lcudart -lcurand -lcutil -lcudpp -lcuda
For the last issue, you are passing double pointers (
**
):kernel_wrapper(&A,&B,Asize ,Bsize);
Where the prototype is expecting single pointers (
*
):extern void kernel_wrapper(float* A,float* B, int Asize ,int Bsize);
A
andB
are already of typefloat *
, so it looks to me like you should pass them directly:kernel_wrapper(A,B,Asize ,Bsize);
EDIT: Responding to a question below.
The problem is the pointers
A_host
andB_host
(parameters tokernel_wrapper
) are being passed by value to the kernel wrapper function, and the kernel wrapper function is allocating the storage for those pointers, but the newly modified pointer reflecting the allocated storage is not (cannot) be passed back to the calling function (i.e. the function that calledkernel_wrapper
).You could allocate the storage for
A_host
andB_host
in the calling function, and then pass the pointer (and then no need tomalloc
those pointers inkernel_wrapper
) or you could modify the kernel wrapper as follows:void kernel_wrapper(float** A_host,float** B_host, int Asize ,int Bsize) { ... //allocate host memory *A_host=(float*)malloc(Asize*sizeof(float)); *B_host=(float*)malloc(Bsize*sizeof(float)); //allocate device memory float* A_dev,*B_dev; cudaMalloc((void**) &A_dev,Asize* sizeof(float)); cudaMalloc((void**) &B_dev,Bsize* sizeof(float)); .... cudaMemcpy(A_dev, *A_host, Asize*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(B_dev, *B_host, Bsize*sizeof(float), cudaMemcpyHostToDevice); kernel<<<1,1>>>(A_dev,B_dev, devStates,Asize,Bsize); ...
You would then also need to modify your calling line in the .cpp file:
int main() {... float *A, *B; int Asize = N*N; int Bsize = N*NHS; ... kernel_wrapper(&A,&B,Asize ,Bsize); ...
The way your code is posted now, you are doing a
malloc
operation twice each forA
andB
and that is not necessary.这篇关于从错误Ç结果调用CUDA的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!