传递结构到CUDA内核 [英] Passing structs to CUDA kernels
问题描述
我是新的CUDA C,并正尝试通过一个typedef定义结构到一个内核。我的方法工作得很好,当我试图用只含整数一个结构,但是当我切换到彩车,我得到意义的数字早在结果。我认为这是同对齐,我想包括 __ __对齐
随着我的类型声明,但无济于事。有人可以给我如何做到这一点的例子,或提供另一种方法?我试图设定,让我可以轻松地添加或删除字段不改变任何东西比结构和内核等。我的code:
typedef结构__align __(8)
{
漂浮A,B;
} 点;
__global__无效testKernel(点* P)
{
INT I = blockIdx.x * blockDim.x + threadIdx.x;
P [I]。A = 1.1;
P [I] .B = 2.2;
}
INT主要(无效)
{
//点的设置数量
诠释为NumPoints = 16,
gpuBlockSize = 4,
的pointsize = sizeof的(点),
的numBytes =为NumPoints *的pointsize,
gpuGridSize =为NumPoints / gpuBlockSize;
//分配内存
点* cpuPointArray =新点[为NumPoints]
* gpuPointArray =新点[为NumPoints]
cpuPointArray =(点*)的malloc(的numBytes);
cudaMalloc((无效**)及gpuPointArray,的numBytes);
//启动内核
testKernel<<< gpuGridSize,gpuBlockSize>>>(gpuPointArray);
//检索结果
cudaMemcpy(cpuPointArray,gpuPointArray,的numBytes,cudaMemcpyDeviceToHost);
的printf(testKernel结果:\ N);
的for(int i = 0; I<为NumPoints ++ I)
{
的printf(point.a:%D,point.b数:%d \ N,cpuPointArray [I]。一,cpuPointArray [I] .B);
}
//释放内存
免费(cpuPointArray);
cudaFree(gpuPointArray);
返回0;
}
因为似乎没有要对如何做到这一点任何体面的文件,我想我会在这里发表最后,修订后的code。事实证明,在 __ __对齐
部分是不必要的,以及,试图打印花车当实际问题是在printf的使用%D的。
的#include< stdlib.h中>
#包括< stdio.h中>
typedef结构
{
漂浮A,B;
} 点;
__global__无效testKernel(点* P)
{
INT I = blockIdx.x * blockDim.x + threadIdx.x;
P [I]。A = 1.1;
P [I] .B = 2.2;
}
INT主要(无效)
{
//点的设置数量
诠释为NumPoints = 16,
gpuBlockSize = 4,
的pointsize = sizeof的(点),
的numBytes =为NumPoints *的pointsize,
gpuGridSize =为NumPoints / gpuBlockSize;
//分配内存
点* cpuPointArray,
* gpuPointArray;
cpuPointArray =(点*)的malloc(的numBytes);
cudaMalloc((无效**)及gpuPointArray,的numBytes);
//启动内核
testKernel<<< gpuGridSize,gpuBlockSize>>>(gpuPointArray);
//检索结果
cudaMemcpy(cpuPointArray,gpuPointArray,的numBytes,cudaMemcpyDeviceToHost);
的printf(testKernel结果:\ N);
的for(int i = 0; I<为NumPoints ++ I)
{
的printf(point.a:%F,point.b:%F \ N,cpuPointArray [I]。一,cpuPointArray [I] .B);
}
//释放内存
免费(cpuPointArray);
cudaFree(gpuPointArray);
返回0;
}
I'm new to CUDA C, and am trying to pass a typedef'd struct into a kernel. My method worked fine when I tried it with a struct containing only ints, but when I switch to floats I get meaningless numbers back as results. I assume this has to do with alignment, and I tried including __align__
along with my type declaration, but to no avail. Can someone give me an example of how this is done, or provide an alternative approach? I'm trying to set it up so that I can easily add or remove fields without changing anything other than the struct and the kernel. My code:
typedef struct __align__(8)
{
float a, b;
} point;
__global__ void testKernel(point *p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
p[i].a = 1.1;
p[i].b = 2.2;
}
int main(void)
{
// set number of points
int numPoints = 16,
gpuBlockSize = 4,
pointSize = sizeof(point),
numBytes = numPoints * pointSize,
gpuGridSize = numPoints / gpuBlockSize;
// allocate memory
point *cpuPointArray = new point[numPoints],
*gpuPointArray = new point[numPoints];
cpuPointArray = (point*)malloc(numBytes);
cudaMalloc((void**)&gpuPointArray, numBytes);
// launch kernel
testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);
// retrieve the results
cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
printf("testKernel results:\n");
for(int i = 0; i < numPoints; ++i)
{
printf("point.a: %d, point.b: %d\n",cpuPointArray[i].a,cpuPointArray[i].b);
}
// deallocate memory
free(cpuPointArray);
cudaFree(gpuPointArray);
return 0;
}
Since there doesn't appear to be any decent documentation on how to do this, I thought I'd post the final, revised code here. It turns out that the __align__
part was unnecessary as well, the actual problem was the use of %d in the printf when trying to print floats.
#include <stdlib.h>
#include <stdio.h>
typedef struct
{
float a, b;
} point;
__global__ void testKernel(point *p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
p[i].a = 1.1;
p[i].b = 2.2;
}
int main(void)
{
// set number of points
int numPoints = 16,
gpuBlockSize = 4,
pointSize = sizeof(point),
numBytes = numPoints * pointSize,
gpuGridSize = numPoints / gpuBlockSize;
// allocate memory
point *cpuPointArray,
*gpuPointArray;
cpuPointArray = (point*)malloc(numBytes);
cudaMalloc((void**)&gpuPointArray, numBytes);
// launch kernel
testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);
// retrieve the results
cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
printf("testKernel results:\n");
for(int i = 0; i < numPoints; ++i)
{
printf("point.a: %f, point.b: %f\n",cpuPointArray[i].a,cpuPointArray[i].b);
}
// deallocate memory
free(cpuPointArray);
cudaFree(gpuPointArray);
return 0;
}
这篇关于传递结构到CUDA内核的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!