传递结构到CUDA内核 [英] Passing structs to CUDA kernels

查看:115
本文介绍了传递结构到CUDA内核的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我是新的CUDA C,并正尝试通过一个typedef定义结构到一个内核。我的方法工作得很好,当我试图用只含整数一个结构,但是当我切换到彩车,我得到意义的数字早在结果。我认为这是同对齐,我想包括 __ __对齐随着我的类型声明,但无济于事。有人可以给我如何做到这一点的例子,或提供另一种方法?我试图设定,让我可以轻松地添加或删除字段不改变任何东西比结构和内核等。我的code:

  typedef结构__align __(8)
{
    漂浮A,B;
} 点;

__global__无效testKernel(点* P)
{
    INT I = blockIdx.x * blockDim.x + threadIdx.x;
    P [I]。A = 1.1;
    P [I] .B = 2.2;
}

INT主要(无效)
{
        //点​​的设置数量
    诠释为NumPoints = 16,
        gpuBlockSize = 4,
        的pointsize = sizeof的(点),
        的numBytes =为NumPoints *的pointsize,
        gpuGridSize =为NumPoints / gpuBlockSize;

        //分配内存
    点* cpuPointArray =新点[为NumPoints]
          * gpuPointArray =新点[为NumPoints]
    cpuPointArray =(点*)的malloc(的numBytes);
    cudaMalloc((无效**)及gpuPointArray,的numBytes);

        //启动内核
    testKernel<<< gpuGridSize,gpuBlockSize>>>(gpuPointArray);

        //检索结果
    cudaMemcpy(cpuPointArray,gpuPointArray,的numBytes,cudaMemcpyDeviceToHost);
    的printf(testKernel结果:\ N);
    的for(int i = 0; I<为NumPoints ++ I)
    {
        的printf(point.a:%D,point.b数:%d \ N,cpuPointArray [I]。一,cpuPointArray [I] .B);
    }

        //释放内存
    免费(cpuPointArray);
    cudaFree(gpuPointArray);

    返回0;
}
 

解决方案

因为似乎没有要对如何做到这一点任何体面的文件,我想我会在这里发表最后,修订后的code。事实证明,在 __ __对齐部分是不必要的,以及,试图打印花车当实际问题是在printf的使用%D的。

 的#include< stdlib.h中>
#包括< stdio.h中>

typedef结构
{
    漂浮A,B;
} 点;

__global__无效testKernel(点* P)
{
    INT I = blockIdx.x * blockDim.x + threadIdx.x;
    P [I]。A = 1.1;
    P [I] .B = 2.2;
}

INT主要(无效)
{
        //点​​的设置数量
    诠释为NumPoints = 16,
        gpuBlockSize = 4,
        的pointsize = sizeof的(点),
        的numBytes =为NumPoints *的pointsize,
        gpuGridSize =为NumPoints / gpuBlockSize;

        //分配内存
    点* cpuPointArray,
          * gpuPointArray;
    cpuPointArray =(点*)的malloc(的numBytes);
    cudaMalloc((无效**)及gpuPointArray,的numBytes);

        //启动内核
    testKernel<<< gpuGridSize,gpuBlockSize>>>(gpuPointArray);

        //检索结果
    cudaMemcpy(cpuPointArray,gpuPointArray,的numBytes,cudaMemcpyDeviceToHost);
    的printf(testKernel结果:\ N);
    的for(int i = 0; I<为NumPoints ++ I)
    {
        的printf(point.a:%F,point.b:%F \ N,cpuPointArray [I]。一,cpuPointArray [I] .B);
    }

        //释放内存
    免费(cpuPointArray);
    cudaFree(gpuPointArray);

    返回0;
}
 

I'm new to CUDA C, and am trying to pass a typedef'd struct into a kernel. My method worked fine when I tried it with a struct containing only ints, but when I switch to floats I get meaningless numbers back as results. I assume this has to do with alignment, and I tried including __align__ along with my type declaration, but to no avail. Can someone give me an example of how this is done, or provide an alternative approach? I'm trying to set it up so that I can easily add or remove fields without changing anything other than the struct and the kernel. My code:

typedef struct __align__(8)
{
    float a, b;
} point;

__global__ void testKernel(point *p)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    p[i].a = 1.1;
    p[i].b = 2.2;
}

int main(void)
{
        // set number of points 
    int numPoints    = 16,
        gpuBlockSize = 4,
        pointSize    = sizeof(point),
        numBytes     = numPoints * pointSize,
        gpuGridSize  = numPoints / gpuBlockSize;

        // allocate memory
    point *cpuPointArray = new point[numPoints],
          *gpuPointArray = new point[numPoints];
    cpuPointArray = (point*)malloc(numBytes);
    cudaMalloc((void**)&gpuPointArray, numBytes);

        // launch kernel
    testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);

        // retrieve the results
    cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
    printf("testKernel results:\n");
    for(int i = 0; i < numPoints; ++i)
    {
        printf("point.a: %d, point.b: %d\n",cpuPointArray[i].a,cpuPointArray[i].b);
    }

        // deallocate memory
    free(cpuPointArray);
    cudaFree(gpuPointArray);

    return 0;
}

解决方案

Since there doesn't appear to be any decent documentation on how to do this, I thought I'd post the final, revised code here. It turns out that the __align__ part was unnecessary as well, the actual problem was the use of %d in the printf when trying to print floats.

#include <stdlib.h>
#include <stdio.h>

typedef struct
{
    float a, b;
} point;

__global__ void testKernel(point *p)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    p[i].a = 1.1;
    p[i].b = 2.2;
}

int main(void)
{
        // set number of points 
    int numPoints    = 16,
        gpuBlockSize = 4,
        pointSize    = sizeof(point),
        numBytes     = numPoints * pointSize,
        gpuGridSize  = numPoints / gpuBlockSize;

        // allocate memory
    point *cpuPointArray,
          *gpuPointArray;
    cpuPointArray = (point*)malloc(numBytes);
    cudaMalloc((void**)&gpuPointArray, numBytes);

        // launch kernel
    testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);

        // retrieve the results
    cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
    printf("testKernel results:\n");
    for(int i = 0; i < numPoints; ++i)
    {
        printf("point.a: %f, point.b: %f\n",cpuPointArray[i].a,cpuPointArray[i].b);
    }

        // deallocate memory
    free(cpuPointArray);
    cudaFree(gpuPointArray);

    return 0;
}

这篇关于传递结构到CUDA内核的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆