对GPU进行结构的动态数组的内存分配 [英] Memory allocation on GPU for dynamic array of structs

查看:110
本文介绍了对GPU进行结构的动态数组的内存分配的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有问题,通过结构的阵列GPU内核。我在此基础上的话题 - cudaMemcpy分段错误,然后我写了某事是这样的:

 的#include<&stdio.h中GT;
#包括LT&;&stdlib.h中GT;结构测试{
    的char *数组;
};__global__无效内核(测试* dev_test){
    的for(int i = 0;我小于5;我++){
        的printf(内核[0] [我]:%C \\ N,dev_test [0] .array [I]);
    }
}诠释主要(无效){    INT n = 4时,大小= 5;
    测试* dev_test,*试验;    测试=(测试*)malloc的(的sizeof(测试)* N);
    的for(int i = 0; I< N;我++)
        测试[I] .array =(字符*)malloc的(大小*的sizeof(字符));    的for(int i = 0; I< N;我++){
        焦温度[] = {'A​​','B','C','D','E'};
        的memcpy(测试[I] .array,温度,尺寸*的sizeof(字符));
    }    cudaMalloc((无效**)及dev_test,正*的sizeof(试验));
    cudaMemcpy(dev_test,检验,n * sizeof的(测试),cudaMemcpyHostToDevice);
    的for(int i = 0; I< N;我++){
        cudaMalloc((无效**)及(测试[I] .array),尺寸*的sizeof(字符));
        cudaMemcpy(及(dev_test [I] .array),及(测试[I] .array),尺寸*的sizeof(char)的,cudaMemcpyHostToDevice);
    }    内核与所述;&所述;&。1,1>>>(dev_test);
    cudaDeviceSynchronize();    //空闲内存
    返回0;
}

有没有错误,但显示核心价值观是不正确的。我做错了吗?预先感谢任何帮助。


解决方案

  1. 这是分配一个新的指针到主机内存:

     测试[I] .array =(字符*)malloc的(大小*的sizeof(字符));


  2. 这是在主机内存将数据复制到该区域的:

     的memcpy(测试[I] .array,温度,尺寸*的sizeof(字符));


  3. 这是覆盖的的previously分配终场前一个的新的的指针设备内存到主机内存(从上面的步骤1):

      cudaMalloc((无效**)及(测试[I] .array),尺寸*的sizeof(字符));


第3步后,在第2步设置数据完全丢失,以及以任何方式不再访问。谈到在问题/答案您链接步骤3和4:


  

3.创建主机上的一个单独的INT指针,姑且称之为 myhostptr


  
  

在设备上4.cudaMalloc INT存储 myhostptr


您还没有这样做。你没有创建一个单独的指针。你重复使用(擦除,改写)现有的指针,该指针指着你的数据在主机上关心。 <一href=\"http://stackoverflow.com/questions/14284964/cuda-how-to-allocate-memory-for-data-member-of-a-class\">This问题/答案,也是从你链接的答案挂钩,使几乎正是你需要遵循的步骤,在code

这是你的code,它正确地实现了缺少的步骤的修改版本3和4(5)你没有按照问题的正确执行/回答您联系:(参见注释描绘步骤3 ,4,5)

  $猫t755.cu
#包括LT&;&stdio.h中GT;
#包括LT&;&stdlib.h中GT;结构测试{
    的char *数组;
};__global__无效内核(测试* dev_test){
    的for(int i = 0;我小于5;我++){
        的printf(内核[0] [我]:%C \\ N,dev_test [0] .array [I]);
    }
}诠释主要(无效){    INT n = 4时,大小= 5;
    测试* dev_test,*试验;    测试=(测试*)malloc的(的sizeof(测试)* N);
    的for(int i = 0; I&LT; N;我++)
        测试[I] .array =(字符*)malloc的(大小*的sizeof(字符));    的for(int i = 0; I&LT; N;我++){
        焦温度[] = {'A​​','B','C','D','E'};
        的memcpy(测试[I] .array,温度,尺寸*的sizeof(字符));
    }    cudaMalloc((无效**)及dev_test,正*的sizeof(试验));
    cudaMemcpy(dev_test,检验,n * sizeof的(测试),cudaMemcpyHostToDevice);    //步骤3:
    字符* temp_data [N];
    //步骤4:
    的for(int i = 0; I&LT; N;我++)
      cudaMalloc(及(temp_data [I]),尺寸*的sizeof(字符));
    //第5步:
    的for(int i = 0; I&LT; N;我++)
      cudaMemcpy(及(dev_test [I] .array),及(temp_data [I]),的sizeof(字符*),cudaMemcpyHostToDevice);
    //现在复制嵌入的数据:
    的for(int i = 0; I&LT; N;我++)
      cudaMemcpy(temp_data [I],测试[I] .array,尺寸*的sizeof(char)的,cudaMemcpyHostToDevice);    内核与所述;&所述;&。1,1&GT;&GT;&GT;(dev_test);
    cudaDeviceSynchronize();    //空闲内存
    返回0;
}$ NVCC -o T755 t755.cu
$ CUDA-MEMCHECK ./t755
========= CUDA-MEMCHECK
内核[0] [i]于:一个
内核[0] [i]于:乙
内核[0] [我]:C
内核[0] [I]:D
内核[0] [我]:电子
=========错误摘要:0错误
$

由于上述方法可以对初学者是具有挑战性的,通常的建议是不要做,而是平展的数据结构。弄平一般是指以重新排列数据存储以便移除必须分别分配的嵌入式指针。

扁平化这种数据结构的一个简单的例子是使用这个代替:

 结构测试{
    字符数组[5];
};

这是公认的,当然,这个的尤其的做法无助于各种用途,但应说明的总体思路/意图。与该变形例中,作为一个例子,在code变为更简单:

  $猫t755.cu
#包括LT&;&stdio.h中GT;
#包括LT&;&stdlib.h中GT;结构测试{
    字符数组[5];
};__global__无效内核(测试* dev_test){
    的for(int i = 0;我小于5;我++){
        的printf(内核[0] [我]:%C \\ N,dev_test [0] .array [I]);
    }
}诠释主要(无效){    INT n = 4时,大小= 5;
    测试* dev_test,*试验;    测试=(测试*)malloc的(的sizeof(测试)* N);    的for(int i = 0; I&LT; N;我++){
        焦温度[] = {'A​​','B','C','D','E'};
        的memcpy(测试[I] .array,温度,尺寸*的sizeof(字符));
    }    cudaMalloc((无效**)及dev_test,正*的sizeof(试验));
    cudaMemcpy(dev_test,检验,n * sizeof的(测试),cudaMemcpyHostToDevice);    内核与所述;&所述;&。1,1&GT;&GT;&GT;(dev_test);
    cudaDeviceSynchronize();    //空闲内存
    返回0;
}
$ NVCC -o T755 t755.cu
$ CUDA-MEMCHECK ./t755
========= CUDA-MEMCHECK
内核[0] [i]于:一个
内核[0] [i]于:乙
内核[0] [我]:C
内核[0] [I]:D
内核[0] [我]:电子
=========错误摘要:0错误
$

I have problem with passing array of struct to gpu kernel. I based on this topic - cudaMemcpy segmentation fault and I wrote sth like this:

#include <stdio.h>
#include <stdlib.h>

struct Test {
    char *array;
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);
    for(int i = 0; i < n; i++)
        test[i].array = (char*)malloc(size * sizeof(char));

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);
    for(int i=0; i < n; i++) {
        cudaMalloc((void**)&(test[i].array), size * sizeof(char));
        cudaMemcpy(&(dev_test[i].array), &(test[i].array), size * sizeof(char), cudaMemcpyHostToDevice);
    }

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}

There is no error, but displayed values in kernel are incorrect. What I'm doing wrong? Thank in advance for any help.

解决方案

  1. This is allocating a new pointer to host memory:

    test[i].array = (char*)malloc(size * sizeof(char));
    

  2. This is copying data to that region in host memory:

    memcpy(test[i].array, temp, size * sizeof(char));
    

  3. This is overwriting the previously allocated pointer to host memory (from step 1 above) with a new pointer to device memory:

    cudaMalloc((void**)&(test[i].array), size * sizeof(char));
    

After step 3, the data you set up in step 2 is entirely lost, and no longer accessible in any fashion. Referring to steps 3 and 4 in the question/answer you linked:

3.Create a separate int pointer on the host, let's call it myhostptr

4.cudaMalloc int storage on the device for myhostptr

You haven't done this. You did not create a separate pointer. You reused (erased, overwrote) an existing pointer, which was pointing to data you cared about on the host. This question/answer, also linked from the answer you linked, gives almost exactly the steps you need to follow, in code.

Here's a modified version of your code, which properly implements the missing steps 3 and 4 (and 5) that you didn't implement correctly according to the question/answer you linked: (refer to comments delineating steps 3,4,5)

$ cat t755.cu
#include <stdio.h>
#include <stdlib.h>

struct Test {
    char *array;
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);
    for(int i = 0; i < n; i++)
        test[i].array = (char*)malloc(size * sizeof(char));

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);

    // Step 3:
    char *temp_data[n];
    // Step 4:
    for (int i=0; i < n; i++)
      cudaMalloc(&(temp_data[i]), size*sizeof(char));
    // Step 5:
    for (int i=0; i < n; i++)
      cudaMemcpy(&(dev_test[i].array), &(temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);
    // now copy the embedded data:
    for (int i=0; i < n; i++)
      cudaMemcpy(temp_data[i], test[i].array, size*sizeof(char), cudaMemcpyHostToDevice);

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}

$ nvcc -o t755 t755.cu
$ cuda-memcheck ./t755
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$

Since the above methodology can be challenging for beginners, the usual advice is not to do it, but instead flatten your data structures. Flatten generally means to rearrange the data storage so as to remove the embedded pointers that have to be separately allocated.

A trivial example of flattening this data structure would be to use this instead:

struct Test {
    char array[5];
};

It's recognized of course that this particular approach would not serve every purpose, but it should illustrate the general idea/intent. With that modification, as an example, the code becomes much simpler:

$ cat t755.cu
#include <stdio.h>
#include <stdlib.h>

struct Test {
    char array[5];
};

__global__ void kernel(Test *dev_test) {
    for(int i=0; i < 5; i++) {
        printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
    }
}

int main(void) {

    int n = 4, size = 5;
    Test *dev_test, *test;

    test = (Test*)malloc(sizeof(Test)*n);

    for(int i=0; i < n; i++) {
        char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
        memcpy(test[i].array, temp, size * sizeof(char));
    }

    cudaMalloc((void**)&dev_test, n * sizeof(Test));
    cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);

    kernel<<<1, 1>>>(dev_test);
    cudaDeviceSynchronize();

    //  memory free
    return 0;
}
$ nvcc -o t755 t755.cu
$ cuda-memcheck ./t755
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$

这篇关于对GPU进行结构的动态数组的内存分配的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆