如何从C ++传递字符串矩阵到Cuda内核 [英] How to pass string matrix to Cuda kernel from C++

查看:99
本文介绍了如何从C ++传递字符串矩阵到Cuda内核的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

问题:



我有一个C ++中的字符串填充字符串,我想传递给cuda内核函数。我知道CUDA不能处理字符串,所以经过一些研究,我试过了一些解决方案如下所示。



尝试:




  1. 定义C ++中的指针数组,为每个单元格指定一个指针字符(为简单起见,tmp [i]被包含在前面引用的矩阵中的字符串填充) / p>

    C ++区




      char * tmp [3]; 
    int text_length,array_length;

    text_length = 4;
    array_length = 3;

    tmp [0] =(char *)malloc(text_length * sizeof(char));
    tmp [1] =(char *)malloc(text_length * sizeof(char));
    tmp [2] =(char *)malloc(text_length * sizeof(char));

    tmp [0] =some;
    tmp [1] =rand;
    tmp [2] =text;

    char * a [3];
    for(int i = 0; i {
    cudaMalloc((void **)& a [i],text_length * sizeof(char)
    cudaMemcpy(& a [i],& tmp [i],text_length * sizeof(char),cudaMemcpyHostToDevice);
    }

    func<<<<块blockPerGrid,threadsPerBlock>>(a);





      __global__ void func(char * a []){

    for = 0; i <3; i ++)
    printf(value [%d] =%s \\\
    ,i,a [i]
    }



      value [0] =(null)
    value [1] = )
    value [2] =(null)



  2. 将填充字符串的矩阵扩展到char指针,并将其传递给cuda内核,然后尝试检索字符串(再次用C ++简化的代码)



    > C ++区




      
    int index [6];

    a =somerandtext;
    index [0] = 0; // first word start
    index [1] = 3; // first word end
    index [2] = 4; //与第一个单词相同
    index [3] = 7;
    index [4] = 8;
    index [5] = 1;

    func<<<<块blockPerGrid,threadsPerBlock>>(a,索引);





      __global__ void func(char * a,int index []){

    int first_word_start = index [0];
    int first_word_end = index [1];

    //打印第一个字
    for(int i = first_word_start; i <= first_word_end; i ++)
    printf(%c,a [i]);
    }



     没有产生输出



我尝试了很多其他解决方案, ...问题也可以提出询问:我怎么能传递'n'字符串到cuda内核和打印(和比较)所有的他们(记住,我不能通过'n'变量)。

解决方案

你已经显示的代码已经完成,你遗漏的东西可能很重要。如果您显示完整的验证码,他人会更容易为您提供协助。此外,任何时候您在使用CUDA代码时,最好使用正确的cuda错误检查,这往往会指向你的工作(我怀疑这可能有助于你的第二次尝试)。此外,使用 cuda-memcheck 运行代码通常是有益的。



运行到CUDA和嵌套指针( a 指针指针数组)的经典问题。这个问题也发生在几乎任何时候有一个指针埋在一些其他数据结构。为了将这样的数据结构从主机复制到设备需要具有多个步骤的深度复制操作。要了解更多关于这一点,搜索CUDA 2D数组(我认为规范的答案是由talonmies 这里)或看看我的答案此处这里



另外请注意,对于CUDA 6,如果您能够使用统一内存



您的第二次尝试似乎是朝着展平您的2D或指针到ponter数组 char 的路径。这是深复制的问题的典型解决方案,导致更少的代码复杂性和可能还有更高的性能。这是一个完全工作的例子,混合了你的第一次和第二次尝试的想法,这似乎对我有用:

  $ cat t389。 cu 
#include< stdio.h>

__global__ void func(char * a,int * indexes,int num_strings){


for(int i = 0; i printf(string [%d]:,i);
for(int j = indexes [2 * i]; j< indexes [2 * i + 1]; j ++)
printf(%c,a [j]
printf(\\\
);
}
}

int main(){

int max_text_length,num_str;
num_str = 3;
char * tmp [num_str];
max_text_length = 12;

tmp [0] =(char *)malloc(max_text_length * sizeof(char));
tmp [1] =(char *)malloc(max_text_length * sizeof(char));
tmp [2] =(char *)malloc(max_text_length * sizeof(char));

tmp [0] =some text;
tmp [1] =rand txt;
tmp [2] =text;

int stridx [2 * num_str];
int * d_stridx;
stridx [0] = 0;
stridx [1] = 9;
stridx [2] = 9;
stridx [3] = 17;
stridx [4] = 17;
stridx [5] = 21;

char * a,* d_a;
a =(char *)malloc(num_str * max_text_length * sizeof(char));
// flatten
int subidx = 0;
for(int i = 0; i {
for(int j = stridx [2 * i]; j a [j] = tmp [i] [subidx ++];
subidx = 0;
}

cudaMalloc((void **)& d_a,num_str * max_text_length * sizeof(char));
cudaMemcpy(d_a,a,num_str * max_text_length * sizeof(char),cudaMemcpyHostToDevice);
cudaMalloc((void **)& d_stridx,num_str * 2 * sizeof(int));
cudaMemcpy(d_stridx,stridx,2 * num_str * sizeof(int),cudaMemcpyHostToDevice);


func<<< 1,1>>>(d_a,d_stridx,num_str);
cudaDeviceSynchronize();

}
$ nvcc -arch = sm_20 -o t389 t389.cu
$ cuda-memcheck ./t389
========= CUDA -MEMCHECK
string [0]:一些文本
string [1]:rand txt
string [2]:text
========= ERROR摘要: 0错误
$


Problem:

I've a matrix in C++ filled with strings and I want to pass it to cuda kernel function. I know that CUDA can't handle strings so after some research I've tried out with some solutions listed below.

Attempts:

  1. define an array of pointers in C++ containing for each cell a pointer chars (for simplicity tmp[i] is filled with the strings contained into the matrix previously cited)

    C++ section

     char *tmp[3];
     int text_length, array_length;
    
     text_length = 4;
     array_length = 3;
    
     tmp[0] = (char*) malloc(text_length*sizeof(char));
     tmp[1] = (char*) malloc(text_length*sizeof(char));
     tmp[2] = (char*) malloc(text_length*sizeof(char));
    
     tmp[0] = "some";
     tmp[1] = "rand";
     tmp[2] = "text";
    
     char *a[3];
     for(int i=0;i<array_length;i++)
     {
       cudaMalloc((void**) &a[i],text_length*sizeof(char));
       cudaMemcpy(&a[i],&tmp[i],text_length*sizeof(char),cudaMemcpyHostToDevice);
     }
    
     func<<<blocksPerGrid, threadsPerBlock>>>(a);
    

    CUDA section

     __global__ void func(char* a[]){
    
     for(int i=0;i<3;i++)
       printf("value[%d] = %s \n",i, a[i]);
     }
    

    Output

     value[0] = (null)
     value[1] = (null)
     value[2] = (null)
    

  2. spread the matrix filled with strings to a char pointer and pass it to cuda kernel and there try to retrieve the strings (again code simplified in C++)

    C++ section

     char *a;
     int index[6];
    
     a = "somerandtext";
     index[0] = 0; // first word start
     index[1] = 3; // first word end
     index[2] = 4; // same as first word 
     index[3] = 7;
     index[4] = 8;
     index[5] = 1;
    
     func<<<blocksPerGrid, threadsPerBlock>>>(a,index);
    

    CUDA section

     __global__ void func(char* a,int index[]){
    
     int first_word_start = index[0];
     int first_word_end = index[1];
    
     // print first word
     for(int i=first_word_start;i<=first_word_end;i++)
       printf("%c",a[i]);
     }
    

    Output

     no output produced
    

I've tried out with a lot of other solutions but no one works for me... The problem can also re proposed asking: how can i pass 'n' strings to a cuda kernel and print (and compare) all of them there ( keep in mind that I can't pass 'n' variables).

解决方案

Niether of the codes you've shown is complete, and the things you've left out may be important. You'll make it easier for others to help you if you show complete codes. Also, anytime you're struggling with CUDA codes, it's good practice to use proper cuda error checking which often times will point you at what is not working (I suspect this might have helped with your second attempt). Also, running your code with cuda-memcheck is often times instructive.

In your first attempt, you've run into a classic problem with CUDA and nested pointers (a is a pointer to an array of pointers). This problem occurs also pretty much any time there is a pointer buried in some other data structure. To copy such a data structure from host to device requires a "deep copy" operation, which has multiple steps. To understand more about this, search on "CUDA 2D array" (I consider the canonical answer to be the one given by talonmies here) or take a look at my answers here and here.

Also note that with CUDA 6, "deep copies" can be a lot easier conceptually for the programmer if you are able to use unified memory.

Your second attempt appears to be headed down a path of "flattening" your 2D or pointer-to-ponter array of char. That's a typical solution to the "problem" of deep-copying, resulting in less code complexity and probably also higher performance. Here's a fully worked example, blending ideas from your first and second attempt, which seems to work for me:

$ cat t389.cu
#include <stdio.h>

 __global__ void func(char* a, int *indexes, int num_strings){


 for(int i=0;i<num_strings;i++){
   printf("string[%d]: ", i);
   for (int j=indexes[2*i]; j < indexes[2*i+1]; j++)
     printf("%c", a[j]);
   printf("\n");
 }
}

int main(){

 int max_text_length, num_str;
 num_str = 3;
 char *tmp[num_str];
 max_text_length = 12;

 tmp[0] = (char*) malloc(max_text_length*sizeof(char));
 tmp[1] = (char*) malloc(max_text_length*sizeof(char));
 tmp[2] = (char*) malloc(max_text_length*sizeof(char));

 tmp[0] = "some text";
 tmp[1] = "rand txt";
 tmp[2] = "text";

 int stridx[2*num_str];
 int *d_stridx;
 stridx[0] = 0;
 stridx[1] = 9;
 stridx[2] = 9;
 stridx[3] = 17;
 stridx[4] = 17;
 stridx[5] = 21;

 char *a, *d_a;
 a = (char *)malloc(num_str*max_text_length*sizeof(char));
 //flatten
 int subidx = 0;
 for(int i=0;i<num_str;i++)
 {
   for (int j=stridx[2*i]; j<stridx[2*i+1]; j++)
     a[j] = tmp[i][subidx++];
   subidx = 0;
 }

 cudaMalloc((void**)&d_a,num_str*max_text_length*sizeof(char));
 cudaMemcpy(d_a, a,num_str*max_text_length*sizeof(char),cudaMemcpyHostToDevice);
 cudaMalloc((void**)&d_stridx,num_str*2*sizeof(int));
 cudaMemcpy(d_stridx, stridx,2*num_str*sizeof(int),cudaMemcpyHostToDevice);


 func<<<1,1>>>(d_a, d_stridx, num_str);
 cudaDeviceSynchronize();

}
$ nvcc -arch=sm_20 -o t389 t389.cu
$ cuda-memcheck ./t389
========= CUDA-MEMCHECK
string[0]: some text
string[1]: rand txt
string[2]: text
========= ERROR SUMMARY: 0 errors
$

这篇关于如何从C ++传递字符串矩阵到Cuda内核的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆