函数指针(到其他内核)作为CUDA中的内核arg [英] Function pointer (to other kernel) as kernel arg in CUDA

查看:171
本文介绍了函数指针(到其他内核)作为CUDA中的内核arg的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

使用CUDA中的动态并行性,您可以从某个版本开始在GPU上启动内核。我有一个包装函数,它指向我想要使用的内核,它要么在CPU上为旧设备,要么在GPU上更新的设备。对于回退路径,它是好的,对于GPU它不是,并说,内存对齐是不正确的。



有没有办法在CUDA(7)?是否有一些底层调用会给我一个在GPU上正确的指针地址?



代码如下,模板TFunc编译器做一些不同的事情,但我已经尝试过强类型。

  template< typename TFunc,typename。 ... TArgs> 
__global__ void测试(TFunc func,int count,TArgs ... args)
{
#if defined(__ CUDA_ARCH__)&& (__CUDA_ARCH__> = 320)
(* func)< < 1,1>> >(args ...);
#else
printf(你在这里干什么!?\\\
);
#endif
}

template< typename ... TArgs>
__host__ void Iterate(void(* kernel)(TArgs ...),const systemInfo * sysInfo,int count,TArgs ... args)
{
if(sysInfo-> getCurrentDevice () - > compareVersion(3.2)> 0)
{
printf(GPU上的迭代
Test<< < 1,1>> >(kernel,count,args ...);
}
else
{
printf(Iterate on CPU\\\
);
Test<< < 1,1>> >(kernel,count,args ...);
}
}


解决方案

strong> EDIT:
在我最初写这个答案的时候,我相信这些语句是正确的:不可能在主机代码中获取内核地址。然而,我相信自那时以来,CUDA的一些变化,因此,现在(在CUDA 8,也许之前)可以在主机代码中采取一个内核地址(仍然不可能取地址



ORIGINAL ANSWER : / p>

这个问题似乎不时出现,但以前的例子我可以想到与调用 __设备__ 函数而不是 __global __ 函数。



通常,在主机代码中获取设备实体(变量,函数)的地址是非法的。



一个可能的方法来解决这个问题(虽然这对我不是很清楚,似乎会有更简单的调度机制)是提取设备地址所需的设备代码并将该值返回给主机,以便调度使用。在这种情况下,我创建一个简单的例子,提取所需的设备地址到 __ device __ 变量,但你也可以写一个内核做这个设置(即给我



这是一个粗略的工作示例,基于您显示的代码:

  $ cat t746.cu 
#include< stdio.h>

__global__ void ckernel1(){

printf(hello1\\\
);
}
__global__ void ckernel2(){

printf(hello2\\\
);
}
__global__ void ckernel3(){

printf(hello3 \\\
);
}

__device__ void(* pck1)()= ckernel1;
__device__ void(* pck2)()= ckernel2;
__device__ void(* pck3)()= ckernel3;

template< typename TFunc,typename ... TArgs>
__global__ void测试(TFunc func,int count,TArgs ... args)
{
#if defined(__ CUDA_ARCH__)&& (__CUDA_ARCH__> = 350)
(* func)< < 1,1>> >(args ...);
#else
printf(你在这里干什么!?\\\
);
#endif
}

template< typename ... TArgs>
__host__ void Iterate(void(* kernel)(TArgs ...),const int sysInfo,int count,TArgs ... args)
{
if(sysInfo&
{
printf(Iterate on GPU\\\
);
Test<< < 1,1>> >(kernel,count,args ...);
}
else
{
printf(Iterate on CPU\\\
);
Test<< < 1,1>> >(kernel,count,args ...);
}
}


int main(){

void(* h_ckernel1)();
void(* h_ckernel2)();
void(* h_ckernel3)();
cudaMemcpyFromSymbol(& h_ckernel1,pck1,sizeof(void *));
cudaMemcpyFromSymbol(& h_ckernel2,pck2,sizeof(void *));
cudaMemcpyFromSymbol(& h_ckernel3,pck3,sizeof(void *));
Iterate(h_ckernel1,350,1);
Iterate(h_ckernel2,350,1);
Iterate(h_ckernel3,350,1);
cudaDeviceSynchronize();
return 0;
}

$ nvcc -std = c ++ 11 -arch = sm_35 -o t746 t746.cu -rdc = true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
在GPU上迭代
在GPU上迭代
在GPU上迭代
hello1
hello2
hello3
=========错误摘要:0个错误
$

上面的( __ device __ 变量)方法可能无法使用模板化的子内核,但是可能创建一个模板化的提取器内核返回(实例化的)模板化子内核的地址。在我链接的上一个答案中给出了extractor setup_kernel 方法的粗略概念。下面是模板化的子内核/提取器内核方法的一个粗略例子:

  $ cat t746.cu 
#include< ; stdio.h>

template< typename T>
__global__ void ckernel1(T * data){

int my_val =(int)(* data + 1);
printf(hello:%d \\\
,my_val);
}
template< typename TFunc,typename ... TArgs>
__global__ void测试(TFunc func,int count,TArgs ... args)
{
#if defined(__ CUDA_ARCH__)&& (__CUDA_ARCH__> = 350)
(* func)< < 1,1>> >(args ...);
#else
printf(你在这里干什么!?\\\
);
#endif
}

template< typename ... TArgs>
__host__ void Iterate(void(* kernel)(TArgs ...),const int sysInfo,int count,TArgs ... args)
{
if(sysInfo&
{
printf(Iterate on GPU\\\
);
Test<< < 1,1>> >(kernel,count,args ...);
}
else
{
printf(Iterate on CPU\\\
);
Test<< < 1,1>> >(kernel,count,args ...);
}
}

template< typename T>
__global__ void extractor(void(** kernel)(T *)){

* kernel = ckernel1< T&
}

template< typename T>
void run_test(T init){

void(* h_ckernel1)(T *);
void(** d_ckernel1)(T *);
T * d_data;
cudaMalloc(& d_ckernel1,sizeof(void *));
cudaMalloc(& d_data,sizeof(T));
cudaMemcpy(d_data,& init,sizeof(T),cudaMemcpyHostToDevice);
提取器<<< 1,1>>>(d_ckernel1);
cudaMemcpy((void *)& h_ckernel1,(void *)d_ckernel1,sizeof(void *),cudaMemcpyDeviceToHost);
Iterate(h_ckernel1,350,1,d_data);
cudaDeviceSynchronize();
cudaFree(d_ckernel1);
cudaFree(d_data);
return;
}

int main(){

run_test(1);
run_test(2.0f);

return 0;
}

$ nvcc -std = c ++ 11 -arch = sm_35 -o t746 t746.cu -rdc = true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
在GPU上迭代
hello:2
在GPU上迭代
hello:3
=== ======错误摘要:0个错误
$


With dynamic parallelism in CUDA, you can launch kernels on the GPU side, starting from a certain version. I have a wrapper function that takes a pointer to the kernel I want to use, and it either does this on the CPU for older devices, or on the GPU for newer devices. For the fallback path it's fine, for the GPU it's not and says the memory alignment is incorrect.

Is there a way to do this in CUDA (7)? Are there some lower-level calls that will give me a pointer address that's correct on the GPU?

The code is below, the template "TFunc" is an attempt to get the compiler to do something different, but I've tried it strongly typed as well.

template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 320)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?\n");
#endif
}

template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const systemInfo *sysInfo, int count, TArgs... args)
{
    if(sysInfo->getCurrentDevice()->compareVersion("3.2") > 0)
    {
        printf("Iterate on GPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}

解决方案

EDIT: At the time that I originally wrote this answer, I believe the statements were correct: it was not possible to take a kernel address in host code. However I believe something has changed in CUDA since then, and so now (in CUDA 8, and maybe prior) it is possible to take a kernel address in host code (it's still not possible to take the address of a __device__ function in host code, however.)

ORIGINAL ANSWER:

It seems like this question comes up from time to time, although the previous examples I can think of have to do with calling __device__ functions instead of __global__ functions.

In general it's illegal to take the address of a device entity (variable, function) in host code.

One possible method to work around this (although the utility of this is not clear to me; it seems like there would be simpler dispatch mechanisms) is to extract the device address needed "in device code" and return that value to the host, for dispatch usage. In this case, I am creating a simple example that extracts the needed device addresses into __device__ variables, but you could also write a kernel to do this setup (i.e. to "give me a pointer address that's correct on the GPU" in your words).

Here's a rough worked example, building on the code you have shown:

$ cat t746.cu
#include <stdio.h>

__global__ void ckernel1(){

  printf("hello1\n");
}
__global__ void ckernel2(){

  printf("hello2\n");
}
__global__ void ckernel3(){

  printf("hello3\n");
}

__device__ void (*pck1)() = ckernel1;
__device__ void (*pck2)() = ckernel2;
__device__ void (*pck3)() = ckernel3;

template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?\n");
#endif
}

template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const int sysInfo, int count, TArgs... args)
{
    if(sysInfo >= 350)
    {
        printf("Iterate on GPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}


int main(){

  void (*h_ckernel1)();
  void (*h_ckernel2)();
  void (*h_ckernel3)();
  cudaMemcpyFromSymbol(&h_ckernel1, pck1, sizeof(void *));
  cudaMemcpyFromSymbol(&h_ckernel2, pck2, sizeof(void *));
  cudaMemcpyFromSymbol(&h_ckernel3, pck3, sizeof(void *));
  Iterate(h_ckernel1, 350, 1);
  Iterate(h_ckernel2, 350, 1);
  Iterate(h_ckernel3, 350, 1);
  cudaDeviceSynchronize();
  return 0;
}

$ nvcc -std=c++11 -arch=sm_35 -o t746 t746.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
Iterate on GPU
Iterate on GPU
Iterate on GPU
hello1
hello2
hello3
========= ERROR SUMMARY: 0 errors
$

The above (__device__ variable) method probably can't be made to work with templated child kernels, but it might be possible to create a templated "extractor" kernel that returns the address of a (instantiated) templated child kernel. A rough idea of the "extractor" setup_kernel method is given in the previous answer I linked. Here's a rough example of the templated child kernel/extractor kernel method:

$ cat t746.cu
#include <stdio.h>

template <typename T>
__global__ void ckernel1(T *data){

  int my_val = (int)(*data+1);
  printf("hello: %d \n", my_val);
}
template <typename TFunc, typename... TArgs>
__global__ void Test(TFunc func, int count, TArgs... args)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
    (*func)<< <1, 1 >> >(args...);
#else
    printf("What are you doing here!?\n");
#endif
}

template <typename... TArgs>
__host__ void Iterate(void(*kernel)(TArgs...), const int sysInfo, int count, TArgs... args)
{
    if(sysInfo >= 350)
    {
        printf("Iterate on GPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
    else
    {
        printf("Iterate on CPU\n");
        Test << <1, 1 >> >(kernel, count, args...);
    }
}

template <typename T>
__global__ void extractor(void (**kernel)(T *)){

  *kernel = ckernel1<T>;
}

template <typename T>
void run_test(T init){

  void (*h_ckernel1)(T *);
  void (**d_ckernel1)(T *);
  T *d_data;
  cudaMalloc(&d_ckernel1, sizeof(void *));
  cudaMalloc(&d_data, sizeof(T));
  cudaMemcpy(d_data, &init, sizeof(T), cudaMemcpyHostToDevice);
  extractor<<<1,1>>>(d_ckernel1);
  cudaMemcpy((void *)&h_ckernel1, (void *)d_ckernel1, sizeof(void *), cudaMemcpyDeviceToHost);
  Iterate(h_ckernel1, 350, 1, d_data);
  cudaDeviceSynchronize();
  cudaFree(d_ckernel1);
  cudaFree(d_data);
  return;
}

int main(){

  run_test(1);
  run_test(2.0f);

  return 0;
}

$ nvcc -std=c++11 -arch=sm_35 -o t746 t746.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t746
========= CUDA-MEMCHECK
Iterate on GPU
hello: 2
Iterate on GPU
hello: 3
========= ERROR SUMMARY: 0 errors
$

这篇关于函数指针(到其他内核)作为CUDA中的内核arg的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆