在CUDA NVRTC代码中包含C标准标头 [英] Including C standard headers in CUDA NVRTC code

查看:113
本文介绍了在CUDA NVRTC代码中包含C标准标头的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在编写一个使用NVRTC(带有NVRTC版本7.5的CUDA版本9.2)在运行时编译的CUDA内核,按顺序需要 stdint.h 标头拥有 int32_t 等类型。



如果我编写不包含include的内核源代码,则它可以正常工作。例如,内核

 外部 C __global__ void f(){...} 

编译为PTX代码,其中f定义为 .visible .entry f



但是如果内核源代码是

  #include< stdint .h> 
extern C __global__ void f(){...}

它报告没有执行空间注释的函数(__host __ / __ device __ / __ global__)被视为宿主函数,并且在JIT模式下不允许宿主函数。(也没有 extern C )。



通过 -default-device 生成PTX代码 .visible .func f ,因此无法从主机调用该函数。



有没有一种包含方法源代码中的标头,并且仍然具有 __ global __ 入口函数?或者,一种了解NVRTC编译器使用哪种整数大小约定的方法,以便可以手动定义 int32_t 等类型?



编辑:
显示问题的示例程序:

  #include< cstdlib> 
#include< string>
#include< vector>
#include< memory>
#include< cassert>
#include< iostream>

#include< cuda.h>
#include< cuda_runtime.h>
#include< nvrtc.h>

[[noreturn]]无效(const std :: string& msg,int码){
std :: cerr<< 错误:<< msg<< (<<代码<<’)'<< std :: endl;
std :: exit(EXIT_FAILURE);
}


std :: unique_ptr< char []> compile_to_ptx(const char * program_source){
nvrtcResult rv;

//创建nvrtc程序
nvrtcProgram prog;
rv = nvrtcCreateProgram(
& prog,
program_source,
program.cu,
0,
nullptr,
nullptr
);
if(rv!= NVRTC_SUCCESS)fail( nvrtcCreateProgram,rv);

//编译nvrtc程序
std :: vector< const char *>选项= {
--gpu-architecture = compute_30
};
//options.push_back(\"-default-device);
rv = nvrtcCompileProgram(prog,options.size(),options.data());
if(rv!= NVRTC_SUCCESS){
std :: size_t log_size;
rv = nvrtcGetProgramLogSize(prog,& log_size);
if(rv!= NVRTC_SUCCESS)fail( nvrtcGetProgramLogSize,rv);

自动日志= std :: make_unique< char []>(log_size);
rv = nvrtcGetProgramLog(prog,log.get());
if(rv!= NVRTC_SUCCESS)fail( nvrtcGetProgramLog,rv);
assert(log [log_size-1] ==‘\0’);

std :: cerr<< 编译错误; log:\n<< log.get()<< std :: endl;

fail( nvrtcCompileProgram,rv);
}

//获取ptx代码
std :: size_t ptx_size;
rv = nvrtcGetPTXSize(prog,& ptx_size);
if(rv!= NVRTC_SUCCESS)fail( nvrtcGetPTXSize,rv);

自动ptx = std :: make_unique< char []>(ptx_size);
rv = nvrtcGetPTX(prog,ptx.get());
if(rv!= NVRTC_SUCCESS)fail( nvrtcGetPTX,rv);
assert(ptx [ptx_size-1] ==‘\0’);

nvrtcDestroyProgram(& prog);

返回ptx;
}

const char program_source [] = R %%%(
//#include< stdint.h>
extern C __global__ void f (int * in,int * out){
out [threadIdx.x] = in [threadIdx.x];
}
)%%%;

int main(){
结果rv;

//初始化CUDA
rv = cuInit(0);
if(rv!= CUDA_SUCCESS)fail( cuInit,rv);

//将程序编译为ptx
auto ptx = compile_to_ptx(program_source);
std :: cout<< PTX代码:\n<< ptx.get()<< std :: endl;
}

//#include< stdint.h> ; 在内核源代码中已取消注释,它将不再编译。当 // options.push_back(-default-device); 不加注释时,它将编译但不标记函数 f .entry



CMakeLists.txt进行编译(需要CUDA驱动程序API + NVRTC)

  cmake_minimum_required(版本3.4)
项目(cudabug CXX)

find_package(需要CUDA)

设置(CMAKE_CXX_STANDARD 14)
设置(CMAKE_CXX_STANDARD_REQUIRED 14)

add_executable(cudabug cudabug.cc)
include_directories(SYSTEM $ {CUDA_INCLUDE_DIRS})
link_directories($ {CUDA_LIBRARY_DIRS})
target_link_libraries(cudabug PUBLIC $ {CUDA_LIBRARIES} nvrtc cuda)




这里的问题似乎出在GNU标准头文件 features.h 上。被拉入 stdint.h ,最后定义了许多存根函数,这些存根函数具有默认的 __ host __ 编译空间,并且导致nvrtc崩溃。似乎 -default-device 选项将导致已解决的glibC编译器功能集使整个nvrtc编译器失败。



您可以通过预定义标准库的功能集(以不包括所有主机功能的方式)来克服(非常怪异的方式)。将您的JIT内核代码更改为

  const char program_source [] = R %%%(
#define __ASSEMBLER__
#定义__extension__
#include< stdint.h>
extern C __global__ void f(int32_t * in,int32_t * out){
out [threadIdx.x] = in [threadIdx.x];
}
)%%%;

让我知道这一点:

  $ nvcc -std = c ++ 14 -ccbin = g ++-7 jit_header.cu -o jitheader -lnvrtc -lcuda 
$ ./jitheader
PTX代码:
//
//由NVIDIA NVVM编译器
//
//编译器内部版本ID:CL-24330188
// Cuda编译工具,9.2版,V9.2.148
//基于LLVM 3.4svn
//

.version 6.2
.target sm_30
.address_size 64

/ / .globl f

.visible .entry f(
.param .u64 f_param_0,
.param .u64 f_param_1

{
.reg .b32%r 3;
.reg .b64%rd< 8> ;;


ld.param.u64%rd1,[f_param_0];
ld.param.u64%rd2,[f_param_1];
cvta.to.global.u64%rd3,%rd2;
cvta.to.global.u64%rd4,%rd1;
mov.u32%r1,%tid.x;
mul.wide.u32%rd5,%r1,4;
add.s64%rd6,%rd4,%rd5;
ld.global.u32%r2,[%rd6];
add.s64%rd7,%rd3,%rd5;
st.global.u32 [%rd7],%r2;
ret;
}

大警告:这在我尝试过的glibC系统上起作用。它可能无法与其他工具链或libC实现一起使用(如果确实存在此问题)。


I'm writing a CUDA kernel that is compiled at runtime using NVRTC (CUDA version 9.2 with NVRTC version 7.5), which needs the stdint.h header, in order to have the int32_t etc. types.

If I write the kernel source code without the include, it works correctly. For example the kernel

extern "C" __global__ void f() { ... }

Compiles to PTX code where f is defined as .visible .entry f.

But if the kernel source code is

#include <stdint.h>
extern "C" __global__ void f() { ... }

it reports A function without execution space annotations (__host__/__device__/__global__) is considered a host function, and host functions are not allowed in JIT mode. (also without extern "C").

Passing -default-device makes the PTX code .visible .func f, so the function cannot be called from the host.

Is there a way to include headers in the source code, and still have a __global__ entry function? Or alternately, a way to know which integer size convention is used on the by the NVRTC compiler, so that the int32_t etc. types can be manually defined?

Edit: Example program that shows the problem:

#include <cstdlib>
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>

[[noreturn]] void fail(const std::string& msg, int code) {
    std::cerr << "error: " << msg << " (" << code << ')' << std::endl;
    std::exit(EXIT_FAILURE);
}


std::unique_ptr<char[]> compile_to_ptx(const char* program_source) {
    nvrtcResult rv;

    // create nvrtc program
    nvrtcProgram prog;
    rv = nvrtcCreateProgram(
        &prog,
        program_source,
        "program.cu",
        0,
        nullptr,
        nullptr
    );
    if(rv != NVRTC_SUCCESS) fail("nvrtcCreateProgram", rv);

    // compile nvrtc program
    std::vector<const char*> options = {
        "--gpu-architecture=compute_30"
    };
    //options.push_back("-default-device");
    rv = nvrtcCompileProgram(prog, options.size(), options.data());
    if(rv != NVRTC_SUCCESS) {
        std::size_t log_size;
        rv = nvrtcGetProgramLogSize(prog, &log_size);
        if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLogSize", rv);

        auto log = std::make_unique<char[]>(log_size);
        rv = nvrtcGetProgramLog(prog, log.get());
        if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLog", rv);
        assert(log[log_size - 1] == '\0');

        std::cerr << "Compile error; log:\n" << log.get() << std::endl;

        fail("nvrtcCompileProgram", rv);
    }

    // get ptx code
    std::size_t ptx_size;
    rv = nvrtcGetPTXSize(prog, &ptx_size);
    if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTXSize", rv);

    auto ptx = std::make_unique<char[]>(ptx_size);
    rv = nvrtcGetPTX(prog, ptx.get());
    if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTX", rv);
    assert(ptx[ptx_size - 1] == '\0');

    nvrtcDestroyProgram(&prog);

    return ptx;
}

const char program_source[] = R"%%%(
//#include <stdint.h>
extern "C" __global__ void f(int* in, int* out) {
    out[threadIdx.x] = in[threadIdx.x];
}
)%%%";

int main() {
    CUresult rv;

    // initialize CUDA
    rv = cuInit(0);
    if(rv != CUDA_SUCCESS) fail("cuInit", rv);

    // compile program to ptx
    auto ptx = compile_to_ptx(program_source);
    std::cout << "PTX code:\n" << ptx.get() << std::endl;
}

When //#include <stdint.h> in the kernel source is uncommented it no longer compiles. When //options.push_back("-default-device"); is uncommented it compiles but does not mark the function f as .entry.

CMakeLists.txt to compile it (needs CUDA driver API + NVRTC)

cmake_minimum_required(VERSION 3.4)
project(cudabug CXX)

find_package(CUDA REQUIRED)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED 14)

add_executable(cudabug cudabug.cc)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})
target_link_libraries(cudabug PUBLIC ${CUDA_LIBRARIES} nvrtc cuda)

解决方案

[Preface: this is a very hacky answer, and is specific to the GNU toolchain (although I suspect the problem in the question is also specific to the GNU toolchain)].

It would appear that the problem here is with the GNU standard header features.h which gets pulled into stdint.hand which winds up defining a lot of stub functions which have the default __host__ compilation space and cause nvrtc to blow up. It also seems that the -default-device option will result in a resolved glibC compiler feature set which makes the whole nvrtc compiler fail.

You can defeat this (in a very hacky way) by predefining a feature set for the standard library which excludes all the host functions. Changing your JIT kernel code to

const char program_source[] = R"%%%(
#define __ASSEMBLER__
#define __extension__
#include <stdint.h>
extern "C" __global__ void f(int32_t* in, int32_t* out) {
    out[threadIdx.x] = in[threadIdx.x];
}
)%%%";

got me this:

$ nvcc -std=c++14 -ccbin=g++-7 jit_header.cu -o jitheader -lnvrtc -lcuda
$ ./jitheader 
PTX code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-24330188
// Cuda compilation tools, release 9.2, V9.2.148
// Based on LLVM 3.4svn
//

.version 6.2
.target sm_30
.address_size 64

    // .globl   f

.visible .entry f(
    .param .u64 f_param_0,
    .param .u64 f_param_1
)
{
    .reg .b32   %r<3>;
    .reg .b64   %rd<8>;


    ld.param.u64    %rd1, [f_param_0];
    ld.param.u64    %rd2, [f_param_1];
    cvta.to.global.u64  %rd3, %rd2;
    cvta.to.global.u64  %rd4, %rd1;
    mov.u32     %r1, %tid.x;
    mul.wide.u32    %rd5, %r1, 4;
    add.s64     %rd6, %rd4, %rd5;
    ld.global.u32   %r2, [%rd6];
    add.s64     %rd7, %rd3, %rd5;
    st.global.u32   [%rd7], %r2;
    ret;
}

Big caveat: This worked on the glibC system I tried it on. It probably won't work with other toolchains or libC implementations (if, indeed, they have this problem).

这篇关于在CUDA NVRTC代码中包含C标准标头的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆