在CUDA NVRTC代码中包含C标准标头 [英] Including C standard headers in CUDA NVRTC code

查看：113 发布时间：2020/10/13 0:38:36 cuda preprocessor stdint nvrtc

本文介绍了在CUDA NVRTC代码中包含C标准标头的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在编写一个使用NVRTC（带有NVRTC版本7.5的CUDA版本9.2）在运行时编译的CUDA内核，按顺序需要 stdint.h 标头拥有 int32_t 等类型。

如果我编写不包含include的内核源代码，则它可以正常工作。例如，内核

 外部 C __global__ void f（）{...}

编译为PTX代码，其中f定义为 .visible .entry f 。

但是如果内核源代码是

  #include< stdint .h> 
 extern C __global__ void f（）{...}

它报告没有执行空间注释的函数（__host __ / __ device __ / __ global__）被视为宿主函数，并且在JIT模式下不允许宿主函数。（也没有 extern C ）。

通过 -default-device 生成PTX代码 .visible .func f ，因此无法从主机调用该函数。

有没有一种包含方法源代码中的标头，并且仍然具有 __ global __ 入口函数？或者，一种了解NVRTC编译器使用哪种整数大小约定的方法，以便可以手动定义 int32_t 等类型？

编辑：
显示问题的示例程序：

  #include< cstdlib> 
 #include< string> 
 #include< vector> 
 #include< memory> 
 #include< cassert> 
 #include< iostream> 
 
 #include< cuda.h> 
 #include< cuda_runtime.h> 
 #include< nvrtc.h> 
 
 [[noreturn]]无效（const std :: string& msg，int码）{
 std :: cerr<< 错误：<< msg<< （<<代码<<’）'<< std :: endl; 
 std :: exit（EXIT_FAILURE）; 
} 
 
 
 std :: unique_ptr< char []> compile_to_ptx（const char * program_source）{
 nvrtcResult rv; 
 
 //创建nvrtc程序
 nvrtcProgram prog; 
 rv = nvrtcCreateProgram（
& prog，
 program_source，
 program.cu，
 0，
 nullptr，
 nullptr 
）; 
 if（rv！= NVRTC_SUCCESS）fail（ nvrtcCreateProgram，rv）; 
 
 //编译nvrtc程序
 std :: vector< const char *>选项= {
 --gpu-architecture = compute_30 
}; 
 //options.push_back(\"-default-device）; 
 rv = nvrtcCompileProgram（prog，options.size（），options.data（））; 
 if（rv！= NVRTC_SUCCESS）{
 std :: size_t log_size; 
 rv = nvrtcGetProgramLogSize（prog，& log_size）; 
 if（rv！= NVRTC_SUCCESS）fail（ nvrtcGetProgramLogSize，rv）; 
 
自动日志= std :: make_unique< char []>（log_size）; 
 rv = nvrtcGetProgramLog（prog，log.get（））; 
 if（rv！= NVRTC_SUCCESS）fail（ nvrtcGetProgramLog，rv）; 
 assert（log [log_size-1] ==‘\0’）; 
 
 std :: cerr<< 编译错误； log：\n<< log.get（）<< std :: endl; 
 
 fail（ nvrtcCompileProgram，rv）; 
} 
 
 //获取ptx代码
 std :: size_t ptx_size; 
 rv = nvrtcGetPTXSize（prog，& ptx_size）; 
 if（rv！= NVRTC_SUCCESS）fail（ nvrtcGetPTXSize，rv）; 
 
自动ptx = std :: make_unique< char []>（ptx_size）; 
 rv = nvrtcGetPTX（prog，ptx.get（））; 
 if（rv！= NVRTC_SUCCESS）fail（ nvrtcGetPTX，rv）; 
 assert（ptx [ptx_size-1] ==‘\0’）; 
 
 nvrtcDestroyProgram（& prog）; 
 
返回ptx； 
} 
 
 const char program_source [] = R %%%（
 //＃include< stdint.h> 
 extern C __global__ void f （int * in，int * out）{
 out [threadIdx.x] = in [threadIdx.x]; 
} 
）%%%; 
 
 int main（）{
结果rv; 
 
 //初始化CUDA 
 rv = cuInit（0）; 
 if（rv！= CUDA_SUCCESS）fail（ cuInit，rv）; 
 
 //将程序编译为ptx 
 auto ptx = compile_to_ptx（program_source）; 
 std :: cout<<  PTX代码：\n<< ptx.get（）<< std :: endl; 
}

当 //＃include< stdint.h> ; 在内核源代码中已取消注释，它将不再编译。当 // options.push_back（-default-device）; 不加注释时，它将编译但不标记函数 f 为 .entry 。

CMakeLists.txt进行编译（需要CUDA驱动程序API + NVRTC）

  cmake_minimum_required（版本3.4）
项目（cudabug CXX）
 
 find_package（需要CUDA）
 
设置（CMAKE_CXX_STANDARD 14）
设置（CMAKE_CXX_STANDARD_REQUIRED 14）
 
 add_executable（cudabug cudabug.cc）
 include_directories（SYSTEM $ {CUDA_INCLUDE_DIRS}）
 link_directories（$ {CUDA_LIBRARY_DIRS}）
 target_link_libraries（cudabug PUBLIC $ {CUDA_LIBRARIES} nvrtc cuda）

这里的问题似乎出在GNU标准头文件 features.h 上。被拉入 stdint.h ，最后定义了许多存根函数，这些存根函数具有默认的 __ host __ 编译空间，并且导致nvrtc崩溃。似乎 -default-device 选项将导致已解决的glibC编译器功能集使整个nvrtc编译器失败。

您可以通过预定义标准库的功能集（以不包括所有主机功能的方式）来克服（非常怪异的方式）。将您的JIT内核代码更改为

  const char program_source [] = R %%%（
 #define __ASSEMBLER__ 
＃定义__extension__ 
 #include< stdint.h> 
 extern C __global__ void f（int32_t * in，int32_t * out）{
 out [threadIdx.x] = in [threadIdx.x]; 
} 
）%%%；

让我知道这一点：

  $ nvcc -std = c ++ 14 -ccbin = g ++-7 jit_header.cu -o jitheader -lnvrtc -lcuda 
 $ ./jitheader 
 PTX代码：
 // 
 //由NVIDIA NVVM编译器
 // 
 //编译器内部版本ID：CL-24330188 
 // Cuda编译工具，9.2版，V9.2.148 
 //基于LLVM 3.4svn 
 // 
 
 .version 6.2 
 .target sm_30 
 .address_size 64 
 
 / / .globl f 
 
 .visible .entry f（
 .param .u64 f_param_0，
 .param .u64 f_param_1 
）
 {
 .reg .b32％r 3; 
 .reg .b64％rd< 8> ;; 
 
 
 ld.param.u64％rd1，[f_param_0]; 
 ld.param.u64％rd2，[f_param_1]; 
 cvta.to.global.u64％rd3，％rd2; 
 cvta.to.global.u64％rd4，％rd1; 
 mov.u32％r1，％tid.x; 
 mul.wide.u32％rd5，％r1，4; 
 add.s64％rd6，％rd4，％rd5; 
 ld.global.u32％r2，[％rd6]; 
 add.s64％rd7，％rd3，％rd5; 
 st.global.u32 [％rd7]，％r2; 
 ret； 
}

大警告：这在我尝试过的glibC系统上起作用。它可能无法与其他工具链或libC实现一起使用（如果确实存在此问题）。

I'm writing a CUDA kernel that is compiled at runtime using NVRTC (CUDA version 9.2 with NVRTC version 7.5), which needs the stdint.h header, in order to have the int32_t etc. types.

If I write the kernel source code without the include, it works correctly. For example the kernel

extern "C" __global__ void f() { ... }

Compiles to PTX code where f is defined as .visible .entry f.

But if the kernel source code is

#include <stdint.h>
extern "C" __global__ void f() { ... }

it reports A function without execution space annotations (__host__/__device__/__global__) is considered a host function, and host functions are not allowed in JIT mode. (also without extern "C").

Passing -default-device makes the PTX code .visible .func f, so the function cannot be called from the host.

Is there a way to include headers in the source code, and still have a __global__ entry function? Or alternately, a way to know which integer size convention is used on the by the NVRTC compiler, so that the int32_t etc. types can be manually defined?

Edit: Example program that shows the problem:

#include <cstdlib>
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>

[[noreturn]] void fail(const std::string& msg, int code) {
    std::cerr << "error: " << msg << " (" << code << ')' << std::endl;
    std::exit(EXIT_FAILURE);
}


std::unique_ptr<char[]> compile_to_ptx(const char* program_source) {
    nvrtcResult rv;

    // create nvrtc program
    nvrtcProgram prog;
    rv = nvrtcCreateProgram(
        &prog,
        program_source,
        "program.cu",
        0,
        nullptr,
        nullptr
    );
    if(rv != NVRTC_SUCCESS) fail("nvrtcCreateProgram", rv);

    // compile nvrtc program
    std::vector<const char*> options = {
        "--gpu-architecture=compute_30"
    };
    //options.push_back("-default-device");
    rv = nvrtcCompileProgram(prog, options.size(), options.data());
    if(rv != NVRTC_SUCCESS) {
        std::size_t log_size;
        rv = nvrtcGetProgramLogSize(prog, &log_size);
        if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLogSize", rv);

        auto log = std::make_unique<char[]>(log_size);
        rv = nvrtcGetProgramLog(prog, log.get());
        if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLog", rv);
        assert(log[log_size - 1] == '\0');

        std::cerr << "Compile error; log:\n" << log.get() << std::endl;

        fail("nvrtcCompileProgram", rv);
    }

    // get ptx code
    std::size_t ptx_size;
    rv = nvrtcGetPTXSize(prog, &ptx_size);
    if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTXSize", rv);

    auto ptx = std::make_unique<char[]>(ptx_size);
    rv = nvrtcGetPTX(prog, ptx.get());
    if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTX", rv);
    assert(ptx[ptx_size - 1] == '\0');

    nvrtcDestroyProgram(&prog);

    return ptx;
}

const char program_source[] = R"%%%(
//#include <stdint.h>
extern "C" __global__ void f(int* in, int* out) {
    out[threadIdx.x] = in[threadIdx.x];
}
)%%%";

int main() {
    CUresult rv;

    // initialize CUDA
    rv = cuInit(0);
    if(rv != CUDA_SUCCESS) fail("cuInit", rv);

    // compile program to ptx
    auto ptx = compile_to_ptx(program_source);
    std::cout << "PTX code:\n" << ptx.get() << std::endl;
}

When //#include <stdint.h> in the kernel source is uncommented it no longer compiles. When //options.push_back("-default-device"); is uncommented it compiles but does not mark the function f as .entry.

CMakeLists.txt to compile it (needs CUDA driver API + NVRTC)

cmake_minimum_required(VERSION 3.4)
project(cudabug CXX)

find_package(CUDA REQUIRED)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED 14)

add_executable(cudabug cudabug.cc)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})
target_link_libraries(cudabug PUBLIC ${CUDA_LIBRARIES} nvrtc cuda)

解决方案

[Preface: this is a very hacky answer, and is specific to the GNU toolchain (although I suspect the problem in the question is also specific to the GNU toolchain)].

It would appear that the problem here is with the GNU standard header features.h which gets pulled into stdint.hand which winds up defining a lot of stub functions which have the default __host__ compilation space and cause nvrtc to blow up. It also seems that the -default-device option will result in a resolved glibC compiler feature set which makes the whole nvrtc compiler fail.

You can defeat this (in a very hacky way) by predefining a feature set for the standard library which excludes all the host functions. Changing your JIT kernel code to

const char program_source[] = R"%%%(
#define __ASSEMBLER__
#define __extension__
#include <stdint.h>
extern "C" __global__ void f(int32_t* in, int32_t* out) {
    out[threadIdx.x] = in[threadIdx.x];
}
)%%%";

got me this:

$ nvcc -std=c++14 -ccbin=g++-7 jit_header.cu -o jitheader -lnvrtc -lcuda
$ ./jitheader 
PTX code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-24330188
// Cuda compilation tools, release 9.2, V9.2.148
// Based on LLVM 3.4svn
//

.version 6.2
.target sm_30
.address_size 64

    // .globl   f

.visible .entry f(
    .param .u64 f_param_0,
    .param .u64 f_param_1
)
{
    .reg .b32   %r<3>;
    .reg .b64   %rd<8>;


    ld.param.u64    %rd1, [f_param_0];
    ld.param.u64    %rd2, [f_param_1];
    cvta.to.global.u64  %rd3, %rd2;
    cvta.to.global.u64  %rd4, %rd1;
    mov.u32     %r1, %tid.x;
    mul.wide.u32    %rd5, %r1, 4;
    add.s64     %rd6, %rd4, %rd5;
    ld.global.u32   %r2, [%rd6];
    add.s64     %rd7, %rd3, %rd5;
    st.global.u32   [%rd7], %r2;
    ret;
}

Big caveat: This worked on the glibC system I tried it on. It probably won't work with other toolchains or libC implementations (if, indeed, they have this problem).

这篇关于在CUDA NVRTC代码中包含C标准标头的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

在CUDA NVRTC代码中包含C标准标头 [英] Including C standard headers in CUDA NVRTC code

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

在CUDA NVRTC代码中包含C标准标头 [英] Including C standard headers in CUDA NVRTC code

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭