cudaSetDevice()对CUDA设备的上下文堆栈有什么作用? [英] What does cudaSetDevice() do to a CUDA device's context stack?

查看:1246
本文介绍了cudaSetDevice()对CUDA设备的上下文堆栈有什么作用?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

假设我有一个与设备 i 关联的活动CUDA上下文,现在我调用 cudaSetDevice(i)。怎么了? :


  1. 什么都没有?

  2. 主上下文替换栈顶吗?

  3. 主要上下文是否被压入堆栈?

实际上似乎不一致。我已经编写了该程序,并在具有单个设备的计算机上运行:

  #include< cuda.h> 
#include< cuda_runtime_api.h>
#include< cassert>
#include< iostream>

int main()
{
CUcontext ctx1,主要的;
cuInit(0);
自动状态= cuCtxCreate(& ctx1,0,0);
assert(状态==(CUresult)cudaSuccess);
cuCtxPushCurrent(ctx1);
status = cudaSetDevice(0);
assert(状态== cudaSuccess);
void * ptr1;
void * ptr2;
cudaMalloc(& ptr1,1024);
assert(状态== cudaSuccess);
cuCtxGetCurrent(& primary);
assert(状态==(CUresult)cudaSuccess);
assert(primary!= ctx1);
status = cuCtxPushCurrent(ctx1);
assert(状态==(CUresult)cudaSuccess);
cudaMalloc(& ptr2,1024);
assert(状态==(CUresult)cudaSuccess);
cudaSetDevice(0);
assert(状态==(CUresult)cudaSuccess);
int i = 0;
而(true){
status = cuCtxPopCurrent(& primary);
if(status!=(CUresult)cudaSuccess){break; }
std :: cout<< <堆栈上的下一个上下文(<< i ++<<))是" << (void *)primary<< ‘n’;
}
}

,我得到以下输出:

 上下文ctx1是0x563ec6225e30 
主上下文是0x563ec61f5490
堆栈(0)上的下一个上下文是0x563ec61f5490
堆栈(1)上的下一个上下文是0x563ec61f5490
stack(2)上的下一个上下文是0x563ec6225e3

这似乎有时是行为

发生了什么事?

解决方案

TL; DR:根据您提供的代码,在您的两种特定用法中,似乎 cudaSetDevice()正在替换上下文在堆栈顶部。


让我们稍微修改一下代码,然后看看我们可以推断出您的代码在上下文堆栈上对每个API调用的影响:

  $ cat t1759.cu 
#include< cuda.h>
#include< cuda_runtime_api.h>
#include< cassert>
#include< iostream>
void check(int j,CUcontext ctx1,CUcontext ctx2){
CUcontext ctx0;
int i = 0;
而(true){
自动状态= cuCtxPopCurrent(& ctx0);
if(status!= CUDA_SUCCESS){中断; }
if(ctx0 == ctx1)std :: cout<< j<< " ;:堆栈中的下一个上下文(<< i ++<<<))是ctx1: << (void *)ctx0<< ‘n’;
else if(ctx0 == ctx2)std :: cout<< j<< " ;:堆栈上的下一个上下文(<< i ++<<))是ctx2: << (void *)ctx0<< ‘n’;
else std :: cout<< j<< < ;:堆栈上的下一个上下文(<< i ++<<))是未知的: << (void *)ctx0<< ‘n’;
}
}
void runtest(int i)
{
CUcontext ctx1,primary = NULL;
cuInit(0);
auto dstatus = cuCtxCreate(& ctx1,0,0); //检查点1
断言(dstatus == CUDA_SUCCESS);
if(i == 1){check(i,ctx1,primary); return;} //检查点1
dstatus = cuCtxPushCurrent(ctx1); //检查点2
assert(dstatus == CUDA_SUCCESS);
if(i == 2){check(i,ctx1,primary); return;} //检查点2
auto rstatus = cudaSetDevice(0); //检查点3
assert(rstatus == cudaSuccess);
if(i == 3){check(i,ctx1,primary); return;} //检查点3
void * ptr1;
void * ptr2;
rstatus = cudaMalloc(& ptr1,1024); //检查点4
assert(rstatus == cudaSuccess);
if(i == 4){check(i,ctx1,primary); return;} //检查点4
dstatus = cuCtxGetCurrent(& primary); //检查点5
assert(dstatus == CUDA_SUCCESS);
assert(primary!= ctx1);
if(i == 5){check(i,ctx1,primary); return;} //检查点5
dstatus = cuCtxPushCurrent(ctx1); //检查点6
assert(dstatus == CUDA_SUCCESS);
if(i == 6){check(i,ctx1,primary); return;} //检查点6
rstatus = cudaMalloc(& ptr2,1024); //检查点7
assert(rstatus == cudaSuccess);
if(i == 7){check(i,ctx1,primary); return;} //检查点7
rstatus = cudaSetDevice(0); //检查点8
assert(rstatus == cudaSuccess);
if(i == 8){check(i,ctx1,primary); return;} //检查点8
return;
}

int main(){
for(int i = 1; i< 9; i ++){
cudaDeviceReset();
runtest(i);}
}
$ nvcc -o t1759 t1759.cu -lcuda -std = c ++ 11
$ ./t1759
1:堆栈(0)上的下一个上下文是ctx1:0x11087e0
2:堆栈(0)上的下一个上下文是ctx1:0x1741160
2:堆栈(1)上的下一个上下文是ctx1:0x1741160
3 :堆栈(0)上的下一个上下文是未知的:0x10dc520
3:堆栈(1)上的下一个上下文是ctx1:0x1c5aa70
4:堆栈(0)上的下一个上下文是未知:0x10dc520
4:堆栈(1)上的下一个上下文是ctx1:0x23eaa00
5:堆栈(0)上的下一个上下文是ctx2:0x10dc520
5:堆栈(1)上的下一个上下文是ctx1:0x32caf30
6:堆栈(0)上的下一个上下文是ctx1:0x3a44ed0
6:堆栈(1)上的下一个上下文是ctx2:0x10dc520
6:堆栈(2)上的下一个上下文是ctx1:0x3a44ed0
7:堆栈(0)上的下一个上下文是ctx1:0x41cfd90
7:堆栈(1)上的下一个上下文是ctx2:0x10dc520
7:堆栈(2)上的下一个上下文是ctx1:0x41cfd90
8:堆栈(0)上的下一个上下文是ctx2:0x10dc520
8:堆栈(1)上的下一个上下文是ctx2:0x10dc520
8:堆栈上的下一个上下文(2)是ctx1:0x4959c70
$

根据上述内容,我们继续进行您的代码:


1。

  auto dstatus = cuCtxCreate(& ctx1,0,0); //检查点1 
1:堆栈(0)上的下一个上下文是ctx1:0x11087e0

此处所述,将新创建的上下文推送到堆栈上a>。


2。

  dstatus = cuCtxPushCurrent(ctx1); //检查点2 
2:堆栈(0)的下一个上下文是ctx1:0x1741160
2:堆栈(1)的下一个上下文是ctx1:0x1741160

毫不奇怪,将相同的上下文压入堆栈会为其创建另一个堆栈条目。


3。

  auto rstatus = cudaSetDevice(0); //检查点3 
3:堆栈(0)的下一个上下文是未知的:0x10dc520
3:堆栈(1)的下一个上下文是ctx1:0x1c5aa70

cudaSetDevice()调用已替换堆栈顶部,并带有未知 ;上下文。 (由于我们尚未检索到 other上下文的句柄值,因此目前尚不清楚)。


4。

  rstatus = cudaMalloc(& ptr1,1024); //检查点4 
4:堆栈(0)上的下一个上下文是未知的:0x10dc520
4:堆栈(1)的下一个上下文是ctx1:0x23eaa00

由于此调用,堆栈配置没有差异。


5。

  dstatus = cuCtxGetCurrent(& primary); //检查点5 
5:堆栈(0)上的下一个上下文是ctx2:0x10dc520
5:堆栈(1)上的下一个上下文是ctx1:0x32caf30

由于此调用,堆栈配置没有区别,但是我们现在知道堆栈上下文的顶部是当前上下文(我们可以推测它是主要上下文)。 / p>

6。

  dstatus = cuCtxPushCurrent(ctx1); //检查点6 
6:堆栈(0)上的下一个上下文是ctx1:0x3a44ed0
6:堆栈(1)上的下一个上下文是ctx2:0x10dc520
6:堆栈上的下一个上下文(2 )是ctx1:0x3a44ed0

这里没有真正的惊喜。我们将 ctx1 推入堆栈,因此堆栈具有3个条目,第一个是驱动程序API创建的上下文,接下来的两个条目与堆栈相同第5步中的配置,只是向下移动了一个堆栈位置。


7。

  rstatus = cudaMalloc(& ; ptr2,1024); //检查点7 
7:堆栈(0)上的下一个上下文是ctx1:0x41cfd90
7:堆栈(1)上的下一个上下文是ctx2:0x10dc520
7:堆栈上的下一个上下文(2 )是ctx1:0x41cfd90

同样,此调用对堆栈配置没有影响。


8。

  rstatus = cudaSetDevice(0); //检查点8 
8:堆栈(0)上的下一个上下文是ctx2:0x10dc520
8:堆栈(1)上的下一个上下文是ctx2:0x10dc520
8:堆栈上的下一个上下文(2 )是ctx1:0x4959c70

再次,我们看到这里的行为是 cudaSetDevice( )调用已将替换为堆栈上下文的顶部与主要上下文。


我从您的测试代码得出的结论是请参见 cudaSetDevice()调用与行为和代码中混合的各种运行时和驱动程序API的行为不一致。


在我看来,这种编程范例是精神错乱。我无法想象为什么您要这样混合使用驱动程序API和运行时API代码。


Suppose I have an active CUDA context associated with device i, and I now call cudaSetDevice(i). What happens? :

  1. Nothing?
  2. Primary context replaces the top of the stack?
  3. Primary context is pushed onto the stack?

It actually seems to be inconsistent. I've written this program, running on a machine with a single device:

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cassert>
#include <iostream>

int main()
{
        CUcontext ctx1, primary;
        cuInit(0);
        auto status = cuCtxCreate(&ctx1, 0, 0);
        assert (status == (CUresult) cudaSuccess);
        cuCtxPushCurrent(ctx1);
        status = cudaSetDevice(0);
        assert (status == cudaSuccess);
        void* ptr1;
        void* ptr2;
        cudaMalloc(&ptr1, 1024);
        assert (status == cudaSuccess);
        cuCtxGetCurrent(&primary);
        assert (status == (CUresult) cudaSuccess);
        assert(primary != ctx1);
        status = cuCtxPushCurrent(ctx1);
        assert (status == (CUresult) cudaSuccess);
        cudaMalloc(&ptr2, 1024);
        assert (status == (CUresult) cudaSuccess);
        cudaSetDevice(0);
        assert (status == (CUresult) cudaSuccess);
        int i = 0;
        while (true) {
                status = cuCtxPopCurrent(&primary);
                if (status != (CUresult) cudaSuccess) { break; }
                std::cout << "Next context on stack (" << i++ << ") is " << (void*) primary << '\n';
        }
}

and I get the following output:

context ctx1 is 0x563ec6225e30
primary context is 0x563ec61f5490
Next context on stack (0) is 0x563ec61f5490
Next context on stack (1) is 0x563ec61f5490
Next context on stack(2) is 0x563ec6225e3

This seems like the behavior is sometimes a replacement, and sometimes a push.

What's going on?

解决方案

TL;DR: Based on the code you have provided, in both instances of your particular usage, it seems that cudaSetDevice() is replacing the context at the top of the stack.

Let's modify your code a bit, and then see what we can infer about the effect of each API call in your code on the context stack:

$ cat t1759.cu
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cassert>
#include <iostream>
void check(int j, CUcontext ctx1, CUcontext ctx2){
  CUcontext ctx0;
  int i = 0;
  while (true) {
                auto status = cuCtxPopCurrent(&ctx0);
                if (status != CUDA_SUCCESS) { break; }
                if (ctx0 == ctx1) std::cout << j << ":Next context on stack (" << i++ << ") is ctx1:" << (void*) ctx0 << '\n';
                else if (ctx0 == ctx2) std::cout << j << ":Next context on stack (" << i++ << ") is ctx2:" << (void*) ctx0 << '\n';
                else std::cout << j << ":Next context on stack (" << i++ << ") is unknown:" << (void*) ctx0 << '\n';
  }
}
void runtest(int i)
{
        CUcontext ctx1, primary = NULL;
        cuInit(0);
        auto dstatus = cuCtxCreate(&ctx1, 0, 0);    // checkpoint 1
        assert (dstatus == CUDA_SUCCESS);
        if (i == 1) {check(i,ctx1,primary); return;}// checkpoint 1
        dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 2
        assert (dstatus == CUDA_SUCCESS);
        if (i == 2) {check(i,ctx1,primary); return;}// checkpoint 2
        auto rstatus = cudaSetDevice(0);            // checkpoint 3
        assert (rstatus == cudaSuccess);
        if (i == 3) {check(i,ctx1,primary); return;}// checkpoint 3
        void* ptr1;
        void* ptr2;
        rstatus = cudaMalloc(&ptr1, 1024);          // checkpoint 4
        assert (rstatus == cudaSuccess);
        if (i == 4) {check(i,ctx1,primary); return;}// checkpoint 4
        dstatus = cuCtxGetCurrent(&primary);        // checkpoint 5
        assert (dstatus == CUDA_SUCCESS);
        assert(primary != ctx1);
        if (i == 5) {check(i,ctx1,primary); return;}// checkpoint 5
        dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 6
        assert (dstatus == CUDA_SUCCESS);
        if (i == 6) {check(i,ctx1,primary); return;}// checkpoint 6
        rstatus = cudaMalloc(&ptr2, 1024);          // checkpoint 7
        assert (rstatus == cudaSuccess);
        if (i == 7) {check(i,ctx1,primary); return;}// checkpoint 7
        rstatus = cudaSetDevice(0);                 // checkpoint 8
        assert (rstatus == cudaSuccess);
        if (i == 8) {check(i,ctx1,primary); return;}// checkpoint 8
        return;
}

int main(){
        for (int i = 1; i < 9; i++){
          cudaDeviceReset();
          runtest(i);}
}
$ nvcc -o t1759 t1759.cu -lcuda -std=c++11
$ ./t1759
1:Next context on stack (0) is ctx1:0x11087e0
2:Next context on stack (0) is ctx1:0x1741160
2:Next context on stack (1) is ctx1:0x1741160
3:Next context on stack (0) is unknown:0x10dc520
3:Next context on stack (1) is ctx1:0x1c5aa70
4:Next context on stack (0) is unknown:0x10dc520
4:Next context on stack (1) is ctx1:0x23eaa00
5:Next context on stack (0) is ctx2:0x10dc520
5:Next context on stack (1) is ctx1:0x32caf30
6:Next context on stack (0) is ctx1:0x3a44ed0
6:Next context on stack (1) is ctx2:0x10dc520
6:Next context on stack (2) is ctx1:0x3a44ed0
7:Next context on stack (0) is ctx1:0x41cfd90
7:Next context on stack (1) is ctx2:0x10dc520
7:Next context on stack (2) is ctx1:0x41cfd90
8:Next context on stack (0) is ctx2:0x10dc520
8:Next context on stack (1) is ctx2:0x10dc520
8:Next context on stack (2) is ctx1:0x4959c70
$

Based on the above, as we proceed through each API call in your code:

1.

        auto dstatus = cuCtxCreate(&ctx1, 0, 0);    // checkpoint 1
1:Next context on stack (0) is ctx1:0x11087e0

The context creation also pushes the newly created context on the stack, as mentioned here.

2.

        dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 2
2:Next context on stack (0) is ctx1:0x1741160
2:Next context on stack (1) is ctx1:0x1741160

No surprise, pushing the same context on the stack creates another stack entry for it.

3.

        auto rstatus = cudaSetDevice(0);            // checkpoint 3
3:Next context on stack (0) is unknown:0x10dc520
3:Next context on stack (1) is ctx1:0x1c5aa70

The cudaSetDevice() call has replaced the top of the stack with an "unknown" context. (Only unknown at this point because we have not retrieved the handle value of the "other" context).

4.

        rstatus = cudaMalloc(&ptr1, 1024);          // checkpoint 4
4:Next context on stack (0) is unknown:0x10dc520
4:Next context on stack (1) is ctx1:0x23eaa00

No difference in stack configuration due to this call.

5.

        dstatus = cuCtxGetCurrent(&primary);        // checkpoint 5
5:Next context on stack (0) is ctx2:0x10dc520
5:Next context on stack (1) is ctx1:0x32caf30

No difference in stack configuration due to this call, but we now know that the top of stack context is the current context (and we can surmise it is the primary context).

6.

        dstatus = cuCtxPushCurrent(ctx1);           // checkpoint 6
6:Next context on stack (0) is ctx1:0x3a44ed0
6:Next context on stack (1) is ctx2:0x10dc520
6:Next context on stack (2) is ctx1:0x3a44ed0

No real surprise here. We are pushing ctx1 on the stack, and so the stack has 3 entries, the first one being the driver API created context, and the next two entries being the same as the stack configuration from step 5, just moved down one stack location.

7.

        rstatus = cudaMalloc(&ptr2, 1024);          // checkpoint 7
7:Next context on stack (0) is ctx1:0x41cfd90
7:Next context on stack (1) is ctx2:0x10dc520
7:Next context on stack (2) is ctx1:0x41cfd90

Again, this call has no effect on stack configuration.

8.

        rstatus = cudaSetDevice(0);                 // checkpoint 8
8:Next context on stack (0) is ctx2:0x10dc520
8:Next context on stack (1) is ctx2:0x10dc520
8:Next context on stack (2) is ctx1:0x4959c70

Once again, we see that the behavior here is that the cudaSetDevice() call has replaced the top of stack context with the primary context.

The conclusion I have from your test code is that I see no inconsistency of behavior of the cudaSetDevice() call when intermixed with various runtime and driver API calls as you have in your code.

From my perspective, this sort of programming paradigm is insanity. I can't imagine why you would want to intermix driver API and runtime API code this way.

这篇关于cudaSetDevice()对CUDA设备的上下文堆栈有什么作用?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆