CUDA - 关于“分支”和“发散分支”的可视分析器结果的混淆(2) [英] CUDA - Confusion about the Visual Profiler results of “branch” and “divergent branch” (2)

查看:243
本文介绍了CUDA - 关于“分支”和“发散分支”的可视分析器结果的混淆(2)的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我使用NVIDIA Visual Profiler分析我的代码。测试内核是:

  ///////////////////// /////////////////////////////////////////第1组
静态__global__ void gpu_test_divergency_0(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< 0)
{
a [tid] = tid;
}
else
{
b [tid] = tid;
}
}
static __global__ void gpu_test_divergency_1(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid == 0)
{
a [tid] = tid;
}
else
{
b [tid] = tid;
}
}
static __global__ void gpu_test_divergency_2(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid> = 0)
{
a [tid] = tid;
}
else
{
b [tid] = tid;
}
}
static __global__ void gpu_test_divergency_3(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid> 0)
{
a [tid] = tid;
}
else
{
b [tid] = tid;
}
}
//////////////////////////////////// ////////////////////////////第2组
static __global__ void gpu_test_divergency_4(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< 0)
{
a [tid] = tid + 1;
}
else
{
b [tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_5(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid == 0)
{
a [tid] = tid + 1;
}
else
{
b [tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_6(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid> = 0)
{
a [tid] = tid + 1;
}
else
{
b [tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_7(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid> 0)
{
a [tid] = tid + 1;
}
else
{
b [tid] = tid + 2;
}
}
//////////////////////////////////// ////////////////////////////第3组
static __global__ void gpu_test_divergency_8(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< 0)
{
a [tid] = tid + 1.0;
}
else
{
b [tid] = tid + 2.0;
}
}
static __global__ void gpu_test_divergency_9(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid == 0)
{
a [tid] = tid + 1.0;
}
else
{
b [tid] = tid + 2.0;
}
}
static __global__ void gpu_test_divergency_10(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid> = 0)
{
a [tid] = tid + 1.0;
}
else
{
b [tid] = tid +
}
}
static __global__ void gpu_test_divergency_11(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if(tid> 0)
{
a [tid] = tid + 1.0;
}
else
{
b [tid] = tid + 2.0;
}
}



当我用< < 1,32 >>>,我从profiler的结果是这样的:

  gpu_test_divergency_0:分支效率= 100% divergent branch = 0 
gpu_test_divergency_1:分支效率= 100%分支= 1发散分支= 0
gpu_test_divergency_2:分支效率= 100%分支= 1发散分支= 0
gpu_test_divergency_3:分支效率= 100 %branch = 1 divergent branch = 0

gpu_test_divergency_4:Branch Efficiency = 100%branch = 3 divergent branch = 0
gpu_test_divergency_5:Branch Efficiency = 100%branch = 3 divergent branch = 0
gpu_test_divergency_6:Branch Efficiency = 100%branch = 2 divergent branch = 0
gpu_test_divergency_7:Branch Efficiency = 100%branch = 3 divergent branch = 0

gpu_test_divergency_8:Branch Efficiency = 100%branch = 3 divergent branch = 0
gpu_test_divergency_9:分支效率= 75%分支= 4发散分支= 1
gpu_test_divergency_10:分支效率= 100%分支= 2发散分支= 0
gpu_test_divergency_11:分支效率= 75%branch = 4 divergent branch = 1



当我用<< ;& 1,64 >>>,我从profiler的结果像这样:

  gpu_test_divergency_0:分支效率= 100% divergent branch = 0 
gpu_test_divergency_1:分支效率= 100%分支= 2发散分支= 0
gpu_test_divergency_2:分支效率= 100%分支= 2发散分支= 0
gpu_test_divergency_3:分支效率= 100 %branch = 2 divergent branch = 0

gpu_test_divergency_4:Branch Efficiency = 100%branch = 6 divergent branch = 0
gpu_test_divergency_5:Branch Efficiency = 100%branch = 6 divergent branch = 0
gpu_test_divergency_6:Branch Efficiency = 100%branch = 4 divergent branch = 0
gpu_test_divergency_7:Branch Efficiency = 100%branch = 5 divergent branch = 0

gpu_test_divergency_8:Branch Efficiency = 100%branch = 6 divergent branch = 0
gpu_test_divergency_9:Branch Efficiency = 85.7%branch = 7 divergent branch = 1
gpu_test_divergency_10:Branch Efficiency = 100%branch = 4 divergent branch = 0
gpu_test_divergency_11: = 83.3%branch = 6 divergent branch = 1



我使用GeForce GTX 570 2.0和NVIDIA Visual Profiler v4.2。根据文档:



branch - 执行内核的线程占用的分支数。如果至少有一个线程,该计数器将增加1 in a warp take the branch。



divergent branch - warp中发散分支的数量此计数器将递增一个如果经线中的至少一个胎面通过数据相关条件分支发散(即遵循不同的执行路径)。



真的很困惑的结果。为什么每个测试组的分支数量不同?为什么只有第三个测试组似乎有正确的发散分支?



@JackOLantern:我是在发布模式下编译的。我以你的方式拆解它。 gpu_test_divergency_4的结果与您的结果完全相同,但gpu_test_divergency_0的结果不同:

 功能:_Z21gpu_test_divergency_0PfS_ 
/ * 0000 * / / * 0x00005de428004404 * / MOV R1,c [0x1] [0x100];
/ * 0008 * / / * 0x94001c042c000000 * / S2R R0,SR_CTAid_X;
/ * 0010 * / / * 0x84009c042c000000 * / S2R R2,SR_Tid_X;
/ * 0018 * / / * 0x20009ca320044000 * / IMAD R2,R0,c [0x0] [0x8],R2;
/ * 0020 * / / * 0xfc21dc23188e0000 * / ISETP.LT.AND P0,pt,R2,RZ,pt;
/ * 0028 * / / * 0x0920de0418000000 * / I2F.F32.S32 R3,R2;
/ * 0030 * / / * 0x9020204340004000 * / @!P0 ISCADD R0,R2,c [0x0] [0x24],0x2;
/ * 0038 * / / * 0x8020804340004000 * / @ P0 ISCADD R2,R2,c [0x0] [0x20],0x2;
/ * 0040 * / / * 0x0000e08590000000 * / @!P0 ST [R0],R3;
/ * 0048 * / / * 0x0020c08590000000 * / @ P0 ST [R2],R3;
/ * 0050 * / / * 0x00001de780000000 * / EXIT;

我想,像你说的,转换指令(在这种情况下为I2F)不添加额外的分支。



但我看不到这些反汇编代码和Profiler结果之间的关系。我从另一篇文章中学到了( https://devtalk.nvidia.com/)默认/主题/ 463316 / branch-divergent-branches / ),使用SMs上的实际线程(warp)运行情况计算发散分支。所以我想我们不能推导出每个实际运行的分支发散,只是根据这些反汇编代码。我对吗?

解决方案

跟进 - 使用投票内容来检查潜在客户



我认为检查warp中的线程分歧的最好方法是使用投票内在函数,特别是 __ ballot __ popc intrinsics。关于 __ ballot __ popc 的良好解释可以在Shane Cook,CUDA Programming,Morgan Kaufmann的书中找到。 p>

__ ballot 的原型如下

  unsigned int __ballot(int predicate); 

如果谓词不为零, __ ballot 值与 N 位设置,其中 N threadIdx.x



另一方面, __ popc 返回设置为<$ c $因此,通过联合使用 __ ballot

code> __ popc
atomicAdd ,可以检查翘曲是否发散。



为此,我设置了以下代码:

  #include< cuda.h> 
#include< stdio.h>
#include< iostream>

#include< cuda.h>
#include< cuda_runtime.h>

__device__ unsigned int __ballot_non_atom(int predicate)
{
if(predicate!= 0)return(1<<(threadIdx.x%32));
else return 0;
}

__global__ void gpu_test_divergency_0(unsigned int * d_ballot,int Num_Warps_per_Block)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;

const unsigned int warp_num = threadIdx.x>> 5;

atomicAdd(& d_ballot [warp_num + blockIdx.x * Num_Warps_per_Block],__ popc(__ ballot_non_atom(tid> 2))));
// atomicAdd(& d_ballot [warp_num + blockIdx.x * Num_Warps_per_Block],__ popc(__ ballot(tid> 2)))

}

#include< conio.h>

int main(int argc,char * argv [])
{
unsigned int Num_Threads_per_Block = 64;
unsigned int Num_Blocks_per_Grid = 1;
unsigned int Num_Warps_per_Block = Num_Threads_per_Block / 32;
unsigned int Num_Warps_per_Grid =(Num_Threads_per_Block * Num_Blocks_per_Grid)/ 32;

unsigned int * h_ballot =(unsigned int *)malloc(Num_Warps_per_Grid * sizeof(unsigned int));
unsigned int * d_ballot; cudaMalloc((void **)& d_ballot,Num_Warps_per_Grid * sizeof(unsigned int));

for(int i = 0; i
cudaMemcpy(d_ballot,h_ballot,Num_Warps_per_Grid * sizeof(unsigned int),cudaMemcpyHostToDevice);

gpu_test_divergency_0<<<< Num_Blocks_per_Grid,Num_Threads_per_Block>>>(d_ballot,Num_Warps_per_Block);

cudaMemcpy(h_ballot,d_ballot,Num_Warps_per_Grid * sizeof(unsigned int),cudaMemcpyDeviceToHost);

for(int i = 0; i if((h_ballot [i] == 0)||(h_ballot [i] == 32))std :: cout<< Warp< i<< IS NOT divergent-Predicate true for< h_ballot [i]< threads \\\
;
else std :: cout<< Warp< i<< IS divergent - Predicate true for< h_ballot [i]< threads \\\
;
}

getch();
return EXIT_SUCCESS;
}

请注意,我现在正在运行计算能力的代码1.2卡,所以在上面的例子中我使用 __ ballot_non_atom ,这是一个非内在的等价于 __ ballot __ ballot 仅适用于计算能力> = 2.0。换句话说,如果您的计算能力> = 2.0的卡,请在内核函数中使用 __ ballot 取消注释该指令。



使用上面的代码,你可以通过简单的更改内核函数中的相关谓词来播放上面的所有内核函数。



上一个答案



我在发布下编译了计算能力 2.0 模式,我使用 -keep 保留中间文件和 cuobjdump 实用程序来生成两个内核的反汇编,即:

  static __global__ void gpu_test_divergency_0(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< 0)a [tid] = tid;
else b [tid] = tid;
}

  static __global__ void gpu_test_divergency_4(float * a,float * b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< 0)a [tid] = tid + 1;
else b [tid] = tid + 2;
}

结果如下

  gpu_test_divergency_0 

/ * 0000 * / MOV R1,c [0x1] [0x100] / * 0x2800440400005de4 * /
/ * 0008 * / S2R R0,SR_CTAID.X; / * 0x2c00000094001c04 * /
/ * 0010 * / S2R R2,SR_TID.X; / * 0x2c00000084009c04 * /
/ * 0018 * / IMAD R2,R0,c [0x0] [0x8],R2; / * 0x2004400020009ca3 * /
/ * 0020 * / ISETP.LT.AND P0,PT,R2,RZ,PT; / * 0x188e0000fc21dc23 * /
/ * 0028 * / I2F.F32.S32 R0,R2; / * 0x1800000009201e04 * /
/ * 0030 * / @!P0 ISCADD R3,R2,c [0x0] [0x24],0x2; / * 0x400040009020e043 * /
/ * 0038 * / @ P0 ISCADD R2,R2,c [0x0] [0x20],0x2; / * 0x4000400080208043 * /
/ * 0040 * / @!P0 ST [R3],R0; / * 0x9000000000302085 * /
/ * 0048 * / @ P0 ST [R2],R0; / * 0x9000000000200085 * /
/ * 0050 * / EXIT; / * 0x8000000000001de7 * /

  gpu_test_divergency_4 

/ * 0000 * / MOV R1,c [0x1] [0x100] / * 0x2800440400005de4 * /
/ * 0008 * / S2R R0,SR_CTAID.X; / * 0x2c00000094001c04 * / R0 = BlockIdx.x
/ * 0010 * / S2R R2,SR_TID.X; / * 0x2c00000084009c04 * / R2 = ThreadIdx.x
/ * 0018 * / IMAD R0,R0,c [0x0] [0x8],R2; / * 0x2004400020001ca3 * / R0 = R0 * c + R2
/ * 0020 * / ISETP.LT.AND P0,PT,R0,RZ,PT; / * 0x188e0000fc01dc23 * / If语句
/ * 0028 * / @ P0 BRA.U 0x58; / * 0x40000000a00081e7 * /分支1 - 跳转到0x58
/ * 0030 * / @!P0 IADD R2,R0,0x2; / * 0x4800c0000800a003 * /分支2 - R2 = R0 + 2
/ * 0038 * / @!P0 ISCADD R0,R0,c [0x0] [0x24],0x2; / * 0x4000400090002043 * /分支2 - 计算gmem地址
/ * 0040 * / @!P0 I2F.F32.S32 R2,R2; / * 0x180000000920a204 * /分支2 - R2 = R2后int浮动转换
/ * 0048 * / @!P0 ST [R0],R2; / * 0x900000000000a085 * / Branch 2 - gmem store
/ * 0050 * / @!P0 BRA.U 0x78; / * 0x400000008000a1e7 * /分支2 - 跳转到0x78(退出)
/ * 0058 * / @ P0 IADD R2,R0,0x1; / * 0x4800c00004008003 * /分支1 - R2 = R0 + 1
/ * 0060 * / @ P0 ISCADD R0,R0,c [0x0] [0x20],0x2; / * 0x4000400080000043 * /分支1 - 计算gmem地址
/ * 0068 * / @ P0 I2F.F32.S32 R2,R2; / * 0x1800000009208204 * /分支1 - R2 = R2后int浮动转换
/ * 0070 * / @ P0 ST [R0],R2; / * 0x9000000000008085 * / Branch 1 - gmem store
/ * 0078 * / EXIT; / * 0x8000000000001de7 * /



从上面的反汇编中,我期望你的分支发散性测试



是否在调试或发布模式下进行编译?


I use NVIDIA Visual Profiler to analyze my code. The test kernels are:

//////////////////////////////////////////////////////////////// Group 1
static __global__ void gpu_test_divergency_0(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < 0)
    {
         a[tid] = tid;
    }
    else
    {
         b[tid] = tid;
    }
}
static __global__ void gpu_test_divergency_1(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid == 0)
    {
         a[tid] = tid;
    }
    else
    {
         b[tid] = tid;
    }
}
static __global__ void gpu_test_divergency_2(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid >= 0)
    {
         a[tid] = tid;
    }
    else
    {
         b[tid] = tid;
    }
}
static __global__ void gpu_test_divergency_3(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid > 0)
    {
         a[tid] = tid;
    }
    else
    {
         b[tid] = tid;
    }
}
//////////////////////////////////////////////////////////////// Group 2
static __global__ void gpu_test_divergency_4(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < 0)
    {
         a[tid] = tid + 1;
    }
    else
    {
         b[tid] = tid + 2;
    }
}
static __global__ void gpu_test_divergency_5(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid == 0)
    {
         a[tid] = tid + 1;
    }
    else
    {
         b[tid] = tid + 2;
    }
}
static __global__ void gpu_test_divergency_6(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid >= 0)
    {
         a[tid] = tid + 1;
    }
    else
    {
         b[tid] = tid + 2;
    }
}
static __global__ void gpu_test_divergency_7(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid > 0)
    {
         a[tid] = tid + 1;
    }
    else
    {
         b[tid] = tid + 2;
    }
}
//////////////////////////////////////////////////////////////// Group 3
static __global__ void gpu_test_divergency_8(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < 0)
    {
         a[tid] = tid + 1.0;
    }
    else
    {
         b[tid] = tid + 2.0;
    }
}
static __global__ void gpu_test_divergency_9(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid == 0)
    {
         a[tid] = tid + 1.0;
    }
    else
    {
         b[tid] = tid + 2.0;
    }
}
static __global__ void gpu_test_divergency_10(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid >= 0)
    {
         a[tid] = tid + 1.0;
    }
    else
    {
         b[tid] = tid + 2.0;
    }
}
static __global__ void gpu_test_divergency_11(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid > 0)
    {
         a[tid] = tid + 1.0;
    }
    else
    {
         b[tid] = tid + 2.0;
    }
}

When I launched the test kernels with <<< 1, 32 >>>, I got the results from profiler like this:

gpu_test_divergency_0 :  Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_1 :  Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_2 :  Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_3 :  Branch Efficiency = 100% branch = 1 divergent branch = 0

gpu_test_divergency_4 :  Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_5 :  Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_6 :  Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_7 :  Branch Efficiency = 100% branch = 3 divergent branch = 0

gpu_test_divergency_8 :  Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_9 :  Branch Efficiency = 75%  branch = 4 divergent branch = 1
gpu_test_divergency_10 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_11 : Branch Efficiency = 75%  branch = 4 divergent branch = 1

And when I launched the test kernels with <<< 1, 64 >>>, I got the results from profiler like this:

gpu_test_divergency_0 :  Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_1 :  Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_2 :  Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_3 :  Branch Efficiency = 100% branch = 2 divergent branch = 0

gpu_test_divergency_4 :  Branch Efficiency = 100% branch = 6 divergent branch = 0
gpu_test_divergency_5 :  Branch Efficiency = 100% branch = 6 divergent branch = 0
gpu_test_divergency_6 :  Branch Efficiency = 100% branch = 4 divergent branch = 0
gpu_test_divergency_7 :  Branch Efficiency = 100% branch = 5 divergent branch = 0

gpu_test_divergency_8 :  Branch Efficiency = 100%  branch = 6 divergent branch = 0
gpu_test_divergency_9 :  Branch Efficiency = 85.7% branch = 7 divergent branch = 1
gpu_test_divergency_10 : Branch Efficiency = 100%  branch = 4 divergent branch = 0
gpu_test_divergency_11 : Branch Efficiency = 83.3% branch = 6 divergent branch = 1

I use "GeForce GTX 570" with the CUDA Capability of 2.0 and NVIDIA Visual Profiler v4.2 on Linux. According to the documents:

"branch" - "Number of branches taken by threads executing a kernel. This counter will be incremented by one if at least one thread in a warp takes the branch."

"divergent branch" - "Number of divergent branches within a warp. This counter will be incremented by one if at least one tread in a warp diverges (that is, follows a different execution path) via a data dependent conditional branch."

But I am really confused about the results. Why the numbers of "branch" for each test group are different? And why only the third test group seems to have the right "divergent branch"?

@JackOLantern: I compiled in release mode. I disassembled it in your way. The results of "gpu_test_divergency_4" is exactly the same as yours but the result of "gpu_test_divergency_0" is different:

    Function : _Z21gpu_test_divergency_0PfS_
/*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100];
/*0008*/     /*0x94001c042c000000*/     S2R R0, SR_CTAid_X;
/*0010*/     /*0x84009c042c000000*/     S2R R2, SR_Tid_X;
/*0018*/     /*0x20009ca320044000*/     IMAD R2, R0, c [0x0] [0x8], R2;
/*0020*/     /*0xfc21dc23188e0000*/     ISETP.LT.AND P0, pt, R2, RZ, pt;
/*0028*/     /*0x0920de0418000000*/     I2F.F32.S32 R3, R2;
/*0030*/     /*0x9020204340004000*/     @!P0 ISCADD R0, R2, c [0x0] [0x24], 0x2;
/*0038*/     /*0x8020804340004000*/     @P0 ISCADD R2, R2, c [0x0] [0x20], 0x2;
/*0040*/     /*0x0000e08590000000*/     @!P0 ST [R0], R3;
/*0048*/     /*0x0020c08590000000*/     @P0 ST [R2], R3;
/*0050*/     /*0x00001de780000000*/     EXIT;

I guess, like you said, conversion instructions (I2F in this case) do not add extra branch.

But I cannot see the relationship between these disassembled code and the Profiler results. I learned from another post (https://devtalk.nvidia.com/default/topic/463316/branch-divergent-branches/) that divergent branch is calculated with the actual thread(warp) running situation on SMs. So I guess we cannot deduce the branch divergence of each actual running, just according to these disassembled code. Am I right?

解决方案

FOLLOW UP - USING VOTE INTRINSICS TO CHECK THREAD DIVERGENCE

I think the best way to check about thread divergence within warps is using vote intrinsics and in particular the __ballot and __popc intrinsics. A good explanation on __ballot and __popc is available in the book by Shane Cook, CUDA Programming, Morgan Kaufmann.

The prototype of __ballot is the following

unsigned int __ballot(int predicate);

If predicate is nonzero, __ballot returns a value with the Nth bit set, where N is threadIdx.x.

On the other side, __popc returns the number of bits set withing a 32-bit parameter.

So, by jointly using __ballot, __popc and atomicAdd, one can check if a warp is divergent or not.

To this end, I have set up the following code

#include <cuda.h>
#include <stdio.h>
#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

__device__ unsigned int __ballot_non_atom(int predicate)
{
    if (predicate != 0) return (1 << (threadIdx.x % 32));
    else return 0;
}

__global__ void gpu_test_divergency_0(unsigned int* d_ballot, int Num_Warps_per_Block)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    const unsigned int warp_num = threadIdx.x >> 5;

    atomicAdd(&d_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot_non_atom(tid > 2)));
    //  atomicAdd(&d_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid > 2)));

}

#include <conio.h>

int main(int argc, char *argv[])
{
    unsigned int Num_Threads_per_Block      = 64;
    unsigned int Num_Blocks_per_Grid        = 1;
    unsigned int Num_Warps_per_Block        = Num_Threads_per_Block/32;
    unsigned int Num_Warps_per_Grid         = (Num_Threads_per_Block*Num_Blocks_per_Grid)/32;

    unsigned int* h_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int));
    unsigned int* d_ballot; cudaMalloc((void**)&d_ballot, Num_Warps_per_Grid*sizeof(unsigned int));

    for (int i=0; i<Num_Warps_per_Grid; i++) h_ballot[i] = 0;

    cudaMemcpy(d_ballot, h_ballot, Num_Warps_per_Grid*sizeof(unsigned int), cudaMemcpyHostToDevice);

    gpu_test_divergency_0<<<Num_Blocks_per_Grid,Num_Threads_per_Block>>>(d_ballot,Num_Warps_per_Block);

    cudaMemcpy(h_ballot, d_ballot, Num_Warps_per_Grid*sizeof(unsigned int), cudaMemcpyDeviceToHost);

    for (int i=0; i<Num_Warps_per_Grid; i++) { 
        if ((h_ballot[i] == 0)||(h_ballot[i] == 32)) std::cout << "Warp " << i << " IS NOT divergent- Predicate true for " << h_ballot[i] << " threads\n";
            else std::cout << "Warp " << i << " IS divergent - Predicate true for " << h_ballot[i] << " threads\n";
    }

    getch();
    return EXIT_SUCCESS;
}

Please, note that I'm right now running the code on a compute capability 1.2 card, so in the example above I'm using __ballot_non_atom which is a non-intrinsic equivalent to __ballot, since __ballot is available only for compute capability >= 2.0. In other words, if you have a card with compute capability >= 2.0, please uncommented the instruction using __ballot in the kernel function.

With the above code, you can play with all your kernel functions above by simply changing the relevant predicate in the kernel function.

PREVIOUS ANSWER

I compiled your code for a compute capability 2.0 under release mode and I used -keep to retain intermediate files and the cuobjdump utility to produce the disassembly of two of your kernels, namely:

static __global__ void gpu_test_divergency_0(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < 0) a[tid] = tid;
    else b[tid] = tid;
}

and

static __global__ void gpu_test_divergency_4(float *a, float *b)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < 0) a[tid] = tid + 1;
    else b[tid] = tid + 2;
}

The results are the following

gpu_test_divergency_0

/*0000*/        MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
/*0008*/        S2R R0, SR_CTAID.X;                    /* 0x2c00000094001c04 */
/*0010*/        S2R R2, SR_TID.X;                      /* 0x2c00000084009c04 */
/*0018*/        IMAD R2, R0, c[0x0][0x8], R2;          /* 0x2004400020009ca3 */
/*0020*/        ISETP.LT.AND P0, PT, R2, RZ, PT;       /* 0x188e0000fc21dc23 */
/*0028*/        I2F.F32.S32 R0, R2;                    /* 0x1800000009201e04 */
/*0030*/   @!P0 ISCADD R3, R2, c[0x0][0x24], 0x2;      /* 0x400040009020e043 */
/*0038*/    @P0 ISCADD R2, R2, c[0x0][0x20], 0x2;      /* 0x4000400080208043 */
/*0040*/   @!P0 ST [R3], R0;                           /* 0x9000000000302085 */
/*0048*/    @P0 ST [R2], R0;                           /* 0x9000000000200085 */
/*0050*/        EXIT ;                                 /* 0x8000000000001de7 */

and

gpu_test_divergency_4

/*0000*/        MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
/*0008*/        S2R R0, SR_CTAID.X;                    /* 0x2c00000094001c04 */   R0 = BlockIdx.x
/*0010*/        S2R R2, SR_TID.X;                      /* 0x2c00000084009c04 */   R2 = ThreadIdx.x
/*0018*/        IMAD R0, R0, c[0x0][0x8], R2;          /* 0x2004400020001ca3 */   R0 = R0 * c + R2
/*0020*/        ISETP.LT.AND P0, PT, R0, RZ, PT;       /* 0x188e0000fc01dc23 */   If statement
/*0028*/    @P0 BRA.U 0x58;                            /* 0x40000000a00081e7 */   Branch 1 - Jump to 0x58
/*0030*/   @!P0 IADD R2, R0, 0x2;                      /* 0x4800c0000800a003 */   Branch 2 - R2 = R0 + 2
/*0038*/   @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;      /* 0x4000400090002043 */   Branch 2 - Calculate gmem address
/*0040*/   @!P0 I2F.F32.S32 R2, R2;                    /* 0x180000000920a204 */   Branch 2 - R2 = R2 after int to float cast
/*0048*/   @!P0 ST [R0], R2;                           /* 0x900000000000a085 */   Branch 2 - gmem store
/*0050*/   @!P0 BRA.U 0x78;                            /* 0x400000008000a1e7 */   Branch 2 - Jump to 0x78 (exit)
/*0058*/    @P0 IADD R2, R0, 0x1;                      /* 0x4800c00004008003 */   Branch 1 - R2 = R0 + 1
/*0060*/    @P0 ISCADD R0, R0, c[0x0][0x20], 0x2;      /* 0x4000400080000043 */   Branch 1 - Calculate gmem address
/*0068*/    @P0 I2F.F32.S32 R2, R2;                    /* 0x1800000009208204 */   Branch 1 - R2 = R2 after int to float cast
/*0070*/    @P0 ST [R0], R2;                           /* 0x9000000000008085 */   Branch 1 - gmem store
/*0078*/        EXIT ;                                 /* 0x8000000000001de7 */

From the above disassemblies, I would expect that the results of your branch divergency tests be the same.

Are you compiling in a debug or release mode?

这篇关于CUDA - 关于“分支”和“发散分支”的可视分析器结果的混淆(2)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆