如何优化这个CUDA内核 [英] How to optimize this CUDA kernel

查看:164
本文介绍了如何优化这个CUDA内核的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我已经对我的模型进行了分析,似乎这个内核占我的总运行时间的大约2/3。我在寻找建议来优化它。代码如下。

  __ global__ void calcFlux(double * concs,double * fluxes,double * dt)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
fluxes [idx] = knowles_flux(idx,concs)
// fluxes [idx] = flux(idx,concs);
}

__device__ double knowles_flux(int r,double * conc)
{
double frag_term = 0;
double flux = 0;
if(r ==((maxlength)-1))
{
//计算类型:Max
flux = -km *(r)* conc [r ] + 2 *(ka)* conc [r-1] * conc [0]
}
else if(r>((nc)-1))
{
//计算类型:F
// arrSum3(conc, & frag_term,r + 1,maxlength-1);
for(int s = r + 1; s<(maxlength); s ++)
{
frag_term + = conc [s]
}
flux = - (km)*(r)* conc [r] + 2 *(km)* frag_term_2 *(ka)* conc [r] * conc [0] + 2 *(ka)* conc [r-1] * conc [0];
}
else if(r ==((nc)-1))
{
//计算类型:N
// arrSum3 & frag_term,r + 1,maxlength-1);
for(int s = r + 1; s<(maxlength); s ++)
{
frag_term + = conc [s]
}
flux =(kn)* pow(conc [0],(nc))+ 2 *(km)* frag_term_2 *(ka)* conc [r] * conc [0] ;
}
else if(r <((nc)-1))
{
//计算类型:O
flux = 0;
}
return flux;
}

只是为了让你了解为什么for循环是一个问题,内核在大约maxlength = 9000个元素的数组上启动。为了我们现在的目的,nc在2-6的范围内。这里是一个插图说明这个内核如何处理传入数组(conc)。对于此数组,需要对不同的元素组应用五种不同类型的计算。

 数组元素:0 1 2 3 4 5 6 7 8 9 ... 8955 8956 8957 8958 8959 8960 
计算类型:MOOOOONFFF ... FFFFF Max

我现在想处理的潜在问题是分支分歧从四元组if-else和for循环。



我处理分支分歧的想法是将这个内核分成四个独立的设备函数或内核,分别处理每个区域,并同时启动。我不知道这是明显好于只是让分支发生发生,如果我没有错误,会导致四种计算类型串行运行。



要处理for循环,您会注意到有一个注释掉的arrSum3函数,我写的基于我以前(也可能很差)写并行还原内核。使用它代替for循环显着增加了我的运行时。我觉得有一个聪明的方式来完成我试图用for循环,但我不是那么聪明,我的顾问厌倦了我浪费时间的思考。



感谢任何帮助。



EDIT >完整代码位于此处: Nvidia Visual Profiler时间线错过了大量程序执行


<假设sgn()和abs()不是从if和elses派生的



pre> __ device__ double knowles_flux(int r,double * conc)
{
double frag_term = 0;
double flux = 0;

//计算类型:Max
//无差异
//应该优先使用20-30个额外周期而不是分支。
//可能不适合CPU
fluxA =(1-abs(sgn(r-(maxlength-1))))*(-km *(r)* conc [r] +2 *(ka)* conc [r-1] * conc [0]);
//如果r和maxlength-1不相等则为0

//总是在共享内存中计算这个值,所有内核的工作将相等,没有分歧

//你应该把内核分成几个部分做一个减少
//但是如果你不想要那么,你可以尝试:
for(int s = 0; s< someLimit; s ++) //全部计数相同的周期数,所以没有偏差
{
frag_term + = conc [s] *(abs(sgn(s-maxlength))* sgn ))*(sgn(1 + sgn(s-(r + 1))));
}
//但你可以使用添加和分配操作
//在本地内存中更容易(在CUDA中是__shared)
//全局conc []到本地concL []内存(使用所有内核)(100个周期)
// for(其他从零到上限)
// if(localID == 0)
{
// frag_termL [0] + = concL [s] //局部到局部(10个周期/分配)
// frag_termL [0 + others] = frag_termL [0] // local to local(10 cycles / assign。)
//} ----->使用几乎相同数量的周期,但是使用更少的能量
//使用单核(2000 instr。with single core vs 1000 instr。with 2k cores)
//在本地内存中,然后将其复制到私有使用所有内核相应地注册



//计算类型:F

fluxB =(abs(sgn(r- ))* sgn(1 + sgn(r-(nc-1)))*( - (km)*(r)* conc [r] + 2 *(km)* frag_term_2 * conc [r] * conc [0] + 2 *(ka)* conc [r-1] * conc [0]
//如果r不大于(nc-1)则为零如果



//计算类型:N
$ b b
fluxC =(1-abs(sgn(r-(nc-1)))*((kn)* pow(conc [0],(nc))+ 2 *(km)* frag_term- 2 *(ka)* conc [r] * conc [0]);
//如果r和nc-1不相等则为零



flux = fluxA + fluxB + fluxC; //只有一个不同于

flux = flux *(-sgn(r-(nc-1))* sgn(1-sgn(r-(nc-1)) ))
//如果r> (nc-1)

返回流量;
}

好吧,让我打开一下:

  if(a> b)x + = y; 

可视为

 如果ab是负的sgn(ab)是-1 
然后加1到-1给出零==>如果a b),x不变则满足比较的较低部分(a x + =(sgn(ab)+1)= 0 if )为零,sgn(ab)为零
那么我们应该将上面的解与sgn(ab)相乘!
x + = y *(sgn(ab)+1)* sgn(ab)
表示
x + = y *(0 + 1)* 0 = 0 a == b !

允许检查如果a> b
x + = y *(sgn(ab)+1)* sgn(ab)
x + = y * * 1 ==> y * 2是不可接受的,需要另一个sgn在外边

x + = y * sgn((sgn(ab)+1)* sgn(ab))

x + = y * sgn((1 + 1)* 1)

x + = y * sgn(2)

x + = y仅当a大于b

  abs(sgn(r-(nc-1))



<使用它

  tmp = abs(sgn(r-(nc-1))

。 .... * tmp *(tmp-1)....
...... + tmp * zxc [s] .....
...... ....

可以更多地减少总周期!寄存器访问可以是级别太字节/如同全局访问一样:

  tmpGlobal = conc [r]; 

...... tmpGlobal * tmp .....
.... tmpGlobal + x -y ....
警告:阅读 conc [-1]不应该引起任何错误,只要它被乘以零,如果conc [0]的实际地址不是真零。



如果您需要从conc [-1]中逃脱,您可以将索引乘以一些绝对值价值太!参见:

  tmp = conc [i-1]变成tmp = conc [abs((i-1))]从正索引读取,无论如何,该值将被乘以零。这是下限保护。 
您也可以应用更高的绑定保护。只是这增加了更多的周期。如果在纯标量值上工作的速度不够快,当访问浓缩时,可以考虑使用向量遍历操作来处理


[r-1]和conc [r + 1]。向量元素之间的随机操作比通过本地mem复制到另一个核心/线程要快。


I've profiled my model and it seems that this kernel accounts for about 2/3 of my total runtime. I was looking for suggestions to optimize it. The code is as follows.

__global__ void calcFlux(double* concs, double* fluxes, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    fluxes[idx]=knowles_flux(idx, concs);
    //fluxes[idx]=flux(idx, concs);
}

__device__ double knowles_flux(int r, double *conc)
{
    double frag_term = 0;
    double flux = 0;
    if (r == ((maxlength)-1))
    {
        //Calculation type : "Max"
        flux = -km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0];
    }
    else if (r > ((nc)-1))
    {
        //Calculation type : "F"
        //arrSum3(conc, &frag_term, r+1, maxlength-1);
        for (int s = r+1; s < (maxlength); s++)
        {
            frag_term += conc[s];
        }
        flux = -(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0];
    }
    else if (r == ((nc)-1))
    {
        //Calculation type : "N"
        //arrSum3(conc, &frag_term, r+1, maxlength-1);
        for (int s = r+1; s < (maxlength); s++)
        {
            frag_term += conc[s];
        }
        flux = (kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0];
    }
    else if (r < ((nc)-1))
    {
    //Calculation type : "O"
        flux = 0;
    }
    return flux;
}

Just to give you an idea of why the for loop is an issue, this kernel is launched on an array of about maxlength = 9000 elements. For our purposes now, nc is in the range of 2-6. Here's an illustration of how this kernel processes the incoming array (conc). For this array, five different types of calculations need to be applied to different groups of elements.

Array element : 0 1 2 3 4 5 6 7 8 9 ... 8955 8956 8957 8958 8959 8960
Type of calc  : M O O O O O N F F F ...   F   F    F    F    F   Max

The potential problems I've been trying to deal with right now are branch divergence from the quadruple if-else and the for loop.

My idea for dealing with the branch divergence is to break this kernel down into four separate device functions or kernels that treat each region separately and all launch at the same time. I'm not sure this is significantly better than just letting the branch divergence take place, which if I'm not mistaken, would cause the four calculation types to be run in serial.

To deal with the for loop, you'll notice that there's a commented out arrSum3 function, which I wrote based off my previously (and probably poorly) written parallel reduction kernel. Using it in place of the for loop drastically increased my runtime. I feel like there's a clever way to accomplish what I'm trying to do with the for loop, but I'm just not that smart and my advisor is tired of me "wasting time" thinking about it.

Appreciate any help.

EDIT

Full code is located here : Nvidia Visual Profiler timeline misses large chunk of program execution

解决方案

Assuming sgn() and abs() are not derived from "if"s and "else"s

__device__ double knowles_flux(int r, double *conc)
{
    double frag_term = 0;
    double flux = 0;

        //Calculation type : "Max"
        //no divergence
        //should prefer 20-30 extra cycles instead of a branching.
        //may not be good for CPU
        fluxA = (1-abs(sgn(r-(maxlength-1)))) * (-km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0]);
        //is zero if r and maxlength-1 are not equal

        //always compute this in shared memory so work will be equal for all cores, no divergence

        // you should divide kernel into several pieces to do a reduction
        // but if you dont want that, then you can try :
        for (int s = 0;s<someLimit ; s++) // all count for same number of cycles so no divergence
        {
            frag_term += conc[s] * (   abs(sgn( s-maxlength ))*sgn(1- sgn( s-maxlength ))  )* (      sgn(1+sgn(s-(r+1)))  );
        }
         //but you can make easier of this using "add and assign" operation
         // in local memory (was it __shared in CUDA?)
         //  global conc[] to local concL[] memory(using all cores)(100 cycles)
         // for(others from zero to upper_limit)
         // if(localID==0)
         // {
         //    frag_termL[0]+=concL[s]             // local to local (10 cycles/assign.)
         //    frag_termL[0+others]=frag_termL[0]; // local to local (10 cycles/assign.)
         // }  -----> uses nearly same number of cycles but uses much less energy
         //using single core (2000 instr. with single core vs 1000 instr. with 2k cores)
         // in local memory, then copy it to private registers accordingly using all cores



        //Calculation type : "F"

        fluxB = (  abs(sgn(r-(nc-1)))*sgn(1+sgn(r-(nc-1)))   )*(-(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0]);
        // is zero if r is not greater than (nc-1)



        //Calculation type : "N"


        fluxC = (   1-abs(sgn(r-(nc-1)))   )*((kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0]);
        //zero if r and nc-1 are not equal



    flux=fluxA+fluxB+fluxC; //only one of these can be different than zero

    flux=flux*(   -sgn(r-(nc-1))*sgn(1-sgn(r-(nc-1)))  )
    //zero if r > (nc-1)

    return flux;
}

Okay, let me open a bit:

if(a>b) x+=y;

can be taken as

if a-b is negative sgn(a-b) is -1
then adding 1 to that -1 gives zero ==> satisfies lower part of comparison(a<b)
x+= (sgn(a-b) +1) = 0 if a<b (not a>b), x unchanged

if(a-b) is zero, sgn(a-b) is zero
then we should multiply the upper solution with sgn(a-b) too!
x+= y*(sgn(a-b) +1)*sgn(a-b)
means
x+= y*( 0  +  1) * 0 = 0           a==b is satisfied too!

lets check what happens if a>b
x+= y*(sgn(a-b) +1)*sgn(a-b)
x+= y*(1 +1)*1  ==> y*2 is not acceptable, needs another sgn on outherside

x+= y* sgn((sgn(a-b)+1)*sgn(a-b))

x+= y* sgn((1+1)*1)

x+= y* sgn(2)   

x+= y only when a is greater than b

when there are too many

abs(sgn(r-(nc-1))

then you can re-use it as

tmp=abs(sgn(r-(nc-1))

.....  *tmp*(tmp-1) ....
...... +tmp*zxc[s] .....
......  ......

to decrease total cycles even more! Register accessing can be in the level of terabytes/s so shouldnt be a problem. Just as doing that for global access:

tmpGlobal= conc[r];

...... tmpGlobal * tmp .....
.... tmpGlobal +x -y ....

all private registers doing stuff in terabytes per second.

Warning: reading from conc[-1] shouldnt cause any faults as long as it is multiplied by zero if the real address of conc[0] is not real zero already . But writing is hazardous.

if you need to escape from conc[-1] anyway, you can multiply the index with some absolut-ified value too! See:

 tmp=conc[i-1] becomes   tmp=conc[abs((i-1))] will always read from positive index, the value will be multiplied by zero later anyway. This was lower bound protection.
  You can apply a higher bound protection too. Just this adds even more cycles.

Think about using vector-shuffle operations if working on a pure scalar values is not fast enough when accessing conc[r-1] and conc[r+1]. Shuffle operation between a vector's elements is faster than copying it through local mem to another core/thread.

这篇关于如何优化这个CUDA内核的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆