用于在CUDA中处理4D张量的内核 [英] Kernel for processing a 4D tensor in CUDA

查看:143
本文介绍了用于在CUDA中处理4D张量的内核的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想写一个内核来执行依赖于索引的所有唯一四重点(ij | kl)的计算。生成主机上所有唯一四重点的代码如下:

  #include< iostream> 


使用namespace std;

int main(int argc,char ** argv)
{
unsigned int i,j,k,l,ltop;
unsigned int nao = 7;
for(i = 0; i {
for(j = 0; j <= i; j ++)
{
for = 0; k <= i; k ++)
{
ltop = k;
if(i == k)
{
ltop = j;
}
for(l = 0; l <= ltop; l ++)
{
printf(计算ERI(%d,%d |%d,%d) \ n,i,j,k,l);
}
}
}
}

int m = nao *(nao + 1)/ 2;
int eris_size = m *(m + 1)/ 2;
cout<<ERI的袋的总大小是:<<< eris_size<< endl;

return 0;
}

输出

 计算ERI(0,0 | 0,0)
计算ERI(1,0 | 0,0)
ERI(1,0 | 1,0)
计算ERI(1,1 | 0,0)
计算ERI(1,1 | 1,0)
计算ERI (1,1 | 1,1)
计算ERI(2,0 | 0,0)
计算ERI(2,0 | 1,0)
计算ERI ,0 | 1,1)
计算ERI(2,0 | 2,0)
计算ERI(2,1 | 0,0)
计算ERI | 1,0)
计算ERI(2,1 | 1,1)
计算ERI(2,1 | 2,0)
计算ERI(2,1 | 2 ,1)
计算ERI(2,2 | 0,0)
计算ERI(2,2 | 1,0)
计算ERI(2,2 | 1,1 )
计算ERI(2,2 | 2,0)
计算ERI(2,2 | 2,1)
计算ERI(2,2 | 2,2)
计算ERI(3,0 | 0,0)
计算ERI(3,0 | 1,0)
计算ERI(3,0 | 1,1)
计算ERI(3,0 | 2,0)
计算ERI(3,0 | 2,1)
计算ERI(3,0 | 2,2)
计算ERI(3,0 | 3,0)
计算ERI(3,1 | 0,0)
计算ERI(3,1 | 1,0)
计算ERI (3,1 | 1,1)
计算ERI(3,1 | 2,0)
计算ERI(3,1 | 2,1)
计算ERI ,1 | 2,2)
计算ERI(3,1 | 3,0)
计算ERI(3,1 | 3,1)
计算ERI | 0,0)
计算ERI(3,2 | 1,0)
计算ERI(3,2 | 1,1)
计算ERI(3,2 | 2 ,0)
计算ERI(3,2 | 2,1)
计算ERI(3,2 | 2,2)
计算ERI(3,2 | 3,0 )
计算ERI(3,2 | 3,1)
计算ERI(3,2 | 3,2)
计算ERI(3,3 | 0,0)
计算ERI(3,3 | 1,0)
计算ERI(3,3 | 1,1)
计算ERI(3,3 | 2,0)
计算ERI(3,3 | 2,1)
计算ERI(3,3 | 2,2)
计算ERI(3,3 | 3,0)
ERI(3,3 | 3,1)
计算ERI(3,3 | 3,2)
计算ERI(3,3 | 3,3)

我想使用CUDA内核恢复同一组四重点,但我不能得到它。我现在的CUDA代码如下

  #include< iostream> 
#include< stdio.h>

using namespace std;

#define ABS(x)(x <0)? - x:x

__global__
void test_kernel(int basis_size)
{
unsigned int i_idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int j_idx = threadIdx.y + blockIdx.y * blockDim.y;


//构建四元组

unsigned int i_orbital,j_orbital,k_orbital,l_orbital,i__1,j__1;
unsigned int i_primitive,j_primitive,k_primitive,l_primitive;

i_orbital =(i_idx + 1)%basis_size / * + 1 * /;
j_orbital =(i__1 =(i_idx)/ basis_size,ABS(i__1))/ * + 1 * /;

k_orbital =(j_idx + 1)%basis_size / * + 1 * /;
l_orbital =(j__1 =(j_idx)/ basis_size,ABS(j__1))/ * + 1 * /;
unsigned int ltop;
ltop = k_orbital;
if(i_orbital == k_orbital)
{
ltop = j_orbital;
}
if(i_orbital< basis_size&& j_orbital< = i_orbital& k_orbital< = i_orbital& l_orbital< = ltop)
printf %d,%d |%d,%d)\\\
,i_orbital,j_orbital,k_orbital,l_orbital);
}

int main(int argc,char * argv [])
{
int nao = 7;
cudaDeviceReset();
/ *从块分配到网格* /
int dimx = 8;
int dimy = 8;
dim3 block(dimx,dimy); //我们将尝试8x8线程的块
dim3 grid((nao + block.x-1)/block.x,(nao + block.y-1)/block.y); //网格被相应地成形

/ *启动内核* /
test_kernel<<<<<< grid,block>>>(nao)
cudaDeviceReset();

return 0;
}

输出

 计算ERI(2,0 | 1,1)
计算ERI(3,1 | 3,1)
ERI(3,0 | 1,1)
计算ERI(1,1 | 1,1)
计算ERI(2,1 | 1,1)
计算ERI (3,1 | 1,1)
计算ERI(3,0 | 2,1)
计算ERI(2,1 | 2,1)
计算ERI ,1 | 2,1)
计算ERI(1,0 | 1,0)
计算ERI(2,0 | 1,0)
计算ERI | 1,0)
计算ERI(1,1 | 1,0)
计算ERI(2,1 | 1,0)
计算ERI(3,1 | 1 ,0)
计算ERI(2,0 | 2,0)
计算ERI(3,0 | 3,0)
计算ERI(3,0 | 2,0 )
计算ERI(2,1 | 2,0)
计算ERI(3,1 | 3,0)
计算ERI(3,1 | 2,0)
计算ERI(1,0 | 0,0)
计算ERI(2,0 | 0,0)
计算ERI(3,0 | 0,0)
计算ERI(0,0 | 0,0)
计算ERI(1,1 | 0,0)
计算ERI(2,1 | 0,0)
计算ERI(3,1 | 0,0)

四重奏将驱动排斥积分的计算输入参数存储在大小为nao的数组中3

解决方案

如上所述,你的代码不会做太多。它生成索引并打印出来。此外,不清楚 nao = 7 实际上是描述您的最终问题大小,还是仅仅是为了演示目的。



纯粹从索引生成的角度来看,有很多方法来解决你的问题。这些中的一些可以自然地有助于GPU的有效使用,一些可能不是。但是GPU的有效使用不能真正从你已经显示的代码确定。例如,CUDA程序的期望目标之一是从全局存储器的合并访问。由于你的代码没有显示任何内容,因此在不知道生成的访问模式可能是什么的情况下提出解决方案有点冒险。



因此,可能真正的问题尝试解决可能有一些考虑,在做出关于索引生成的最终决定之前应该考虑这些考虑(这实际上等于将工作分配给给定线程)。



有些批评你的代码:


  1. 您建议的实现建议使用2D线程块和网格结构有效地提供您的程序使用的i,j索引。然而,从原始C源代码的检查,我们看到j索引只覆盖直到i索引的空间:

      for(i = 0; i  {
    for(j = 0; j <= i; j ++)

    但是,使用2D网格/线程块结构替换这样的结构会创建一个矩形空间,这在原始C代码中不是由嵌套的for循环表示的。因此,这样的空间/网格的发射将导致产生超出范围的i,j索引;这些线程将需要不做任何工作。我们可以通过重新利用超出范围的线程来覆盖一些额外的空间来解决这个问题(也许这就是你的代码所要做的),但这导致相当复杂和难以读取的算术,看下一个批评。


  2. 您的内核没有循环,因此很明显,每个线程只能生成一行打印输出。因此每个线程只能负责一个ERI条目。根据您提供的C源代码,对于大小为7的 nao ,您需要406个ERI条目(您发布的打印输出对于您显示的代码似乎不完整, BTW)。如果我们每个线程有一个打印输出,并且我们需要覆盖406个ERI条目的空间,那么我们最好至少有406个线程,否则我们提出的解决方案根本无法工作。如果我们检查你的代码以确定网格的大小在线程:

      int dimx = 8; 
    int dimy = 8;
    dim3 block(dimx,dimy); //我们将尝试8x8线程的块
    dim3 grid((nao + block.x-1)/block.x,(nao + block.y-1)/block.y); //网格相应地成形

    我们得出的结论是,你只是没有启动足够的线程。如果你通过上面的计算(你可能想打印block.x,.y和grid.x,.y值如果不确定),你会发现你正在启动一个 8x8线程块,即总共64个线程,对于 nao 为7. 64个线程,在每个线程一个打印输出,不能覆盖406个条目的ERI空间。事实上,假设您的 printf 受到 if 语句的限制,可能是您有少于一个打印输出


可能的解决方案:


  1. 一个相当简单的方法只是将C源代码中的外部两个for循环映射到2D网格的x,y索引。然后,我们将保留(或多或少)完整的C源代码的内部for循环结构,作为我们的内核代码。这是相当容易写。它将有一个缺点,它将启动一些线程,什么也不做(我们必须检查条件,其中j> i和条件这样的线程不做任何事情),但这可能是一个次要考虑。更大的问题可能是我们生成了多少实际线程。对于给定的 nao 7,这将启动7x7 = 49个线程(其中一些不工作)。对于GPU,49或甚至406线程的问题大小微小,并且由于线程数量有限,无法达到峰值性能。该问题(j <= i)的三角形性质意味着该方法的进一步改进将是启动线性线程块,然后使用线性到三角形索引映射,其将导致没有浪费的线程。


  2. 另一种可能的方法是使用一个由@AngryLettuce在您的以前的一个问题(现在已删除,或许)的评论建议。具体地,生成1D网格和1D全局唯一线程索引,然后使用算术来计算必要的子索引(i,j,k,l),以将1D索引转换为子索引。这具有的优点是,对于小的问题大小,我们可以直接启动更大的内核。例如,对于406个ERI条目的问题空间,我们将生成(至少)406个线程,其中gobally唯一索引为0..405,并将该1-D索引转换为4个子索引(i,j,k, 1)。看看你的内核代码,这可能是你想做的。然而,由于你的空间是奇怪的形状,所以从我的观点来看,将线性索引(或任何矩形索引集合)转换为奇怪形状的空间的算法将是复杂的。


如果你真正的问题空间很大( nao 大于7),那么我个人会选择第一种方法,因为它将是更可读的代码(IMO),并且容易写/维护。对于小问题空间,由于已经讨论的原因,第二种方法可能更好。对于上述任何一种方法,我们都想研究生成的全局内存访问模式。这将取决于您没有显示的数据组织。第一种方法可能更容易写入以生成合并访问,但是在第二种方法中小心使用索引生成算法应该(理论上)允许您实现任何所需的访问模式。



这里是方法1的完整代码:

  #include< stdio.h> 
#include< iostream>

using namespace std;

__global__ void kernel(unsigned int nao)
{
unsigned i = threadIdx.x + blockDim.x * blockIdx.x;
unsigned j = threadIdx.y + blockDim.y * blockIdx.y;
if(i> = nao)return;
if(j> i)return; //修改如果使用__syncthreads()之后这个

unsigned int k,l,ltop;
for(k = 0; k <= i; k ++)
{
ltop = k;
if(i == k)
{
ltop = j;
}
for(l = 0; l <= ltop; l ++)
{
printf(计算ERI(%d,%d |%d,%d) \ n,i,j,k,l);
}
}
}


int main(){
unsigned int nao = 7;
dim3 block(4,4);
dim3 grid((nao + block.x-1)/block.x,(nao + block.y-1)/block.y);
kernel<<<< grid,block>>>>(nao);
cudaDeviceSynchronize();
int m = nao *(nao + 1)/ 2;
int eris_size = m *(m + 1)/ 2;
cout<<ERI的袋的总大小是:<<< eris_size<< endl;

return 0;
}

注意,我已经使用了条件返回语句,对于线程不工作)。然而,如果你需要在你的内核中使用 __ syncthreads(),你将需要修改内核结构以避免条件返回,而允许out of bounds线程继续通过内核代码而不做任何工作。这个概念在 __ syncthreads()使用中的许多其他SO问题中都有介绍。还要注意,内核打印可以以任何顺序发生(正如线程可以以任何顺序执行),所以我只验证了上述方法似乎工作,并产生所需的406行打印输出。更好的验证方法是避免使用printf并使用tally矩阵。



对于第二种方法,从单个线性索引到多维空间的转换与1:1映射和没有浪费/未使用的索引)是相当复杂,如已经提到的,由于您的多维空间是奇怪形状的事实。我没有一个伟大的方法从线性指数转换为子指数。我必须根据i,j,k索引绘制 l 循环的各种问题空间维度大小:

  i = 0,j = 0,k = 0,l = 0(对于i = 0,总共1次迭代)

l循环限制, 1:(对于i = 1,总共5次迭代)
j
0 1
k 0 0 0
1 0 1

l循环限制,对于i = 2 :(对于i = 2,总共15次迭代)
j
0 1 2
k 0 0 0 0
1 1 1 1
2 0 1 2
b $ bl循环限制,对于i = 3 :(对于i = 3,总共迭代34次)
j
0 1 2 3
k 0 0 0 0 0
1 1 1 1 1
2 2 2 2 2
3 0 1 2 3


(注意,另一种映射方法是将上面每个矩阵的最后一行视为等于 k 的常数,就像其他行一样,这将导致一些浪费的线程,但大大简化了索引计算下面我们将简要讨论无浪费线程方法,但最终提出一个使用浪费线程方法的例子,因为索引计算在算术和时间都大大简化了复杂度)。



研究上述内容,我们看到映射到给定 i 循环迭代的迭代次数是:

  [((i + 2)*(i + 1)/ 2)*(i + 1)]  - i + 1)* i / 2 

或:


$ b b

 (((i + 2)*(i + 1)-i)/ 2)*(i + 1)
/ pre>

上述计算可用于基于线性索引确定 i 的索引。我知道使用上述断点解剖线性索引的唯一方法是通过二进制搜索。



然后,我们将重复上述方法(确定每个级别的迭代次数,并使用二分搜索将线性索引映射到级别索引)下一个索引j和k。



为了简化讨论的剩余部分,我已经切换到使用修改后的映射版本:

  l循环限制,对于i = 1:(对于i = 1,总共6次迭代)
j
0 1
k 0 0 0
1 1 1

l循环限制,对于i = 2:(对于i = 2,总共18次迭代)
j
0 1 2
k 0 0 0 0
1 1 1 1
2 2 2 2

l循环限制,对于i = 3:(对于i = 3,总共40次迭代)
j
0 1 2 3
k 0 0 0 0 0
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3

等等。

我们将有一些浪费的线程,我们将在内核中使用if条件。



上面每个i级的迭代次数只是:

 (i + 2)*(i + 1)*(i + 1)/ 2 

一旦我们从给定线性指数的上述公式计算i-index(使用上述多项式的求和),那么我们可以通过将剩余空间除以i个相等大小的部分来容易地计算下一个索引(j)。下一个k索引可以使用三角形映射方法,然后剩余空间变成我们的l循环范围。



以下是方法2的完整代码:

  #include< stdio.h> 
#include< iostream>

//从0到n的多项式(i + 2)*(i + 1)*(i + 1)/ 2的闭合形式求和
__host__ __device__ unsigned my_func无符号i){return 1+(2 * i +((i *(i + 1))/ 2)*((i *(i + 1))/ 2) ((2 * i)+1)* 2)/ 3 +((5 * i *(i + 1))/ 2) }

//二进制搜索
__host__ __device__ unsigned bsearch_functional(const unsigned key,const unsigned len_a,unsigned(* func)(unsigned)){
unsigned lower = 0;
unsigned upper = len_a; unsigned midpt;

while(lower< upper){
midpt =(lower + upper)>> 1;
if(func(midpt)< = key)lower = midpt +1;
else upper = midpt;
}
return lower;
}

//将线性索引转换为三角矩阵(row,col)索引
__host__ __device__ void linear_to_triangular(const unsigned i,const unsigned n,unsigned * trow,unsigned * tcol)
{
int c = i;
int row = c /(n-1);
int col = c%(n-1);

if(col< row){
col =(n-2)-col;
row =(n-1)-row;
}

* tcol = col + 1;
* trow = row;
}



__global__ void kernel(unsigned int nao,unsigned int eris_size)
{
unsigned idx = threadIdx.x + blockDim .x * blockIdx.x;
if(idx< eris_size){
//通过二进制搜索计算i-index
unsigned int i = bsearch_functional(idx,nao,my_func);
//通过空格除以i计算j-index;
unsigned int j =(idx == 0)?0:(idx-my_func(i-1))/((my_func(i)-my_func(i-1))/(i + 1)
unsigned k,l;
linear_to_triangular((idx-my_func(i-1) - (j *((my_func(i)-my_func(i-1))/(i + 1))),(i + 2) ; k,& l);
k = i-k;
l =(i + 1)-l;
if(idx == 0){k = 0; l = 0;}
if(l <=((i == k)?j:k))
printf(计算ERI(%d,%d |%d,%d )\\\
,i,j,k,l);
}
}

int main(){
unsigned int nao = 7;
int m = nao *(nao + 1)/ 2;
int eris_size = m *(m + 1)/ 2;
const int nTPB = 64;
std :: cout<<ERI的大袋的总大小是:<< eris_size<< std :: endl;
int mod_eris_size = my_func(nao-1);
kernel<<< mod_eris_size + nTPB-1 / nTPB,nTPB>>(nao,mod_eris_size);
cudaDeviceSynchronize();
return 0;
}

要清楚,我不知道你的任务是什么,我不保证这些例子是对任何给定的使用没有bug。我的目的不是给你一个黑盒子,而是解释一些可能的编程方法。我没有做严格的验证,但只是观察到每个方法产生406行ERI输出,与您的原始C代码相同。



最后,我已省略正确的cuda错误检查为了简洁的表示,但任何时候你遇到麻烦的cuda代码,我建议使用它,并运行您的代码与 cuda-memcheck


I want to write a kernel to perform computations that depends on all the unique quartets of indices (ij|kl). The code that generate all the unique quartets on the host is as follows:

#include <iostream>


using namespace std;

int main(int argc, char** argv)
{
    unsigned int i,j,k,l,ltop;
    unsigned int nao=7;
    for(i=0;i<nao;i++)
    {
        for(j=0;j<=i;j++)
        {
            for(k=0;k<=i;k++)
            {
                ltop=k;
                if(i==k)
                {
                    ltop=j;
                }
                for(l=0;l<=ltop; l++)
                {
                    printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l);
                }
            }    
        }
    }

    int m = nao*(nao+1)/2;
    int eris_size = m*(m+1)/2;
    cout<<"The total size of the sack of ERIs is: "<<eris_size<<endl;

    return 0;
}

output:

computing the ERI (0,0|0,0) 
computing the ERI (1,0|0,0) 
computing the ERI (1,0|1,0) 
computing the ERI (1,1|0,0) 
computing the ERI (1,1|1,0) 
computing the ERI (1,1|1,1) 
computing the ERI (2,0|0,0) 
computing the ERI (2,0|1,0) 
computing the ERI (2,0|1,1) 
computing the ERI (2,0|2,0) 
computing the ERI (2,1|0,0) 
computing the ERI (2,1|1,0) 
computing the ERI (2,1|1,1) 
computing the ERI (2,1|2,0) 
computing the ERI (2,1|2,1) 
computing the ERI (2,2|0,0) 
computing the ERI (2,2|1,0) 
computing the ERI (2,2|1,1) 
computing the ERI (2,2|2,0) 
computing the ERI (2,2|2,1) 
computing the ERI (2,2|2,2) 
computing the ERI (3,0|0,0) 
computing the ERI (3,0|1,0) 
computing the ERI (3,0|1,1) 
computing the ERI (3,0|2,0) 
computing the ERI (3,0|2,1) 
computing the ERI (3,0|2,2) 
computing the ERI (3,0|3,0) 
computing the ERI (3,1|0,0) 
computing the ERI (3,1|1,0) 
computing the ERI (3,1|1,1) 
computing the ERI (3,1|2,0) 
computing the ERI (3,1|2,1) 
computing the ERI (3,1|2,2) 
computing the ERI (3,1|3,0) 
computing the ERI (3,1|3,1) 
computing the ERI (3,2|0,0) 
computing the ERI (3,2|1,0) 
computing the ERI (3,2|1,1) 
computing the ERI (3,2|2,0) 
computing the ERI (3,2|2,1) 
computing the ERI (3,2|2,2) 
computing the ERI (3,2|3,0) 
computing the ERI (3,2|3,1) 
computing the ERI (3,2|3,2) 
computing the ERI (3,3|0,0) 
computing the ERI (3,3|1,0) 
computing the ERI (3,3|1,1) 
computing the ERI (3,3|2,0) 
computing the ERI (3,3|2,1) 
computing the ERI (3,3|2,2) 
computing the ERI (3,3|3,0) 
computing the ERI (3,3|3,1) 
computing the ERI (3,3|3,2) 
computing the ERI (3,3|3,3)

I want to recover the same set of quartets using a CUDA kernel but I cannot get it. The code for CUDA that I have by now is as follows

#include <iostream>
#include <stdio.h>

using namespace std;

#define ABS(x)  (x<0)?-x:x

__global__
void test_kernel(int basis_size)
{
    unsigned int i_idx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int j_idx = threadIdx.y + blockIdx.y * blockDim.y;


    // Building the quartets                                                                                                                                                  

    unsigned int i_orbital, j_orbital, k_orbital, l_orbital, i__1,j__1;
    unsigned int i_primitive, j_primitive, k_primitive, l_primitive;

    i_orbital = (i_idx + 1)%basis_size /*+1*/;
    j_orbital = (i__1 = (i_idx) / basis_size, ABS(i__1)) /*+ 1*/;

    k_orbital = (j_idx+1)%basis_size /*+1*/;
    l_orbital = (j__1 = (j_idx) / basis_size, ABS(j__1)) /*+ 1*/;
    unsigned int ltop;
    ltop=k_orbital;
    if(i_orbital==k_orbital)
    {
        ltop=j_orbital;
    }
    if(i_orbital<basis_size && j_orbital<=i_orbital && k_orbital<=i_orbital && l_orbital<=ltop)
        printf("computing the ERI (%d,%d|%d,%d)   \n", i_orbital, j_orbital,k_orbital,l_orbital);
}

int main(int argc, char *argv[])
{
    int nao = 7;
    cudaDeviceReset();
    /* partitioning from blocks to grids */
    int dimx = 8;
    int dimy = 8;
    dim3 block(dimx, dimy);   // we will try blocks of 8x8 threads                                                                                                            
    dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y);   // The grids are shaped accordingly                                                                        

    /* Launch the kernel */
    test_kernel<<<grid,block>>>(nao);
    cudaDeviceReset();

    return 0;
}

output:

computing the ERI (2,0|1,1)    
computing the ERI (3,1|3,1)    
computing the ERI (3,0|1,1)    
computing the ERI (1,1|1,1)    
computing the ERI (2,1|1,1)    
computing the ERI (3,1|1,1)    
computing the ERI (3,0|2,1)    
computing the ERI (2,1|2,1)    
computing the ERI (3,1|2,1)    
computing the ERI (1,0|1,0)    
computing the ERI (2,0|1,0)    
computing the ERI (3,0|1,0)    
computing the ERI (1,1|1,0)    
computing the ERI (2,1|1,0)    
computing the ERI (3,1|1,0)    
computing the ERI (2,0|2,0)    
computing the ERI (3,0|3,0)    
computing the ERI (3,0|2,0)    
computing the ERI (2,1|2,0)    
computing the ERI (3,1|3,0)    
computing the ERI (3,1|2,0)    
computing the ERI (1,0|0,0)    
computing the ERI (2,0|0,0)    
computing the ERI (3,0|0,0)    
computing the ERI (0,0|0,0)    
computing the ERI (1,1|0,0)    
computing the ERI (2,1|0,0)    
computing the ERI (3,1|0,0)

The quartets will drive the computation of repulsion integrals whose input parameters are stored in an array of size nao with 3

解决方案

As presented, your code doesn't do "much". It generates indices and prints them out. Furthermore, it's not clear whether nao=7 is actually descriptive of your final problem size, or if it is just for demonstration purposes.

Purely from the standpoint of indices generation, there are a large number of ways to solve your problem. Some of these may naturally lend themselves to efficient usage of the GPU, some may not. But efficient usage of the GPU cannot really be ascertained from the code you have shown so far. For example, one of the desirable goals of a CUDA program is coalesced access from global memory. Since your code shows none of that, it's a bit risky to propose solutions without knowing what the generated access patterns might be.

Therefore it's possible that the "real problem" you are trying to solve may have some considerations that should be contemplated before a final decision is made about index generation (which is effectively tantamount to "assignment of work to a given thread"). I'll try to mention some of these.

Some criticisms of your code:

  1. Your proposed realization suggests use of a 2D threadblock and grid structure to effectively provide the i,j indices that are used by your program. However, from inspection of the original C source code, we see that the j index only covers a space up to the i index:

    for(i=0;i<nao;i++)
    {
        for(j=0;j<=i;j++)
    

    However, the replacement of such a structure with a 2D grid/threadblock structure creates a rectangular space, which is not implied by your nested for-loops in the original C code. Therefore launch of such a space/grid would lead to creation of i,j indices that are out of range; those threads would need to do no work. We could work around this (perhaps that is what your code is trying to do) by "re-purposing" out-of-range threads to cover some of the additional space, but this leads to fairly complex and hard to read arithmetic, and see the next criticism.

  2. Your kernel has no loop in it, so it seems evident that each thread can only generate one line of printout. Therefore each thread can only be responsible for one ERI entry. Based on the C source code you have supplied, for a nao size of 7, you expect 406 ERI entries (your posted printout appears to be incomplete for the code you have shown, BTW). If we have one printout per thread, and we need to cover a space of 406 ERI entries, we had better have at least 406 threads, or our proposed solution simply cannot work. If we inspect your code to determine the size of the grid in terms of threads:

    int dimx = 8;
    int dimy = 8;
    dim3 block(dimx, dimy);   // we will try blocks of 8x8 threads                                                                                                            
    dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y);   // The grids are shaped accordingly                                                                        
    

    we will arrive at the conclusion that you are simply not launching enough threads. If you work through the above calculations (you may want to printout the block.x,.y and grid.x,.y values if unsure), you will discover that you are launching one 8x8 threadblock, i.e. 64 threads total, for nao of 7. 64 threads, at one printout per thread, cannot cover an ERI space of 406 entries. In fact, given that your printf is conditioned by an if statement, it may be that you have less than one printout per thread.

Possible solution ideas:

  1. A fairly straightforward approach would simply be to map the outer two for-loops in your C source code to x,y indices of a 2D grid. We would then retain, more or less intact, the inner for-loop structure of your C source code, as our kernel code. This is fairly easy to write. It will have the drawback that it will launch some threads that will do nothing (we must check for the condition where j>i and condition such threads to do nothing), but this may be a minor consideration. A bigger issue with this might be how many actual threads we are generating. For your given nao of 7, this would launch 7x7 = 49 threads (some of which do no work). A problem size of 49, or even 406 threads is tiny for a GPU, and it will not be able to achieve peak performance due to a limited number of threads. The "triangular" nature of this problem (j<=i) means that a further improvement of this method would be to launch a linear threadblock and then use a linear-to-triangular index mapping for i,j from the linear threadblock index, which would result in no "wasted" threads. However, a more important consideration than ~50% wasted threads would be the coalescing of global access, as already discussed.

  2. Another possible approach would be to use the one suggested by @AngryLettuce in a comment on one of your previous questions (now deleted, perhaps). Specifically, generate a 1D grid and a 1D globally unique thread index, and then compute the necessary sub-indices (i,j,k,l) using arithmetic to convert the 1D index into the sub-indices. This has the advantage that for small problem sizes, we can directly launch larger kernels. For example, for your problem space of 406 ERI entries, we would generate (at least) 406 threads, with gobally unique indices of 0..405, and convert that 1-D index into 4 sub indices (i,j,k,l). Looking at your kernel code, this may be what you are trying to do. However, since your space is oddly-shaped, the arithmetic to convert from a linear index (or any set of rectangular indices) to an oddly-shaped space will be complicated, in my opinion.

If your real problem space is large (nao much larger than 7), then I personally would choose the first method, since it will be more readable code (IMO) and is easy to write/maintain. For small problem spaces, the second method may possibly be better, for the reasons already discussed. For either of the above methods, we would want to study the generated global memory access patterns that would result. This will depend on your data organization which you haven't shown. The first method might be easier to write to generate coalesced access, but careful use of the index generation arithmetic in the second method should (in theory) allow you to achieve whatever access patterns you wish as well.

Here's a complete code of method 1:

#include <stdio.h>
#include <iostream>

using namespace std;

__global__ void kernel(unsigned int nao)
{
    unsigned i = threadIdx.x+blockDim.x*blockIdx.x;
    unsigned j = threadIdx.y+blockDim.y*blockIdx.y;
    if (i >= nao) return;
    if (j > i) return; // modify if use of __syncthreads() after this

    unsigned int k,l,ltop;
            for(k=0;k<=i;k++)
            {
                ltop=k;
                if(i==k)
                {
                    ltop=j;
                }
                for(l=0;l<=ltop; l++)
                {
                    printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l);
                }
            }
}


int main(){
    unsigned int nao = 7;
    dim3 block(4,4);
    dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y);
    kernel<<<grid,block>>>(nao);
    cudaDeviceSynchronize();
    int m = nao*(nao+1)/2;
    int eris_size = m*(m+1)/2;
    cout<<"The total size of the sack of ERIs is: "<<eris_size<<endl;

    return 0;
}

Note that I have used conditional return statements for ease of demonstration in this code (for the threads doing no work). However if you need to use __syncthreads() in your kernel, you will want to modify the kernel structure to avoid the conditional returns, and instead allow the "out of bounds" threads to continue through the kernel code while doing no work. This concept is covered in many other SO questions on __syncthreads() usage. Also note that kernel printout can occur in any order (just as threads can execute in any order) so I have only verified that the above method seems to work and produces the desired 406 lines of printout. A better verification approach would be to avoid use of printf and use a tally matrix instead.

For the second method, the conversion from a single linear index to a multidimensional space (with 1:1 mapping and no wasted/unused indices) is considerably complicated, as already mentioned, by the fact that your multi-dimensional space is "oddly shaped". I don't have a "great" method to convert from linear index to sub-indices. I had to map out the various problem space dimension sizes for the l loop based on the i,j,k indices:

i=0, j=0, k=0, l=0  (total 1 iteration for i=0)

l loop limit, for i=1: (total 5 iterations for i=1)
     j
    0 1
k 0 0 0
  1 0 1

l loop limit, for i=2: (total 15 iterations for i=2)
     j
    0 1 2
k 0 0 0 0
  1 1 1 1
  2 0 1 2

l loop limit, for i=3:  (total 34 iterations for i=3)
     j
    0 1 2 3
k 0 0 0 0 0
  1 1 1 1 1
  2 2 2 2 2
  3 0 1 2 3

 etc.

(Note that an alternative mapping method would be to treat the last row of each matrix above as constant equal to k, just like the other rows, and this would result in some wasted threads but considerably simplified index calculation below. We will discuss briefly the no-wasted-threads method, but ultimately present an example using the wasted threads method, because the indexing calculations are considerably simplified in both arithmetic and time complexity).

Studying the above, we see that the number of iterations mapped to a given i loop iteration is:

[((i+2)*(i+1)/2)*(i+1)] - (i+1)*i/2

or:

(((i+2)*(i+1)-i)/2)*(i+1)

The above calculation can be used to determine the indexing for i based on a linear index. The only way I know of to dissect a linear index using the above breakpoints would be via a binary search.

We would then repeat the above approach (determining number of iterations per "level", and the using a binary search to map the linear index to a level index) to compute the next indices j and k. The remaining quantity would be the l-index.

To simplify the remainder of the discussion, I have switched to using the modified mapping version:

l loop limit, for i=1: (total 6 iterations for i=1)
     j
    0 1
k 0 0 0
  1 1 1

l loop limit, for i=2: (total 18 iterations for i=2)
     j
    0 1 2
k 0 0 0 0
  1 1 1 1
  2 2 2 2

l loop limit, for i=3:  (total 40 iterations for i=3)
     j
    0 1 2 3
k 0 0 0 0 0
  1 1 1 1 1
  2 2 2 2 2
  3 3 3 3 3

 etc.

from which we will have some wasted threads which we will handle with an if condition in our kernel.

The number of iterations per i-level above is just:

(i+2)*(i+1)*(i+1)/2

Once we have computed the i-index from the above formula given a linear index (using binary search on the summation of the above polynomial), then we can compute the next index (j) quite easily, by dividing the remaining space into i equal sized pieces. The next k index can be found using a triangular mapping method and the remaining space then becomes our l loop extent. However we must remember to condition this l index as discussed previously.

Here's a complete code for method 2:

#include <stdio.h>
#include <iostream>

// the closed-form summation of the polynomial (i+2)*(i+1)*(i+1)/2 from 0 to n
__host__ __device__ unsigned my_func(unsigned i){ return 1+(2*i+(((i*(i+1))/2)*((i*(i+1))/2)) + (i*(i+1)*((2*i)+1)*2)/3 + ((5*i*(i+1))/2))/2; }

// binary search
__host__ __device__ unsigned bsearch_functional(const unsigned key, const unsigned len_a, unsigned (*func)(unsigned)){
  unsigned lower = 0;
  unsigned upper = len_a;
  unsigned midpt;
  while (lower < upper){
    midpt = (lower + upper)>>1;
    if (func(midpt) <= key) lower = midpt +1;
    else upper = midpt;
    }
  return lower;
}

// conversion of linear index to triangular matrix (row,col) index
__host__ __device__ void linear_to_triangular(const unsigned i, const unsigned n, unsigned *trow, unsigned *tcol)
{
    int c = i;
    int row = c/(n-1);
    int col = c%(n-1);

    if (col < row) {
      col = (n-2)-col;
      row = (n-1)-row;
    }

    *tcol = col+1;
    *trow = row;
}



__global__ void kernel(unsigned int nao, unsigned int eris_size)
{
    unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < eris_size){
    // compute i-index via binary search
    unsigned int i = bsearch_functional(idx, nao, my_func);
    // compute j-index via division of the space by i;
    unsigned int j = (idx==0)?0:(idx-my_func(i-1))/((my_func(i)-my_func(i-1))/(i+1));
    unsigned k,l;
    linear_to_triangular((idx-my_func(i-1)-(j *((my_func(i)-my_func(i-1))/(i+1)))), (i+2), &k, &l);
    k = i-k;
    l = (i+1)-l;
    if (idx == 0) {k=0; l=0;}
    if (l <= ((i==k)?j:k))
      printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l);
    }
}

int main(){
    unsigned int nao = 7;
    int m = nao*(nao+1)/2;
    int eris_size = m*(m+1)/2;
    const int nTPB = 64;
    std::cout<<"The total size of the sack of ERIs is: "<<eris_size<<std::endl;
    int mod_eris_size = my_func(nao-1);
    kernel<<<mod_eris_size+nTPB-1/nTPB, nTPB>>>(nao, mod_eris_size);
    cudaDeviceSynchronize();
    return 0;
}

To be clear, I don't know exactly what your task is, and I am not guaranteeing these examples are bug-free for any given usage. My intent is not to give you a black-box, but to explain some possible programming approaches. I didn't do rigorous verification but merely observed that each method produced 406 lines of ERI output, same as your original C code.

Finally, I have omitted proper cuda error checking for brevity of presentation, but any time you are having trouble with a cuda code, I recommend using it and running your code with cuda-memcheck.

这篇关于用于在CUDA中处理4D张量的内核的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆