CUDA共享内存数组变量 [英] Cuda Shared Memory array variable

查看:352
本文介绍了CUDA共享内存数组变量的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想申报矩阵乘法的变量如下:

  __ shared__持股量[BLOCK_SIZE] [BLOCK_SIZE];

我想使它所以用户可以输入矩阵的大小来计算,但是这将意味着改变BLOCK_SIZE。我改变了它,但我得到一个编译器错误:错误:常量值不知道。我看着它,它类似于此<一个href=\"http://stackoverflow.com/questions/3220553/define-variable-size-on-array-in-local-memory-using-cuda\">thread.所以我尝试:

  __ shared__ INT BUF [];

但后来我得到:错误:不完全类型是不允许的

谢谢,

与code(pretty更新多遵循本指南和与CUDA引导凝视着):
块大小通过询问矩阵的大小的用户传递。他们进入x和y。块大小仅为x和现在它已经接受尺寸X和Y一样的。

  __ global__无效matrixMul(浮点* C,浮* A,浮* B,诠释WA,诠释WB,为size_t BLOCK_SIZE)
{
    //块索引
    INT BX = blockIdx.x;
    通过= blockIdx.y诠释;    //主题指数
    INT TX = threadIdx.x;
    INT TY = threadIdx.y;    的处理的第一子矩阵的//指数
    //由块
    INT aBegin = WA * BLOCK_SIZE *通过;    的处理的最后一个子矩阵的//指数
    //由块
    INT aEnd = aBegin + WA - 1;    //用于通过迭代步长
    A的//子矩阵
    INT aStep =块;    的B处理的第一子矩阵的//指数
    //由块
    INT bBegin =块* BX;    //用于通过迭代步长
    的B //子矩阵
    INT bStep =块* WB;
    浮Csub的= 0;
    //循环A和B的全部子矩阵
    //需要计算块子矩阵
    对于(INT A = aBegin,B = bBegin; A&LT; = aEnd; A + = aStep,B + = bStep)
    {
        //共享内存阵列的宣言
        //用来存储A的子矩阵        的extern __shared__流通量为[];        //共享内存阵列烧烤宣言
        //用于存储B的子矩阵
        的extern __shared__浮烧烤[];
        的extern __shared__浮动SMEM [];        //将全局内存矩阵
        //共享内存;每个线程的负载
        //每个矩阵的一个元素
        SMEM [TY * BLOCK_SIZE + TX] = A [A + WA * TY + TX];
        // cuPrintf(\\ n \\ nWhat是内存位置\\ n吗?);
        // cuPrintf(共享内存(A)为:%.2f \\ n,SMEM [TY * BLOCK_SIZE + TX]);
        SMEM [BLOCK_SIZE * BLOCK_SIZE + TY * BLOCK_SIZE + TX] = B [B + WB * TY + TX];
        // cuPrintf(以下简称共享内存(B)为:%.2f \\ n,SMEM [BLOCK_SIZE * BLOCK_SIZE + TY * BLOCK_SIZE + TX]);
        //同步,确保矩阵
        //是装
        __syncthreads();        //乘以两个矩阵在一起;
        //每个线程计算一个元素
        //块子矩阵的
        对于(INT K = 0; K&LT; BLOCK_SIZE ++ K)
        {            Csub的+ = SMEM [TY * BLOCK_SIZE + K] * SMEM [BLOCK_SIZE * BLOCK_SIZE + K * BLOCK_SIZE + TX];
            // cuPrintf(Csub的是目前:%.2f \\ n,Csub的);
        }
        // cuPrintf(\\ n \\ n \\ n);
        //同步,确保了preceding
        //计算是加载了两个新之前完成
        //在下次迭代A和B的子矩阵
        // cuPrintf(下称结果是Csub的:%.2f \\ n,Csub的);
        __syncthreads();
    }
    //写块子矩阵设备内存;
    //每个线程写一个元素
    INT C = WB * BLOCK_SIZE *由+ BLOCK_SIZE * BX;
    C [C + WB * TY + TX] = Csub的;
}


解决方案

的extern __shared__ INT BUF [];

当你启动的内核,你应该推出这种方式;

内核&LT;&LT;&LT;块,线,numbytes_for_shared&GT;&GT;&GT;(...);

如果你有多个共享extern声明:

的extern __shared__流通量为[];

的extern __shared__浮动烧烤[];

这将导致指向相同的地址烧烤

您将需要保持作为​​和B一维数组中。

 的extern __shared__浮动SMEM [];

当调用内核,你应该启动它2 * BLOCK_SIZE * BLOCK_SIZE *的sizeof(浮动)

当进行索引作为,使用 SMEM [Y * BLOCK_SIZE + X] 键,索引到烧烤时使用 SMEM [BLOCK_SIZE * BLOCK_SIZE + Y * BLOCK_SIZE + X]

I am trying to declare a variable for matrix multiplication as follows:

__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

I am trying to make it so the user could input the size of the matrix to calculate, however that would mean changing the BLOCK_SIZE. I changed it but I am getting a compiler error:"error: constant value is not known". I've looked into it and it's similar to this thread. So I tried:

__shared__ int buf [];

But then I get: "error: incomplete type is not allowed"

Thanks, Dan Update with code(pretty much followed this guide and the staring out with cuda guide): The block size is passed in by asking the user of the size of the matrix. They enter the x and y. Block size is only x and right now it has to accept the same size as x and y.

__global__ void matrixMul( float* C, float* A, float* B, int wA, int wB,size_t block_size)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Index of the first sub-matrix of A processed 
    // by the block
    int aBegin = wA * block_size * by;

    // Index of the last sub-matrix of A processed 
    // by the block
    int aEnd   = aBegin + wA - 1;

    // Step size used to iterate through the 
    // sub-matrices of A
    int aStep  = block_size;

    // Index of the first sub-matrix of B processed 
    // by the block
    int bBegin = block_size * bx;

    // Step size used to iterate through the 
    // sub-matrices of B
    int bStep  = block_size * wB;
    float Csub=0;
    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) 
    {
        // Declaration of the shared memory array As 
        // used to store the sub-matrix of A

        extern __shared__ float As[];

        // Declaration of the shared memory array Bs 
        // used to store the sub-matrix of B
        extern __shared__ float Bs[];
        extern __shared__ float smem[];

        // Load the matrices from global memory
        // to shared memory; each thread loads
        // one element of each matrix
        smem[ty*block_size+tx] = A[a + wA * ty + tx];
        //cuPrintf("\n\nWhat are the memory locations?\n");
        //cuPrintf("The shared memory(A) is: %.2f\n",smem[ty*block_size+tx]);
        smem[block_size*block_size+ty*block_size+tx]  = B[b + wB * ty + tx];
        //cuPrintf("The shared memory(B) is: %.2f\n",smem[block_size*block_size+ty*block_size+tx]);
        // Synchronize to make sure the matrices 
        // are loaded
        __syncthreads();

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (int k = 0; k < block_size; ++k)
        {

            Csub += smem[ty*block_size+k] * smem[block_size*block_size+k*block_size+tx] ;
            //cuPrintf("Csub is currently: %.2f\n",Csub);
        }
        //cuPrintf("\n\n\n");
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        //cuPrintf("the results are csub: %.2f\n",Csub);
        __syncthreads();
    }
    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int c = wB * block_size * by + block_size * bx;
    C[c + wB * ty + tx] = Csub;


}

解决方案

extern __shared__ int buf[];

when you launch the kernel you should launch it this way;

kernel<<<blocks,threads,numbytes_for_shared>>>(...);

If you have multiple extern declaration of shared:

extern __shared__ float As[];

extern __shared__ float Bs[];

this will lead to As pointing to the same address as Bs.

You will need to keep As and Bs inside the 1D-array.

extern __shared__ float smem[];

When calling kernel, you should launch it with 2*BLOCK_SIZE*BLOCK_SIZE*sizeof(float).

When indexing into As, use smem[y*BLOCK_SIZE+x] and when indexing into Bs use smem[BLOCK_SIZE*BLOCK_SIZE+y*BLOCK_SIZE+x]

这篇关于CUDA共享内存数组变量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆