共享内存矩阵乘法内核 [英] Shared memory matrix multiplication kernel

查看:402
本文介绍了共享内存矩阵乘法内核的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试实现基于共享内存的矩阵乘法内核,如CUDA C编程指南中所述。以下是内核:

  __global__ void matrixMultiplyShared(float * A,float * B,float * C,
int ARows,int AColumns,
int BRows,int BColumns,
int CRows,int CColumns){
float * CSub =& C [CColumns * 16 * blockIdx.y + 16 * blockIdx 。X];
float CValue = 0;
for(int k = 0; k <(AColumns / 16); ++ k){
float * ASub =& A [AColumns * 16 * blockIdx.y + 16 * k]
float * BSub =& B [AColumns * 16 * k + 16 * blockIdx.y];
__shared__ float As [16] [16];
__shared__ float Bs [16] [16];
As [threadIdx.y] [threadIdx.x] = ASub [threadIdx.y * AColumns + threadIdx.x];
Bs [threadIdx.y] [threadIdx.x] = BSub [threadIdx.y * AColumns + threadIdx.x];
__syncthreads();
for(int n = 0; n <16; ++ n)
CValue + = As [threadIdx.y] [n] * Bs [n] [threadIdx.x];
__syncthreads();
}
CSub [threadIdx.x * CColumns + threadIdx.y] = CValue;
}

以下是对内核的调用:

  dim3 dimBlock(16,16,1); 
dim3 dimGrid;
dimGrid.x =(CColumns + dimBlock.x - 1)/dimBlock.x;
dimGrid.y =(CRows + dimBlock.y - 1)/dimBlock.y;
matrixMultiplyShared<<< dimGrid,dimBlock>>(deviceA,deviceB,deviceC,ARows,AColumns,BRows,BColumns,CRows,CColumns);

不幸的是,这似乎会产生不正确的结果。


解决方案

您的内核中至少有2个基本错误, 。您有这个的地方:

  float * BSub =& B [AColumns * 16 * k + 16 * blockIdx.y]; 

您应该使用这个:

  float * BSub =& B [AColumns * 16 * k + 16 * blockIdx.x]; 

您在此处有:

  CSub [threadIdx.x * CColumns + threadIdx.y] = CValue; 

您应该使用:

  CSub [threadIdx.y * CColumns + threadIdx.x] = CValue; 

这将允许您在以下条件下获得基本正确性:


  1. 方形矩阵

  2. 矩形维度可按瓦片尺寸平均分割

修复方矩阵限制并不困难。修复瓷砖尺寸的尺寸限制需要对内核进行大量更改,以便:


  1. 不处理超出范围的元素

  2. 使用适合边框区域的值正确填充您的共享内存区域

由于你的代码不能理解这些,我不知道你是否问这个问题,并选择不专门解决这些问题。



得到以下适应你的代码工作作为一个基本的例子:
(注意,为了减少代码大小的好处,我已经免除了通常 CUDA错误检查。请不要将其用作代表例如良好的编码,做正确的错误检查。我的答案的要点不是解释好的CUDA错误检查,而是显示一个算法正确的例子。)

  #include< stdio.h> 
#include< math.h>
#define TILE_DIM 16
#define DIMX 256
#define DIMY 256
#define RES 0.1

__global__ void matrixMultiplyShared(float * A,float * B,float * C,
int ARows,int AColumns,
int BRows,int BColumns,
int CRows,int CColumns){
float CValue = 0;
if(((blockIdx.y * blockDim.y + threadIdx.y)< CRows)&&((blockIdx.x * blockDim.x + threadIdx.x)< CColumns)){
for(int k = 0; k<(AColumns / TILE_DIM); ++ k){
float * ASub =& A [AColumns * TILE_DIM * blockIdx.y + TILE_DIM * k]
float * BSub =& B [AColumns * TILE_DIM * k + TILE_DIM * blockIdx.x];
__shared__ float As [TILE_DIM] [TILE_DIM];
__shared__ float Bs [TILE_DIM] [TILE_DIM];
As [threadIdx.y] [threadIdx.x] = ASub [threadIdx.y * AColumns + threadIdx.x];
Bs [threadIdx.y] [threadIdx.x] = BSub [threadIdx.y * AColumns + threadIdx.x];
__syncthreads();
for(int n = 0; n CValue + = As [threadIdx.y] [n] * Bs [n] [threadIdx.x];
__syncthreads();
}
C [((blockIdx.y * blockDim.y + threadIdx.y)* CColumns)+(blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
}
}


void matrixMultiplyCPU(float * A,float * B,float * C,
int ARows,int AColumns,
int BRows,int BColumns,
int CRows,int CColumns){
for(int i = 0; i for(int j = 0; j float Ctemp = 0.0;
for(int k = 0; k Ctemp + = A [i * AColumns + k] * B [k * BColumns + j]
C [i * CColumns + j] = Ctemp;
}

}
int main(){
int CColumns = DIMY,CRows = DIMX,AColumns = DIMY,ARows = DIMX,BColumns = DIMY, DIMX;
dim3 dimBlock(TILE_DIM,TILE_DIM,1);
dim3 dimGrid;
dimGrid.x =(CColumns + dimBlock.x - 1)/dimBlock.x;
dimGrid.y =(CRows + dimBlock.y - 1)/dimBlock.y;
float * deviceA,* deviceB,* deviceC;
float hostA [DIMY] [DIMX];
float hostB [DIMY] [DIMX];
float hostC [DIMY] [DIMX];
float hostCp [DIMY] [DIMX];
for(int x = 0; x for(int y = 0; y< DIMY; y ++){
hostA [y] /(float)RAND_MAX;
hostB [y] [x] = rand()/(float)RAND_MAX;
}
cudaMalloc((void **)& deviceA,DIMX * DIMY * sizeof(float));
cudaMalloc((void **)& deviceB,DIMX * DIMY * sizeof(float));
cudaMalloc((void **)& deviceC,DIMX * DIMY * sizeof(float));
cudaMemcpy(deviceA,hostA,DIMX * DIMY * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceB,hostB,DIMX * DIMY * sizeof(float),cudaMemcpyHostToDevice);
matrixMultiplyShared<<< dimGrid,dimBlock>>(deviceA,deviceB,deviceC,ARows,AColumns,BRows,BColumns,CRows,CColumns);
cudaMemcpy(hostC,deviceC,DIMX * DIMY * sizeof(float),cudaMemcpyDeviceToHost);
matrixMultiplyCPU(&(hostA [0] [0]),&(hostB [0] [0]),&(hostCp [0] [0]),ARows,AColumns,BRows,BColumns, CRows,CColumns);

for(int y = 0; y for(int x = 0; x if(fabs(hostCp [y] [x] - hostC [y] [x])> RES)
{
printf(offset at y =%d,x =%d,CPU =%f,GPU =%f \ n,y,x,hostCp [y] [x],hostC [y] [x]);
return 1;
}
printf(Finished!\\\
);
return 0;
}


I am attempting to implement a shared memory based matrix multiplication kernel as outlined in the CUDA C Programming Guide. The following is the kernel:

 __global__ void matrixMultiplyShared(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
     float * CSub = &C[CColumns * 16 * blockIdx.y + 16 * blockIdx.x];
     float CValue = 0;
 for (int k = 0; k < (AColumns / 16); ++k) {
         float * ASub =  &A[AColumns * 16 * blockIdx.y + 16 * k];
         float * BSub = &B[AColumns*16*k + 16*blockIdx.y];
         __shared__ float As[16][16];
         __shared__ float Bs[16][16];
         As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
         Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
         __syncthreads();
         for (int n = 0; n < 16; ++n)
        CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
         __syncthreads();
     }
     CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;
 }

While the following is the call to the kernel:

 dim3 dimBlock(16, 16, 1);
 dim3 dimGrid;
 dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
 dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
 matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);

Unfortunately this seems to produce incorrect results.

Any assistance/explanations would be greatly appreciated.

解决方案

There are at least 2 basic errors in your kernel, both relatively trivial. Where you have this:

     float * BSub = &B[AColumns*16*k + 16*blockIdx.y];

You should use this:

     float * BSub = &B[AColumns*16*k + 16*blockIdx.x];

And where you have this:

 CSub[threadIdx.x*CColumns+threadIdx.y]=CValue;

You should use this:

 CSub[threadIdx.y*CColumns+threadIdx.x]=CValue;

This should allow you to get basic correctness under the following conditions:

  1. square matrices
  2. matrix dimensions evenly divisible by tile dimension

Fixing the square matrix limitation is not difficult. Fixing the dimension limitation on the tile dimension involves considerable changes to the kernel, in order to:

  1. not process out-of-range elements
  2. properly populate your shared memory area with values that are appropriate in the "border" regions

Since your code doesn't comprehend any of this, I wasn't sure if you're asking about it and chose not to address those issues specifically.

I was able to get the following adaptation of your code working as a basic example: (note that for the benefit of reduced code size to look at, I have dispensed with usual CUDA error checking. Please don't use this as a representative example of good coding. Do proper error checking. The point of my answer is not to explain good CUDA error checking but to show an algorithmically correct example.)

#include <stdio.h>
#include <math.h>
#define TILE_DIM 16
#define DIMX 256
#define DIMY 256
#define RES 0.1

__global__ void matrixMultiplyShared(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
     float CValue = 0;
     if (((blockIdx.y * blockDim.y + threadIdx.y)< CRows) && ((blockIdx.x * blockDim.x + threadIdx.x) < CColumns)) {
       for (int k = 0; k < (AColumns / TILE_DIM); ++k) {
         float * ASub =  &A[AColumns * TILE_DIM * blockIdx.y + TILE_DIM * k];
         float * BSub = &B[AColumns*TILE_DIM*k + TILE_DIM*blockIdx.x];
         __shared__ float As[TILE_DIM][TILE_DIM];
         __shared__ float Bs[TILE_DIM][TILE_DIM];
         As[threadIdx.y][threadIdx.x] = ASub[threadIdx.y*AColumns+threadIdx.x];
         Bs[threadIdx.y][threadIdx.x] = BSub[threadIdx.y*AColumns+threadIdx.x];
         __syncthreads();
         for (int n = 0; n < TILE_DIM; ++n)
         CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];
         __syncthreads();
       }
       C[((blockIdx.y * blockDim.y + threadIdx.y)*CColumns)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
     }
 }


void matrixMultiplyCPU(float * A, float * B, float * C,
                     int ARows, int AColumns,
                     int BRows, int BColumns,
                     int CRows, int CColumns) {
  for (int i = 0; i<ARows; i++)
    for (int j=0; j<BColumns; j++){
      float Ctemp = 0.0;
      for (int k=0; k<AColumns; k++)
        Ctemp += A[i*AColumns + k] * B[k*BColumns+j];
      C[i*CColumns+j] = Ctemp;
      }

}
int main(){
 int CColumns = DIMY, CRows=DIMX, AColumns=DIMY, ARows=DIMX, BColumns=DIMY, BRows=DIMX;
 dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
 dim3 dimGrid;
 dimGrid.x = (CColumns + dimBlock.x - 1)/dimBlock.x;
 dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
 float *deviceA, *deviceB, *deviceC;
 float hostA[DIMY][DIMX];
 float hostB[DIMY][DIMX];
 float hostC[DIMY][DIMX];
 float hostCp[DIMY][DIMX];
 for (int x = 0; x<DIMX; x++)
   for (int y = 0; y<DIMY; y++) {
     hostA[y][x] = rand()/(float)RAND_MAX;
     hostB[y][x] = rand()/(float)RAND_MAX;
     }
  cudaMalloc((void **)&deviceA, DIMX*DIMY*sizeof(float));
  cudaMalloc((void **)&deviceB, DIMX*DIMY*sizeof(float));
  cudaMalloc((void **)&deviceC, DIMX*DIMY*sizeof(float));
  cudaMemcpy(deviceA, hostA, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(deviceB, hostB, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
  matrixMultiplyShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , AColumns, BRows ,BColumns , CRows , CColumns);
  cudaMemcpy(hostC, deviceC, DIMX*DIMY*sizeof(float), cudaMemcpyDeviceToHost);
  matrixMultiplyCPU(&(hostA[0][0]) , &(hostB[0][0]) , &(hostCp[0][0]) , ARows , AColumns, BRows ,BColumns , CRows , CColumns);

 for (int y = 0; y<DIMY; y++)
   for (int x = 0; x<DIMX; x++)
     if (fabs(hostCp[y][x] - hostC[y][x]) > RES)
       {
       printf("Error at offset y=%d,x=%d, CPU = %f, GPU = %f\n", y, x, hostCp[y][x], hostC[y][x]);
       return 1;
       }
 printf("Finished!\n");
 return 0;
}

这篇关于共享内存矩阵乘法内核的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆