CUDA C语言编程与2显卡 [英] CUDA C programming with 2 video cards
问题描述
我很新的CUDA编程,并阅读CUDA C语言编程指南由NVIDIA提供的。
(<一href=\"http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf\">http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf)
在25页,它有下面的C code,做矩阵乘法。你能告诉我怎样可以使code运行在两个设备? (如果我有安装在我的计算机二级NVIDA CUDA能够卡)。能否请您告诉我用一个例子。
//矩阵存储在行主顺序:
// M(行,列)= *(M.elements +行* M.stride + COL)
typedef结构{
INT宽度;
INT高度;
INT大步;
*浮动元素;
}矩阵;//获取一个矩阵元素
__device__浮动GetElement(常量矩阵A,INT行,诠释山口)
{
返回A.elements [*行+ A.stride COL];
}//设置一个矩阵元素
__device__无效SetElement(矩阵A,INT行,诠释山口,浮点数)
{
A.elements [*行+ A.stride山坳] =价值;
}//获取A的BLOCK_SIZExBLOCK_SIZE子矩阵ASUB是
//位于山口子矩阵向右和行子矩阵向下
//从A的左上角
__device__矩阵GetSubMatrix(矩阵A,INT行,诠释山口)
{
矩阵ASUB;
Asub.width =块;
Asub.height =块;
Asub.stride = A.stride;
Asub.elements =安培; A.elements [A.stride * BLOCK_SIZE *行+ BLOCK_SIZE * COL]。
返回ASUB;
}//线程块的大小
#定义BLOCK_SIZE 16//矩阵乘法内核向前声明
__global__无效MatMulKernel(常量矩阵,常量矩阵,矩阵);//矩阵乘法 - 主机code
//矩阵尺寸被假定为BLOCK_SIZE的倍数
无效MATMUL(常量矩阵A,常量矩阵B,矩阵C)
{
//装载A和B到设备内存
矩阵D_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
为size_t大小= A.width * A.height *的sizeof(浮动);
cudaMalloc(安培; d_A.elements,大小);
cudaMemcpy(d_A.elements,A.elements,大小,cudaMemcpyHostToDevice);
矩阵d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
大小= B.width * B.height *的sizeof(浮动);
cudaMalloc(安培; d_B.elements,大小);
cudaMemcpy(d_B.elements,B.elements,大小,cudaMemcpyHostToDevice); //分配下在设备内存
矩阵d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
大小= C.width * C.height *的sizeof(浮动);
cudaMalloc(安培; d_C.elements,大小); //调用内核
为dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
为dim3 dimGrid(B.width / dimBlock.x,A.height / dimBlock.y);
MatMulKernel&LT;&LT;&LT; dimGrid,dimBlock&GT;&GT;&GT;(D_A,d_B,d_C); //从设备内存READ C
cudaMemcpy(C.elements,d_C.elements,大小,cudaMemcpyDeviceToHost); //释放设备存储空间
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}//矩阵乘法内核由MATMUL称为()
__global__无效MatMulKernel(矩阵A,矩阵B,矩阵C)
{
//块的行和列
INT blockRow = blockIdx.y;
INT blockCol = blockIdx.x; //每个线程块计算的C的一个子矩阵Csub的
矩阵Csub的= GetSubMatrix(C,blockRow,blockCol); //每个线程计算Csub的一个元素
//通过积累成果转化Cvalue
浮Cvalue = 0; 内Csub的//线程的行和列
INT排= threadIdx.y;
INT COL = threadIdx.x; //遍历所有A的那些子矩阵和B
//需要计算Csub的
//乘以每对子矩阵一起
//和积累的结果
为(中间体m = 0时,M≤(A.width / BLOCK_SIZE)++ M)
{
//获取子矩阵A的ASUB
矩阵ASUB = GetSubMatrix(A,blockRow,M);
//获取子矩阵B的Bsub
矩阵Bsub = GetSubMatrix(B,M,blockCol); //共享内存用于分别存储ASUB和Bsub
__shared__持股量[BLOCK_SIZE] [BLOCK_SIZE];
__shared__浮烧烤[BLOCK_SIZE] [BLOCK_SIZE]; //从设备内存中加载ASUB和Bsub到共享内存
//每个子矩阵的每个线程加载一个元件
作为[行] [山口] = GetElement(ASUB,行,列);
BS [行] [山口] = GetElement(Bsub,行,列); //同步,以确保该子矩阵是装载
//开始计算之前
__syncthreads(); //乘ASUB和Bsub在一起
对于(INT E = 0; e控制BLOCK_SIZE ++ E)
Cvalue + =作为[行] [E] *烧烤[E] [COL]; //同步,确保了preceding
//计算是加载了两个新之前完成
//在下次迭代A和B的子矩阵
__syncthreads();
} //写Csub的设备内存
//每个线程写入一个元素
SetElement(Csub的,行,列,Cvalue);
}
有没有自动的方式运行多个GPU CUDA一个内核。
您需要设计一种方法来分解矩阵乘法问题成独立操作可并行运行(因此一个上并联每个GPU)。举个简单的例子:
C = AB
等同于 C = [A]。[B1 | B2] = [A.B1 | A.B2]
,其中 B1
和 B2
是包含矩阵的列合适尺寸的矩阵 B
和 |
表示纵列concantenation。你可以计算 A.B1
和 A.B2
作为独立的矩阵乘法运算,然后复制时进行级联导致子矩阵回主机内存。
一旦你有一个合适的分解方案,你那么CUDA 4.x的API使用标准的多GPU设施实现它。对于使用CUDA API的多GPU编程的一个很好的概述,我建议看Paulius Micikevicius从GTC 2012优秀的谈话,这可以作为一个视频流和PDF <一个href=\"http://www.gputechconf.com/gtcnew/on-demand-gtc.php?sessionTopic=999&searchByKeyword=&submit=&select=%20&sessionEvent=&sessionYear=2012&sessionFormat=#1451\"相对=nofollow>这里。
I am very new to CUDA programming and was reading the 'CUDA C Programming Guide' provided by nvidia. (http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf)
In the page 25, it has the following C code that does the matrix multiplication. Can you please tell me how can I make that code run on two devices? (if I have two nvida CUDA capable cards installed in my computer). Can you please show me with an example.
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col];
return Asub;
}
// Thread block size
#define BLOCK_SIZE 16
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc(&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m)
{
// Get sub-matrix Asub of A
Matrix Asub = GetSubMatrix(A, blockRow, m);
// Get sub-matrix Bsub of B
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}
There is no "automatic" way to run a CUDA kernel on multiple GPUs.
You will need to devise a way to decompose the matrix multiplication problem into independent operations that can be run in parallel (so one on each GPU in parallel). As a simple example:
C = A.B
is equivalent to C = [A].[B1|B2] = [A.B1|A.B2]
where B1
and B2
are suitably sized matrices containing the columns of the matrix B
and |
denotes columnwise concantenation. You can calculate A.B1
and A.B2
as separate matrix multiplication operations, and then perform the concatenation when copying the resulting submatrices back to host memory.
Once you have a suitable decomposition scheme, you then implement it using the standard multi-gpu facilities in the CUDA 4.x API. For a great overview of multi-GPU programming using the CUDA APIs, I recommend watching Paulius Micikevicius' excellent talk from GTC 2012, which available as a streaming video and PDF here.
这篇关于CUDA C语言编程与2显卡的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!