CUDA C语言编程与2显卡 [英] CUDA C programming with 2 video cards

查看:179
本文介绍了CUDA C语言编程与2显卡的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我很新的CUDA编程,并阅读CUDA C语言编程指南由NVIDIA提供的。
 (<一href=\"http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf\">http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf)

在25页,它有下面的C code,做矩阵乘法。你能告诉我怎样可以使code运行在两个设备? (如果我有安装在我的计算机二级NVIDA CUDA能够卡)。能否请您告诉我用一个例子。

  //矩阵存储在行主顺序:
// M(行,列)= *(M.elements +行* M.stride + COL)
typedef结构{
    INT宽度;
    INT高度;
    INT大步;
    *浮动元素;
}矩阵;//获取一个矩阵元素
__device__浮动GetElement(常量矩阵A,INT行,诠释山口)
{
    返回A.elements [*行+ A.stride COL];
}//设置一个矩阵元素
__device__无效SetElement(矩阵A,INT行,诠释山口,浮点数)
{
    A.elements [*行+ A.stride山坳] =价值;
}//获取A的BLOCK_SIZExBLOCK_SIZE子矩阵ASUB是
//位于山口子矩阵向右和行子矩阵向下
//从A的左上角
__device__矩阵GetSubMatrix(矩阵A,INT行,诠释山口)
{
    矩阵ASUB;
    Asub.width =块;
    Asub.height =块;
    Asub.stride = A.stride;
    Asub.elements =安培; A.elements [A.stride * BLOCK_SIZE *行+ BLOCK_SIZE * COL]。
    返回ASUB;
    }//线程块的大小
#定义BLOCK_SIZE 16//矩阵乘法内核向前声明
__global__无效MatMulKernel(常量矩阵,常量矩阵,矩阵);//矩阵乘法 - 主机code
//矩阵尺寸被假定为BLOCK_SIZE的倍数
无效MATMUL(常量矩阵A,常量矩阵B,矩阵C)
{
    //装载A和B到设备内存
    矩阵D_A;
    d_A.width = d_A.stride = A.width; d_A.height = A.height;
    为size_t大小= A.width * A.height *的sizeof(浮动);
    cudaMalloc(安培; d_A.elements,大小);
    cudaMemcpy(d_A.elements,A.elements,大小,cudaMemcpyHostToDevice);
    矩阵d_B;
    d_B.width = d_B.stride = B.width; d_B.height = B.height;
    大小= B.width * B.height *的sizeof(浮动);
    cudaMalloc(安培; d_B.elements,大小);
    cudaMemcpy(d_B.elements,B.elements,大小,cudaMemcpyHostToDevice);    //分配下在设备内存
    矩阵d_C;
    d_C.width = d_C.stride = C.width; d_C.height = C.height;
    大小= C.width * C.height *的sizeof(浮动);
    cudaMalloc(安培; d_C.elements,大小);    //调用内核
    为dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    为dim3 dimGrid(B.width / dimBlock.x,A.height / dimBlock.y);
    MatMulKernel&LT;&LT;&LT; dimGrid,dimBlock&GT;&GT;&GT;(D_A,d_B,d_C);    //从设备内存READ C
    cudaMemcpy(C.elements,d_C.elements,大小,cudaMemcpyDeviceToHost);    //释放设备存储空间
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}//矩阵乘法内核由MATMUL称为()
__global__无效MatMulKernel(矩阵A,矩阵B,矩阵C)
{
    //块的行和列
    INT blockRow = blockIdx.y;
    INT blockCol = blockIdx.x;    //每个线程块计算的C的一个子矩阵Csub的
    矩阵Csub的= GetSubMatrix(C,blockRow,blockCol);    //每个线程计算Csub的一个元素
    //通过积累成果转化Cvalue
    浮Cvalue = 0;    内Csub的//线程的行和列
    INT排= threadIdx.y;
    INT COL = threadIdx.x;    //遍历所有A的那些子矩阵和B
    //需要计算Csub的
    //乘以每对子矩阵一起
    //和积累的结果
    为(中间体m = 0时,M≤(A.width / BLOCK_SIZE)++ M)
    {
        //获取子矩阵A的ASUB
        矩阵ASUB = GetSubMatrix(A,blockRow,M);
        //获取子矩阵B的Bsub
        矩阵Bsub = GetSubMatrix(B,M,blockCol);        //共享内存用于分别存储ASUB和Bsub
        __shared__持股量[BLOCK_SIZE] [BLOCK_SIZE];
        __shared__浮烧烤[BLOCK_SIZE] [BLOCK_SIZE];        //从设备内存中加载ASUB和Bsub到共享内存
        //每个子矩阵的每个线程加载一个元件
        作为[行] [山口] = GetElement(ASUB,行,列);
        BS [行] [山口] = GetElement(Bsub,行,列);        //同步,以确保该子矩阵是装载
        //开始计算之前
        __syncthreads();        //乘ASUB和Bsub在一起
        对于(INT E = 0; e控制BLOCK_SIZE ++ E)
            Cvalue + =作为[行] [E] *烧烤[E] [COL];        //同步,确保了preceding
        //计算是加载了两个新之前完成
        //在下次迭代A和B的子矩阵
        __syncthreads();
    }    //写Csub的设备内存
    //每个线程写入一个元素
    SetElement(Csub的,行,列,Cvalue);
}


解决方案

有没有自动的方式运行多个GPU CUDA一个内核。

您需要设计一种方法来分解矩阵乘法问题成独立操作可并行运行(因此一个上并联每个GPU)。举个简单的例子:

C = AB 等同于 C = [A]。[B1 | B2] = [A.B1 | A.B2] ,其中 B1 B2 是包含矩阵的列合适尺寸的矩阵 B | 表示纵列concantenation。你可以计算 A.B1 A.B2 作为独立的矩阵乘法运算,然后复制时进行级联导致子矩阵回主机内存。

一旦你有一个合适的分解方案,你那么CUDA 4.x的API使用标准的多GPU设施实现它。对于使用CUDA API的多GPU编程的一个很好的概述,我建议看Paulius Micikevicius从GTC 2012优秀的谈话,这可以作为一个视频流和PDF <一个href=\"http://www.gputechconf.com/gtcnew/on-demand-gtc.php?sessionTopic=999&searchByKeyword=&submit=&select=%20&sessionEvent=&sessionYear=2012&sessionFormat=#1451\"相对=nofollow>这里。

I am very new to CUDA programming and was reading the 'CUDA C Programming Guide' provided by nvidia. (http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf)

In the page 25, it has the following C code that does the matrix multiplication. Can you please tell me how can I make that code run on two devices? (if I have two nvida CUDA capable cards installed in my computer). Can you please show me with an example.

// Matrices are stored in row-major order: 
// M(row, col) = *(M.elements + row * M.stride + col) 
typedef struct { 
    int width; 
    int height; 
    int stride; 
    float* elements; 
} Matrix; 

// Get a matrix element 
__device__ float GetElement(const Matrix A, int row, int col) 
{ 
    return A.elements[row * A.stride + col]; 
} 

// Set a matrix element 
__device__ void SetElement(Matrix A, int row, int col, float value) 
{ 
    A.elements[row * A.stride + col] = value; 
} 

// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is 
// located col sub-matrices to the right and row sub-matrices down 
// from the upper-left corner of A 
__device__ Matrix GetSubMatrix(Matrix A, int row, int col) 
{ 
    Matrix Asub; 
    Asub.width = BLOCK_SIZE; 
    Asub.height = BLOCK_SIZE; 
    Asub.stride = A.stride; 
    Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]; 
    return Asub;
    } 

// Thread block size 
#define BLOCK_SIZE 16 

// Forward declaration of the matrix multiplication kernel 
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix); 

// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE 
void MatMul(const Matrix A, const Matrix B, Matrix C) 
{ 
    // Load A and B to device memory 
    Matrix d_A; 
    d_A.width = d_A.stride = A.width; d_A.height = A.height; 
    size_t size = A.width * A.height * sizeof(float); 
    cudaMalloc(&d_A.elements, size); 
    cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice); 
    Matrix d_B; 
    d_B.width = d_B.stride = B.width; d_B.height = B.height; 
    size = B.width * B.height * sizeof(float); 
    cudaMalloc(&d_B.elements, size); 
    cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice); 

    // Allocate C in device memory 
    Matrix d_C; 
    d_C.width = d_C.stride = C.width; d_C.height = C.height; 
    size = C.width * C.height * sizeof(float); 
    cudaMalloc(&d_C.elements, size); 

    // Invoke kernel 
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 
    dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y); 
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); 

    // Read C from device memory 
    cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost); 

    // Free device memory 
    cudaFree(d_A.elements); 
    cudaFree(d_B.elements); 
    cudaFree(d_C.elements); 
} 

// Matrix multiplication kernel called by MatMul() 
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) 
{ 
    // Block row and column 
    int blockRow = blockIdx.y; 
    int blockCol = blockIdx.x; 

    // Each thread block computes one sub-matrix Csub of C 
    Matrix Csub = GetSubMatrix(C, blockRow, blockCol);

    // Each thread computes one element of Csub 
    // by accumulating results into Cvalue 
    float Cvalue = 0; 

    // Thread row and column within Csub 
    int row = threadIdx.y; 
    int col = threadIdx.x; 

    // Loop over all the sub-matrices of A and B that are 
    // required to compute Csub 
    // Multiply each pair of sub-matrices together 
    // and accumulate the results 
    for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) 
    { 
        // Get sub-matrix Asub of A 
        Matrix Asub = GetSubMatrix(A, blockRow, m); 
        // Get sub-matrix Bsub of B 
        Matrix Bsub = GetSubMatrix(B, m, blockCol); 

        // Shared memory used to store Asub and Bsub respectively 
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 

        // Load Asub and Bsub from device memory to shared memory 
        // Each thread loads one element of each sub-matrix 
        As[row][col] = GetElement(Asub, row, col); 
        Bs[row][col] = GetElement(Bsub, row, col); 

        // Synchronize to make sure the sub-matrices are loaded 
        // before starting the computation 
        __syncthreads(); 

        // Multiply Asub and Bsub together 
        for (int e = 0; e < BLOCK_SIZE; ++e) 
            Cvalue += As[row][e] * Bs[e][col]; 

        // Synchronize to make sure that the preceding 
        // computation is done before loading two new 
        // sub-matrices of A and B in the next iteration 
        __syncthreads(); 
    } 

    // Write Csub to device memory 
    // Each thread writes one element 
    SetElement(Csub, row, col, Cvalue); 
}

解决方案

There is no "automatic" way to run a CUDA kernel on multiple GPUs.

You will need to devise a way to decompose the matrix multiplication problem into independent operations that can be run in parallel (so one on each GPU in parallel). As a simple example:

C = A.B is equivalent to C = [A].[B1|B2] = [A.B1|A.B2] where B1 and B2 are suitably sized matrices containing the columns of the matrix B and | denotes columnwise concantenation. You can calculate A.B1 and A.B2 as separate matrix multiplication operations, and then perform the concatenation when copying the resulting submatrices back to host memory.

Once you have a suitable decomposition scheme, you then implement it using the standard multi-gpu facilities in the CUDA 4.x API. For a great overview of multi-GPU programming using the CUDA APIs, I recommend watching Paulius Micikevicius' excellent talk from GTC 2012, which available as a streaming video and PDF here.

这篇关于CUDA C语言编程与2显卡的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆