在CUDA中乘以矩形矩阵 [英] Multiply Rectangular Matrices in CUDA

查看：134 发布时间：2017/3/4 11:43:01 cuda

本文介绍了在CUDA中乘以矩形矩阵的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

在这个家庭作业中，我需要完成代码来使用CUDA C乘以两个矩形矩阵。在我完成代码后，我提交，当矩阵是正方形时，解决方案是正确的，而结果不是

In this homework i need to complete the code to multiply two rectangle matrices using CUDA C. After I completed the code, I submitted and the solution was correct for the data set when the matrices were square, while the result wasn't matching the expected value when the matrices were not square.

这是我添加缺少的部分后的代码：

Here is the code after I added the missing parts:

#include    <wb.h>

#define wbCheck(stmt) do {                             \
    cudaError_t err = stmt;                            \
    if (err != cudaSuccess) {                          \
        wbLog(ERROR, "Failed to run stmt ", #stmt);    \
        return -1;                                     \
    }                                                  \
} while(0)

// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
               int numARows, int numAColumns,
               int numBRows, int numBColumns,
               int numCRows, int numCColumns) {
   //@@ Insert code to implement matrix multiplication here
   int Row = blockIdx.y * blockDim.y + threadIdx.y;
   int Col = blockIdx.x * blockDim.x + threadIdx.x;
   if (numAColumns != numBRows) return ;
   if ((Row < numARows) && (Col < numBColumns)){
       float Cvalue = 0;
       for (int k = 0 ; k < numAColumns ; ++k )
       Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
       C[Row*numAColumns + Col] = Cvalue;
     }

    }



int main(int argc, char ** argv) {
   wbArg_t args;
   float * hostA; // The A matrix
   float * hostB; // The B matrix
   float * hostC; // The output C matrix
   float * deviceA;
   float * deviceB;
   float * deviceC;
   int numARows; // number of rows in the matrix A
   int numAColumns; // number of columns in the matrix A
   int numBRows; // number of rows in the matrix B
   int numBColumns; // number of columns in the matrix B
   int numCRows; // number of rows in the matrix C (you have to set this)
   int numCColumns; // number of columns in the matrix C (you have to set this)

   args = wbArg_read(argc, argv);

   wbTime_start(Generic, "Importing data and creating memory on host");
   hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
   hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
   //@@ Set numCRows and numCColumns  
   numCRows = 0;
   numCColumns = 0;
   numCRows = numARows;
   numCColumns = numBColumns;  
   //@@ Allocate the hostC matrix
   hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);  
   wbTime_stop(Generic, "Importing data and creating memory on host");

   wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
   wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);

   wbTime_start(GPU, "Allocating GPU memory.");
   //@@ Allocate GPU memory here
   cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
   cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
   cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);  

   wbTime_stop(GPU, "Allocating GPU memory.");

   wbTime_start(GPU, "Copying input memory to the GPU.");
   //@@ Copy memory to the GPU here

   cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
   cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
   wbTime_stop(GPU, "Copying input memory to the GPU.");

   //@@ Initialize the grid and block dimensions here

   dim3 DimGrid(numARows / 8 , numBColumns / 8, 1);
   dim3 DimBlock(8 , 8, 1);

   wbTime_start(Compute, "Performing CUDA computation");

   //@@ Launch the GPU Kernel here
   matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);  

   cudaThreadSynchronize();
   wbTime_stop(Compute, "Performing CUDA computation");

   wbTime_start(Copy, "Copying output memory to the CPU");
   //@@ Copy the GPU memory back to the CPU here
   cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);  

   wbTime_stop(Copy, "Copying output memory to the CPU");

   wbTime_start(GPU, "Freeing GPU Memory");
   //@@ Free the GPU memory here

   cudaFree(deviceA);
   cudaFree(deviceB);
   cudaFree(deviceC);
   wbTime_stop(GPU, "Freeing GPU Memory");

   wbSolution(args, hostC, numCRows, numCColumns);

   free(hostA);
   free(hostB);
   free(hostC);

   return 0;
}

我希望你能帮我找到哪个部分不正确。

I hope you can help me to find which part is incorrect.

推荐答案

在Ira，Ahmad，ram和Oli Fly的帮助下，我得到了如下的正确答案：

After the help of Ira, Ahmad, ram, and Oli Fly, I got the correct answer as follows:

#include    <wb.h>

#define wbCheck(stmt) do {                                 \
        cudaError_t err = stmt;                            \
        if (err != cudaSuccess) {                          \
            wbLog(ERROR, "Failed to run stmt ", #stmt);    \
            return -1;                                     \
        }                                                  \
    } while(0)

// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
                   int numARows, int numAColumns,
                   int numBRows, int numBColumns,
                   int numCRows, int numCColumns) {
    //@@ Insert code to implement matrix multiplication here
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    if (numAColumns != numBRows) return;
    if ((Row < numARows) && (Col < numBColumns)){
    float Cvalue = 0;
    for (int k = 0; k < numAColumns; ++k)
    Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
    C[Row*numCColumns + Col] = Cvalue;
  }

}

int main(int argc, char ** argv) {
    wbArg_t args;
    float * hostA; // The A matrix
    float * hostB; // The B matrix
    float * hostC; // The output C matrix
    float * deviceA;
    float * deviceB;
    float * deviceC;
    int numARows; // number of rows in the matrix A
    int numAColumns; // number of columns in the matrix A
    int numBRows; // number of rows in the matrix B
    int numBColumns; // number of columns in the matrix B
    int numCRows; // number of rows in the matrix C (you have to set this)
    int numCColumns; // number of columns in the matrix C (you have to set this)

    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
    //@@ Set numCRows and numCColumns  
    numCRows = 0;
    numCColumns = 0;
    numCRows = numARows;
    numCColumns = numBColumns;  
    //@@ Allocate the hostC matrix
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);  
    wbTime_stop(Generic, "Importing data and creating memory on host");

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);

    wbTime_start(GPU, "Allocating GPU memory.");
    //@@ Allocate GPU memory here
    cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);  

    wbTime_stop(GPU, "Allocating GPU memory.");

    wbTime_start(GPU, "Copying input memory to the GPU.");
    //@@ Copy memory to the GPU here

    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
    wbTime_stop(GPU, "Copying input memory to the GPU.");

    //@@ Initialize the grid and block dimensions here

    dim3 DimGrid((numCColumns - 1) / 8 + 1, (numCRows - 1) / 8 + 1, 1);
    dim3 DimBlock(8 , 8, 1);

    wbTime_start(Compute, "Performing CUDA computation");

    //@@ Launch the GPU Kernel here
    matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);  

    cudaThreadSynchronize();
    wbTime_stop(Compute, "Performing CUDA computation");

    wbTime_start(Copy, "Copying output memory to the CPU");
    //@@ Copy the GPU memory back to the CPU here
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);  

    wbTime_stop(Copy, "Copying output memory to the CPU");

    wbTime_start(GPU, "Freeing GPU Memory");
    //@@ Free the GPU memory here

    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);
    wbTime_stop(GPU, "Freeing GPU Memory");

    wbSolution(args, hostC, numCRows, numCColumns);

    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}

这篇关于在CUDA中乘以矩形矩阵的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

在CUDA中乘以矩形矩阵 [英] Multiply Rectangular Matrices in CUDA

问题描述

推荐答案

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录关闭

在CUDA中乘以矩形矩阵 [英] Multiply Rectangular Matrices in CUDA

问题描述

推荐答案

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录 关闭

登录关闭