无法在CUDA中执行设备内核 [英] Unable to execute device kernel in CUDA

查看：200 发布时间：2017/3/4 14:07:28 cuda nvidia matrix-multiplication gpu-programming

本文介绍了无法在CUDA中执行设备内核的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我试图在全局内核中调用设备内核。我的全局内核是一个矩阵乘法，我的设备内核在乘积矩阵的每一列中找到最大值和索引。以下是代码：

I am trying to call a device kernel within a global kernel. My global kernel is a Matrix Multiplication and my device kernel is finding the maximum value and the index in each column of the product matrix. Following is the code :

__device__ void MaxFunction(float* Pd, float* max)
{
  int x = (threadIdx.x + blockIdx.x * blockDim.x);  
  int y = (threadIdx.y + blockIdx.y * blockDim.y); 
  int k = 0;
  int temp = 0; int temp_idx = 0;
  for (k = 0; k < wB; ++k) {
   if(Pd[x*wB + y] > temp){
    temp = Pd[x*wB + y];
    temp_idx = x*wB + y;
   }
       max[y*2 + 0] = temp;
       max[y*2 + 1] = temp_idx;
  }
}

__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
  // declare cache in the shared memory
  __shared__ float Mds[blockD][blockD];
  __shared__ float Nds[blockD][blockD];

  float Pvalue = 0;
  // Loop over the Md and Nd block dimension required to compute the Pd element
  for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x); 
                            m < ((wA * blockD * blockIdx.y)+wA-1); 
                                        m += blockD, n += (blockD*hB)){

    // collaboratively loading of Md and Nd blocks into shared memory    
    Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
    Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
    __syncthreads();

    // keep track of the running sum    
    for (int k = 0; k < blockD; k++)
      Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
    __syncthreads();
  }

  // write back to the global memory
  int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
  Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
  __syncthreads();

  MaxFunction(Pd, max);

}

主代码：

#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>

#define blockD 32


const int wA = 128;
const int hA = 1024;

const int wB = 128;
const int hB = wA;

main(void){

    void MatrixMultiplication(float *, float *, float *, float *);

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *M, *N, *P, *C;   


    // allocate memory on the CPU
    M = (float*)malloc(size_A);
    N = (float*)malloc(size_B);
    P = (float*)malloc(size_max);
    C = (float*)malloc(size_C);

    // initialize the matrices
    for (int y=0; y < hA; y++) {
        for (int x=0; x < wA; x++){
            M[y*wA + x] = x;
       }
    }

    for (int y=0; y<hB; y++) {
        for (int x=0; x<wB; x++){
            N[y*wB + x] = x;
       }
    }

    MatrixMultiplication(M, N, P, C);

    //Write
    FILE *f1;
    int i, j;
    f1 = fopen("max_val.txt","w");
    for(i=0; i < (wB * 2); i+=2){
    fprintf(f1,"%d\t%d\n",int(P[i]),int(P[i+1]));
    }
    fclose(f1);

    f1 = fopen("Prod_mat.txt","w");
    for(i=0; i < 2; i++){
    for(j=0; j < wB; j++){
        fprintf(f1,"%d\t",int(C[i*wB + j]));
    }
    fprintf(f1,"\n");
    }
    fclose(f1);

    free( M );
    free( N );
    free( P ); 
            free( C );

    cudaDeviceReset();
    return 0;
}


void MatrixMultiplication(float *M, float *N, float *P, float *C) {

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *Md, *Nd, *Pd, *max; 

    // allocate memory on the GPU
    cudaMalloc((void**)&Md, size_A);
    cudaMalloc((void**)&Nd, size_B);
    cudaMalloc((void**)&Pd, size_C);
    cudaMalloc((void**)&max, size_max);

    // transfer M and N to device memory
    cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);

    // kernel invocation code
    dim3 dimBlock(blockD, blockD);
    dim3 dimGrid(wA/blockD, hB/blockD);

    //Execute Kernel
    MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);

    // transfer P from device    
    cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
    cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);

    cudaFree(Md);
    cudaFree(Nd);
    cudaFree(Pd);
    cudaFree(max);
}

矩阵乘法结果很好（使用Matlab验证），但我不是能够获得最大值及其相应的索引。我很感激，如果有人可以指出我做错了什么。当我运行上面的代码时，max变量只有垃圾。

The Matrix Multiplication result is fine (Verified using Matlab), but I am not able to get the max values and their corresponding index. I would appreciate if anyone can kindly point out at what I am doing wrong. The max variable has only garbage when I run the above code.

无法在CUDA中执行设备内核 [英] Unable to execute device kernel in CUDA

问题描述

推荐答案

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录关闭

无法在CUDA中执行设备内核 [英] Unable to execute device kernel in CUDA

问题描述

推荐答案

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录 关闭

登录关闭