将指针数组复制到设备内存并返回(CUDA) [英] Copying array of pointers into device memory and back (CUDA)

查看:24
本文介绍了将指针数组复制到设备内存并返回(CUDA)的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试在我的玩具示例中使用 cublas 函数 cublasSgemmBatched.在本例中,我首先分配二维数组:h_AA, h_BB 大小为 [6][5] 和 h_CC 的大小为 [6][1].之后我将它复制到设备,执行 cublasSgemmBatched 并尝试将数组 d_CC 复制回主机数组 h_CC.但是,我在设备到主机复制时遇到错误 (cudaErrorLaunchFailure),我不确定我是否将数组正确复制到设备中:

int main(){cublasHandle_t 句柄;cudaError_t cudaerr;cudaEvent_t 开始,停止;cublasStatus_t 统计;常量浮动阿尔法= 1.0f;常量浮动 beta = 0.0f;浮动**h_AA,**h_BB,**h_CC;h_AA = 新浮点数*[6];h_BB = 新浮点数*[6];h_CC = 新浮点数*[6];for (int i = 0; i < 6; i++){h_AA[i] = 新浮点数[5];h_BB[i] = 新浮点数[5];h_CC[i] = 新浮点数[1];for (int j = 0; j < 5; j++){h_AA[i][j] = j;h_BB[i][j] = j;}h_CC[i][0] = 1;}浮动**d_AA,**d_BB,**d_CC;cudaMalloc(&d_AA, 6 * sizeof(float*));cudaMalloc(&d_BB, 6 * sizeof(float*));cudaMalloc(&d_CC, 6 * sizeof(float*));cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);stat = cublasCreate(&handle);stat = cublasSgemmBatched(句柄, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);cudaerr = cudaMemcpy(h_CC, d_CC, 6 * sizeof(float*), cudaMemcpyDeviceToHost);cublasDestroy(句柄);}

所以这段代码可以工作,但是最后一个 cudaerr 返回 cudaErrorLaunchFailure.我试图在 Github 上遵循此示例代码.p>

谢谢

附:我不明白的是, sizeof(float*) 是什么以及 cudaMalloc 如何知道每个数组需要多少内存(就像这里我确定一维的大小仅).

更新:我做到了!:

cublasHandle_t 句柄;cudaError_t cudaerr;cudaEvent_t 开始,停止;cublasStatus_t 统计;常量浮动阿尔法= 1.0f;常量浮动 beta = 0.0f;浮动 *h_A = 新浮动 [5];浮动 *h_B = 新浮动 [5];浮动 *h_C = 新浮动 [6];for (int i = 0; i <5; i++){h_A[i] = i;h_B[i] = i;}浮动**h_AA,**h_BB,**h_CC;h_AA = (float**)malloc(6* sizeof(float*));h_BB = (float**)malloc(6 * sizeof(float*));h_CC = (float**)malloc(6 * sizeof(float*));for (int i = 0; i < 6; i++){cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));cudaMalloc((void **)&h_CC[i], sizeof(float));cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);}浮动**d_AA,**d_BB,**d_CC;cudaMalloc(&d_AA, 6 * sizeof(float*));cudaMalloc(&d_BB, 6 * sizeof(float*));cudaMalloc(&d_CC, 6 * sizeof(float*));cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);stat = cublasCreate(&handle);stat = cublasSgemmBatched(句柄, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);for (int i = 0; i <6;i++)cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);cublasDestroy(句柄);

解决方案

所以,我找到了答案(感谢@Robert Crovella):为了创建 device array of pointers to device array(对于批处理函数),应首先创建指向设备数组的指针的主机数组,然后将其复制到指向设备数组的指针的设备数组.传回主机也是如此:应该使用中间指向设备数组的指针的主机数组.

cublasHandle_t 句柄;cudaError_t cudaerr;cudaEvent_t 开始,停止;cublasStatus_t 统计;常量浮动阿尔法= 1.0f;常量浮动 beta = 0.0f;浮动 *h_A = 新浮动 [5];浮动 *h_B = 新浮动 [5];浮动 *h_C = 新浮动 [6];for (int i = 0; i <5; i++){h_A[i] = i;h_B[i] = i;}浮动**h_AA,**h_BB,**h_CC;h_AA = (float**)malloc(6* sizeof(float*));h_BB = (float**)malloc(6 * sizeof(float*));h_CC = (float**)malloc(6 * sizeof(float*));for (int i = 0; i < 6; i++){cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));cudaMalloc((void **)&h_CC[i], sizeof(float));cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);}浮动**d_AA,**d_BB,**d_CC;cudaMalloc(&d_AA, 6 * sizeof(float*));cudaMalloc(&d_BB, 6 * sizeof(float*));cudaMalloc(&d_CC, 6 * sizeof(float*));cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);stat = cublasCreate(&handle);stat = cublasSgemmBatched(句柄, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);for (int i = 0; i <6;i++)cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);cublasDestroy(句柄);

I am trying to use cublas function cublasSgemmBatched in my toy example. In this example I first allocate 2D arrays: h_AA, h_BB of the size [6][5] and h_CC of the size [6][1]. After that I copied it to the device, performed cublasSgemmBatched and tried to copy array d_CC back to the host array h_CC. However, I got a error (cudaErrorLaunchFailure) with device to host copying and I am not sure that I copied arrays into the device correctly:

int main(){
    cublasHandle_t handle;
    cudaError_t cudaerr;
    cudaEvent_t start, stop;
    cublasStatus_t stat;
    const float alpha = 1.0f;
    const float beta = 0.0f;
    float **h_AA, **h_BB, **h_CC;
    h_AA = new float*[6];
    h_BB = new float*[6];
    h_CC = new float*[6];
    for (int i = 0; i < 6; i++){
        h_AA[i] = new float[5];
        h_BB[i] = new float[5];
        h_CC[i] = new float[1];
        for (int j = 0; j < 5; j++){
            h_AA[i][j] = j;
            h_BB[i][j] = j;
        }
        h_CC[i][0] = 1;
    }
    float **d_AA, **d_BB, **d_CC;
    cudaMalloc(&d_AA, 6 * sizeof(float*));
    cudaMalloc(&d_BB, 6 * sizeof(float*));
    cudaMalloc(&d_CC, 6 * sizeof(float*));
    cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
    cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
    cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
    stat = cublasCreate(&handle);
    stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
             (const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
    cudaerr = cudaMemcpy(h_CC, d_CC, 6 * sizeof(float*), cudaMemcpyDeviceToHost);
    cublasDestroy(handle);
}

So this code works, however the last cudaerr returns cudaErrorLaunchFailure. I was trying to follow this sample code on Github.

Thanks

P.S. What I don't understand, what is the sizeof(float*) and how cudaMalloc knows how many memory required for each array (like here I determine the size of 1 dimension only).

UPDATE: I did it!!:

cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;

float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
    h_A[i] = i;
    h_B[i] = i;
}



float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
    cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
    cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
    cudaMalloc((void **)&h_CC[i], sizeof(float));
    cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
    stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha, 
             (const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
    cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 6;i++)
        cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);

解决方案

So, I figured out the answer (thanks to @Robert Crovella): in order to create device array of pointers to device arrays (for batched functions), one should first create host array of pointers to device arrays, and after that copy it into device array of pointers to device arrays. The same is true about transfering back to host: one should use intermediate host array of pointers to device arrays.

cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;

float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
    h_A[i] = i;
    h_B[i] = i;
}



float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
    cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
    cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
    cudaMalloc((void **)&h_CC[i], sizeof(float));
    cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
    stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha, 
             (const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
    cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 6;i++)
        cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);

这篇关于将指针数组复制到设备内存并返回(CUDA)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆