将指针数组复制到设备内存并返回(CUDA) [英] Copying array of pointers into device memory and back (CUDA)
问题描述
我正在尝试在我的玩具示例中使用 cublas
函数 cublasSgemmBatched
.在本例中,我首先分配二维数组:h_AA, h_BB
大小为 [6
][5
] 和 h_CC
的大小为 [6
][1
].之后我将它复制到设备,执行 cublasSgemmBatched
并尝试将数组 d_CC
复制回主机数组 h_CC
.但是,我在设备到主机复制时遇到错误 (cudaErrorLaunchFailure
),我不确定我是否将数组正确复制到设备中:
int main(){cublasHandle_t 句柄;cudaError_t cudaerr;cudaEvent_t 开始,停止;cublasStatus_t 统计;常量浮动阿尔法= 1.0f;常量浮动 beta = 0.0f;浮动**h_AA,**h_BB,**h_CC;h_AA = 新浮点数*[6];h_BB = 新浮点数*[6];h_CC = 新浮点数*[6];for (int i = 0; i < 6; i++){h_AA[i] = 新浮点数[5];h_BB[i] = 新浮点数[5];h_CC[i] = 新浮点数[1];for (int j = 0; j < 5; j++){h_AA[i][j] = j;h_BB[i][j] = j;}h_CC[i][0] = 1;}浮动**d_AA,**d_BB,**d_CC;cudaMalloc(&d_AA, 6 * sizeof(float*));cudaMalloc(&d_BB, 6 * sizeof(float*));cudaMalloc(&d_CC, 6 * sizeof(float*));cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);stat = cublasCreate(&handle);stat = cublasSgemmBatched(句柄, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);cudaerr = cudaMemcpy(h_CC, d_CC, 6 * sizeof(float*), cudaMemcpyDeviceToHost);cublasDestroy(句柄);}
所以这段代码可以工作,但是最后一个 cudaerr
返回 cudaErrorLaunchFailure
.我试图在 Github 上遵循此示例代码.p>
谢谢
附:我不明白的是, sizeof(float*)
是什么以及 cudaMalloc
如何知道每个数组需要多少内存(就像这里我确定一维的大小仅).
更新:我做到了!:
cublasHandle_t 句柄;cudaError_t cudaerr;cudaEvent_t 开始,停止;cublasStatus_t 统计;常量浮动阿尔法= 1.0f;常量浮动 beta = 0.0f;浮动 *h_A = 新浮动 [5];浮动 *h_B = 新浮动 [5];浮动 *h_C = 新浮动 [6];for (int i = 0; i <5; i++){h_A[i] = i;h_B[i] = i;}浮动**h_AA,**h_BB,**h_CC;h_AA = (float**)malloc(6* sizeof(float*));h_BB = (float**)malloc(6 * sizeof(float*));h_CC = (float**)malloc(6 * sizeof(float*));for (int i = 0; i < 6; i++){cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));cudaMalloc((void **)&h_CC[i], sizeof(float));cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);}浮动**d_AA,**d_BB,**d_CC;cudaMalloc(&d_AA, 6 * sizeof(float*));cudaMalloc(&d_BB, 6 * sizeof(float*));cudaMalloc(&d_CC, 6 * sizeof(float*));cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);stat = cublasCreate(&handle);stat = cublasSgemmBatched(句柄, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);for (int i = 0; i <6;i++)cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);cublasDestroy(句柄);
所以,我找到了答案(感谢@Robert Crovella):为了创建 device array of pointers to device array
(对于批处理函数),应首先创建指向设备数组的指针的主机数组
,然后将其复制到指向设备数组的指针的设备数组
.传回主机也是如此:应该使用中间指向设备数组的指针的主机数组
.
cublasHandle_t 句柄;cudaError_t cudaerr;cudaEvent_t 开始,停止;cublasStatus_t 统计;常量浮动阿尔法= 1.0f;常量浮动 beta = 0.0f;浮动 *h_A = 新浮动 [5];浮动 *h_B = 新浮动 [5];浮动 *h_C = 新浮动 [6];for (int i = 0; i <5; i++){h_A[i] = i;h_B[i] = i;}浮动**h_AA,**h_BB,**h_CC;h_AA = (float**)malloc(6* sizeof(float*));h_BB = (float**)malloc(6 * sizeof(float*));h_CC = (float**)malloc(6 * sizeof(float*));for (int i = 0; i < 6; i++){cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));cudaMalloc((void **)&h_CC[i], sizeof(float));cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);}浮动**d_AA,**d_BB,**d_CC;cudaMalloc(&d_AA, 6 * sizeof(float*));cudaMalloc(&d_BB, 6 * sizeof(float*));cudaMalloc(&d_CC, 6 * sizeof(float*));cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);stat = cublasCreate(&handle);stat = cublasSgemmBatched(句柄, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);for (int i = 0; i <6;i++)cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);cublasDestroy(句柄);
I am trying to use cublas
function cublasSgemmBatched
in my toy example. In this example I first allocate 2D arrays: h_AA, h_BB
of the size [6
][5
] and h_CC
of the size [6
][1
]. After that I copied it to the device, performed cublasSgemmBatched
and tried to copy array d_CC
back to the host array h_CC
. However, I got a error (cudaErrorLaunchFailure
) with device to host copying and I am not sure that I copied arrays into the device correctly:
int main(){
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float **h_AA, **h_BB, **h_CC;
h_AA = new float*[6];
h_BB = new float*[6];
h_CC = new float*[6];
for (int i = 0; i < 6; i++){
h_AA[i] = new float[5];
h_BB[i] = new float[5];
h_CC[i] = new float[1];
for (int j = 0; j < 5; j++){
h_AA[i][j] = j;
h_BB[i][j] = j;
}
h_CC[i][0] = 1;
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, 6 * sizeof(float*), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
}
So this code works, however the last cudaerr
returns cudaErrorLaunchFailure
. I was trying to follow this sample code on Github.
Thanks
P.S. What I don't understand, what is the sizeof(float*)
and how cudaMalloc
knows how many memory required for each array (like here I determine the size of 1 dimension only).
UPDATE: I did it!!:
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
h_A[i] = i;
h_B[i] = i;
}
float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
cudaMalloc((void **)&h_CC[i], sizeof(float));
cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 6;i++)
cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
So, I figured out the answer (thanks to @Robert Crovella): in order to create device array of pointers to device arrays
(for batched functions), one should first create host array of pointers to device arrays
, and after that copy it into device array of pointers to device arrays
. The same is true about transfering back to host: one should use intermediate host array of pointers to device arrays
.
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
h_A[i] = i;
h_B[i] = i;
}
float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
cudaMalloc((void **)&h_CC[i], sizeof(float));
cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 6;i++)
cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
这篇关于将指针数组复制到设备内存并返回(CUDA)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!