指针到设备内存和背部的复制阵列(CUDA) [英] Copying array of pointers into device memory and back (CUDA)
问题描述
我想使用 CUBLAS
函数 cublasSgemmBatched
在我的玩具例子。在这个例子中,我第一次分配的二维数组: h_AA,h_BB
尺寸[ 6
] [ 5
]和 h_CC
6 ] [<$ C $的c> > 1 ]。之后,我把它复制到设备,执行 cublasSgemmBatched
并试图复制阵列 d_CC
回主阵列 h_CC
。不过,我得到了一个错误( cudaErrorLaunchFailure
)与设备到主机的复制,我不知道,我复制到阵列设备正确:
INT的main(){
cublasHandle_t处理;
cudaError_t cudaerr;
cudaEvent_t启动,停止;
cublasStatus_t统计;
常量浮动阿尔法= 1.0F;
常量浮公测= 0.0;
浮** h_AA,** h_BB,** h_CC;
h_AA =新的浮动* [6];
h_BB =新的浮动* [6];
h_CC =新的浮动* [6];
的for(int i = 0;我6;;我++){
h_AA [I] =新的浮动[5];
h_BB [I] =新的浮动[5];
h_CC [I] =新的浮动[1];
为(中间体J = 0; J&小于5; J ++){
h_AA [I] [J] = j的;
h_BB [I] [J] = j的;
}
h_CC [I] [0] = 1;
}
浮** d_AA,** d_BB,** d_CC;
cudaMalloc(安培; d_AA,6 * sizeof的(浮动*));
cudaMalloc(安培; d_BB,6 * sizeof的(浮动*));
cudaMalloc(安培; d_CC,6 * sizeof的(浮动*));
cudaerr = cudaMemcpy(d_AA,h_AA,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB,h_BB,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC,h_CC,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
STAT = cublasCreate(安培;办理);
STAT = cublasSgemmBatched(手柄,CUBLAS_OP_N,CUBLAS_OP_N,1,1,5,&安培;α,
(const的浮动**)d_AA,1,(const的浮动**)d_BB,5,&安培;公测,d_CC,1,6);
cudaerr = cudaMemcpy(h_CC,d_CC,6 * sizeof的(浮动*),cudaMemcpyDeviceToHost);
cublasDestroy(手柄);
}
所以这code工作,但最后 cudaerr
收益 cudaErrorLaunchFailure
。我试图遵循 Github上此示例code
感谢
P.S。我不明白,什么是的sizeof(浮动*)
和 cudaMalloc
如何知道有多少内存需要每个阵列(比如这里我确定1维的尺寸)。
更新:我做到了!
cublasHandle_t手柄;
cudaError_t cudaerr;
cudaEvent_t启动,停止;
cublasStatus_t统计;
常量浮动阿尔法= 1.0F;
常量浮公测= 0.0;浮动* h_A =新的浮动[5];
浮动* h_B =新的浮动[5];
浮动* h_C =新的浮动[6];
的for(int i = 0;我小于5;我++)
{
h_A [我] =我;
h_B [我] =我;
}浮** h_AA,** h_BB,** h_CC;
h_AA =(浮点**)的malloc(6 * sizeof的(浮动*));
h_BB =(浮点**)的malloc(6 * sizeof的(浮动*));
h_CC =(浮点**)的malloc(6 * sizeof的(浮动*));
的for(int i = 0;我6;;我++){
cudaMalloc((无效**)及h_AA [I],5 *的sizeof(浮动));
cudaMalloc((无效**)及h_BB [I],5 *的sizeof(浮动));
cudaMalloc((无效**)及h_CC [I]的sizeof(浮动));
cudaMemcpy(h_AA [I] h_a,数组5 *的sizeof(浮动),cudaMemcpyHostToDevice);
cudaMemcpy(h_BB [I],h_B,5 *的sizeof(浮动),cudaMemcpyHostToDevice);
}
浮** d_AA,** d_BB,** d_CC;
cudaMalloc(安培; d_AA,6 * sizeof的(浮动*));
cudaMalloc(安培; d_BB,6 * sizeof的(浮动*));
cudaMalloc(安培; d_CC,6 * sizeof的(浮动*));
cudaerr = cudaMemcpy(d_AA,h_AA,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB,h_BB,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC,h_CC,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
STAT = cublasCreate(安培;办理);
STAT = cublasSgemmBatched(手柄,CUBLAS_OP_N,CUBLAS_OP_N,1,1,5,&安培;α,
(const的浮动**)d_AA,1,(const的浮动**)d_BB,5,&安培;公测,d_CC,1,6);
cudaerr = cudaMemcpy(h_CC,d_CC,sizeof的(浮动),cudaMemcpyDeviceToHost);
的for(int i = 0;我6;;我++)
cudaMemcpy(h_C + I,h_CC [I]的sizeof(浮动),cudaMemcpyDeviceToHost);
cublasDestroy(手柄);
所以,我想出了答案(感谢@Robert Crovella):以创造指针设备阵列器件阵列
(用于批处理功能),你应该先创建指针设备阵列
,之后将其复制到设备指针设备数组的数组
。同样是关于转流回主机真:应该使用中间指针设备阵列
cublasHandle_t手柄;
cudaError_t cudaerr;
cudaEvent_t启动,停止;
cublasStatus_t统计;
常量浮动阿尔法= 1.0F;
常量浮公测= 0.0;浮动* h_A =新的浮动[5];
浮动* h_B =新的浮动[5];
浮动* h_C =新的浮动[6];
的for(int i = 0;我小于5;我++)
{
h_A [我] =我;
h_B [我] =我;
}浮** h_AA,** h_BB,** h_CC;
h_AA =(浮点**)的malloc(6 * sizeof的(浮动*));
h_BB =(浮点**)的malloc(6 * sizeof的(浮动*));
h_CC =(浮点**)的malloc(6 * sizeof的(浮动*));
的for(int i = 0;我6;;我++){
cudaMalloc((无效**)及h_AA [I],5 *的sizeof(浮动));
cudaMalloc((无效**)及h_BB [I],5 *的sizeof(浮动));
cudaMalloc((无效**)及h_CC [I]的sizeof(浮动));
cudaMemcpy(h_AA [I] h_a,数组5 *的sizeof(浮动),cudaMemcpyHostToDevice);
cudaMemcpy(h_BB [I],h_B,5 *的sizeof(浮动),cudaMemcpyHostToDevice);
}
浮** d_AA,** d_BB,** d_CC;
cudaMalloc(安培; d_AA,6 * sizeof的(浮动*));
cudaMalloc(安培; d_BB,6 * sizeof的(浮动*));
cudaMalloc(安培; d_CC,6 * sizeof的(浮动*));
cudaerr = cudaMemcpy(d_AA,h_AA,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB,h_BB,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC,h_CC,6 * sizeof的(浮动*),cudaMemcpyHostToDevice);
STAT = cublasCreate(安培;办理);
STAT = cublasSgemmBatched(手柄,CUBLAS_OP_N,CUBLAS_OP_N,1,1,5,&安培;α,
(const的浮动**)d_AA,1,(const的浮动**)d_BB,5,&安培;公测,d_CC,1,6);
cudaerr = cudaMemcpy(h_CC,d_CC,sizeof的(浮动),cudaMemcpyDeviceToHost);
的for(int i = 0;我6;;我++)
cudaMemcpy(h_C + I,h_CC [I]的sizeof(浮动),cudaMemcpyDeviceToHost);
cublasDestroy(手柄);
I am trying to use cublas
function cublasSgemmBatched
in my toy example. In this example I first allocate 2D arrays: h_AA, h_BB
of the size [6
][5
] and h_CC
of the size [6
][1
]. After that I copied it to the device, performed cublasSgemmBatched
and tried to copy array d_CC
back to the host array h_CC
. However, I got a error (cudaErrorLaunchFailure
) with device to host copying and I am not sure that I copied arrays into the device correctly:
int main(){
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float **h_AA, **h_BB, **h_CC;
h_AA = new float*[6];
h_BB = new float*[6];
h_CC = new float*[6];
for (int i = 0; i < 6; i++){
h_AA[i] = new float[5];
h_BB[i] = new float[5];
h_CC[i] = new float[1];
for (int j = 0; j < 5; j++){
h_AA[i][j] = j;
h_BB[i][j] = j;
}
h_CC[i][0] = 1;
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, 6 * sizeof(float*), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
}
So this code works, however the last cudaerr
returns cudaErrorLaunchFailure
. I was trying to follow this sample code on Github.
Thanks
P.S. What I don't understand, what is the sizeof(float*)
and how cudaMalloc
knows how many memory required for each array (like here I determine the size of 1 dimension only).
UPDATE: I did it!!:
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
h_A[i] = i;
h_B[i] = i;
}
float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
cudaMalloc((void **)&h_CC[i], sizeof(float));
cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 6;i++)
cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
So, I figured out the answer (thanks to @Robert Crovella): in order to create device array of pointers to device arrays
(for batched functions), one should first create host array of pointers to device arrays
, and after that copy it into device array of pointers to device arrays
. The same is true about transfering back to host: one should use intermediate host array of pointers to device arrays
.
cublasHandle_t handle;
cudaError_t cudaerr;
cudaEvent_t start, stop;
cublasStatus_t stat;
const float alpha = 1.0f;
const float beta = 0.0f;
float *h_A = new float[5];
float *h_B = new float[5];
float *h_C = new float[6];
for (int i = 0; i < 5; i++)
{
h_A[i] = i;
h_B[i] = i;
}
float **h_AA, **h_BB, **h_CC;
h_AA = (float**)malloc(6* sizeof(float*));
h_BB = (float**)malloc(6 * sizeof(float*));
h_CC = (float**)malloc(6 * sizeof(float*));
for (int i = 0; i < 6; i++){
cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
cudaMalloc((void **)&h_CC[i], sizeof(float));
cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
float **d_AA, **d_BB, **d_CC;
cudaMalloc(&d_AA, 6 * sizeof(float*));
cudaMalloc(&d_BB, 6 * sizeof(float*));
cudaMalloc(&d_CC, 6 * sizeof(float*));
cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
stat = cublasCreate(&handle);
stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha,
(const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 6;i++)
cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
这篇关于指针到设备内存和背部的复制阵列(CUDA)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!