并发CUDA多GPU执行 [英] Concurrency in CUDA multi-GPU executions

查看:222
本文介绍了并发CUDA多GPU执行的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我在多GPU系统上运行cuda内核函数,使用 4 GPU。我预计他们将同时推出,但他们不是。我测量每个内核的开始时间,第二个内核在第一个内核完成执行后启动。因此,在 4 GPU上启动内核不会比 1 单GPU更快。



我如何让他们同时工作?



这是我的代码:

  cudaSetDevice(0); 
GPU_kernel<<< gridDim,threadsPerBlock>>(d_result_0,parameterA +(0 * rateA),parameterB +(0 * rateB));
cudaMemcpyAsync(h_result_0,d_result_0,mem_size_result,cudaMemcpyDeviceToHost);

cudaSetDevice(1);
GPU_kernel<<< gridDim,threadsPerBlock>>(d_result_1,参数A +(1 * rateA),参数B +(1 * rateB)
cudaMemcpyAsync(h_result_1,d_result_1,mem_size_result,cudaMemcpyDeviceToHost);

cudaSetDevice(2);
GPU_kernel<<< gridDim,threadsPerBlock>>(d_result_2,parameterA +(2 * rateA),parameterB +(2 * rateB));
cudaMemcpyAsync(h_result_2,d_result_2,mem_size_result,cudaMemcpyDeviceToHost);

cudaSetDevice(3);
GPU_kernel<<< gridDim,threadsPerBlock>>(d_result_3,parameterA +(3 * rateA),parameterB +(3 * rateB));
cudaMemcpyAsync(h_result_3,d_result_3,mem_size_result,cudaMemcpyDeviceToHost);


解决方案

我已经做了一些实验,群集 4 Kepler K20c GPU。我已经考虑过 8 测试用例,其对应的代码以及分析器时间表如下所示。



测试用例#1 - 宽度优先方法 - 同步复制



b

  #includeUtilities.cuh
#includeInputOutput.cuh

#define BLOCKSIZE 128

/ ******************* /
/ * KERNEL功能* /
/ ********* ********** /
template< class T>
__global__ void kernelFunction(T * __restrict__ d_data,const unsigned int NperGPU){

const int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< NperGPU)for(int k = 0; k< 1000; k ++)d_data [tid] = d_data [tid] * d_data [tid]

}

/ ****************** /
/ *计划结构* /
/ ****************** /
template< class T>
struct plan {
T * d_data;
};

/ ********************* /
/ * SVD计划创建* /
/ *** ****************** /
template< class T>
void createPlan(plan< T>& plan,unsigned int NperGPU,unsigned int gpuID){

// ---设备分配
gpuErrchk(cudaSetDevice(gpuID)) ;
gpuErrchk(cudaMalloc(&(plan.d_data),NperGPU * sizeof(T)));
}

/ ******** /
/ * MAIN * /
/ ******** /
int main(){

const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;

plan< double> plan [numGPUs];
for(int k = 0; k
double * inputMatrices =(double *)malloc(N * sizeof(double));

// ---广度优先方法 - 无异步
for(int k = 0; k gpuErrchk(cudaSetDevice ));
gpuErrchk(cudaMemcpy(plan [k] .d_data,inputMatrices + k * NperGPU,NperGPU * sizeof(double),cudaMemcpyHostToDevice));
}

for(int k = 0; k gpuErrchk(cudaSetDevice(k));
kernelFunction<<< iDivUp(NperGPU,BLOCKSIZE),BLOCKSIZE>>>(plan [k] .d_data,NperGPU);
}

for(int k = 0; k gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpy(inputMatrices + k * NperGPU,plan [k] .d_data,NperGPU * sizeof(double),cudaMemcpyDeviceToHost));
}

gpuErrchk(cudaDeviceReset());
}

- 分析器时间轴
< a href =https://i.stack.imgur.com/fPlIo.jpg =nofollow noreferrer>



可以看出,使用 cudaMemcpy



测试用例#2 - 深度优先方法 - 同步复制

  

#includeUtilities.cuh
#includeInputOutput.cuh

#define BLOCKSIZE 128

/ *********** ******** /
/ * KERNEL FUNCTION * /
/ ******************* /
template<类T>
__global__ void kernelFunction(T * __restrict__ d_data,const unsigned int NperGPU){

const int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< NperGPU)for(int k = 0; k< 1000; k ++)d_data [tid] = d_data [tid] * d_data [tid]

}

/ ****************** /
/ *计划结构* /
/ ****************** /
template< class T>
struct plan {
T * d_data;
};

/ ********************* /
/ * SVD计划创建* /
/ *** ****************** /
template< class T>
void createPlan(plan< T>& plan,unsigned int NperGPU,unsigned int gpuID){

// ---设备分配
gpuErrchk(cudaSetDevice(gpuID)) ;
gpuErrchk(cudaMalloc(&(plan.d_data),NperGPU * sizeof(T)));
}

/ ******** /
/ * MAIN * /
/ ******** /
int main(){

const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;

plan< double> plan [numGPUs];
for(int k = 0; k
double * inputMatrices =(double *)malloc(N * sizeof(double));

// ---深度优先方法 - 无异步
for(int k = 0; k gpuErrchk(cudaSetDevice ));
gpuErrchk(cudaMemcpy(plan [k] .d_data,inputMatrices + k * NperGPU,NperGPU * sizeof(double),cudaMemcpyHostToDevice));
kernelFunction<<< iDivUp(NperGPU,BLOCKSIZE),BLOCKSIZE>>>(plan [k] .d_data,NperGPU);
gpuErrchk(cudaMemcpy(inputMatrices + k * NperGPU,plan [k] .d_data,NperGPU * sizeof(double),cudaMemcpyDeviceToHost));
}

gpuErrchk(cudaDeviceReset());
}

- 分析器时间轴 - b
$ b



这次,内存副本中内核执行。



测试用例#3 - 深度优先方法 - 流的异步复制



- 代码 -

  #includeUtilities.cuh
#includeInputOutput.cuh

#define BLOCKSIZE 128

/ ******************* /
/ * KERNEL FUNCTION * /
/ ********************* /
template< class T>
__global__ void kernelFunction(T * __restrict__ d_data,const unsigned int NperGPU){

const int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< NperGPU)for(int k = 0; k< 1000; k ++)d_data [tid] = d_data [tid] * d_data [tid]

}

/ ****************** /
/ *计划结构* /
/ ****************** /
template< class T>
struct plan {
T * d_data;
T * h_data;
cudaStream_t stream;
};

/ ********************* /
/ * SVD计划创建* /
/ *** ****************** /
template< class T>
void createPlan(plan< T>& plan,unsigned int NperGPU,unsigned int gpuID){

// ---设备分配
gpuErrchk(cudaSetDevice(gpuID)) ;
gpuErrchk(cudaMalloc(&(plan.d_data),NperGPU * sizeof(T)));
gpuErrchk(cudaMallocHost((void **)& plan.h_data,NperGPU * sizeof(T)));
gpuErrchk(cudaStreamCreate(& plan.stream));
}

/ ******** /
/ * MAIN * /
/ ******** /
int main(){

const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;

plan< double> plan [numGPUs];
for(int k = 0; k
// ---Depth-first方法 - 异步
for(int k = 0; k {
gpuErrchk cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan [k] .d_data,plan [k] .h_data,NperGPU * sizeof(double),cudaMemcpyHostToDevice,plan [k] .stream)
kernelFunction<<< iDivUp(NperGPU,BLOCKSIZE),BLOCKSIZE,0,plan [k] .stream>>(plan [k] .d_data,NperGPU);
gpuErrchk(cudaMemcpyAsync(plan [k] .h_data,plan [k] .d_data,NperGPU * sizeof(double),cudaMemcpyDeviceToHost,plan [k] .stream)
}

gpuErrchk(cudaDeviceReset());
}

- 分析器时间轴 - b
$ b



如预期那样实现并发。



测试用例#4 - 深度优先方法 - 默认流中的异步复制



> - 代码



  #includeUtilities.cuh
#includeInputOutput.cuh

#define BLOCKSIZE 128

/ ******************* /
/ * KERNEL FUNCTION * /
/ ******************* /
template< class T>
__global__ void kernelFunction(T * __restrict__ d_data,const unsigned int NperGPU){

const int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< NperGPU)for(int k = 0; k< 1000; k ++)d_data [tid] = d_data [tid] * d_data [tid]

}

/ ****************** /
/ *计划结构* /
/ ****************** /
template< class T>
struct plan {
T * d_data;
T * h_data;
};

/ ********************* /
/ * SVD计划创建* /
/ *** ****************** /
template< class T>
void createPlan(plan< T>& plan,unsigned int NperGPU,unsigned int gpuID){

// ---设备分配
gpuErrchk(cudaSetDevice(gpuID)) ;
gpuErrchk(cudaMalloc(&(plan.d_data),NperGPU * sizeof(T)));
gpuErrchk(cudaMallocHost((void **)& plan.h_data,NperGPU * sizeof(T)));
}

/ ******** /
/ * MAIN * /
/ ******** /
int main(){

const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;

plan< double> plan [numGPUs];
for(int k = 0; k
// ---Depth-first方法 - 无流
for(int k = 0; k {
gpuErrchk (cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan [k] .d_data,plan [k] .h_data,NperGPU * sizeof(double),cudaMemcpyHostToDevice));
kernelFunction<<< iDivUp(NperGPU,BLOCKSIZE),BLOCKSIZE>>>(plan [k] .d_data,NperGPU);
gpuErrchk(cudaMemcpyAsync(plan [k] .h_data,plan [k] .d_data,NperGPU * sizeof(double),cudaMemcpyDeviceToHost));
}

gpuErrchk(cudaDeviceReset());
}

- 分析器时间轴 - b
$ b



尽管使用默认流,仍然实现了并发。



测试用例#5 - 深度优先方法 - 默认流和唯一主机中的异步复制 cudaMallocHost ed向量

 <$> c $ c> #includeUtilities.cuh
#includeInputOutput.cuh

#define BLOCKSIZE 128

/ ******* ************ /
/ * KERNEL FUNCTION * /
/ ******************* /
template< class T>
__global__ void kernelFunction(T * __restrict__ d_data,const unsigned int NperGPU){

const int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< NperGPU)for(int k = 0; k< 1000; k ++)d_data [tid] = d_data [tid] * d_data [tid]

}

/ ****************** /
/ *计划结构* /
/ ****************** /
template< class T>
struct plan {
T * d_data;
};

/ ********************* /
/ * SVD计划创建* /
/ *** ****************** /
template< class T>
void createPlan(plan< T>& plan,unsigned int NperGPU,unsigned int gpuID){

// ---设备分配
gpuErrchk(cudaSetDevice(gpuID)) ;
gpuErrchk(cudaMalloc(&(plan.d_data),NperGPU * sizeof(T)));
}

/ ******** /
/ * MAIN * /
/ ******** /
int main(){

const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;

plan< double> plan [numGPUs];
for(int k = 0; k
// ---深度优先方法 - 无流
double * inputMatrices; gpuErrchk(cudaMallocHost(& inputMatrices,N * sizeof(double)));
for(int k = 0; k {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan [k] .d_data,inputMatrices + k * NperGPU,NperGPU * sizeof(double),cudaMemcpyHostToDevice));
kernelFunction<<< iDivUp(NperGPU,BLOCKSIZE),BLOCKSIZE>>>(plan [k] .d_data,NperGPU);
gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU,plan [k] .d_data,NperGPU * sizeof(double),cudaMemcpyDeviceToHost));
}

gpuErrchk(cudaDeviceReset());
}

- 分析器时间轴 - b
$ b



再次实现并发。



测试用例#6 - 使用流的异步复制的宽度优先方法



代码 -

  #includeUtilities.cuh
#includeInputOutput.cuh

#define BLOCKSIZE 128

/ ******************* /
/ * KERNEL FUNCTION * /
/ ******************* /
template< class T>
__global__ void kernelFunction(T * __restrict__ d_data,const unsigned int NperGPU){

const int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< NperGPU)for(int k = 0; k< 1000; k ++)d_data [tid] = d_data [tid] * d_data [tid]

}

/ ****************** /
/ *计划结构* /
/ ****************** /
// --- Async
template< class T>
struct plan {
T * d_data;
T * h_data;
cudaStream_t stream;
};

/ ********************* /
/ * SVD计划创建* /
/ *** ****************** /
template< class T>
void createPlan(plan< T>& plan,unsigned int NperGPU,unsigned int gpuID){

// ---设备分配
gpuErrchk(cudaSetDevice(gpuID)) ;
gpuErrchk(cudaMalloc(&(plan.d_data),NperGPU * sizeof(T)));
gpuErrchk(cudaMallocHost((void **)& plan.h_data,NperGPU * sizeof(T)));
gpuErrchk(cudaStreamCreate(& plan.stream));
}

/ ******** /
/ * MAIN * /
/ ******** /
int main(){

const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;

plan< double> plan [numGPUs];
for(int k = 0; k
// ---breadth-firstapproach - async
for(int k = 0; k gpuErrchk(cudaSetDevice );
gpuErrchk(cudaMemcpyAsync(plan [k] .d_data,plan [k] .h_data,NperGPU * sizeof(double),cudaMemcpyHostToDevice,plan [k] .stream)
}

for(int k = 0; k gpuErrchk(cudaSetDevice(k));
kernelFunction<<< iDivUp(NperGPU,BLOCKSIZE),BLOCKSIZE,0,plan [k] .stream>>(plan [k] .d_data,NperGPU);
}

for(int k = 0; k gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan [k] .h_data,plan [k] .d_data,NperGPU * sizeof(double),cudaMemcpyDeviceToHost,plan [k] .stream)
}

gpuErrchk(cudaDeviceReset());
}

- 分析器时间轴 - b
$ b



在对应的深度优先方法中实现并发



$ b

测试用例#7 - 广度优先方法 - 默认流中的异步复制 $ b

- 代码 -

  #includeUtilities.cuh
#includeInputOutput.cuh

#define BLOCKSIZE 128

/ ******************* /
/ * KERNEL FUNCTION * /
/ ********************* /
template< class T>
__global__ void kernelFunction(T * __restrict__ d_data,const unsigned int NperGPU){

const int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< NperGPU)for(int k = 0; k< 1000; k ++)d_data [tid] = d_data [tid] * d_data [tid]

}

/ ****************** /
/ *计划结构* /
/ ****************** /
// --- Async
template< class T>
struct plan {
T * d_data;
T * h_data;
};

/ ********************* /
/ * SVD计划创建* /
/ *** ****************** /
template< class T>
void createPlan(plan< T>& plan,unsigned int NperGPU,unsigned int gpuID){

// ---设备分配
gpuErrchk(cudaSetDevice(gpuID)) ;
gpuErrchk(cudaMalloc(&(plan.d_data),NperGPU * sizeof(T)));
gpuErrchk(cudaMallocHost((void **)& plan.h_data,NperGPU * sizeof(T)));
}

/ ******** /
/ * MAIN * /
/ ******** /
int main(){

const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;

plan< double> plan [numGPUs];
for(int k = 0; k
// ---breadth-firstapproach - async
for(int k = 0; k gpuErrchk(cudaSetDevice );
gpuErrchk(cudaMemcpyAsync(plan [k] .d_data,plan [k] .h_data,NperGPU * sizeof(double),cudaMemcpyHostToDevice));
}

for(int k = 0; k gpuErrchk(cudaSetDevice(k));
kernelFunction<<< iDivUp(NperGPU,BLOCKSIZE),BLOCKSIZE>>>(plan [k] .d_data,NperGPU);
}

for(int k = 0; k gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan [k] .h_data,plan [k] .d_data,NperGPU * sizeof(double),cudaMemcpyDeviceToHost)
}

gpuErrchk(cudaDeviceReset());
}

- 分析器时间轴 - b
$ b



实现并发性,如在相应的深度优先方法。



测试用例#8 - 宽度优先方法 - 在默认流和唯一主机中异步复制 cudaMallocHost ed向量



- 代码 -

  #includeUtilities.cuh
#includeInputOutput.cuh

#define BLOCKSIZE 128

/ * ****************** /
/ * KERNEL FUNCTION * /
/ **************** *** /
template< class T>
__global__ void kernelFunction(T * __restrict__ d_data,const unsigned int NperGPU){

const int tid = threadIdx.x + blockIdx.x * blockDim.x;

if(tid< NperGPU)for(int k = 0; k< 1000; k ++)d_data [tid] = d_data [tid] * d_data [tid]

}

/ ****************** /
/ *计划结构* /
/ ****************** /
// --- Async
template< class T>
struct plan {
T * d_data;
};

/ ********************* /
/ * SVD计划创建* /
/ *** ****************** /
template< class T>
void createPlan(plan< T>& plan,unsigned int NperGPU,unsigned int gpuID){

// ---设备分配
gpuErrchk(cudaSetDevice(gpuID)) ;
gpuErrchk(cudaMalloc(&(plan.d_data),NperGPU * sizeof(T)));
}

/ ******** /
/ * MAIN * /
/ ******** /
int main(){

const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;

plan< double> plan [numGPUs];
for(int k = 0; k
// ---宽度优先方法 - 异步
double * inputMatrices; gpuErrchk(cudaMallocHost(& inputMatrices,N * sizeof(double)));
for(int k = 0; k gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan [k] .d_data,inputMatrices + k * NperGPU,NperGPU * sizeof(double),cudaMemcpyHostToDevice));
}

for(int k = 0; k gpuErrchk(cudaSetDevice(k));
kernelFunction<<< iDivUp(NperGPU,BLOCKSIZE),BLOCKSIZE>>>(plan [k] .d_data,NperGPU);
}

for(int k = 0; k gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU,plan [k] .d_data,NperGPU * sizeof(double),cudaMemcpyDeviceToHost));
}

gpuErrchk(cudaDevicesReset());
}

- 分析器时间轴 - b
$ b



实现并发性,如在相应的depth-first


使用异步拷贝可保证并发执行,无论是使用专门创建的流还是使用默认流。



请注意
在上述所有示例中,我已提供足够的工作GPU的复制和计算任务。未能为群集提供足够的工作可能会阻止观察并发执行。


I'm running a cuda kernel function on a multiple GPUs system, with 4 GPUs. I've expected them to be launched concurrently, but they are not. I measure the starting time of each kernel, and the second kernel starts after the first one finishes its execution. So launching the kernel on 4 GPUs is not faster than 1 single GPU.

How can I make them work concurrently?

This is my code:

cudaSetDevice(0);
GPU_kernel<<< gridDim, threadsPerBlock >>>(d_result_0, parameterA +(0*rateA), parameterB + (0*rateB));
cudaMemcpyAsync(h_result_0, d_result_0, mem_size_result, cudaMemcpyDeviceToHost);

cudaSetDevice(1);
GPU_kernel<<< gridDim, threadsPerBlock >>>(d_result_1, parameterA +(1*rateA), parameterB + (1*rateB));
cudaMemcpyAsync(h_result_1, d_result_1, mem_size_result, cudaMemcpyDeviceToHost);

cudaSetDevice(2);
GPU_kernel<<< gridDim, threadsPerBlock >>>(d_result_2, parameterA +(2*rateA), parameterB + (2*rateB));
cudaMemcpyAsync(h_result_2, d_result_2, mem_size_result, cudaMemcpyDeviceToHost);

cudaSetDevice(3);
GPU_kernel<<< gridDim, threadsPerBlock >>>(d_result_3, parameterA +(3*rateA), parameterB + (3*rateB));
cudaMemcpyAsync(h_result_3, d_result_3, mem_size_result, cudaMemcpyDeviceToHost);

解决方案

I have done some experiments on achieving concurrent execution on a cluster of 4 Kepler K20c GPUs. I have considered 8 test cases, whose corresponding codes along with the profiler timelines are reported below.

Test case #1 - "Breadth-first" approach - synchronous copy

- Code -

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
    T *d_data;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

    double *inputMatrices = (double *)malloc(N * sizeof(double));

    // --- "Breadth-first" approach - no async
    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpy(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpy(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
    }

    gpuErrchk(cudaDeviceReset());
}

- Profiler timeline -

As it can be seen, the use of cudaMemcpy does not enable achieving concurrency in copies, but concurrency is achieved in kernel execution.

Test case #2 - "Depth-first" approach - synchronous copy

- Code -

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
    T *d_data;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

    double *inputMatrices = (double *)malloc(N * sizeof(double));

    // --- "Depth-first" approach - no async
    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpy(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
        gpuErrchk(cudaMemcpy(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
    }

    gpuErrchk(cudaDeviceReset());
}

- Profiler timeline -

This time, concurrency is not achieved neither within memory copies nor within kernel executions.

Test case #3 - "Depth-first" approach - asynchronous copy with streams

- Code -

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
    T               *d_data;
    T               *h_data;
    cudaStream_t    stream;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
    gpuErrchk(cudaMallocHost((void **)&plan.h_data, NperGPU * sizeof(T)));
    gpuErrchk(cudaStreamCreate(&plan.stream));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

     // --- "Depth-first" approach - async
    for (int k = 0; k < numGPUs; k++)
    {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice, plan[k].stream));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE, 0, plan[k].stream>>>(plan[k].d_data, NperGPU);
        gpuErrchk(cudaMemcpyAsync(plan[k].h_data, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost, plan[k].stream));
    }

    gpuErrchk(cudaDeviceReset());
}

- Profiler timeline -

Concurrency is achieved, as expected.

Test case #4 - "Depth-first" approach - asynchronous copy within default streams

- Code -

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
    T               *d_data;
    T               *h_data;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
    gpuErrchk(cudaMallocHost((void **)&plan.h_data, NperGPU * sizeof(T)));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

    // --- "Depth-first" approach - no stream
    for (int k = 0; k < numGPUs; k++)
    {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
        gpuErrchk(cudaMemcpyAsync(plan[k].h_data, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
    }

    gpuErrchk(cudaDeviceReset());
}

- Profiler timeline -

Despite using the default stream, concurrency is achieved.

Test case #5 - "Depth-first" approach - asynchronous copy within default stream and unique host cudaMallocHosted vector

- Code -

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
    T               *d_data;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

    // --- "Depth-first" approach - no stream
    double *inputMatrices;   gpuErrchk(cudaMallocHost(&inputMatrices, N * sizeof(double)));
    for (int k = 0; k < numGPUs; k++)
    {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
        gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
    }

    gpuErrchk(cudaDeviceReset());
}

- Profiler timeline -

Concurrency is achieved once again.

Test case #6 - "Breadth-first" approach with asynchronous copy with streams

- Code -

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
    T               *d_data;
    T               *h_data;
    cudaStream_t    stream;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
    gpuErrchk(cudaMallocHost((void **)&plan.h_data, NperGPU * sizeof(T)));
    gpuErrchk(cudaStreamCreate(&plan.stream));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

    // --- "Breadth-first" approach - async
    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice, plan[k].stream));
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE, 0, plan[k].stream>>>(plan[k].d_data, NperGPU);
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].h_data, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost, plan[k].stream));
    }

    gpuErrchk(cudaDeviceReset());
}

- Profiler timeline -

Concurrency achieved, as in the corresponding "depth-first" approach.

Test case #7 - "Breadth-first" approach - asynchronous copy within default streams

- Code -

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
    T               *d_data;
    T               *h_data;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
    gpuErrchk(cudaMallocHost((void **)&plan.h_data, NperGPU * sizeof(T)));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

    // --- "Breadth-first" approach - async
    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].h_data, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
    }

    gpuErrchk(cudaDeviceReset());
}

- Profiler timeline -

Concurrency is achieved, as in the corresponding "depth-first" approach.

Test case #8 - "Breadth-first" approach - asynchronous copy within the default stream and unique host cudaMallocHosted vector

- Code -

#include "Utilities.cuh"
#include "InputOutput.cuh"

#define BLOCKSIZE 128

/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];

}

/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
    T               *d_data;
};

/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {

    // --- Device allocation
    gpuErrchk(cudaSetDevice(gpuID));
    gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}

/********/
/* MAIN */
/********/
int main() {

    const int numGPUs   = 4;
    const int NperGPU   = 500000;
    const int N         = NperGPU * numGPUs;

    plan<double> plan[numGPUs];
    for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);

    // --- "Breadth-first" approach - async
    double *inputMatrices;   gpuErrchk(cudaMallocHost(&inputMatrices, N * sizeof(double)));
    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
    }

    for (int k = 0; k < numGPUs; k++) {
        gpuErrchk(cudaSetDevice(k));
        gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
    }

    gpuErrchk(cudaDeviceReset());
}

- Profiler timeline -

Concurrency is achieved, as in the corresponding "depth-first" approach.

Conclusion Using asynchronous copies guarantees concurrent executions, either using purposely created streams or using the default stream.

Note In all the above examples, I have taken care to provide enough work to do the GPUs, either in terms of copies and of computing tasks. Failing to provide enough work to the cluster may prevent observing concurrent executions.

这篇关于并发CUDA多GPU执行的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆