cuda cpu函数 - gpu内核重叠 [英] cuda cpu function - gpu kernel overlap

查看:201
本文介绍了cuda cpu函数 - gpu内核重叠的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我在我的CUDA应用程序中的并发问题,我正在开发为了练习CUDA。我想通过使用cudaMemecpyAsync和CUDA内核的异步行为来共享GPU和CPU之间的工作,但是我不能成功地重叠CPU执行和GPU执行。

I am having problems with concurrency in my CUDA application that I am trying to develop in order to practice CUDA. I want to share the work between GPU and CPU by using asynchronous behaviors of cudaMemecpyAsync and CUDA kernels but I cannot successfully overlap CPU execution and GPU execution.

它与主机重叠设备数据传输,但内核执行不重叠。它基本上等待CPU完成并调用同步函数,然后内核开始在设备上执行。我不明白这种行为,不是内核总是与CPU线程异步?

It overlaps with Host to Device data transfer but kernel execution does not overlap. It basically waits CPU to finish and call the synchronization function then kernel starts to execute on device. I couldn't understand this behavior, aren't kernels always asynchronous to CPU thread?

我的GPU是Nvidia Geforce GT 550m(Fermi Architecture with 1 Copy Engine and 1 Compute引擎)。

My GPU is Nvidia Geforce GT 550m (Fermi Architecture with 1 Copy Engine and 1 Compute Engine).

我使用CUDA 6.0和Nsight 4.0。

I use CUDA 6.0 and Nsight 4.0.

这是代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdlib.h>
#include <stdio.h>

#include <iostream>
#include <thread>
#include <chrono>
using namespace std;

struct point4D 
{
    float x;
    float y;
    float z;
    float w;
};

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC);

bool correct_output(point4D * data, unsigned int size);
void flush_buffer(point4D * data, unsigned int size);
void initialize_input(point4D *& data, unsigned int size);
void cudaCheckError(cudaError_t cudaStatus, char* err);

// Implements cross product for 4D point on the GPU-side.
__global__ void gpu_kernel(point4D * d_ptrData, point4D * d_out, point4D pB, point4D pC)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    point4D pA = d_ptrData[index];
    point4D out; out.x = 0; out.y = 0; out.z = 0; out.w = 0;

    out.x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
    out.y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
    out.z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
    out.w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);

   d_out[index] = out;
}

// Implements cross product for 4D point on the CPU-size.
void cpu_function(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
{
    for(unsigned int index = 0; index < h_dataSize; index++)
    {
        h_out[index].x = 0; h_out[index].y = 0; h_out[index].z = 0; h_out[index].w = 0;

        point4D pA = h_ptrData[index];

        h_out[index].x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
        h_out[index].y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
        h_out[index].z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
        h_out[index].w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);
    }   
}


int main(int argc, char *argv[])
{
    int devID;
    cudaDeviceProp deviceProps;

    printf("[%s] - Starting...\n", argv[0]);

    int device_count;
    cudaCheckError(cudaGetDeviceCount(&device_count), "Couldn't get device count!");

    if (device_count == 0)
    {
        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    devID = 0;
    cudaCheckError(cudaSetDevice(devID), "Couldn't set device!");
    cudaCheckError(cudaGetDeviceProperties(&deviceProps, devID), "Couldn't get Device Properties");
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProps.name, deviceProps.major, deviceProps.minor);

    cudaDeviceReset();

    const unsigned int DATA_SIZE = 30000000;
    bool bFinalResults = true;

    // Input Data Initialization
    point4D pointB;
    pointB.x = 1; pointB.y = 1; pointB.z = 0; pointB.w = 0;

    point4D pointC;
    pointC.x = 1; pointC.y = 1; pointC.z = 1; pointC.w = 0;

    point4D * data = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    point4D * out_points = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    initialize_input(data, DATA_SIZE);
    //

    flush_buffer(out_points, DATA_SIZE);
    cout << endl << endl;

    // 1+way
    heterogenous_1way_plus(data, DATA_SIZE, out_points, pointB, pointC);
    bFinalResults &= correct_output(out_points, DATA_SIZE); // checking correctness

    free(out_points);
    free(data);

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
    return 0;
}

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
{
    cout << "1-way_plus: STARTS!!!" << endl;

    // Run the %25 of the data from CPU, rest will be executed on GPU
    unsigned int ratioPercentCPUtoGPU = 25;
    unsigned int d_dataSize = (h_dataSize * (100 - ratioPercentCPUtoGPU))/100;
    h_dataSize = (h_dataSize * ratioPercentCPUtoGPU)/100;
    size_t memorySize = d_dataSize * sizeof(point4D);

    cout << "Data Ratio Between CPU and GPU:" << (float)ratioPercentCPUtoGPU/100 << endl;
    cout << "CPU will process " << h_dataSize << " data." << endl;
    cout << "GPU will process " << d_dataSize << " data." << endl;

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    cudaCheckError(cudaHostRegister(h_ptrData, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");
    cudaCheckError(cudaHostRegister(h_out, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");

    // allocate device memory
    point4D * d_in = 0; point4D * d_out = 0;
    cudaCheckError(cudaMalloc( (void **)&d_in, memorySize), "cudaMalloc failed!");
    cudaCheckError(cudaMalloc( (void **)&d_out, memorySize), "cudaMalloc failed!");

    // set kernel launch configuration
    dim3 nThreads = dim3(1000,1);
    dim3 nBlocks = dim3(d_dataSize / nThreads.x,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t stream;
    cudaCheckError(cudaStreamCreate(&stream), "cudaStreamCreate failed!");

    // create cuda event handles
    cudaEvent_t start, stop;
    cudaCheckError(cudaEventCreate(&start), "cudaEventCreate failed!");
    cudaCheckError(cudaEventCreate(&stop), "cudaEventCreate failed!");

    // main thread waits for device
    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");
    float gpu_time = 0.0f;
    cudaEventRecord(start, stream);

    cudaMemcpyAsync(d_in, h_ptrData, memorySize, cudaMemcpyHostToDevice, stream);       
    gpu_kernel<<<nBlocks, nThreads, 0, stream>>>(d_in, d_out, pB, pC);
    cudaMemcpyAsync(h_out, d_out, memorySize, cudaMemcpyDeviceToHost, stream);

    cudaEventRecord(stop, stream);

    // The memory layout of CPU processing starts after GPU's.
    cpu_function(h_ptrData + d_dataSize, h_dataSize, h_out + d_dataSize, pB, pC);       

    cudaCheckError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed!");

    cudaCheckError(cudaEventElapsedTime(&gpu_time, start, stop), "cudaEventElapsedTime failed!");

    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");

    // release resources
    cudaCheckError(cudaEventDestroy(start), "cudaEventDestroy failed!");
    cudaCheckError(cudaEventDestroy(stop), "cudaEventDestroy failed!");
    cudaCheckError(cudaHostUnregister(h_ptrData), "cudaHostUnregister failed!");
    cudaCheckError(cudaHostUnregister(h_out), "cudaHostUnregister failed!");
    cudaCheckError(cudaFree(d_in), "cudaFree failed!");
    cudaCheckError(cudaFree(d_out), "cudaFree failed!");
    cudaCheckError(cudaStreamDestroy(stream), "cudaStreamDestroy failed!");

    cudaDeviceReset();    

    cout << "Execution of GPU: " << gpu_time << "ms" << endl;
    cout << "1-way_plus: ENDS!!!" << endl;        
}

// Checks correctness of outputs
bool correct_output(point4D * data, unsigned int size)
{ 
    const static float x = 0, y = 0, z = 0, w = -1;

    for (unsigned int i = 0; i < size; i++)
    {
        if (data[i].x != x || data[i].y != y ||
            data[i].z != y || data[i].w != w)
        {
            printf("Error! data[%d] = [%f, %f, %f, %f], ref = [%f, %f, %f, %f]\n",
            i, data[i].x, data[i].y, data[i].z, data[i].w, x, y, z, w);

            return 0;
        }
    }
    return 1;
}

// Refresh the output buffer
void flush_buffer(point4D * data, unsigned int size)
{
    for(unsigned int i = 0; i < size; i++)
    {
        data[i].x = 0; data[i].y = 0; data[i].z = 0; data[i].w = 0;
    }
}

// Initialize the input data to feed the system for simulation
void initialize_input(point4D *& data, unsigned int size)
{
    for(unsigned int idx = 0; idx < size; idx++)
    {
        point4D* d = &data[idx];
        d->x = 1;
        d->y = 0;
        d->z = 0;
        d->w = 0;
    }
}

void cudaCheckError(cudaError_t cudaStatus, char* err)
{
    if(cudaStatus != cudaSuccess)
    {
        fprintf(stderr, err);
        cudaDeviceReset();
       exit(EXIT_FAILURE);
    }
}

这里是Nsight截图:

And here is the Nsight screenshot :

推荐答案

您得到适当的重叠,从我可以看到你的profiler图像。我运行你的代码,看到类似的东西。

You're getting proper overlap, from what I can see on your profiler image. I ran your code and see something similar.

一般来说,你的代码中的关键序列是这样的:

In general, the critical sequence in your code is like this:


  1. cudaMemcpyAsyncH2D

  2. 内核调用

  3. cudaMemcpyAsyncD2H


  4. cudaStreamSynchronize

  1. cudaMemcpyAsyncH2D
  2. kernel call
  3. cudaMemcpyAsyncD2H
  4. cpu function
  5. cudaStreamSynchronize

CPU线程按照该顺序处理这些步骤。步骤1-3是异步的,意味着控制立即返回到CPU线程,而不等待底层CUDA操作完成。您希望第4步尽可能与步骤1,2和3重叠。

The CPU thread processes those steps in that order. Steps 1-3 are asynchronous, meaning control is returned to the CPU thread immediately, without waiting for the underlying CUDA operation to complete. And you desire that step 4 overlaps as much as possible with steps 1,2, and 3.

我们看到的是 cudaStreamSynchronize 调用显示在时间轴中,与内核执行的开始大致一致。这意味着 cudaStreamSynchronize()调用之前的所有CPU线程活动之前已经完成因此,我们希望与步骤1-3重叠的cpu函数(步骤4)实际上通过步骤2的开始完成(根据实际CUDA执行)。因此,您的cpu函数与第一个host-> device memcpy操作完全重叠。

What we see is that the cudaStreamSynchronize() call shows up in the timeline approximately coincident with the start of the kernel execution. This means that all CPU thread activity preceding the cudaStreamSynchronize() call has completed at that point (i.e. approximately at the point of the beginning of the actual kernel execution.) Therefore the cpu function (step 4) that we are desiring to overlap with steps 1-3 is actually completed by the start of step 2 (in terms of actual CUDA execution). Therefore you are getting full overlap of your cpu function with the first host->device memcpy operation.

所以它正常工作。因为 cudaStreamSynchronize()调用阻塞CPU线程,直到所有流活动完成,它占据从遇到直到流活动完成点的时间线。

So it's working as expected. Because the cudaStreamSynchronize() call blocks the CPU thread until all stream activity is complete, it occupies the timeline from when it is encountered until the point at which the stream activity is complete.

事实上, cudaStreamSynchronize()调用很奇怪地与内核执行的开始一致,在H2D memcpy的结束和内核的开始之间的间隙,可能是由于命令的WDDM批处理。当我在linux下配置你的代码时,我没有看到间隙和精确的符合,但是否则一般的流程是一样的。这是我看到使用linux下的可视化分析器:

The fact that the cudaStreamSynchronize() call is curiously coincident with the start of kernel execution, and that there is a gap in between the end of the H2D memcpy and the start of the kernel, is likely due to WDDM batching of commands. When I profile your code under linux, I don't see the gap and exact coincidence, but otherwise the general flow is the same. Here is what I see using the visual profiler under linux:

请注意,在上面的图片中, cudaStreamSynchronize()

Note that in the above image, the cudaStreamSynchronize() is actually encountered during the H2D memcpy operation before the kernel begins.

响应评论中的问题,我修改了应用程序,使得分割百分比为50,而不是25:

Responding to a question in the comments, I modified the app so the split percentage was 50 instead of 25:

unsigned int ratioPercentCPUtoGPU = 50;

这里是新的分析器输出结果:

here is what the new profiler output looks like:

我们看到CPU相对于GPU内核调用需要更多的时间,因此CPU线程不会遇到 cudaStreamSynchronize()调用,直到 D2H memcpy操作。我们继续看到在linux下,这一点和内核执行的开始之间没有固定的关系。现在CPU执行完全重叠H2D memcpy,内核执行和D2H memcpy的一小部分。

We see that the CPU is taking more time relative to the GPU kernel call, and so the cudaStreamSynchronize() call is not encountered by the CPU thread until during the D2H memcpy operation. We continue to see under linux that there is no fixed relationship between this point and the start of the kernel execution. Now the CPU execution is fully overlapping the H2D memcpy, the kernel execution, and a small portion of the D2H memcpy.

这篇关于cuda cpu函数 - gpu内核重叠的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆