Fermi架构的False依赖问题 [英] False dependency issue for the Fermi architecture

查看:156
本文介绍了Fermi架构的False依赖问题的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我尝试使用 3 流实现 3 href =http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf =nofollow noreferrer> CUDA流和并发网络研讨会。但我不能实现。

I am trying to achieve "3-way overlapping" using 3 streams as in the examples in CUDA streams and concurrency webinar. But I couldn't achieve it.

我有Geforce GT 550M(Fermi Architecture with one copy engine)和我使用Windows 7(64位)。

I have Geforce GT 550M (Fermi Architecture with one copy engine) and I am using Windows 7 (64 bit).

这是我写的代码。

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions 

#include <stdio.h>

using namespace std;

#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int dataSize)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  dataSize;
    for (int i = start; i < end; i += blockDim.x * gridDim.x) 
    {
        out[i] = in[i] * in[i];
    }
}

int main()
{
    const int dataSize = DATA_SIZE;
    int *h_in = new int[dataSize];
    int *h_out = new int[dataSize];
    int *h_groundTruth = new int[dataSize];

    // Input population
    for(int i = 0; i < dataSize; i++)
        h_in[i] = 5;

    for(int i = 0; i < dataSize; i++)
        h_out[i] = 0;

    // CPU calculation for ground truth
    for(int i = 0; i < dataSize; i++)
        h_groundTruth[i] = h_in[i] * h_in[i];

    // Choose which GPU to run on, change this on a multi-GPU system.
    checkCudaErrors( cudaSetDevice(0) );

    int *d_in = 0;
    int *d_out = 0;
    int streamSize = dataSize / NUM_STREAMS;
    size_t memSize = dataSize * sizeof(int);
    size_t streamMemSize = memSize / NUM_STREAMS;

    checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
    checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
    checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));

    // set kernel launch config
    dim3 nThreads = dim3(NUM_THREADS,1,1);
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t streams[NUM_STREAMS];
    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamCreate(&streams[i]));

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, 0);

    // overlapped execution using version 2

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]);
    }

    //cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice);

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

        //kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize/2);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);
    }

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]);
    }



    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamSynchronize(streams[i]));

    cudaEventRecord(stop, 0);

    checkCudaErrors(cudaStreamSynchronize(0));

    checkCudaErrors(cudaDeviceSynchronize());

    float gpu_time = 0;
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));


    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaHostUnregister(h_in));
    checkCudaErrors(cudaHostUnregister(h_out));
    checkCudaErrors(cudaFree(d_in));
    checkCudaErrors(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamDestroy(streams[i]));

    cudaDeviceReset();  

    cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;


    // GPU output check
    int sum = 0;
    for(int i = 0; i < dataSize; i++)       
        sum += h_groundTruth[i] - h_out[i];

    cout << "Error between CPU and GPU: " << sum << endl;

    delete[] h_in;
    delete[] h_out;
    delete[] h_groundTruth;

    return 0;
}

使用Nsight进行性能分析,结果如下:

Using Nsight for profiling, I have this result:

看起来是正确的,但是为什么在流#1中的D2H传输仅在流#2的最后一个内核启动时才开始,而不是之前?
我也尝试使用 8 流(只需将 NUM_STREAM 更改为 8 )以实现这样的 3 - 重叠,这里是结果:

It may seem correct, but why does the D2H transfer in stream #1 only start when the last kernel launch of stream #2 and not before? I tried also to use 8 streams (just by changing NUM_STREAM to 8) to achieve such a "3-way overlap" and here is the result:

有趣的事当我使用 8 流时,计算和内存传输之间的重叠似乎更好。

The interesting thing is that when I use 8 streams, the overlappings between computation and memory transfers seem to be much better.

是这个问题的原因吗?是由于WDDM驱动程序还是我的程序有问题?

What is the reason for this problem? Is it due to WDDM driver or is there something wrong with my program?

推荐答案

从上面的注释,问题是一个虚假依赖问题,受到费米架构的影响,并由Kepler架构的Hyper-Q功能解决。

From the comments above, it seems that the OP's problem is a false dependency issue, suffered by the Fermi architecture and solved by the Hyper-Q feature of the Kepler architecture.

OP突出显示第一D2H传输(流#1)在最后一个H2D(流#3)完成之后不立即开始,而原则上它可以的事实。时间间隙由下图中的红色圆圈突出显示(以下,但是对于不同的指定,所有测试均涉及属于Fermi系列的GeForce GT540M):

To summarize, the OP is highlighting the fact that the first D2H transfer (stream #1) does not start immediately after the last H2D (stream #3) finishes, while in principle it could. The time gap is highlighted by the red circle in the following figure (henceforth, but for the differently specified, all the tests refer to a GeForce GT540M belonging to the Fermi family):

OP的方法是广度优先方法,根据以下方案运行:

The OP's approach is a breadth-first approach, which operates according to the following scheme:

for(int i = 0; i < NUM_STREAMS; i++)
    cudaMemcpyAsync(..., cudaMemcpyHostToDevice,   streams[i]);

for(int i = 0; i < NUM_STREAMS; i++)
{
    kernel_launch_1<<<..., 0, streams[i]>>>(...);
    kernel_launch_2<<<..., 0, streams[i]>>>(...);
}

for(int i = 0; i < NUM_STREAMS; i++)
    cudaMemcpyAsync(..., cudaMemcpyDeviceToHost,   streams[i]);

使用深度优先方法,按照以下方案进行操作

Using a depth-first approach, operating according to the following scheme

for(int i = 0; i < NUM_STREAMS; i++)
{
    cudaMemcpyAsync(...., cudaMemcpyHostToDevice, streams[i]);

    kernel_launch_1<<<...., 0, streams[i]>>>(....);
    kernel_launch_2<<<...., 0, streams[i]>>>(....);

    cudaMemcpyAsync(...., cudaMemcpyDeviceToHost,   streams[i]);
}

似乎没有改善的情况,根据以下时间线 - 第一个代码在答案的底部报告),但它似乎显示更差的重叠:

does not seem to improve the situation, according to the following timeline (the depth-first code is reported at the bottom of the answer), but it seems to show a worse overlapping:

在广度优先方法下,并评论第二个内核启动,第一个D2H副本

Under the breadth-first approach, and commenting the second kernel launch, the first D2H copy starts immediately as it can, as reported by the following timeline:

最后,在Kepler K20c上运行代码,问题不会出现,如下图所示:

Finally, running the code on a Kepler K20c, the problem does not show up, as illustrated by the following figure:

这里是深度优先方法的代码:

Here is the code for the depth-first approach:

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions

#include <stdio.h>

using namespace std;

#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int dataSize)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  dataSize;
    for (int i = start; i < end; i += blockDim.x * gridDim.x)
    {
        out[i] = in[i] * in[i];
    }
}

int main()
{
    const int dataSize = DATA_SIZE;
    int *h_in = new int[dataSize];
    int *h_out = new int[dataSize];
    int *h_groundTruth = new int[dataSize];

    // Input population
    for(int i = 0; i < dataSize; i++)
        h_in[i] = 5;

    for(int i = 0; i < dataSize; i++)
        h_out[i] = 0;

    // CPU calculation for ground truth
    for(int i = 0; i < dataSize; i++)
        h_groundTruth[i] = h_in[i] * h_in[i];

    // Choose which GPU to run on, change this on a multi-GPU system.
    checkCudaErrors( cudaSetDevice(0) );

    int *d_in = 0;
    int *d_out = 0;
    int streamSize = dataSize / NUM_STREAMS;
    size_t memSize = dataSize * sizeof(int);
    size_t streamMemSize = memSize / NUM_STREAMS;

    checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
    checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
    checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));

    // set kernel launch config
    dim3 nThreads = dim3(NUM_THREADS,1,1);
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t streams[NUM_STREAMS];
    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamCreate(&streams[i]));

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, 0);

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;

        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]);

        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize/2);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);

        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]);
    }



    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamSynchronize(streams[i]));

    cudaEventRecord(stop, 0);

    checkCudaErrors(cudaStreamSynchronize(0));

    checkCudaErrors(cudaDeviceSynchronize());

    float gpu_time = 0;
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));


    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaHostUnregister(h_in));
    checkCudaErrors(cudaHostUnregister(h_out));
    checkCudaErrors(cudaFree(d_in));
    checkCudaErrors(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamDestroy(streams[i]));

    cudaDeviceReset();  

    cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;


    // GPU output check
    int sum = 0;
    for(int i = 0; i < dataSize; i++)      
        sum += h_groundTruth[i] - h_out[i];

    cout << "Error between CPU and GPU: " << sum << endl;

    delete[] h_in;
    delete[] h_out;
    delete[] h_groundTruth;

    return 0;
}

这篇关于Fermi架构的False依赖问题的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆