测量CUDA程序和CUDA内核所用时间的故障 [英] Trouble measuring the elapsed time of a CUDA program and CUDA kernels

查看:482
本文介绍了测量CUDA程序和CUDA内核所用时间的故障的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我目前有三个方面测量经过的时间,两个使用CUDA事件,另一个记录开始和结束UNIX。使用CUDA事件的方法测量两件事,一个测量整个外部循环时间,另一个测量所有内核执行时间。



这里是代码:

  int64 x1,x2; 

cudaEvent_t start;
cudaEvent_t end;
cudaEvent_t s1,s2;
float timeValue;


#define timer_s cudaEventRecord(start,0);
#define timer_e cudaEventRecord(end,0); cudaEventSynchronize(end); cudaEventElapsedTime(& timeValue,start,end); printf(time:%f ms \\\
,timeValue);


cudaEventCreate(& start);
cudaEventCreate(& end);
cudaEventCreate(& s1);
cudaEventCreate(& s2);

cudaEventRecord(s1,0);
x1 = GetTimeMs64();

for(int r = 0; r <2; r ++)
{
timer_s
kernel1<<< 1,x> (gl_devdata_ptr);
cudaThreadSynchronize();
timer_e
sum + = timeValue;

for(int j = 0; j <5; j ++)
{
timer_s
kernel2<<< 1,x> (gl_devdata_ptr);
cudaThreadSynchronize();
timer_e
sum + = timeValue;

timer_s
kernel3<<< 1,x>>>(gl_devdata_ptr);
cudaThreadSynchronize();
timer_e
sum + = timeValue;
}

timer_s
kernel4<<< y,x>> (gl_devdata_ptr);
cudaThreadSynchronize();
timer_e
sum + = timeValue;
}

x2 = GetTimeMs64();

cudaEventRecord(s2,0);
cudaEventSynchronize(s2);
cudaEventElapsedTime(& timeValue,s1,s2);
printf(elapsed cuda:%f ms \\\
,timeValue);
printf(elapsed sum:%f ms \\\
,sum);
printf(elapsed win:%d ms \\\
,x2-x1);

GetTimeMs64是我在StackOverflow上找到的:

  int64 GetTimeMs64()
{
/ * Windows * /
FILETIME ft;
LARGE_INTEGER li;
uint64 ret;

/ *获取自1601年1月1日(UTC)以来所经过的100纳秒间隔的量,并将它
*复制到LARGE_INTEGER结构。 * /
GetSystemTimeAsFileTime(& ft);
li.LowPart = ft.dwLowDateTime;
li.HighPart = ft.dwHighDateTime;

ret = li.QuadPart;
ret - = 116444736000000000LL; / *从文件时间转换为UNIX纪元时间。 * /
ret / = 10000; / *从100纳秒(10 ^ -7)到1毫秒(10 ^ -3)间隔* /

return ret;
}



这些不是真正的变量名称或正确的内核名称,我只是



因此,问题是,每个小节都给我一个完全不同的总时间。



我刚刚运行的一些示例:

  elapsed cuda:21.076832 
elapsed sum:4.177984
win:27

那么为什么会有这么大的差别?所有内核调用的总和约为4ms,其他是18ms? CPU时间?

解决方案

cudaThreadSynchronize是一个非常高的开销操作,因为它必须等待GPU上的所有工作完成。 / p>

如果按以下方式构建代码,则应该得到正确的结果:

  int64 x1,x2; 

cudaEvent_t start;
cudaEvent_t end;
const int k_maxEvents = 5 +(2 * 2)+(2 * 5 * 2);
cudaEvent_t events [k_maxEvents];
int eIdx = 0;
float timeValue;

for(int e = 0; e <5; ++ e)
{
cudaEventCreate(& events [e]);
}

x1 = GetTimeMs64();
cudaEventRecord(events [eIdx ++],0);
for(int r = 0; r< 2; r ++)
{
cudaEventRecord(events [eIdx ++],0);
kernel1<<< 1,x>>>(gl_devdata_ptr);

for(int j = 0; j< 5; j ++)
{
cudaEventRecord(events [eIdx ++],0);
kernel2<<< 1,x>>>(gl_devdata_ptr);

cudaEventRecord(events [eIdx ++],0);
kernel3<<< 1,x>>>(gl_devdata_ptr);
}

cudaEventRecord(events [eIdx ++],0);
kernel4<<< y,x>>> (gl_devdata_ptr);
}

cudaEventRecord(eIdx ++,0);
cudaDeviceSynchronize();

x2 = GetTimeMs64();

cudaEventElapsedTime(& timeValue,events [0],events [k_maxEvents - 1]);
printf(elapsed cuda:%f ms \\\
,timeValue);
// TODO每个事件之间的时间是执行每个内核的时间。
//在WDDM上,任何引导
//的内核之间可能发生上下文切换,比高于预期的结果。
// printf(elapsed sum:%f ms \\\
,sum);
printf(elapsed win:%d ms \\\
,x2-x1);

在Windows上,测量时间的一种更简单的方法是使用QueryPerformanceCounter和QueryPerformanceFrequency。



如果你写上面没有事件的例子

  #includeNvToolsExt.h 
nvtxRangePushA(CPU Time);
for(int r = 0; r <2; r ++)
{
kernel1<<< 1,x>>(gl_devdata_ptr);

for(int j = 0; j <5; j ++)
{
kernel2<<< 1,x>>(gl_devdata_ptr);
kernel3<<< 1,x>>>(gl_devdata_ptr);
}
kernel4<<<< y,x>>> (gl_devdata_ptr);
}

cudaDeviceSynchronize();
nvtxRangePop();

并在Nsight Visual Studio Edition 1.5-2.2中运行CUDA跟踪活动或Visual Profiler 4.0+次数将可用。 GPU时间将比使用cudaEvents API收集的时间更精确。使用nvtxRangePush来测量CPU时间范围是可选的。这也可以通过从示例中的第一个CUDA API测量到cudaDeviceSynchronize结束来实现。


I currently have three methos of measuring the elapsed time, two using CUDA events and the other recording start and end UNIX. The ones using CUDA events measure two things, one measures the entire outer loop time, and the other sum all kernel execution times.

Here's the code:

int64 x1, x2;

cudaEvent_t start;
cudaEvent_t end;
cudaEvent_t s1, s2;
float timeValue;


 #define timer_s cudaEventRecord(start, 0);
 #define timer_e cudaEventRecord(end, 0);   cudaEventSynchronize(end); cudaEventElapsedTime( &timeValue, start, end ); printf("time:  %f  ms \n", timeValue);


cudaEventCreate( &start );
cudaEventCreate( &end );
cudaEventCreate( &s1 );
cudaEventCreate( &s2 );

cudaEventRecord(s1, 0);   
x1 = GetTimeMs64();

for(int r = 0 ; r < 2 ; r++)
{
    timer_s
    kernel1<<<1, x>>>(gl_devdata_ptr);
    cudaThreadSynchronize();
    timer_e
    sum += timeValue;

    for(int j = 0 ; j < 5; j++)
    {
        timer_s
        kernel2<<<1,x>>>(gl_devdata_ptr);
        cudaThreadSynchronize();
        timer_e
        sum += timeValue;

        timer_s
        kernel3<<<1,x>>>(gl_devdata_ptr);
        cudaThreadSynchronize();
        timer_e
        sum += timeValue;
    }

    timer_s
    kernel4<<<y, x>>> (gl_devdata_ptr);
    cudaThreadSynchronize();
    timer_e
    sum += timeValue;
}

x2 = GetTimeMs64();

cudaEventRecord(s2, 0);   
cudaEventSynchronize(s2); 
cudaEventElapsedTime( &timeValue, s1, s2 ); 
printf("elapsed cuda :       %f  ms \n", timeValue);
printf("elapsed sum :       %f  ms \n", sum);
printf("elapsed win :       %d  ms \n", x2-x1);

The GetTimeMs64 is something I found here on StackOverflow:

int64 GetTimeMs64()
{
 /* Windows */
 FILETIME ft;
 LARGE_INTEGER li;
 uint64 ret;

 /* Get the amount of 100 nano seconds intervals elapsed since January 1, 1601 (UTC) and copy it
  * to a LARGE_INTEGER structure. */
 GetSystemTimeAsFileTime(&ft);
 li.LowPart = ft.dwLowDateTime;
 li.HighPart = ft.dwHighDateTime;

 ret = li.QuadPart;
 ret -= 116444736000000000LL; /* Convert from file time to UNIX epoch time. */
 ret /= 10000; /* From 100 nano seconds (10^-7) to 1 millisecond (10^-3) intervals */

 return ret;
}

Those aren't the real variable names nor the right kernel names, I just removed some to make the code smaller.

So the problem is, every measure gives me a really different total time.

Some examples I just ran:

elapsed cuda : 21.076832    
elapsed sum :  4.177984     
elapsed win :  27

So why is there such a huge difference? The sum of all kernel calls is around 4 ms, where are the other 18ms? CPU time?

解决方案

cudaThreadSynchronize is a very high overhead operation as it has to wait for all work on the GPU to complete.

You should get the correct result if you structure you code as follows:

int64 x1, x2;

cudaEvent_t start;
cudaEvent_t end;
const int k_maxEvents = 5 + (2 * 2) + (2 * 5 * 2);
cudaEvent_t events[k_maxEvents];
int eIdx = 0;
float timeValue;

for (int e = 0; e < 5; ++e)
{
    cudaEventCreate(&events[e]);
}

x1 = GetTimeMs64();
cudaEventRecord(events[eIdx++], 0);       
for(int r = 0 ; r < 2 ; r++)
{
    cudaEventRecord(events[eIdx++], 0);
    kernel1<<<1, x>>>(gl_devdata_ptr);

    for(int j = 0 ; j < 5; j++)
    {
        cudaEventRecord(events[eIdx++], 0);
        kernel2<<<1,x>>>(gl_devdata_ptr);

        cudaEventRecord(events[eIdx++], 0);
        kernel3<<<1,x>>>(gl_devdata_ptr);
    }

    cudaEventRecord(events[eIdx++], 0);
    kernel4<<<y, x>>> (gl_devdata_ptr);
}

cudaEventRecord(eIdx++, 0);   
cudaDeviceSynchronize(); 

x2 = GetTimeMs64();

cudaEventElapsedTime( &timeValue, events[0], events[k_maxEvents - 1] ); 
printf("elapsed cuda :       %f  ms \n", timeValue);
// TODO the time between each events is the time to execute each kernel.
// On WDDM a context switch may occur between any of the kernels leading
// to higher than expected results.
// printf("elapsed sum :       %f  ms \n", sum);
printf("elapsed win :       %d  ms \n", x2-x1);

On Windows an easier way to measure time is to use QueryPerformanceCounter and QueryPerformanceFrequency.

If you write the above example without the events as

#include "NvToolsExt.h"
nvtxRangePushA("CPU Time");
for(int r = 0 ; r < 2 ; r++)
{
    kernel1<<<1, x>>>(gl_devdata_ptr);

    for(int j = 0 ; j < 5; j++)
    {
        kernel2<<<1,x>>>(gl_devdata_ptr); 
        kernel3<<<1,x>>>(gl_devdata_ptr);
    }
    kernel4<<<y, x>>> (gl_devdata_ptr);
}

cudaDeviceSynchronize(); 
nvtxRangePop();

and run in Nsight Visual Studio Edition 1.5-2.2 CUDA Trace Activity or Visual Profiler 4.0+ all of the times will be available. The GPU times will be more accurate than what you can collect using cudaEvents API. Using nvtxRangePush to measure the CPU time range is optional.This can also be accomplished by measuring from the first CUDA API in the example to the end of cudaDeviceSynchronize.

这篇关于测量CUDA程序和CUDA内核所用时间的故障的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆