在CUDA中获取结果的运行时间和时间之间的差异 [英] the Difference between running time and time of obtaining results in CUDA

查看:447
本文介绍了在CUDA中获取结果的运行时间和时间之间的差异的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想使用CUDA在GPU上实现我的算法。这个程序工作很好,但有一个问题。当我尝试打印出的结果,他们会显示太晚了。
这里是我的一些代码。假设真的结果不重要。

  __ device__ unsigned char dev_state [128]; 

__device__ unsigned char GMul(unsigned char a,unsigned char b){// Galois字段(256)两个字节的乘法
unsigned char p = 0;
int counter;
unsigned char hi_bit_set;
for(counter = 0; counter <8; counter ++){
if((b& 1)!= 0){
p ^ = a;
}
hi_bit_set = a& 0x80;
a<< = 1;
if(hi_bit_set!= 0){
a ^ = 0x1b; / * x ^ 8 + x ^ 4 + x ^ 3 + x + 1 * /
}
b>> = 1;
}
return p;
}


__global__ void AESROUND()
{
__shared__ unsigned char dev_rkey;
__shared__ unsigned char dev_sh_state;
int state_idx = blockIdx.x;
int offset =((state_idx / 4))* 4;

for(int i = 0; i <512; i ++)
{
dev_rkey = dev_state [state_idx];

dev_sh_state = GMul(dev_state [state_idx],0x02)^ GMul(dev_state [(state_idx + 5)%16],0x03)^ dev_state [(offset + 5)%16] ^ dev_state [ offset + 5)%16];


dev_state [state_idx] = dev_sh_state ^ dev_rkey;
}


}






呼叫AESROUND

  int main()
{
$ b b unsigned char p [] = {0x19,0x3d,0xe3,0xbe,0xa0,0xf4,0xe2,0x2b,0x9a,0xc6,0x8d,0x2a,0xe9,0xf8,0x48,0x08}。

unsigned char h_state [128];
for(long long i = 0; i <128; i ++)
h_state [i] = p [i%16];

cudaMemcpyToSymbolAsync(dev_state,h_state,128,0,cudaMemcpyHostToDevice);

clock_t start,finish;

start = clock();
for(long long i = 0; i <1024; i ++)

AESROUND< < 128,128> >();
finish = clock();

float Time = finish - start;

printf(\\\
\\\
processing time:%2.15f(ms)\\\
,Time);

cudaMemcpyFromSymbolAsync(h_state,dev_state,128,0,cudaMemcpyDeviceToHost);
printf(\\\
\state After Encryption:\\\
);
for(int i = 0; i <16; i ++)
printf(%x,h_state [i]);

getchar();
return 0;
}

这里是结果:

 处理时间:1.0000000000000(ms)



<经过长时间(约5秒),下一行将显示 -

 加密后的状态:
88 91 23 09 78 65 11 87 65 43 56 71 20 93 18 70

时间太快,但128Byte将显示太晚了。
为什么会发生这种情况?这是与GPU有关吗?
我如何修复它?

解决方案

这里的混乱似乎是由于使用基于主机的计时方法来计算什么是(大部分)设备活动。



内核启动是异步的。主机代码启动内核,然后继续进行而不等待内核完成。因此这种时间:

  start = clock(); 
for(long long i = 0; i <1024; i ++)

AESROUND< < 128,128> >();
finish = clock();

只是测量内核启动时间。 (即使在内核在循环中重复启动的情况下,这也是真的。如果没有超过一些设备队列,每个内核启动将是异步的,允许主机线程,即for循环继续) p>

为了测量完整的设备执行时间,您可以这样做:

  start = clock(); 
for(long long i = 0; i <1024; i ++)

AESROUND< < 128,128> >();
cudaDeviceSynchronize(); //等待设备完成
finish = clock();


I am trying to implement My algorithm on GPU using CUDA. this program work well but there is a problem. when I try to print out the results, they will be shown too late . here are some of my code. Assume True Results is not matter.

__device__ unsigned char dev_state[128];

__device__ unsigned char GMul(unsigned char a, unsigned char b) { // Galois Field (256) Multiplication of two Bytes
    unsigned char p = 0;
    int counter;
    unsigned char hi_bit_set;
    for (counter = 0; counter < 8; counter++) {
        if ((b & 1) != 0) {
            p ^= a;
        }
        hi_bit_set = a & 0x80;
        a <<= 1;
        if (hi_bit_set != 0) {
            a ^= 0x1b; /* x^8 + x^4 + x^3 + x + 1 */
        }
        b >>= 1;
    }
    return p;
}


__global__ void AESROUND()
{
    __shared__ unsigned char dev_rkey;
    __shared__ unsigned char dev_sh_state;
    int state_idx = blockIdx.x;
    int offset = ((state_idx / 4)) *4;

    for (int i = 0; i < 512; i++)
    {
        dev_rkey = dev_state[state_idx];

        dev_sh_state= GMul(dev_state[state_idx], 0x02) ^ GMul(dev_state[(state_idx + 5) % 16], 0x03) ^ dev_state[(offset + 5) % 16] ^ dev_state[(offset + 5) % 16];


        dev_state[state_idx] = dev_sh_state ^ dev_rkey;
    }


}


calling AESROUND

int main()
{

    unsigned char p[] = { 0x19, 0x3d, 0xe3, 0xbe, 0xa0, 0xf4, 0xe2, 0x2b, 0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08 };

unsigned char h_state[128];
for (long long i = 0; i < 128; i++)
    h_state[i] = p[i%16];

cudaMemcpyToSymbolAsync(dev_state, h_state, 128, 0, cudaMemcpyHostToDevice);

clock_t start, finish;

start = clock();
for (long long i = 0; i < 1024; i++)

     AESROUND << <128, 128 >> >();
finish = clock();

float Time = finish - start;

printf("\n\nprocessing time: %2.15f (ms)\n", Time);

cudaMemcpyFromSymbolAsync(h_state, dev_state, 128, 0, cudaMemcpyDeviceToHost);
printf("\n\state After Encryption:\n ");
for (int i = 0; i < 16; i++)
    printf("%x ", h_state[i]);

getchar();
return 0;
}

here are the Results:

processing time: 1.0000000000000 (ms)

-after a long time ( ~ 5 seconds), next line will be shown-

state after encryption:
88 91 23 09 78 65 11 87 65 43 56 71 20 93 18 70

as you can see, The processing time are too fast but the 128Byte will be shown Too late. why this happened? is this related to GPU? how can I fix it?

解决方案

The confusion here seems to have arisen out of using a host-based timing method to time what is (mostly) device activity.

Kernel launches are asynchronous. The host code launches the kernel, and then proceeds without waiting for the kernel to complete. Therefore this kind of timing:

start = clock();
for (long long i = 0; i < 1024; i++)

     AESROUND << <128, 128 >> >();
finish = clock();

is only measuring the kernel launch time. (This is true even in this case where the kernel is launched repeatedly in a loop. If some device queues are not exceeded, each kernel launch will be asynchronous, allowing the host thread, i.e. the for-loop, to continue.)

In order to measure the complete device execution time, you could do something like this:

start = clock();
for (long long i = 0; i < 1024; i++)

     AESROUND << <128, 128 >> >();
cudaDeviceSynchronize();   //wait for device to finish
finish = clock();

这篇关于在CUDA中获取结果的运行时间和时间之间的差异的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆