Cuda性能测量 - 已用时间返回零 [英] Cuda Performance measuring - Elapsed time returns zero

查看:338
本文介绍了Cuda性能测量 - 已用时间返回零的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我写了几核函数,不知道有多少毫秒来处理这些功能。



 使用命名空间std; 
#include< iostream>
#include< stdio.h>
#include< stdlib.h>
的#defineñ8000

无效fillArray为(int *数据,诠释计数){
的for(int i = 0; I<计数;我++)
数据[i] = rand()%100;
}

__global__ void add(int * a,int * b){
int add = 0;

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
add = a [tid] + b [tid];
}
}

__global__ void subtract(int * a,int * b){
int subtract = 0;

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
subtract = a [tid] -b [tid];
}
}

__global__ void multiply(int * a,int * b){
int multiply = 0;

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
multiply = a [tid] * b [tid];
}
}

__global__ void divide(int * a,int * b){
int divide = 0;

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
divide = a [tid] / b [tid];
}
}

__global__ void modu(int * a,int * b){
int modulus = 0;

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
modulus = a [tid]%b [tid];
}
}

__global__ void neg(int * data){

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
data [tid] = -data [tid];
}
}

float duration(int * devA,int * devB,int blocksPerGrid,int threadsPerBlock){

cudaEvent_t start,stop;
float elapsedTime;

cudaEventCreate(& start);
cudaEventCreate(& stop);
cudaEventRecord(start,0);

add<<<<块blockPerGrid,threadsPerBlock>>>(devA,devB);
subtract<<<< blocksPerGrid,threadsPerBlock>>>(devA,devB);
multiply<<<< blocksPerGrid,threadsPerBlock>>>(devA,devB);
divide<<<< blocksPerGrid,threadsPerBlock>>>(devA,devB);
modu<<<< blocksPerGrid,threadsPerBlock>>>(devA,devB);
neg<<<< blocksPerGrid,threadsPerBlock>>>(devA);
neg<<<<块blockPerGrid,threadsPerBlock>>>(devB);

cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(& elapsedTime,start,stop);

cudaEventDestroy(start);
cudaEventDestroy(stop);

return elapsedTime;
}

int main(void){

int a [N],b [N];
float dur = 0;



int * devA,* devB;

cudaMalloc((void **)& devA,N * sizeof(int));
cudaMalloc((void **)& devB,N * sizeof(int));

fillArray(a,N);
fillArray(b,N);

cudaMemcpy(devA,a,N * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(devA,b,N * sizeof(int),cudaMemcpyHostToDevice);



dur = duration(a,b,N,1);

cout<< 全局内存版本:\\\
;
cout<< Process completed in<< dur;
cout<< 对于数据集<< N< integer。

return 0;
}

毫秒总是返回零。为什么?我在这里缺少什么?如果我从持续时间功能中删除neg函数。它返回0.15687 ms。我认为这是一个小数字来处理这些功能。



编辑后,我这样做:

  using namespace std; 
#include< iostream>
#include< stdio.h>
#include< stdlib.h>

const int N = 8000;

void fillArray(int * data,int count){
for(int i = 0; i data [i] 100;
}

__global__ void add(int * a,int * b,int * c){

int tid = threadIdx.x + blockIdx.x * blockDim 。X;
if(tid< N){
c [tid] = a [tid] + b [tid];
}
}

__global__ void subtract(int * a,int * b,int * c){

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
c [tid] = a [tid] - b [tid];
}
}

__global__ void multiply(int * a,int * b,int * c){

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
c [tid] = a [tid] * b [tid];
}
}

__global__ void divide(int * a,int * b,int * c){

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
c [tid] = a [tid] / b [tid];
}
}

__global__ void modu(int * a,int * b,int * c){

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
c [tid] = a [tid]%b [tid];
}
}

__global__ void neg(int * data,int * c){

int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid< N){
c [tid] = -data [tid];
}
}

float duration(int * devA,int * devB,int * devC,int blocksPerGrid,int threadsPerBlock){

cudaEvent_t start , 停止;
float elapsedTime;

cudaEventCreate(& start);
cudaEventCreate(& stop);
cudaEventRecord(start,0);

double hArrayC [N];

add<<<< blocksPerGrid,threadsPerBlock>>>(devA,devB,devC);
cudaMemcpy(hArrayC,devC,N * sizeof(int),cudaMemcpyDeviceToHost);

subtract<<<<块blockPerGrid,threadsPerBlock>>>(devA,devB,devC);
cudaMemcpy(hArrayC,devC,N * sizeof(int),cudaMemcpyDeviceToHost);

multiply<<<<块blockPerGrid,threadsPerBlock>>>(devA,devB,devC);
cudaMemcpy(hArrayC,devC,N * sizeof(int),cudaMemcpyDeviceToHost);

divide<<<< blocksPerGrid,threadsPerBlock>>>(devA,devB,devC);
cudaMemcpy(hArrayC,devC,N * sizeof(int),cudaMemcpyDeviceToHost);

modu<<<< blocksPerGrid,threadsPerBlock>>>(devA,devB,devC);
cudaMemcpy(hArrayC,devC,N * sizeof(int),cudaMemcpyDeviceToHost);

neg<<<<块blockPerGrid,threadsPerBlock>>>(devA,devC);
cudaMemcpy(hArrayC,devC,N * sizeof(int),cudaMemcpyDeviceToHost);

neg<<<<块blockPerGrid,threadsPerBlock>>(devB,devC);
cudaMemcpy(hArrayC,devC,N * sizeof(int),cudaMemcpyDeviceToHost);

cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(& elapsedTime,start,stop);

cudaEventDestroy(start);
cudaEventDestroy(stop);

return elapsedTime;
}

int main(void){

int a [N],b [N],c [N]
float dur = 0;

int * devA,* devB,* devC;

cudaMalloc((void **)& devA,N * sizeof(int));
cudaMalloc((void **)& devB,N * sizeof(int));
cudaMalloc((void **)& devC,N * sizeof(int));

fillArray(a,N);
fillArray(b,N);

cudaMemcpy(devA,a,N * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(devB,b,N * sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(devC,c,N * sizeof(int),cudaMemcpyHostToDevice);




dur = duration(devA,devB,devC,N,1);

cout<< 全局内存版本:\\\
;
cout<< Process completed in<< dur;
cout<< 对于数据集<< N< integer。



cudaFree(devA);
cudaFree(devB);
return 0;
}


解决方案

,因为您只将结果存储在寄存器中。当编译时,你得到一些警告:


kernel.cu(13):warning:设置变量add p>

此外,如果你想看到更好的时间,使用NVIDIA的分析器: nvprof (CLI)或 nvvp (GUI)。


$ nvprof ./kernel

  ======== NVPROF正在剖析内核... 
======= =命令:kernel
全局内存版本:对于8000个整数的数据集,进程在0中完成。
========分析结果:
时间(%)时间调用平均最小最大值
100.00 18.46us 2 9.23us 6.02us 12.45us [CUDA memcpy HtoD]
0.00 0ns 1 0ns 0ns 0ns multiply(int *,int *)
0.00 0ns 1 0ns 0ns 0ns add(int *,int *)
0.00 0ns 1 0ns 0ns 0ns modu(int *,int *)
0.00 0ns 2 0ns 0ns 0ns neg(int *)
0.00 0ns 1 0ns 0ns 0ns subtract(int *,int *)
0.00 0ns 1 0ns 0ns 0ns divide int *)


N 每个网格块,每个块1个线程。您应该考虑阅读此问题的答案。 p>

UPDATE



关于向量加法(和其他简单的操作),你应该学习 vectorAdd示例的CUDA SDK,或使用 Thrust 。第一个选项将教你如何使用CUDA,第二个选项将显示你可以用Thrust做的高级操作。如果我是你,我会这样做。


I wrote a few kernel function and wonder how many miliseconds to process these functions.

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define N 8000

void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}

__global__ void add(int* a, int *b) {
    int add = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        add = a[tid] + b[tid];
    }
}

__global__ void subtract(int* a, int *b) {
    int subtract = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        subtract = a[tid] - b[tid];
    }
}

__global__ void multiply(int* a, int *b) {
    int multiply = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        multiply = a[tid] * b[tid];
    }
}

__global__ void divide(int* a, int *b) {
    int divide = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        divide = a[tid] / b[tid];
    }
}

__global__ void modu(int* a, int *b) {
    int modulus = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        modulus = a[tid] % b[tid];
    }
}

__global__ void neg(int *data) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        data[tid] = -data[tid];
    }
}

float duration(int *devA, int *devB, int blocksPerGrid, int threadsPerBlock) {

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devA);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devB);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return elapsedTime;
}

int main(void) {

    int a[N], b[N];
    float dur = 0;



    int *devA, *devB;

    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));

    fillArray(a, N);
    fillArray(b, N);

    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devA, b, N * sizeof(int), cudaMemcpyHostToDevice);



    dur = duration(a, b, N, 1);

    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";

    return 0;
}

Milisecond always return zero. Why? What I'm missing here? If a i remove the neg functions from the duration duration function. It returns 0.15687 ms. I think it is a small number to process these functions. whats wrong with that program?

After edit, I did this:

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

const int N = 8000;

void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}

__global__ void add(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

__global__ void subtract(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] - b[tid];
    }
}

__global__ void multiply(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] * b[tid];
    }
}

__global__ void divide(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] / b[tid];
    }
}

__global__ void modu(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] % b[tid];
    }
}

__global__ void neg(int *data, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = -data[tid];
    }
}

float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    double hArrayC[N];

    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return elapsedTime;
}

int main(void) {

    int a[N], b[N],c[N];
    float dur = 0;

    int *devA, *devB,*devC;

    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));
    cudaMalloc((void**) &devC, N * sizeof(int));

    fillArray(a, N);
    fillArray(b, N);

    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devC, c, N * sizeof(int), cudaMemcpyHostToDevice);




    dur = duration(devA, devB, devC,N, 1);

    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";



    cudaFree(devA);
    cudaFree(devB);
    return 0;
}

解决方案

Your kernels are not doing anything, since you only store results in registers. When compiling, you get some warnings:

kernel.cu(13): warning: variable "add" was set but never used

Also, if you want to see some better timings, use NVIDIA's profiler: either nvprof (CLI) or nvvp (GUI).

$ nvprof ./kernel

======== NVPROF is profiling kernel...
======== Command: kernel
Global memory version: Process completed in 0 for a data set of 8000 integers.
======== Profiling result:
  Time(%)     Time   Calls       Avg       Min       Max  Name
  100.00   18.46us       2    9.23us    6.02us   12.45us  [CUDA memcpy HtoD]
    0.00       0ns       1       0ns       0ns       0ns  multiply(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  add(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  modu(int*, int*)
    0.00       0ns       2       0ns       0ns       0ns  neg(int*)
    0.00       0ns       1       0ns       0ns       0ns  subtract(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  divide(int*, int*)

You are also using N blocks per grid, and 1 thread per block. You should consider reading the answer to this question.

UPDATE

Concerning the vector addition (and the other simple operations) in itself, you should either study the vectorAdd sample of the CUDA SDK, or use Thrust. The first option will teach you how to use CUDA, and the second option will show you the kind of high-level operations you can do with Thrust. If I were you, I would do both.

这篇关于Cuda性能测量 - 已用时间返回零的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆