CUDA性能 - 始终返回不同的值 [英] CUDA Performance - Always return different values

查看:281
本文介绍了CUDA性能 - 始终返回不同的值的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

这是我的代码:

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

const int N = 8000;

void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}

__global__ void add(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

__global__ void subtract(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] - b[tid];
    }
}

__global__ void multiply(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] * b[tid];
    }
}

__global__ void divide(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] / b[tid];
    }
}

__global__ void modu(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] % b[tid];
    }
}

__global__ void neg(int *data, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = -data[tid];
    }
}

float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    int hArrayC[N];

    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return elapsedTime;
}

int main(void) {

    int *a, *b;
    a = new int[N];
    b = new int [N];

    float dur = 0;

    int *devA, *devB,*devC;

    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));
    cudaMalloc((void**) &devC, N * sizeof(int));

    fillArray(a, N);
    fillArray(b, N);

    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);


    dur = duration(devA, devB, devC,N, 1);

    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";



    cudaFree(devA);
    cudaFree(devB);
    delete [] a;
    delete [] b;

    return 0;
}

我想知道duration函数中的总毫秒数。但是毫秒总是返回不同的值。有时它是10毫秒有时它是0.78652有时它是30毫秒。为什么?我的代码有什么问题?

What i want to know the total miliseconds in duration function. But miliseconds always return in different values. Sometimes it is 10 ms sometimes it is 0.78652 sometimes it is 30 miliseconds.Why? What is wrong with my code?

推荐答案

这可能是由于NVIDIA驱动程序的加载/卸载。

This may be caused by the loading/unloading of the NVIDIA drivers. Think of it as an initialization step for the GPU.

您可以将GPU设置为持久化模式:

You can either set your GPU to persistence mode:

nvidia-smi -pm 1

虚拟内核在计算你的GPU代码之前触发加载驱动程序:

Or you could run a dummy kernel before timing your GPU code to trigger the loading of the drivers:

__global__ void dummy()
{
    // This kernel does nothing, this is just a "warm-up"
}

// Before your cudaEventRecord etc.
dummy<<<blocksPerGrid, threadsPerBlock>>>();

或者可以使用 cudaThreadSynchronize()计时你的内核。

Or maybe just use cudaThreadSynchronize() before timing your kernels.

这篇关于CUDA性能 - 始终返回不同的值的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆