实验结果不同于CUDA占用计算器 [英] The result of an experiment different from CUDA Occupancy Calculator

查看：126 发布时间：2017/3/4 14:01:50 cuda

本文介绍了实验结果不同于CUDA占用计算器的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我在学习CUDA架构。

I study CUDA architecture.

我在以下环境中制作了一些并行处理代码。

I made some of parallel processing code in environment like below.

GPU：GTX580（CC is 2.0）

GPU : GTX580 (CC is 2.0)

每个线程数：16x16 = 256

Threads Per Block : 16x16 = 256

共享内存每块：48字节

Shared Memory Per Block : 48 bytes

我知道编译的寄存器数和共享内存大小选项：--ptxas-options = -v
此外，网格大小为32x32 = 1024，没有额外的共享内存。

I know the number of Registers and Shared Memory size by the compile option: --ptxas-options=-v In addition, grid size is 32x32 = 1024 and there is not extra shared memory.

使用NVIDIA的CUDA_Occupancy_Calculator。
然后，它说，

So, I tried to use CUDA_Occupancy_Calculator by NVIDIA. then, It said,

3。）GPU占用率数据显示在这里和图表中：
每个多处理器的活动线程数1536
每个多处理器的活动变形48
每个多处理器的活动线程块6
每个多处理器的占用率100％

3.) GPU Occupancy Data is displayed here and in the graphs: Active Threads per Multiprocessor 1536 Active Warps per Multiprocessor 48 Active Thread Blocks per Multiprocessor 6 Occupancy of each Multiprocessor 100%

所以，我运行应用程序。
但是，结果显示块大小比16x16快8x8。

So, I run the application. But, the result showed that the block size is 8x8 faster than 16x16.

8x8表示块大小，网格大小为64x64。
16x16表示块大小，网格大小为32x32。
所以，线程的总数是相同的。它没有改变。

8x8 means the block size, and the gird size is 64x64. 16x16 means the block size, and the grid size is 32x32. So, the total amount of threads is same. It's unchanged.

我不知道为什么。请帮助我。

I don't know the why. Please help me.

以下代码是我的计划的一部分。

Following code is a part of my Program.

void LOAD_VERTEX(){
        MEM[0] = 60;    //y0 
        MEM[1] = 50;    //x0
        MEM[2] = 128;   //r0
        MEM[3] = 0;     //g0
        MEM[4] = 70;    //b0
        MEM[5] = 260;
        MEM[6] = 50;
        MEM[7] = 135;
        MEM[8] = 70;
        MEM[9] = 0;
        MEM[10] = 260;
        MEM[11] = 250;
        MEM[12] = 0;
        MEM[13] = 200;
        MEM[14] = 55;
        MEM[15] = 60;
        MEM[16] = 250;
        MEM[17] = 55;
        MEM[18] = 182;
        MEM[19] = 100;
        MEM[20] = 30;
        MEM[21] = 330;
        MEM[22] = 72;
        MEM[23] = 12;
        MEM[24] = 25;
        MEM[25] = 30;
        MEM[26] = 130;
        MEM[27] = 80;
        MEM[28] = 255;
        MEM[29] = 15;
        MEM[30] = 230; 
        MEM[31] = 330;
        MEM[32] = 56;   
        MEM[33] = 186;  
        MEM[34] = 201;
}

__global__ void PRINT_POLYGON( unsigned char *IMAGEin, int *MEMin, int dev_ID, int a, int b, int c)
{
        int i = blockIdx.x*TILE_WIDTH + threadIdx.x;
        int j = blockIdx.y*TILE_HEIGHT + threadIdx.y;

        float result_a, result_b;
        int temp[15];
        int k;

        for(k = 0; k < 5; k++){
                temp[k] = a*5+k;
                temp[k+5] = b*5+k;
                temp[k+10] = c*5+k;
        }

        int result_a_up = ((MEMin[temp[11]]-MEMin[temp[1]])*(i-MEMin[temp[0]]))-((MEMin[temp[10]]-MEMin[temp[0]])*(j-MEMin[temp[1]]));
        int result_a_down = ((MEMin[temp[11]]-MEMin[temp[1]])*(MEMin[temp[5]]-MEMin[temp[0]]))-((MEMin[temp[6]]-MEMin[temp[1]])*(MEMin[temp[10]]-MEMin[temp[0]]));

        int result_b_up = ((MEMin[temp[6]] -MEMin[temp[1]])*(MEMin[temp[0]]-i))-((MEMin[temp[5]] -MEMin[temp[0]])*(MEMin[temp[1]]-j));
        int result_b_down = ((MEMin[temp[11]]-MEMin[temp[1]])*(MEMin[temp[5]]-MEMin[temp[0]]))-((MEMin[temp[6]]-MEMin[temp[1]])*(MEMin[temp[10]]-MEMin[temp[0]]));

        result_a = float(result_a_up) / float(result_a_down);
        result_b = float(result_b_up) / float(result_b_down);

        int isIn = (0 <= result_a && result_a <=1) && ((0 <= result_b && result_b <= 1)) && ((0 <= (result_a+result_b) && (result_a+result_b) <= 1));

        IMAGEin[(i*HEIGHTs+j)*CHANNELS] += (int)(float(MEMin[temp[2]]) + (float(MEMin[temp[7]])-float(MEMin[temp[2]]))*result_a + (float(MEMin[temp[12]])-float(MEMin[temp[2]]))*result_b) * isIn;      //Red Channel
        IMAGEin[(i*HEIGHTs+j)*CHANNELS+1] += (int)(float(MEMin[temp[3]]) + (float(MEMin[temp[8]])-float(MEMin[temp[3]]))*result_a + (float(MEMin[temp[13]])-float(MEMin[temp[3]]))*result_b) * isIn;    //Green Channel
        IMAGEin[(i*HEIGHTs+j)*CHANNELS+2] += (int)(float(MEMin[temp[4]]) + (float(MEMin[temp[9]])-float(MEMin[temp[4]]))*result_a + (float(MEMin[temp[14]])-float(MEMin[temp[4]]))*result_b) * isIn;    //Blue Channel

}

//The information each device
struct DataStruct {
    int                 deviceID;
    unsigned char       IMAGE_SEG[WIDTH*HEIGHTs*CHANNELS];
};

void* routine( void *pvoidData ) {
        DataStruct  *data = (DataStruct*)pvoidData;
        unsigned char *dev_IMAGE;
        int *dev_MEM;
        unsigned char *IMAGE_SEG = data->IMAGE_SEG;

        HANDLE_ERROR(cudaSetDevice(data->deviceID));

        //initialize array
        memset(IMAGE_SEG, 0, WIDTH*HEIGHTs*CHANNELS);

        printf("Device %d Starting..\n", data->deviceID);

        //Evaluate Time
        cudaEvent_t start, stop;
        cudaEventCreate( &start );
        cudaEventCreate( &stop );

        HANDLE_ERROR( cudaMalloc( (void **)&dev_MEM, sizeof(int)*35) );                                //Creating int array each Block
        HANDLE_ERROR( cudaMalloc( (void **)&dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS) ); //output array

        cudaMemcpy(dev_MEM, MEM, sizeof(int)*256, cudaMemcpyHostToDevice);
        cudaMemset(dev_IMAGE, 0, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS);

        dim3    grid(WIDTH/TILE_WIDTH, HEIGHTs/TILE_HEIGHT);            //blocks in a grid
        dim3    block(TILE_WIDTH, TILE_HEIGHT);                         //threads in a block

        cudaEventRecord(start, 0);

        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 1, 2);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 2, 3);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 3, 4);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 4, 5);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 3, 2, 4);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 2, 6, 4);                    //Start the Kernel

        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        HANDLE_ERROR( cudaMemcpy( IMAGE_SEG, dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS, cudaMemcpyDeviceToHost ) );
        HANDLE_ERROR( cudaFree( dev_MEM ) );
        HANDLE_ERROR( cudaFree( dev_IMAGE ) );

        cudaEventElapsedTime( &elapsed_time_ms[data->deviceID], start, stop );          //Calculate elapsed time
        cudaEventDestroy(start);
        cudaEventDestroy(stop);

        printf("Algorithm Elapsed Time : %f ms(Device %d)\n", elapsed_time_ms[data->deviceID], data->deviceID);
        printf("Device %d Complete!\n", data->deviceID);

        return 0;
}

int main( void )
{       
        int i;
        CUTThread thread[7];

        printf("Program Start.\n");                     
        LOAD_VERTEX();

        DataStruct data[DEVICENUM];                     //define device info

        for(i = 0; i < DEVICENUM; i++){
                data[i].deviceID = i;
                thread[i] = start_thread(routine, &(data[i]));
        }

        for(i = 0; i < DEVICENUM; i++){
                end_thread(thread[i]);
        }

        cudaFreeHost(MEM);

    return 0;
}

实验结果不同于CUDA占用计算器 [英] The result of an experiment different from CUDA Occupancy Calculator

问题描述

推荐答案

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录关闭

实验结果不同于CUDA占用计算器 [英] The result of an experiment different from CUDA Occupancy Calculator

问题描述

推荐答案

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录 关闭

登录关闭