使用共享内存的CUDA 3D卷积 [英] 3D Convolution with CUDA using shared memory

查看:273
本文介绍了使用共享内存的CUDA 3D卷积的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我目前正尝试修改这个二维卷积代码

I'm currently trying to adapt the 2D convolution code from THIS question to 3D and having trouble trying to understand where my error is.

我的二维码看起来像这样:

My 2D Code looks like this:

#include <iostream>

#define     MASK_WIDTH      3
#define     MASK_RADIUS     MASK_WIDTH / 2
#define     TILE_WIDTH      8
#define         W           (TILE_WIDTH + MASK_WIDTH - 1)

/**
 * GPU 2D Convolution using shared memory
 */
__global__ void convolution(float *I, float* M, float *P, int width, int height)
{
    /***** WRITE TO SHARED MEMORY *****/
    __shared__ float N_ds[W][W];

    // First batch loading
    int dest = threadIdx.x + (threadIdx.y * TILE_WIDTH);
    int destY = dest / W;
    int destX = dest % W;

    int srcY = destY + (blockIdx.y * TILE_WIDTH) - MASK_RADIUS;
    int srcX = destX + (blockIdx.x * TILE_WIDTH) - MASK_RADIUS;
    int src = srcX + (srcY * width);

    if(srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
        N_ds[destY][destX] = I[src];
    else
        N_ds[destY][destX] = 0;

    // Second batch loading
    dest = threadIdx.x + (threadIdx.y * TILE_WIDTH) + TILE_WIDTH * TILE_WIDTH;
    destY = dest / W;
    destX = dest % W;

    srcY = destY + (blockIdx.y * TILE_WIDTH) - MASK_RADIUS;
    srcX = destX + (blockIdx.x * TILE_WIDTH) - MASK_RADIUS;
    src = srcX + (srcY * width);

    if(destY < W)
    {
        if(srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
            N_ds[destY][destX] = I[src];
        else
            N_ds[destY][destX] = 0;
    }
    __syncthreads();

    /***** Perform Convolution *****/
    float sum = 0;
    int y;
    int x;
    for(y = 0; y < MASK_WIDTH; y++)
        for(x = 0; x < MASK_WIDTH; x++)
            sum = sum + N_ds[threadIdx.y + y][threadIdx.x + x] * M[x + (y * MASK_WIDTH)];
    y = threadIdx.y + (blockIdx.y * TILE_WIDTH);
    x = threadIdx.x + (blockIdx.x * TILE_WIDTH);
    if(y < height && x < width)
        P[x + (y * width)] = sum;

    __syncthreads();

}

int main(int argc, char* argv[])
{
    int image_width  = 16;
    int image_height = 16;

    float *deviceInputImageData;
    float *deviceOutputImageData;
    float *deviceMaskData;

    float data[] =
    {
         1.0f,  1.0f,  1.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
         2.0f,  2.0f,  2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
         3.0f,  3.0f,  3.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
         4.0f,  4.0f,  4.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
         5.0f,  5.0f,  5.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
         6.0f,  6.0f,  6.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
         7.0f,  7.0f,  7.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
         8.0f,  8.0f,  8.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
         9.0f,  9.0f,  9.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        10.0f, 10.0f, 10.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        11.0f, 11.0f, 11.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        12.0f, 12.0f, 12.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        13.0f, 13.0f, 13.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        14.0f, 14.0f, 14.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        15.0f, 15.0f, 15.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        16.0f, 16.0f, 16.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
    };

    float mask[] =
    {
        1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f
    };

    // CHECK CHECK CHECK CHECK CHECK
    int shared_memory_size = W * W;
    int block_size = TILE_WIDTH * TILE_WIDTH;
    int max_size = 2 * block_size;
    std::cout << "Block Size: " << block_size << " - Shared Memory Size: " << shared_memory_size << " - Max Size: " << max_size << std::endl;
    std::cout << "SHARED MEMORY SIZE HAS TO BE SMALLER THAN MAX SIZE IN ORDER TO WORK PROPERLY !!!!!!!";

    cudaMalloc((void **)&deviceInputImageData,  image_width * image_height * sizeof(float));
    cudaMalloc((void **)&deviceOutputImageData, image_width * image_height * sizeof(float));
    cudaMalloc((void **)&deviceMaskData,        MASK_WIDTH * MASK_WIDTH  * sizeof(float));

    cudaMemcpy(deviceInputImageData, data, image_width * image_height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceMaskData,       mask, MASK_WIDTH * MASK_WIDTH  * sizeof(float), cudaMemcpyHostToDevice);

    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
    dim3 dimGrid((image_width + TILE_WIDTH - 1) / TILE_WIDTH, (image_height + TILE_WIDTH - 1) / TILE_WIDTH);
    convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, image_width, image_height);
    cudaDeviceSynchronize();

    cudaMemcpy(data, deviceOutputImageData, image_width * image_height * sizeof(float), cudaMemcpyDeviceToHost);

    // Print data
    for(int i = 0; i < image_width * image_height; ++i)
    {
        if(i % image_width == 0)
        {
            std::cout << std::endl;
        }
        std::cout << data[i] << " - ";
    }

    cudaFree(deviceInputImageData);
    cudaFree(deviceOutputImageData);
    cudaFree(deviceMaskData);

    return 0;
}

与3D相同:

#include <iostream>

#define     MASK_WIDTH      3
#define     MASK_RADIUS     MASK_WIDTH / 2
#define     TILE_WIDTH      8
#define         W           (TILE_WIDTH + MASK_WIDTH - 1)

/**
 * GPU 2D Convolution using shared memory
 */
__global__ void convolution(float *I, float* M, float *P, int width, int height, int depth)
{
    /***** WRITE TO SHARED MEMORY *****/
    __shared__ float N_ds[W][W][W];

    // First batch loading
    int dest = threadIdx.x + (threadIdx.y * TILE_WIDTH) + (threadIdx.z * TILE_WIDTH * TILE_WIDTH);
    int destTmp = dest;
    int destX = destTmp % W;
    destTmp = destTmp / W;
    int destY = destTmp % W;
    destTmp = destTmp / W;
    int destZ = destTmp;

    int srcZ = destZ + (blockIdx.z * TILE_WIDTH) - MASK_RADIUS;
    int srcY = destY + (blockIdx.y * TILE_WIDTH) - MASK_RADIUS;
    int srcX = destX + (blockIdx.x * TILE_WIDTH) - MASK_RADIUS;
    int src = srcX + (srcY * width) + (srcZ * width * height);

    if(srcZ >= 0 && srcZ < depth && srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
        N_ds[destZ][destY][destX] = I[src];
    else
        N_ds[destZ][destY][destX] = 0;

    // Second batch loading
    dest = threadIdx.x + (threadIdx.y * TILE_WIDTH) + (threadIdx.z * TILE_WIDTH * TILE_WIDTH) + TILE_WIDTH * TILE_WIDTH;
    destTmp = dest;
    destX = destTmp % W;
    destTmp = destTmp / W;
    destY = destTmp % W;
    destTmp = destTmp / W;
    destZ = destTmp;

    srcZ = destZ + (blockIdx.z * TILE_WIDTH) - MASK_RADIUS;
    srcY = destY + (blockIdx.y * TILE_WIDTH) - MASK_RADIUS;
    srcX = destX + (blockIdx.x * TILE_WIDTH) - MASK_RADIUS;
    src = srcX + (srcY * width) + (srcZ * width * height);

    if(destZ < W)
    {
        if(srcZ >= 0 && srcZ < depth && srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
            N_ds[destZ][destY][destX] = I[src];
        else
            N_ds[destZ][destY][destX] = 0;
    }
    __syncthreads();

    /***** Perform Convolution *****/
    float sum = 0;
    int z;
    int y;
    int x;
    for(z = 0; z < MASK_WIDTH; z++)
        for(y = 0; y < MASK_WIDTH; y++)
            for(x = 0; x < MASK_WIDTH; x++)
                sum = sum + N_ds[threadIdx.z + z][threadIdx.y + y][threadIdx.x + x] * M[x + (y * MASK_WIDTH) + (z * MASK_WIDTH * MASK_WIDTH)];
    z = threadIdx.z + (blockIdx.z * TILE_WIDTH);
    y = threadIdx.y + (blockIdx.y * TILE_WIDTH);
    x = threadIdx.x + (blockIdx.x * TILE_WIDTH);
    if(z < depth && y < height && x < width)
        P[x + (y * width) + (z * width * height)] = sum;

    __syncthreads();

}

int main(int argc, char* argv[])
{
    int image_width  = 16;
    int image_height = 16;
    int image_depth  = 5;

    float *deviceInputImageData;
    float *deviceOutputImageData;
    float *deviceMaskData;

    float data[] =
    {
        1.0f,  1.0f,  1.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        2.0f,  2.0f,  2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        3.0f,  3.0f,  3.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        4.0f,  4.0f,  4.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        5.0f,  5.0f,  5.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        6.0f,  6.0f,  6.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        7.0f,  7.0f,  7.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        8.0f,  8.0f,  8.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        9.0f,  9.0f,  9.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       10.0f, 10.0f, 10.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       11.0f, 11.0f, 11.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       12.0f, 12.0f, 12.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       13.0f, 13.0f, 13.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       14.0f, 14.0f, 14.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       15.0f, 15.0f, 15.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       16.0f, 16.0f, 16.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,

        1.0f,  1.0f,  1.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        2.0f,  2.0f,  2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        3.0f,  3.0f,  3.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        4.0f,  4.0f,  4.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        5.0f,  5.0f,  5.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        6.0f,  6.0f,  6.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        7.0f,  7.0f,  7.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        8.0f,  8.0f,  8.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        9.0f,  9.0f,  9.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       10.0f, 10.0f, 10.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       11.0f, 11.0f, 11.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       12.0f, 12.0f, 12.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       13.0f, 13.0f, 13.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       14.0f, 14.0f, 14.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       15.0f, 15.0f, 15.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       16.0f, 16.0f, 16.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,

        1.0f,  1.0f,  1.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        2.0f,  2.0f,  2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        3.0f,  3.0f,  3.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        4.0f,  4.0f,  4.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        5.0f,  5.0f,  5.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        6.0f,  6.0f,  6.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        7.0f,  7.0f,  7.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        8.0f,  8.0f,  8.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        9.0f,  9.0f,  9.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       10.0f, 10.0f, 10.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       11.0f, 11.0f, 11.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       12.0f, 12.0f, 12.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       13.0f, 13.0f, 13.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       14.0f, 14.0f, 14.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       15.0f, 15.0f, 15.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       16.0f, 16.0f, 16.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,

        1.0f,  1.0f,  1.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        2.0f,  2.0f,  2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        3.0f,  3.0f,  3.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        4.0f,  4.0f,  4.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        5.0f,  5.0f,  5.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        6.0f,  6.0f,  6.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        7.0f,  7.0f,  7.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        8.0f,  8.0f,  8.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        9.0f,  9.0f,  9.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       10.0f, 10.0f, 10.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       11.0f, 11.0f, 11.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       12.0f, 12.0f, 12.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       13.0f, 13.0f, 13.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       14.0f, 14.0f, 14.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       15.0f, 15.0f, 15.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       16.0f, 16.0f, 16.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,

        1.0f,  1.0f,  1.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        2.0f,  2.0f,  2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        3.0f,  3.0f,  3.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        4.0f,  4.0f,  4.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        5.0f,  5.0f,  5.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        6.0f,  6.0f,  6.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        7.0f,  7.0f,  7.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        8.0f,  8.0f,  8.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        9.0f,  9.0f,  9.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       10.0f, 10.0f, 10.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       11.0f, 11.0f, 11.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       12.0f, 12.0f, 12.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       13.0f, 13.0f, 13.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       14.0f, 14.0f, 14.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       15.0f, 15.0f, 15.0f, 1.0f, 3.0f, 1.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       16.0f, 16.0f, 16.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
    };

    float mask[] =
    {
        1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f,

        1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f,

        1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f
    };

    // CHECK CHECK CHECK CHECK CHECK
    int shared_memory_size = W * W * W;
    int block_size = TILE_WIDTH * TILE_WIDTH * TILE_WIDTH;
    int max_size = 3 * block_size;
    std::cout << "Block Size: " << block_size << " - Shared Memory Size: " << shared_memory_size << " - Max Size: " << max_size << std::endl;
    std::cout << "SHARED MEMORY SIZE HAS TO BE SMALLER THAN MAX SIZE IN ORDER TO WORK PROPERLY !!!!!!!";

    cudaMalloc((void **)&deviceInputImageData,  image_width * image_height * image_depth * sizeof(float));
    cudaMalloc((void **)&deviceOutputImageData, image_width * image_height * image_depth * sizeof(float));
    cudaMalloc((void **)&deviceMaskData,        MASK_WIDTH  * MASK_WIDTH   * MASK_WIDTH  * sizeof(float));

    cudaMemcpy(deviceInputImageData, data, image_width * image_height * image_depth * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceMaskData,       mask, MASK_WIDTH  * MASK_WIDTH   * MASK_WIDTH  * sizeof(float), cudaMemcpyHostToDevice);

    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, TILE_WIDTH);
    dim3 dimGrid((image_width + TILE_WIDTH - 1) / TILE_WIDTH, (image_height + TILE_WIDTH - 1) / TILE_WIDTH, (image_depth + TILE_WIDTH - 1) / TILE_WIDTH);
    convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, image_width, image_height, image_depth);
    cudaDeviceSynchronize();

    cudaMemcpy(data, deviceOutputImageData, image_width * image_height * image_depth * sizeof(float), cudaMemcpyDeviceToHost);

    // Print data
    for(int i = 0; i < image_width * image_height * image_depth; ++i)
    {
        if((i % image_width) == 0)
            std::cout << std::endl;

        if((i % (image_width * image_height)) == 0)
            std::cout << std::endl;

        std::cout << data[i] << " - ";
    }

    cudaFree(deviceInputImageData);
    cudaFree(deviceOutputImageData);
    cudaFree(deviceMaskData);

    return 0;
}

使用 TILE_WIDTH 8,卷积似乎部分工作很好,因为第二和第三层是相同的,也是值似乎是正确的。在3D情况下,我计算了 destX destY destZ 根据说明。我改变的第二件事是第二批加载的if条件: if(destZ< W)使用 destZ 而不是 destY

When using a TILE_WIDTH of 8, the convolution seems to partially work nicely, since the second and third layers are the same and also the values seem to be correct. In the 3D case, I calculated the destX, destY and destZ indices according to THIS explanation. The second thing that I changed is the if-condition for the second batch loading: if(destZ < W) to use destZ instead of destY.

现在我的问题是什么原因在第4层5的输出是。我想我错过了一些理解,为了正常工作 TILE_WIDTH 必须是多大。从此答案,我创建了以下检查,因为每个线程应该从全局到共享内存执行至少2次加载:

My question now is what the reason for the incorrect values inside layer 4 and 5 of the output is. I guess I'm missing some understanding on how big the TILE_WIDTH MUST be in order to work properly. From this answer, I created the following check because every thread is supposed to perform at least 2 loads from global to shared memory:

// CHECK CHECK CHECK CHECK CHECK
    int shared_memory_size = W * W;
    int block_size = TILE_WIDTH * TILE_WIDTH;
    int max_size = 2 * block_size;
    std::cout << "Block Size: " << block_size << " - Shared Memory Size: " << shared_memory_size << " - Max Size: " << max_size << std::endl;
    std::cout << "SHARED MEMORY SIZE HAS TO BE SMALLER THAN MAX SIZE IN ORDER TO WORK PROPERLY !!!!!!!";

它是否也适用于3D情况,如果是,是否在我的3D检查?

Does it also apply in the 3D case, and if so, is it adapted correctly in my 3D check?

推荐答案

似乎我正确地修改它,除了一个愚蠢的错误:

Seems like I adapted it correctly, apart from one stupid error:

// Second batch loading
dest = threadIdx.x + (threadIdx.y * TILE_WIDTH) + (threadIdx.z * TILE_WIDTH * TILE_WIDTH) + TILE_WIDTH * TILE_WIDTH;

我忘了一个 * TILE_WIDTH 应为:

// Second batch loading
dest = threadIdx.x + (threadIdx.y * TILE_WIDTH) + (threadIdx.z * TILE_WIDTH * TILE_WIDTH) + TILE_WIDTH * TILE_WIDTH * TILE_WIDTH;

这篇关于使用共享内存的CUDA 3D卷积的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆