PyCuda:CUDA中内核解引用数组元素通过指针 [英] PyCuda: Dereferencing Array Element Via Pointer in Cuda Kernel

查看:278
本文介绍了PyCuda:CUDA中内核解引用数组元素通过指针的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我使用PyCuda到双阵列传递到通过指针CUDA内核。该阵列是一个不同的内核的输出,所以数据已是在GPU上。

在内核中,我试图访问元素在每个阵列做减法载体。那我得到的数组中的元素的值不正确(的H& p是错在下面的code)

谁能帮我看看我究竟做错了什么?

我的code:

 进口pycuda.​​driver为CUDA
进口pycuda.​​autoinit
从pycuda.​​compiler进口SourceModule
导入numpy的是NP
进口时间
进口CV2
从pycuda.​​tools导入DeviceMemoryPool为DMP
从scipy.spatial进口的距离
进口OS
进口水珠高清get_cuda_hist_kernel():
        了#make内核
    histogram_kernel =
    __global__无效kernel_getHist(unsigned int类型数组*,无符号整型大小,无符号整型*组织相容,浮动bucket_size,无符号​​整型num_bins,无符号整型* out_max)
    {
        unsigned int类型X = threadIdx.x + blockDim.x * blockIdx.x;
        如果(X<大小){
            无符号整型值=阵列[X];            unsigned int类型斌=地板(浮点(值)* bucket_size) - 1;
            //更快模3进行信道分配
            无符号整型偏移量= X;
            偏移量=(偏移GT;> 16)+(偏移放大器; 0xFFFF的);
            偏移量=(偏移GT;→8)+(偏移放大器;为0xFF);
            偏移量=(偏移GT;→4)+(偏移放大器; 0xF的);
            偏移量=(偏移GT;&→2)+(偏移放大器; 0x3中);
            偏移量=(偏移GT;&→2)+(偏移放大器; 0x3中);
            偏移量=(偏移GT;&→2)+(偏移放大器; 0x3中);
            如果(偏移→2)偏移=偏移 - 3;
            偏移量=偏移* num_bins;            滨+ =偏移;            atomicAdd(安培;组织相容[仓+偏移量],1);
        }
    }
    __global__无效kernel_chebyshev(无符号整数*组织相容,无符号整型* prev_histo,无符号整型数,为int *输​​出)
    {        const的无符号整型大小= 12;
        //获取所有的差异
        __shared__诠释temp_diffs【尺寸】;
        unsigned int类型I = threadIdx.x + blockDim.x * blockIdx.x;        如果(I<大小){
            unsigned int类型差异= 0;
            unsigned int类型H =组织相容[I]
            unsigned int类型P = prev_histo [I]            如果(h取代; p)的
            {
                差异= H - P;
            }
            其他
            {
                差异= P - H;
            }
            temp_diffs [I] =(int)的差异;
        }        __syncthreads();        输出[数字] = 0;
        atomicMax(安培;输出[数字],temp_diffs [I]);
    }
        MOD = SourceModule(histogram_kernel)
    返回MOD
高清cuda_histogram(IMS,BLOCK_SIZE,内核):    开始=选定了time.time()
    MAX_VAL = 4
    num_bins = np.uint32(4)
    NUM_CHANNELS = np.uint32(3)
    bin_size = np.float32(1 / np.uint32(MAX_VAL / num_bins))    #Memory游泳池
    池= DMP()
    打印游泳池举行块:',pool.held_blocks    #Compute块放;网格尺寸    bdim =(BLOCK_SIZE,1,1)
    COLS = IMS [0] .size
    行数= 1
    渠道= 1    DX,MX = divmod(COLS,bdim [0])
    DY,我= divmod(行,bdim [1])
    DZ,MZ = divmod(信道,bdim [2])
    g_x =(DX +(MX大于0))* bdim [0]
    g_y =(DY +(我的大于0))* bdim [1]
    g_z =(DZ +(MZ&0))* bdim [2]
    gdim =(g_x,g_y,g_z)    #获取功能
    FUNC = kernel.get_function('kernel_getHist')
    FUNC2 = kernel.get_function('kernel_chebyshev')    直方图#build名单
    #send直方图到GPU
    hists = []
    device_hists = []
    在IM范围(LEN(IMS)):
        hists.append(np.zeros([* NUM_CHANNELS num_bins])。astype(np.uint32))    结束=选定了time.time()
    DUR =结束 - 开始
    打印(''。加入(['$ P​​ $ PP时间:',STR(DUR)))    开始=选定了time.time()
    #Copy所有的图象数据到GPU的
    device_images = []
    在IM范围(LEN(IMS)):
        #PRINT('分配图像数据:',IM)
        #convert的形象uint32s的一维数组
        A = IMS [IM] .astype(np.uint32)
        A = a.flatten('C')
        a_size = np.uint32(a.size)        #allocate&安培;发送即时消息数据GPU
        device_images.append(pool.allocate(a.nbytes))
        cuda.​​memcpy_htod(device_images [IM],一)        d_hist = pool.allocate(hists [IM] .nbytes)
        device_hists.append(d_hist)
        cuda.​​memcpy_htod(d_hist,hists [IM])
    差异= np.zeros(LEN(IMS))。astype(np.uint32)
    device_diffs = pool.allocate(differences.nbytes)
    cuda.​​memcpy_htod(device_diffs,差异)
    在IM范围(LEN(IMS)):
        #run直方图功能
        FUNC(device_images [IM] a_size,device_hists [IM] bin_size,num_bins,块=(BLOCK_SIZE,1,1),网格= gdim)    cuda.​​Context.synchronize()
    device_hist_size = np.uint32(LEN(device_hists [IM))
    有效范围内的即时通讯(1,LEN(IMS)):
        数= np.uint32(IM - 1)
        FUNC2(device_hists [IM] device_hists [IM - 1],数字,device_diffs,块=(32,1,1))    cuda.​​memcpy_dtoh(差异,device_diffs)
    打印(差异)    在IM范围(LEN(IMS)):
        #获取直方图回
        cuda.​​memcpy_dtoh(hists [IM] device_hists [IM])
        device_hists [IM] = 0
    结束=选定了time.time()
    DUR =结束 - 开始
    打印(''。加入(['负荷,计算,和放大器;收集时间:',STR(DUR)))
    pool.free_held()
    返回差异高清get_all_files(目录):
    模式= os.path.join(目录,* .JPG)
    文件= [F在glob.glob F(图案)]
    返回文件
如果__name__ ==__main__:
    RESOURCES_PATH =../data/ims/
    MAX_IMS = 1000
    直销= os.path.join(RESOURCES_PATH,'21JumpStreet','source_video_frames')
    文件= get_all_files(直销)
    A = cv2.imread('t.png')
    IMS = [cv2.imread(F)对于f在文件中]
    打印'我的形象的形状:',IMS [0] .shape
    打印图像直方图的号码:,LEN(IMS)
    BLOCK_SIZE = 128
    内核= get_cuda_hist_kernel()
    开始=选定了time.time()    num_diffs = LEN(IMS)// MAX_IMS + 1
    cuda_diffs = []    在范围(num_diffs)我:        第一= I * MAX_IMS
        最后=(1 + 1)* MAX_IMS
        打印(第一)
        small_set = IMS [第一:去年]
        打印小集合的大小:,STR(LEN(small_set))
        cuda_diffs.extend(cuda_histogram(small_set,BLOCK_SIZE,内核))    结束=选定了time.time()
    DUR =结束 - 开始
    打印(''。加入(['CUDA的版本了:',STR(DUR)))    开始=选定了time.time()
    cv_hists = []
    因为我在范围内(LEN(IMS)):
        IM = IMS [我LEN%(IMS)
        H = []
        对于在范围Ĵ(3):
            HIST = cv2.calcHist([IM],[J],无,[4],[0,100])
            h.extend(历史)
        cv_hists.append(H)    #run切比雪夫的CPU:
    color_hist_diffs = np.array([distance.chebyshev(cv_hists [Ⅰ-1],cv_hists [I])\\
                                 因为我在范围内(LEN(cv_hists)),如果我!= 0])
    打印(color_hist_diffs)
    结束=选定了time.time()
    DUR =结束 - 开始
    打印(''。加入(['CPU和放大器; CV2版本了:',STR(DUR)))


解决方案

这是一个不好的问题,因为错误是在我的code别处。对困惑感到抱歉。

I am using PyCuda to pass pairs of arrays to a cuda kernel via a pointer. The arrays are the output of a different kernel, so the data is already on the GPU.

Within the kernel, I'm trying to access elements in each of the arrays to do a vector subtraction. The values that I'm getting for the elements in the array are not correct (h & p are wrong in the code below).

Can anyone help me see what am I doing wrong?

My code:

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import time
import cv2
from pycuda.tools import DeviceMemoryPool as DMP
from scipy.spatial import distance
import os
import glob

def get_cuda_hist_kernel():
        #Make the kernel
    histogram_kernel = """
    __global__ void kernel_getHist(unsigned int* array,unsigned int size, unsigned int* histo, float bucket_size, unsigned int num_bins, unsigned int* out_max)
    {
        unsigned int x = threadIdx.x + blockDim.x * blockIdx.x;
        if(x<size){
            unsigned int value = array[x];

            unsigned int bin = floor(float(value) * bucket_size) - 1;


            //Faster Modulo 3 for channel assignment
            unsigned int offset = x;
            offset = (offset >> 16) + (offset & 0xFFFF); 
            offset = (offset >>  8) + (offset & 0xFF);   
            offset = (offset >>  4) + (offset & 0xF);    
            offset = (offset >>  2) + (offset & 0x3);    
            offset = (offset >>  2) + (offset & 0x3);    
            offset = (offset >>  2) + (offset & 0x3);    
            if (offset > 2) offset = offset - 3;
            offset = offset * num_bins;

            bin += offset;

            atomicAdd(&histo[bin + offset],1);
        }
    }


    __global__ void kernel_chebyshev(unsigned int* histo, unsigned int* prev_histo, unsigned int number, int* output)
    {

        const unsigned int size = 12;
        //Get all of the differences
        __shared__ int temp_diffs[size];
        unsigned int i = threadIdx.x + blockDim.x * blockIdx.x;

        if (i < size){
            unsigned int diff = 0;
            unsigned int h = histo[i];
            unsigned int p = prev_histo[i];

            if (h > p)
            {
                diff = h - p;
            }
            else
            {
                diff = p - h;
            }
            temp_diffs[i] = (int)diff;
        }

        __syncthreads();

        output[number] = 0;
        atomicMax(&output[number], temp_diffs[i]);
    }
    """

    mod = SourceModule(histogram_kernel)
    return mod


def cuda_histogram(ims, block_size, kernel):

    start = time.time()
    max_val = 4
    num_bins = np.uint32(4)
    num_channels = np.uint32(3)
    bin_size = np.float32(1 / np.uint32(max_val / num_bins))

    #Memory Pool
    pool = DMP()
    print 'Pool Held Blocks: ', pool.held_blocks

    #Compute block & Grid dimensions

    bdim = (block_size, 1, 1)
    cols = ims[0].size
    rows = 1
    channels = 1

    dx, mx = divmod(cols, bdim[0])
    dy, my = divmod(rows, bdim[1])
    dz, mz = divmod(channels, bdim[2])
    g_x = (dx + (mx>0)) * bdim[0]
    g_y = (dy + (my>0)) * bdim[1]
    g_z = (dz + (mz>0)) * bdim[2]
    gdim = (g_x, g_y, g_z)

    #get the function
    func = kernel.get_function('kernel_getHist')
    func2 = kernel.get_function('kernel_chebyshev')

    #build list of histograms
    #send the histogram to the gpu
    hists = []
    device_hists = []
    for im in range(len(ims)):
        hists.append(np.zeros([num_channels * num_bins]).astype(np.uint32))

    end = time.time()
    dur = end - start
    print(' '.join(['Prep Time: ', str(dur)]))

    start = time.time()


    #Copy all of the image data to GPU
    device_images = []
    for im in range(len(ims)):
        #print('Allocating data for image :', im)
        #convert the image to 1D array of uint32s
        a = ims[im].astype(np.uint32)
        a = a.flatten('C')
        a_size = np.uint32(a.size)

        #allocate & send im data to gpu
        device_images.append(pool.allocate(a.nbytes))
        cuda.memcpy_htod(device_images[im], a)

        d_hist = pool.allocate(hists[im].nbytes)
        device_hists.append(d_hist)
        cuda.memcpy_htod(d_hist, hists[im])


    differences = np.zeros(len(ims)).astype(np.uint32)
    device_diffs = pool.allocate(differences.nbytes)
    cuda.memcpy_htod(device_diffs, differences)


    for im in range(len(ims)):
        #run histogram function
        func(device_images[im], a_size, device_hists[im], bin_size, num_bins, block=(block_size,1,1), grid=gdim)

    cuda.Context.synchronize()
    device_hist_size = np.uint32(len(device_hists[im]))
    for im in range(1, len(ims)):
        number = np.uint32(im - 1)
        func2(device_hists[im], device_hists[im - 1], number, device_diffs, block=(32,1,1))

    cuda.memcpy_dtoh(differences, device_diffs)
    print(differences)

    for im in range(len(ims)):
        #get histogram back
        cuda.memcpy_dtoh(hists[im], device_hists[im])
        device_hists[im] = 0


    end = time.time()
    dur = end - start
    print(' '.join(['Load, Compute, & Gather Time: ', str(dur)]))
    pool.free_held()
    return differences

def get_all_files(directory):
    pattern = os.path.join(directory, '*.jpg')
    files = [f for f in glob.glob(pattern)]
    return files
if __name__ == "__main__":
    RESOURCES_PATH = "../data/ims/"
    MAX_IMS = 1000
    direc = os.path.join(RESOURCES_PATH, '21JumpStreet', 'source_video_frames')
    files = get_all_files(direc)
    a = cv2.imread('t.png')
    ims = [cv2.imread(f) for f in files]
    print 'Shape of my image: ', ims[0].shape
    print 'Number of images to histogram: ', len(ims)
    block_size = 128
    kernel = get_cuda_hist_kernel()
    start = time.time()

    num_diffs = len(ims) // MAX_IMS + 1
    cuda_diffs = []

    for i in range(num_diffs):

        first = i * MAX_IMS
        last = (i + 1) * MAX_IMS
        print(first)
        small_set = ims[first:last]
        print 'Small set size: ', str(len(small_set))
        cuda_diffs.extend(cuda_histogram(small_set, block_size, kernel))

    end = time.time()
    dur = end - start
    print(' '.join(['CUDA version took:', str(dur)]))

    start = time.time()
    cv_hists = []
    for i in range(len(ims)):
        im = ims[i % len(ims)]
        h = []
        for j in range(3):
            hist = cv2.calcHist([im], [j], None, [4], [0, 100])
            h.extend(hist)
        cv_hists.append(h)

    #run Chebyshev on CPU:
    color_hist_diffs = np.array([distance.chebyshev(cv_hists[i-1], cv_hists[i]) \
                                 for i in range(len(cv_hists)) if i != 0])
    print(color_hist_diffs)
    end = time.time()
    dur = end - start
    print(' '.join(['CPU & cv2 version took:', str(dur)]))

解决方案

This was a bad question, as the error was elsewhere in my code. Sorry for the confusion.

这篇关于PyCuda:CUDA中内核解引用数组元素通过指针的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆