OpenCL缓冲区分配和映射最佳实践 [英] OpenCL buffer allocation and mapping best practice

查看:1647
本文介绍了OpenCL缓冲区分配和映射最佳实践的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我对使用OpenCL映射缓冲区的代码是否正确有点困惑。



我有两个例子,一个使用CL_MEM_USE_HOST_PTR,一个使用CL_MEM_ALLOC_HOST_PTR。两者都在我的本地机器和OpenCL设备上工作和运行,但我感兴趣的是这是正确的方式进行映射,以及它是否应该工作的所有OpenCL设备。我特别不确定USE_HOST_PTR的例子。



我只对缓冲区/地图特定的操作感兴趣。我知道我应该进行错误检查等。



CL_MEM_ALLOC_HOST_PTR:

  //保存结果的指针
int * host_ptr = malloc(size * sizeof(int));

d_mem = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
size * sizeof(cl_int),NULL和& ret);

int * map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_WRITE,
0,size * sizeof(int),0,NULL,NULL和&
//初始化数据
for(i = 0; i map_ptr [i] = i;
}

ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);

//设置OpenCL内核参数
ret = clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)& d_mem);

size_t global_work [1] = {size};
//执行OpenCL内核
ret = clEnqueueNDRangeKernel(command_queue,kernel,1,NULL,
global_work,NULL,0,0,NULL)

map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_READ,
0,size * sizeof(int),0,NULL,NULL和&
//将数据复制到结果数组
for(i = 0; i host_ptr [i] = map_ptr [i]
}

ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);

// cl finish etc

CL_MEM_USE_HOST_PTR:

  //保存结果的指针
int * host_ptr = malloc(size * sizeof(int));
int i;
for(i = 0; i host_ptr [i] = i;
}

d_mem = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
size * sizeof(cl_int),host_ptr,& ret);

//无需映射或取消映射,因为我们使用HOST_PTR原始数据
//已经初始化到缓冲区中了?

//设置OpenCL内核参数
ret = clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)& d_mem);

size_t global_work [1] = {size};
//执行OpenCL内核
ret = clEnqueueNDRangeKernel(command_queue,kernel,1,NULL,
global_work,NULL,0,0,NULL)

//这返回host_ptr所以需要保存它(我假设它总是会?)
//虽然我们需要调用map函数
//确保数据被复制回来。
//没有必要手动将它复制回host_ptr
//因为它默认使用
clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_READ,
0,size * sizeof(int),0,NULL,NULL,& ret);

ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);

// cl finish,cleanup etc


解决方案>

如果您使用CL_MEM_ALLOC_HOST_PTR,您有机会表示OpenCL的底层实施可能使用页锁定内存



这意味着页面不能交换到磁盘,并且主机和设备内存之间的传输将以 DMA风格完成,而不会浪费CPU循环。因此,在这种情况下,CL_MEM_ALLOC_HOST_PTR将是最佳解决方案。



nVidia具有页锁定( pinned )内存功能,在他们的OpenCL实现。对于AMD,它不确定他们是否做同样的。请检查 此处 更多详细信息。



使用CL_MEM_USE_HOST_PTR只会使程序员的生活更轻松,所以在不太可能的情况下当硬件不能使用页锁定内存时,只需使用此选项。


I am a little confused as to whether my code using OpenCL mapped buffers are correct.

I have two examples, one using CL_MEM_USE_HOST_PTR and one using CL_MEM_ALLOC_HOST_PTR. Both work and run on my local machine and OpenCL devices but I am interested in whether this is the correct way of doing the mapping, and whether it should work an all OpenCL devices. I am especially unsure about the USE_HOST_PTR example.

I am only interested in the buffer/map specific operations. I am aware I should do error checking and so forth.

CL_MEM_ALLOC_HOST_PTR:

// pointer to hold the result
int * host_ptr = malloc(size * sizeof(int));

d_mem = clCreateBuffer(context,CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR,
                       size*sizeof(cl_int), NULL, &ret);

int * map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_WRITE,
                                   0,size*sizeof(int),0,NULL,NULL,&ret);
// initialize data
for (i=0; i<size;i++) {
  map_ptr[i] = i;
}

ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL); 

//Set OpenCL Kernel Parameters
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_mem);

size_t global_work[1]  = { size };
//Execute OpenCL Kernel
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
                             global_work, NULL, 0, 0, NULL);

map_ptr = clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_READ,
                             0,size*sizeof(int),0,NULL,NULL,&ret);
// copy the data to result array 
for (i=0; i<size;i++){
  host_ptr[i] = map_ptr[i];
} 

ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);        

// cl finish etc     

CL_MEM_USE_HOST_PTR:

// pointer to hold the result
int * host_ptr = malloc(size * sizeof(int));
int i;
for(i=0; i<size;i++) {
  host_ptr[i] = i;
}

d_mem = clCreateBuffer(context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,
                       size*sizeof(cl_int), host_ptr, &ret);

// No need to map or unmap here, as we use the HOST_PTR the original data
// is already initialized into the buffer?

//Set OpenCL Kernel Parameters
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_mem);

size_t global_work[1]  = { size };
//Execute OpenCL Kernel
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
                             global_work, NULL, 0, 0, NULL);

// this returns the host_ptr so need need to save it (I assume it always will?)
// although we do need to call the map function
// to ensure the data is copied back.
// There's no need to manually copy it back into host_ptr
// as it uses this by default
clEnqueueMapBuffer(command_queue,d_mem,CL_TRUE,CL_MAP_READ,
                   0,size*sizeof(int),0,NULL,NULL,&ret); 

ret = clEnqueueUnmapMemObject(command_queue,d_mem,map_ptr,0,NULL,NULL);        

// cl finish, cleanup etc

解决方案

If you use CL_MEM_ALLOC_HOST_PTR you have the chance that the underlying implementation of OpenCL might use page-locked memory.

That means that the page cannot be swapped out to disk and that the transfer between host and device memory would be done DMA style without wasting CPU cycles. Therefore in this case CL_MEM_ALLOC_HOST_PTR would be the best solution.

nVidia has the page-locked (pinned) memory feature and they should also use it in their OpenCL implementation. For AMD it's not certain if they do the same. Check here for more details.

Using CL_MEM_USE_HOST_PTR would just make the programmer's life easier so in the unlikely case when the hardware cannot use page-locked memory you could just use this option.

这篇关于OpenCL缓冲区分配和映射最佳实践的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆