OpenCL 内核仅部分写入输出缓冲区 [英] OpenCL Kernel only partly writing to output buffer

查看：47 发布时间：2021/6/12 19:32:53 c opencl

本文介绍了OpenCL 内核仅部分写入输出缓冲区的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在从包含超过一百万个元素的数组中读取大整数值.获得的值来自使用 libsndfile 库的 wav 文件.现在，如果我不使用内核，我可以将原始数组写入我的输出文件并毫无问题地收听音频.然而，当我决定使用内核做完全相同的事情时，它只写了不到一秒钟的歌曲.

起初，我认为这是一个内存问题，所以我尝试了缓冲区大小，但仍然没有运气.然后我认为这可能是我在内核中执行的循环，我还使用循环值来得出相同的结论(仍然不起作用.)我现在很困惑，不知道该怎么做.下面是我的代码.下面的一些代码是我的，但我在网上找到的主要结构可以帮助我设置内核.

在这段代码的最底部，如果我将 OutputData 更改为 Array，我会得到完全相同的音频.我很确定内核有问题，这就是为什么它没有写回整首歌曲.

我知道这段代码很乱，但是你要尝试和测试这段代码所要做的就是复制并浪费它，只需更改输入 wav 文件和输出 wav 文件的路径即可.

为了明确目标，我将尝试修改 wav 文件中的每个值，看看会发生什么.到目前为止，如果我将内核中的输出值乘以 2，它会扭曲它.但同样，只持续 1 秒，其余的剪辑是空的.请注意，输入和输出文件的大小相同.

我的 For 循环也在循环进行 120 万次迭代，因为这是我的示例 wav 文件中的项目数

const char* prog = "__kernel void exchange(__global int *Array, __global int *Output) { for(int j = 0; j <100000; j++){ for(int i = 0; i< 12; i++){ 输出[j+i] = (Array[j+i]);} } }";int main() {//此代码在 OpenCL 主机上执行SNDFILE *sf;SF_INFO 信息；int num_channels;int num, num_items;//输入输出数据int *数组；int *输出数据；int f, sr, c;内部 i, j;文件*输出；/* 打开 WAV 文件.*/信息格式 = 0;sf = sf_open("你的wavfilepathhere", SFM_READ, &info);如果(sf == NULL){printf("打开文件失败.\n");perror("错误");退出(-1)；}/* 打印一些信息，并计算出要读取的数据量.*/f = info.frames;sr = info.samplerate;int 格式 = info.format;c = info.channels;printf("帧数=%d\n", f);printf("采样率=%d\n", sr);printf("channels=%d\n", c);printf("格式 %i\n", 格式);num_items = f*c;printf("num_items=%d\n", num_items);/* 为要读取的数据分配空间，然后读取.*/数组 = (int *)malloc(num_items*sizeof(int));输出数据 = (int*)malloc(num_items*sizeof(int));;num = sf_read_int(sf, Array, num_items);sf_close(sf);printf("读取 %d 个项目\n", num);//性能执行的时间变量.时序约束所需的事件变量cl_event someEvent;cl_ulong 开始 = (cl_ulong)0;cl_ulong 结束 = (cl_ulong)0;cl_ulong finalTime = (cl_ulong)0;//采样点数int 样本大小 = 100;浮动 h = 0;//用于乘以进入内核内部实现的FIFO缓冲区的值的系数浮动系数 = 1/样本大小；//以Hz为单位的信号频率浮动信号频率 = 10;//0到最大值之间的点数(T_Sample)浮动频率样本 = 样本大小 * 信号频率；//Step = 最大值或 T_Sample.******stepSize 为 1/freqSample 或 1/sampleSize ******浮动步长 = 1.0/freqSample;/*这是一个不同的例子*///使用它来检查每个 API 调用的输出cl_int 状态；//-----------------------------------------------------//第 1 步:发现并初始化平台//-----------------------------------------------------cl_uint numPlatforms = 0;cl_platform_id *platforms = NULL;//使用 clGetPlatformIDs() 来检索数量//平台状态 = clGetPlatformIDs(0, NULL, &numPlatforms);//为每个平台分配足够的空间平台 =(cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));//用 clGetPlatformIDs() 填充平台status = clGetPlatformIDs(numPlatforms,platforms,空值);//-----------------------------------------------------//STEP 2: 发现并初始化设备//-----------------------------------------------------cl_uint numDevices = 0;cl_device_id *devices = NULL;//使用 clGetDeviceIDs() 来检索//设备存在状态 = clGetDeviceIDs(平台[0]，CL_DEVICE_TYPE_CPU，0,空值，&numDevices);//为每个设备分配足够的空间设备 =(cl_device_id*)malloc(numDevices*sizeof(cl_device_id));//用 clGetDeviceIDs() 填充设备状态 = clGetDeviceIDs(平台[0]，CL_DEVICE_TYPE_CPU，数量设备，设备，空值);//-----------------------------------------------------//第 3 步:创建上下文//-----------------------------------------------------cl_context 上下文 = NULL;//使用 clCreateContext() 创建上下文//将其与设备关联上下文 = clCreateContext(空值，数量设备，设备，空值，空值，&状态);//-----------------------------------------------------//STEP 4: 创建命令队列//-----------------------------------------------------cl_command_queue cmdQueue;//使用 clCreateCommandQueue() 创建命令队列，//并将其与您要执行的设备相关联//上cmdQueue = clCreateCommandQueue(语境，设备[0]，CL_QUEUE_PROFILING_ENABLE，&状态);//-----------------------------------------------------//第 5 步:创建设备缓冲区//-----------------------------------------------------cl_mem 输入；cl_mem 输出；cl_float 系数；输入 = clCreateBuffer(语境，CL_MEM_READ_ONLY，num_items,空值，&状态);输出 = clCreateBuffer(语境，CL_MEM_WRITE_ONLY，num_items,空值，&状态);//-----------------------------------------------------//第 6 步:将主机数据写入设备缓冲区//-----------------------------------------------------//使用 clEnqueueWriteBuffer() 将输入数组 Array 写入//设备缓冲区输入状态 = clEnqueueWriteBuffer(cmd队列，输入，CL_FALSE，0,num_items,大批，0,空值，空值);printf("状态 %i \n", 状态);//-----------------------------------------------------//STEP 7: 创建并编译程序//-----------------------------------------------------//使用 clCreateProgramWithSource() 创建程序cl_program 程序 = clCreateProgramWithSource(语境，1、(const char**)&prog,空值，&状态);printf("状态 %i \n", 状态);//为设备构建(编译)程序//clBuildProgram()状态 = clBuildProgram(程序，数量设备，设备，空值，空值，空值);//-----------------------------------------------------//第 8 步:创建内核//-----------------------------------------------------cl_kernel 内核 = NULL;kernel = clCreateKernel(program, "exchange", &status);//-----------------------------------------------------//STEP 9: 设置内核参数//-----------------------------------------------------//将输入和输出缓冲区与//核心//使用 clSetKernelArg()状态 = clSetKernelArg(核心，0,大小(cl_mem)，&输入);printf("状态 %i \n",status);状态 |= clSetKernelArg(核心，1、大小(cl_mem)，&输出);//-----------------------------------------------------//STEP 10: 配置工作项结构//-----------------------------------------------------//定义工作的索引空间(全局工作大小)//项目//执行.工作组大小(本地工作大小)不是//必需的，//但可以使用.size_t globalWorkSize[1];//有元素"工作项globalWorkSize[0] = 样本大小；//-----------------------------------------------------//STEP 11: 将内核排队执行//-----------------------------------------------------//通过使用执行内核//clEnqueueNDRangeKernel().//'globalWorkSize' 是一维的维度//工作项状态 = clEnqueueNDRangeKernel(cmd队列，核心，1、空值，全局工作大小，空值，0,空值，&someEvent);clFinish(cmdQueue);clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);double totalTime = 结束 - 开始；printf("总时间为:%f ms \n", totalTime/1000000.0);//-----------------------------------------------------//STEP 12: 将输出缓冲区读回主机//-----------------------------------------------------//使用 clEnqueueReadBuffer() 读取 OpenCL 输出//缓冲区(缓冲区C)//到主机输出数组 (C)printf("成功了！%i \n", status);clEnqueueReadBuffer(cmd队列，输出，CL_TRUE,0,num_items,输出数据，0,空值，空值);printf("在这里成功2！%i \n", status);SNDFILE * outfile = sf_open("outputwavfilepathhere", SFM_WRITE, &info);sf_count_t count = sf_write_int(outfile, OutputData, num_items);sf_write_sync(输出文件)；sf_close(输出文件)；//-----------------------------------------------------//STEP 13: 释放 OpenCL 资源//-----------------------------------------------------//免费 OpenCL 资源clReleaseKernel(内核)；clReleaseProgram(程序)；clReleaseCommandQueue(cmdQueue);clReleaseMemObject(输入);clReleaseMemObject(输出)；clReleaseContext(上下文)；//释放主机资源免费(输出数据)；免费(平台)；免费(设备)；自由(数组)；}

解决方案

试试这个:

__kernel void exchange(__global int *Array, __global int *Output){int globalSize = get_global_size(0)int globalId = get_global_id(0)for(int i = globalId; i <1200000; i += globalSize){输出[i] = (数组[i]);}}

确保在 for 循环中使用正确的上限.理想情况下，您将其作为另一个参数传入.

您最初做错了什么，只是重复写入前 100012 个元素.了解工作项函数以了解变量的含义.此处参考 OpenCL 1.2.>

I am reading large integer values from an array that has over a million elements. The values obtained are from a wav file by using the libsndfile library. Now if I do not use the kernel, I can write the original array to my output file and listen to the audio with no issues. However, when i decide to use the kernel to do the exact same thing, it only writes maybe less than a second of the song.

At first, I thought this was a memory issue, so i played around with the buffer sizes and still no luck. Then I thought it could be the loop I am doing in the kernel, I also played around with the loop values to come to the same conclusion (still doesn't work.) I am pretty stumped right now and do not know what to do. Here is my code below. Some of the code below is mine, but the main structure I found online to help me with setting up the kernel.

At the very bottom of this code, if I change OutputData to Array, I get the exact same audio back. Im pretty sure something is wrong with the kernel and thats why it is not writing to the whole song back.

I know this code is messy but all you have to do to try and test this code is copy and waste it and simply change the path to an input wav file and output wav file.

Just so the objective is clear, I am going to attempt to modify each value in the wav file, to see what would happen. So far, if I multiply the Output value in the kernel by 2, it distorts it. But again, only lasts for like 1 second and the rest of the clip is empty. Note that both input and output files are the same size.

My For loop is also looping to do 1.2million iterations because thats the number of items i have in my sample wav file

const char* prog = "__kernel void exchange(__global int *Array, __global int *Output) { for(int j = 0; j < 100000; j++){ for(int i = 0; i < 12; i++){ Output[j+i] = (Array[j+i]);}  }  }";

int main() {
// This code executes on the OpenCL host
SNDFILE *sf;
SF_INFO info;
int num_channels;
int num, num_items;
//input and output data
int *Array;
int *OutputData;

int f, sr, c;
int i, j;
FILE *out;

/* Open the WAV file. */
info.format = 0;
sf = sf_open("Yourwavfilepathhere", SFM_READ, &info);
if (sf == NULL)
{
    printf("Failed to open the file.\n");
    perror("Error");
    exit(-1);
}
/* Print some of the info, and figure out how much data to read. */
f = info.frames;
sr = info.samplerate;
int format = info.format;
c = info.channels;
printf("frames=%d\n", f);
printf("samplerate=%d\n", sr);
printf("channels=%d\n", c);
printf("format %i\n", format);
num_items = f*c;
printf("num_items=%d\n", num_items);
/* Allocate space for the data to be read, then read it. */
Array = (int *)malloc(num_items*sizeof(int));
OutputData = (int*)malloc(num_items*sizeof(int));;
num = sf_read_int(sf, Array, num_items);
sf_close(sf);
printf("Read %d items\n", num);

//Time variables for performance execution. Event variable needed for timing constraint 
cl_event someEvent;
cl_ulong start = (cl_ulong)0;
cl_ulong end = (cl_ulong)0;
cl_ulong finalTime = (cl_ulong)0;

//Number of sampling points 
int sampleSize = 100;
float h = 0;

//Coefficient used to multiply the values entering the FIFO buffer implemented inside the kernel
float coefficient = 1 / sampleSize;

//Signal Frequency in Hz
float signalFreq = 10;

//Number of points between 0 and max val (T_Sample)
float freqSample = sampleSize*signalFreq;

//Step = max value or T_Sample. ******Either 1/freqSample or 1/sampleSize for the stepSize******
float stepSize = 1.0 / freqSample;

/*
  This is a different Example
*/


// Use this to check the output of each API call
cl_int status;

//-----------------------------------------------------
// STEP 1: Discover and initialize the platforms
//-----------------------------------------------------

cl_uint numPlatforms = 0;

cl_platform_id *platforms = NULL;

// Use clGetPlatformIDs() to retrieve the number of 
// platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);

// Allocate enough space for each platform
platforms =
    (cl_platform_id*)malloc(
        numPlatforms*sizeof(cl_platform_id));

// Fill in platforms with clGetPlatformIDs()
status = clGetPlatformIDs(numPlatforms, platforms,
    NULL);

//-----------------------------------------------------
// STEP 2: Discover and initialize the devices
//----------------------------------------------------- 

cl_uint numDevices = 0;
cl_device_id *devices = NULL;

// Use clGetDeviceIDs() to retrieve the number of 
// devices present
status = clGetDeviceIDs(
    platforms[0],
    CL_DEVICE_TYPE_CPU,
    0,
    NULL,
    &numDevices);

// Allocate enough space for each device
devices =
    (cl_device_id*)malloc(
        numDevices*sizeof(cl_device_id));

// Fill in devices with clGetDeviceIDs()
status = clGetDeviceIDs(
    platforms[0],
    CL_DEVICE_TYPE_CPU,
    numDevices,
    devices,
    NULL);


//-----------------------------------------------------
// STEP 3: Create a context
//----------------------------------------------------- 

cl_context context = NULL;

// Create a context using clCreateContext() and 
// associate it with the devices
context = clCreateContext(
    NULL,
    numDevices,
    devices,
    NULL,
    NULL,
    &status);

//-----------------------------------------------------
// STEP 4: Create a command queue
//----------------------------------------------------- 

cl_command_queue cmdQueue;

// Create a command queue using clCreateCommandQueue(),
// and associate it with the device you want to execute 
// on
cmdQueue = clCreateCommandQueue(
    context,
    devices[0],
    CL_QUEUE_PROFILING_ENABLE,
    &status);

//-----------------------------------------------------
// STEP 5: Create device buffers
//----------------------------------------------------- 

cl_mem input;
cl_mem output;
cl_float coeff;

input = clCreateBuffer(
    context,
    CL_MEM_READ_ONLY,
    num_items,
    NULL,
    &status);

output = clCreateBuffer(
    context,
    CL_MEM_WRITE_ONLY,
    num_items,
    NULL,
    &status);

//-----------------------------------------------------
// STEP 6: Write host data to device buffers
//----------------------------------------------------- 

// Use clEnqueueWriteBuffer() to write input array Array to
// the device buffer input
status = clEnqueueWriteBuffer(
    cmdQueue,
    input,
    CL_FALSE,
    0,
    num_items,
    Array,
    0,
    NULL,
    NULL);

printf("status %i \n", status);

//-----------------------------------------------------
// STEP 7: Create and compile the program
//----------------------------------------------------- 

// Create a program using clCreateProgramWithSource()
cl_program program = clCreateProgramWithSource(
    context,
    1,
    (const char**)&prog,
    NULL,
    &status);
printf("status %i \n", status);

// Build (compile) the program for the devices with
// clBuildProgram()
status = clBuildProgram(
    program,
    numDevices,
    devices,
    NULL,
    NULL,
    NULL);

//-----------------------------------------------------
// STEP 8: Create the kernel
//----------------------------------------------------- 

cl_kernel kernel = NULL;

kernel = clCreateKernel(program, "exchange", &status);

//-----------------------------------------------------
// STEP 9: Set the kernel arguments
//----------------------------------------------------- 

// Associate the input and output buffers with the 
// kernel 
// using clSetKernelArg()
status = clSetKernelArg(
    kernel,
    0,
    sizeof(cl_mem),
    &input);
printf("Status %i \n",status);

status |= clSetKernelArg(
    kernel,
    1,
    sizeof(cl_mem),
    &output);


//-----------------------------------------------------
// STEP 10: Configure the work-item structure
//----------------------------------------------------- 

// Define an index space (global work size) of work 
// items for 
// execution. A workgroup size (local work size) is not 
// required, 
// but can be used.
size_t globalWorkSize[1];
// There are 'elements' work-items 
globalWorkSize[0] = sampleSize;

//-----------------------------------------------------
// STEP 11: Enqueue the kernel for execution
//----------------------------------------------------- 

// Execute the kernel by using 
// clEnqueueNDRangeKernel().
// 'globalWorkSize' is the 1D dimension of the 
// work-items
status = clEnqueueNDRangeKernel(
    cmdQueue,
    kernel,
    1,
    NULL,
    globalWorkSize,
    NULL,
    0,
    NULL,
    &someEvent);

clFinish(cmdQueue);

clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

double totalTime = end - start;

printf("Total time is: %f ms \n", totalTime / 1000000.0);
//-----------------------------------------------------
// STEP 12: Read the output buffer back to the host
//----------------------------------------------------- 

// Use clEnqueueReadBuffer() to read the OpenCL output  
// buffer (bufferC) 
// to the host output array (C)
printf("Made it here! %i \n", status);
clEnqueueReadBuffer(
    cmdQueue,
    output,
    CL_TRUE,
    0,
    num_items,
    OutputData,
    0,
    NULL,
    NULL);
printf("Made it here2! %i \n", status);


SNDFILE * outfile = sf_open("outputwavfilepathhere", SFM_WRITE, &info);
sf_count_t count = sf_write_int(outfile, OutputData, num_items);
sf_write_sync(outfile);
sf_close(outfile);

//-----------------------------------------------------
// STEP 13: Release OpenCL resources
//----------------------------------------------------- 

// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseContext(context);

// Free host resources
free(OutputData);
free(platforms);
free(devices);
free(Array);
}

解决方案

Try this:

__kernel void exchange(__global int *Array, __global int *Output)
{
    int globalSize = get_global_size(0)
    int globalId = get_global_id(0)

    for(int i = globalId; i < 1200000; i += globalSize){
        Output[i] = (Array[i]);
    }
}

Make sure you use the correct upper limit in the for loop. Ideally, you pass this in as another parameter.

What you were doing wrong originally, was only writing the first 100012 elements repeatedly. red up on work-item functions to get an idea of what the variables mean. OpenCL 1.2 reference here.

这篇关于OpenCL 内核仅部分写入输出缓冲区的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

OpenCL 内核仅部分写入输出缓冲区 [英] OpenCL Kernel only partly writing to output buffer

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

OpenCL 内核仅部分写入输出缓冲区 [英] OpenCL Kernel only partly writing to output buffer

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭