Mac OSx上的OpenCL内核错误 [英] OpenCL kernel error on Mac OSx
问题描述
inline float8 __OVERLOAD__ _name(float8 x){return _default_name X); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30:note:候选函数
__CLFN_FD_1FD_FAST_RELAX(__ fast_relax_log,native_log,__cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:421:29:
注意:从宏'__CLFN_FD_1FD_FAST_RELAX'扩展
内联float16 __OVERLOAD__ _name(float16 x){return _default_name(x);
^
<程序源>:206:19:错误:调用'__fast_relax_log'是不明确的
det_zkinin + log((2.0)* 3.14));
^ ~~~~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel。 h:4608:22:
注意:从宏'log'扩展
#define log(__ x)__fast_relax_log(__ x)
^ ~~~~~~~~~~~~~~ ~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30:
注意:候选函数
__CLFN_FD_1FD_FAST_RELAX (__fast_relax_log,native_log,__cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:416:27:
注意:从宏'__CLFN_FD_1FD_FAST_RELAX'扩展
内联float __OVERLOAD__ _name(float x){return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30
注意:候选人功能
__CLFN_FD_1FD_FAST_RELAX(__ fast_relax_log,native_log,__cl_log)
^
^
这是内核代码:
#pragma OPENCL EXTENSION cl_khr_fp64:enable
__kernel void ckf_kernel2(int dimx,int align_dimx,
int numOfCKF,int aligned_ckf,
int iter,
double epsilon,
__global double * yrlists,
__global double * zrlists,
__global double * rlists,
__global double * init_state,
__global double * init_var,
__global double * sing_j,
__global double * covMatrixSum,
__global double * cummulative,
__global double * temp_var,
__global dou b * __g_,
__global double * z_k_j,
__global double * crossCovMatrixSum,
__global double * z_k_f,
__global double * innCovMatrixSum,
__global double * zk_diff,
__global double * reduce_gain_matrix,
__global double * llk
)
{
int ckf_id = get_global_id(0);
if(ckf_id< numOfCKF){
for(int i = 0; i< dimx; i ++)
{
for(int idx = 0; idx< dimx * 2; idx ++)
{
int column = idx%dimx;
int mode =(idx> = dimx)? -1:1;
sing_j [(i * dimx * 2 + idx)* aligned_ckf + ckf_id] = temp_var [(i * dimx + column)* aligned_ckf + ckf_id] * epsilon * mode + init_state [i * aligned_ckf + ckf_id]
}
}
z_k_f [ckf_id] = 0;
innCovMatrixSum [ckf_id] = 0; (int idx = 0; idx {
z_k_j [idx * aligned_ckf + ckf_id] = 0; (int i = 0; i z_k_j [idx * aligned_ckf + ckf_id] + = sing_j [(i * dimx * 2 + idx)* aligned_ckf + ckf_id] * zrlists [iter * aligned_dimx + i];
z_k_f [ckf_id] + = z_k_j [idx * aligned_ckf + ckf_id];
innCovMatrixSum [ckf_id] + = z_k_j [idx * aligned_ckf + ckf_id] * z_k_j [idx * aligned_ckf + ckf_id];
}
z_k_f [ckf_id] = z_k_f [ckf_id] /(dimx * 2);
innCovMatrixSum [ckf_id] = innCovMatrixSum [ckf_id] /(dimx * 2);
innCovMatrixSum [ckf_id] =(innCovMatrixSum [ckf_id] - z_k_f [ckf_id] * z_k_f [ckf_id])+ rlists [ckf_id];
// calcualte crossCovMatrixSum
for(int idx = 0; idx< dimx; idx ++)
{
crossCovMatrixSum [idx * aligned_ckf + ckf_id] = 0; (int i = 0; i <2 * dimx; i ++)
{
crossCovMatrixSum [idx * aligned_ckf + ckf_id] + = sing_j [(idx * dimx * 2 + i) * aligned_ckf + ckf_id] * z_k_j [i * aligned_ckf + ckf_id];
}
crossCovMatrixSum [idx * aligned_ckf + ckf_id] = crossCovMatrixSum [idx * aligned_ckf + ckf_id] /(dimx * 2);
crossCovMatrixSum [idx * aligned_ckf + ckf_id] = crossCovMatrixSum [idx * aligned_ckf + ckf_id] - x_k_f [idx * aligned_ckf + ckf_id] * z_k_f [ckf_id];
}
//计算zk_diff
int z_check =(int)yrlists [iter];
if(z_check == -1)
zk_diff [ckf_id] = 0;
else
zk_diff [ckf_id] = yrlists [iter] - z_k_f [ckf_id];
//计算reduce_gain_matrix和(reduce_state_matrix< ==> init_state); (int idx = 0; idx< dimx; idx ++)
{
reduce_gain_matrix [idx * aligned_ckf + ckf_id] =(crossCovMatrixSum [idx * aligned_ckf + ckf_id] / innCovMatrixSum [ckf_id] );
init_state [idx * aligned_ckf + ckf_id] = reduce_gain_matrix [idx * aligned_ckf + ckf_id] * zk_diff [ckf_id] + x_k_f [idx * aligned_ckf + ckf_id];
}
for(int idx = 0; idx< dimx; idx ++)
{
init_var [idx * aligned_ckf + ckf_id] = covMatrixSum [(idx * dimx + idx)* aligned_ckf + ckf_id] -
reduce_gain_matrix [idx * aligned_ckf + ckf_id] * innCovMatrixSum [ckf_id] *
reduce_gain_matrix [idx * aligned_ckf + ckf_id];
}
double det_zkinin = zk_diff [ckf_id] * zk_diff [ckf_id] *(1.0f / innCovMatrixSum [ckf_id]);
if(innCovMatrixSum [ckf_id]< = 0)
llk [ckf_id] = 0;
else
llk [ckf_id] = 0.5 *((log(innCovMatrixSum [ckf_id]))+
det_zkinin + log((2.0)* 3.14));
cummulative [ckf_id] + = llk [ckf_id];
}
}
我怀疑你正在尝试在不支持双精度的集成Intel GPU上运行。如果我为英特尔HD 4000编译内核代码,我只能在自己的Macbook Pro上复制错误 - 当我定位到CPU或者分立的NVIDIA GPU时,它会很好地编译。
您可以通过查询 CL_DEVICE_DOUBLE_FP_CONFIG
设备信息参数来检查设备是否支持双精度:
cl_device_fp_config cfg;
clGetDeviceInfo(device,CL_DEVICE_DOUBLE_FP_CONFIG,sizeof(cfg),& cfg,NULL);
printf(Double FP config =%llu\\\
,cfg);
如果此函数返回值 0
那么不支持双精度。这就解释了为什么编译器日志仅报告 log
函数的 float
变体。
I wrote some OpenCL code which works fine on LINUX, but it is failing with errors on Mac OSX. Can someone please help me to identify why these should occur. The kernel code is shown after the error. My kernel uses double, so I have the corresponding pragma at the top. But I don't know why the error shows float data type:
inline float8 __OVERLOAD__ _name(float8 x) { return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30: note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:421:29:
note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float16 __OVERLOAD__ _name(float16 x){ return _default_name(x); }
^
<program source>:206:19: error: call to '__fast_relax_log' is ambiguous
det_zkinin + log((2.0) * 3.14));
^~~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4608:22:
note: expanded from macro 'log'
#define log(__x) __fast_relax_log(__x)
^~~~~~~~~~~~~~~~
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30:
note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:416:27:
note: expanded from macro '__CLFN_FD_1FD_FAST_RELAX'
inline float __OVERLOAD__ _name(float x) { return _default_name(x); } \
^
/System/Library/Frameworks/OpenCL.framework/Versions/A/lib/clang/3.2/include/cl_kernel.h:4606:30
note: candidate function
__CLFN_FD_1FD_FAST_RELAX(__fast_relax_log, native_log, __cl_log);
^
^
This is the kernel code:
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void ckf_kernel2(int dimx, int aligned_dimx,
int numOfCKF, int aligned_ckf,
int iter,
double epsilon,
__global double * yrlists,
__global double * zrlists,
__global double * rlists,
__global double * init_state,
__global double * init_var,
__global double * sing_j,
__global double * covMatrixSum,
__global double * cummulative,
__global double * temp_var,
__global double * x_k_f,
__global double * z_k_j,
__global double * crossCovMatrixSum,
__global double * z_k_f,
__global double * innCovMatrixSum,
__global double * zk_diff,
__global double * reduce_gain_matrix,
__global double * llk
)
{
int ckf_id = get_global_id(0);
if( ckf_id < numOfCKF){
for (int i = 0 ; i < dimx ; i++)
{
for (int idx = 0; idx < dimx * 2 ; idx++)
{
int column = idx % dimx;
int mode = (idx >= dimx) ? -1 : 1;
sing_j[(i * dimx * 2 + idx) * aligned_ckf + ckf_id] = temp_var[(i * dimx + column) * aligned_ckf + ckf_id] * epsilon * mode + init_state[i * aligned_ckf + ckf_id];
}
}
z_k_f[ckf_id] = 0;
innCovMatrixSum[ckf_id] = 0;
for (int idx = 0; idx < dimx * 2 ; idx++)
{
z_k_j[idx * aligned_ckf + ckf_id] = 0;
for (int i = 0 ; i < dimx ; i++)
z_k_j[idx * aligned_ckf + ckf_id] += sing_j[(i * dimx * 2 + idx) * aligned_ckf + ckf_id] * zrlists[iter * aligned_dimx + i ];
z_k_f[ckf_id] += z_k_j[idx* aligned_ckf + ckf_id] ;
innCovMatrixSum[ckf_id] += z_k_j[idx* aligned_ckf + ckf_id] * z_k_j[idx* aligned_ckf + ckf_id];
}
z_k_f[ckf_id] = z_k_f[ckf_id] / (dimx * 2);
innCovMatrixSum[ckf_id] = innCovMatrixSum[ckf_id] / (dimx * 2);
innCovMatrixSum[ckf_id] = (innCovMatrixSum[ckf_id] - z_k_f[ckf_id] *z_k_f[ckf_id]) + rlists[ckf_id];
// calcualte crossCovMatrixSum
for (int idx = 0; idx < dimx; idx ++)
{
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = 0;
for (int i = 0 ; i < 2 * dimx ; i++)
{
crossCovMatrixSum[idx * aligned_ckf + ckf_id] += sing_j[(idx * dimx*2 + i) * aligned_ckf + ckf_id ] * z_k_j[i* aligned_ckf + ckf_id];
}
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = crossCovMatrixSum[idx * aligned_ckf + ckf_id]/ (dimx * 2);
crossCovMatrixSum[idx * aligned_ckf + ckf_id] = crossCovMatrixSum[idx * aligned_ckf + ckf_id] - x_k_f[idx* aligned_ckf + ckf_id] * z_k_f[ckf_id];
}
// calculate zk_diff
int z_check = (int)yrlists[iter];
if (z_check == -1)
zk_diff[ckf_id] = 0;
else
zk_diff[ckf_id] = yrlists[iter] - z_k_f[ckf_id];
// calculate reduce_gain_matrix and (reduce_state_matrix <==> init_state);
for (int idx = 0 ; idx < dimx; idx++)
{
reduce_gain_matrix[idx * aligned_ckf + ckf_id] = (crossCovMatrixSum[idx * aligned_ckf + ckf_id] / innCovMatrixSum[ckf_id]);
init_state[idx * aligned_ckf + ckf_id] = reduce_gain_matrix[idx * aligned_ckf + ckf_id] * zk_diff[ckf_id] + x_k_f[idx* aligned_ckf + ckf_id];
}
for (int idx = 0 ; idx < dimx; idx++)
{
init_var[idx * aligned_ckf + ckf_id ] = covMatrixSum[(idx * dimx + idx) * aligned_ckf + ckf_id] -
reduce_gain_matrix[idx * aligned_ckf + ckf_id] * innCovMatrixSum[ckf_id] *
reduce_gain_matrix[idx * aligned_ckf + ckf_id];
}
double det_zkinin = zk_diff[ckf_id] * zk_diff[ckf_id] * (1.0f /innCovMatrixSum[ckf_id]);
if (innCovMatrixSum[ckf_id] <= 0)
llk[ckf_id] = 0;
else
llk[ckf_id] = 0.5 * ((log(innCovMatrixSum[ckf_id])) +
det_zkinin + log((2.0) * 3.14));
cummulative[ckf_id] += llk[ckf_id];
}
}
I suspect you are trying to run this on an integrated Intel GPU which does not support double precision. I can only reproduce your error on my own Macbook Pro if I compile your kernel code for the Intel HD 4000 - it compiles just fine when I target the CPU or the discrete NVIDIA GPU.
You can check if the device supports double precision by querying the CL_DEVICE_DOUBLE_FP_CONFIG
device information parameter:
cl_device_fp_config cfg;
clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
printf("Double FP config = %llu\n", cfg);
If this function returns the value 0
, then double precision is not supported. This explains why the compiler log is only reporting float
variants of the log
function.
这篇关于Mac OSx上的OpenCL内核错误的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!