ARM内联汇编代码错误“asm中不可能的约束” [英] ARM inline assembly code with error "impossible constraint in asm"

查看:637
本文介绍了ARM内联汇编代码错误“asm中不可能的约束”的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图优化下面的代码complex.cpp:

$ p $ typedef struct {
float re;
float im;
} dcmplx;

dcmplx ComplexConv(int len,dcmplx * hat,dcmplx * buf)
{
int i;
dcmplx z,xout;

xout.re = xout.im = 0.0;
asm volatile(
movs r3,#0\\\
\t
.loop:\\\
\t
vldr s11,[% #4] \\\\\
vldr s13,[%[hat]] \\\
vneg.f32 s11,s11 \\\ n \ t
vldr s15,[%[buf],#4] \\\
\t
vldr s12,[%[buf]] \\\
\
vmul.f32 s14,s15,s13 \\\
\t
vmul.f32 s15,s11,s15 \\\
加%[帽],#8 \\\
\t
vmla.f32 s14,s11,s12\
'\\t
vnmls.f32 s15,s12,s13\\\
\
增加%[buf],#8 \\\
vadd.f32 s1,s1,s14\\\
\t
vadd.f32 s0,s0,s15 \\\
\t
将r3,r3,#1 \\\
添加到cmp r3,r0\\\
\
bne .loop \\\
\t
:= r(xout)
:[hat]r(hat),[buf]r(buf)
:s0 ,cc
);
return xout;
}

使用arm-linux-gnueabihf-g ++ -c complex .cpp -o complex.o -mfpu = neon,
我收到以下错误:'asm'中的不可约束。



当我注释掉= r(xout)时,编译没有抱怨,但是我怎样才能将寄存器's0'的结果写入xout?



此外,如果r0包含返回值,但返回类型是复杂结构,它是如何工作的,因为r0只是一个32位?注册。



我在这里发布的原始c代码:

  dcmplx ComplexConv(int len,dcmplx * hat,dcmplx * buf)
{
int i;
dcmplx z,xout;
xout.re = xout.im = 0.0;
for(int i = 0; i< len; i ++){
z = BI_dcmul(BI_dconjg(hat [i]),buf [i]);
xout = BI_dcadd(xout,z);
}
return xout;
}
dcmplx BI_dcmul(dcmplx x,dcmplx y)
{
dcmplx z;
z.re = x.re * y.re - x.im * y.im;
z.im = x.im * y.re + x.re * y.im;
return z;
}
dcmplx BI_dconjg(dcmplx x)
{
dcmplx y;
y.re = x.re;
y.im = -x.im;
return y;
}
dcmplx BI_dcadd(dcmplx x,dcmplx y)
{
dcmplx z;
z.re = x.re + y.re;
z.im = x.im + y.im;
return z;


解决方案

错误次数:


  • 它尝试将64位结构用作32位输出寄存器的操作数(= r)约束。这是什么给你的错误。
  • 它不会在任何地方使用该输出操作数
  • 它不告诉编译器输出实际是(S0 / S1)
  • 它不会告诉编译器 len 应该是一个输入

  • 在不告诉编译器的情况下,它封锁了多个寄存器R3,S11,S12,S13,S14,S14。
  • 它使用一个标签 .loop 不必要地阻止编译器在多处插入代码。
  • 它实际上并不等同于C ++代码,已经显示,计算其他东西。



我不打算解释如何解决所有这些错误,因为你不应该使用内联汇编。您可以使用C ++编写代码,并让编译器进行向量化。



例如编译以下代码,相当于您的示例C ++代码,使用GCC 4.9和 -O3 -funsafe-math-optimizations options:

  dcmplx ComplexConv(int len ,dcmplx * hat,dcmplx * buf)
{
int i;
dcmplx xout;
xout.re = xout.im = 0.0;
for(i = 0; i< len; i ++){
xout.re + = hat [i] .re * buf [i] .re + hat [i] .im * buf [ I] .IM;
xout.im + = hat [i] .re * buf [i] .im - hat [i] .im * buf [i] .re;
}
return xout;
}

生成以下程序集作为其内部循环:

  .L97:
add lr,lr,#1
cmp ip,lr
vld2.32 {d20-d23},[r5]!
vld2.32 {d24-d27},[r4]!
vmul.f32 q15,q12,q10
vmul.f32 q14,q13,q10
vmla.f32 q15,q13,q11
vmls.f32 q14,q12,q11
vadd.f32 q9,q9,q15
vadd.f32 q8,q8,q14
bhi .L97

基于内联汇编代码,编译器生成的结果可能比您想要自己对其进行矢量化时的结果要好。



-funsafe-math-optimizations 是必要的,因为NEON指令不完全符合IEEE 754标准。正如 GCC文档所述:


如果选定的浮点硬件包含NEON扩展
(例如 -mfpu ='neon' code>),请注意,除非
-funsafe-math-optimizations ,否则浮点运算不是由GCC的自动向量化过程生成的
。也被指定。这是因为NEON硬件没有完全实现用于
浮点运算的IEEE 754标准(特别是非正常值被视为
为零),所以使用NEON指令可能会导致
精度。

我还应该注意到,如果您不自己推出自己的编译器,那么编译器的生成几乎与上面的代码一样好复杂类型,如下例所示:

  #include< complex> 
typedef std :: complex< float>复杂;
complex ComplexConv_std(int len,complex * hat,complex * buf)
{
int i;
complex xout(0.0f,0.0f);
for(i = 0; i< len; i ++){
xout + = std :: conj(hat [i])* buf [i];
}
return xout;
}

然而,使用自己的类型的一个好处是可以改进代码编译器生成一个小小的改变,如何声明 struct dcmplx

  typedef struct {
float re;
float im;
$ __attribute __((aligned(8))dcmplx;

8字节(64位)对齐,这允许编译器跳过检查,看看它是否适当对齐,然后回落到较慢的标量实现。



现在,假设你可以说你对GCC向量化你的代码感到不满意,并认为你可以做得更好。这是否证明使用内联汇编是合理的?不,下一个要尝试的是 ARM NEON intrinsics 。使用内在函数就像正常的C ++编程一样,您不必担心例如下面是我将上面的向量化程序集转换为使用内在函数的未经测试的代码:

  #include< assert.h> 
#include< arm_neon.h>
dcmplx ComplexConv(int len,dcmplx * hat,dcmplx * buf)
{
int i;
dcmplx xout;

/ *一切都需要适当对齐* /
assert(len%4 == 0);
assert(((unsigned)hat%8)== 0);
assert(((unsigned)buf%8)== 0);

float32x4_t re,im; (i = 0; i< len; i + = 4){
float32x4x2_t h = vld2q_f32(& hat [i] .re)的
;
float32x4x2_t b = vld2q_f32(& buf [i] .re);
re = vaddq_f32(re,vmlaq_f32(vmulq_f32(h.val [0],b.val [0]),
b.val [1],h.val [1]));
im = vaddq_f32(im,vmlsq_f32(vmulq_f32(h.val [1],b.val [1]),
b.val [0],h.val [0]));
}
float32x2_t re_tmp = vadd_f32(vget_low_f32(re),vget_high_f32(re));
float32x2_t im_tmp = vadd_f32(vget_low_f32(im),vget_high_f32(im));
xout.re = vget_lane_f32(vpadd_f32(re_tmp,re_tmp),0);
xout.im = vget_lane_f32(vpadd_f32(im_tmp,im_tmp),0);
return xout;
}

最后,如果这不够好,你需要调整每一点的性能,那么使用内联汇编仍然不是一个好主意。相反,你最后的手段应该是使用常规组装。由于您在汇编中重写了大部分函数,​​因此您最好在汇编中完整地编写它。这意味着你不用担心告诉编译器你在内联汇编中做的所有事情。您只需要符合ARM ABI,这可能会非常棘手,但比通过内联汇编更正所有问题要容易得多。


I am trying to optimize the following code complex.cpp:

typedef struct {
    float re;
    float im;
} dcmplx;

dcmplx ComplexConv(int len, dcmplx *hat, dcmplx *buf)
{
    int    i;
    dcmplx    z, xout;

    xout.re = xout.im = 0.0;
    asm volatile (
    "movs r3, #0\n\t"
    ".loop:\n\t"
    "vldr s11, [%[hat], #4]\n\t"
    "vldr s13, [%[hat]]\n\t"
    "vneg.f32 s11, s11\n\t"
    "vldr s15, [%[buf], #4]\n\t"
    "vldr s12, [%[buf]]\n\t"
    "vmul.f32 s14, s15, s13\n\t"
    "vmul.f32 s15, s11, s15\n\t"
    "adds %[hat], #8\n\t"
    "vmla.f32 s14, s11, s12\n\t"
    "vnmls.f32 s15, s12, s13\n\t"
    "adds %[buf], #8\n\t"
    "vadd.f32 s1, s1, s14\n\t"
    "vadd.f32 s0, s0, s15\n\t"
    "adds r3, r3, #1\n\t"
    "cmp r3, r0\n\t"
    "bne .loop\n\t"
    : "=r"(xout)
    : [hat]"r"(hat),[buf]"r"(buf) 
    : "s0","cc"
    );
    return xout;
}

When it is compiled with "arm-linux-gnueabihf-g++ -c complex.cpp -o complex.o -mfpu=neon", I got the following error: impossible constraint in 'asm'.

When I comment out "=r"(xout), the compile doesn't complain, but how can I get result of register 's0' into xout?

Besides, how it works if r0 contains return value but the return type is a complicate structure, since r0 is only a 32-bit? register.

The original c code I post here:

dcmplx ComplexConv(int len, dcmplx *hat, dcmplx *buf)
{
    int    i;
    dcmplx    z, xout;
    xout.re = xout.im = 0.0;
    for(int i = 0; i < len; i++) {
        z = BI_dcmul(BI_dconjg(hat[i]),buf[i]);
        xout = BI_dcadd(xout,z);
    }
    return xout;
}
dcmplx BI_dcmul(dcmplx x, dcmplx y)
{
    dcmplx    z;
    z.re = x.re * y.re - x.im * y.im;
    z.im = x.im * y.re + x.re * y.im;
    return z;
}
dcmplx BI_dconjg(dcmplx x)
{
    dcmplx    y;
    y.re = x.re;
    y.im = -x.im;
    return y;
}
dcmplx BI_dcadd(dcmplx x, dcmplx y)
{
    dcmplx    z;
    z.re = x.re + y.re;
    z.im = x.im + y.im;
    return z;
}

解决方案

Your inline assembly code makes a number of mistakes:

  • It tries to use a 64-bit structure as an operand with a 32-bit output register ("=r") constraint. This is what gives you the error.
  • It doesn't use that output operand anywhere
  • It doesn't tell the compiler where the output actually is (S0/S1)
  • It doesn't tell the compiler that len is supposed to be an input
  • It clobbers a number of registers, R3, S11, S12, S13, S14, S14, without telling the compiler.
  • It uses a label .loop that unnecessarily prevents the compiler from inlining your code in multiple places.
  • It doesn't actually appear to be the equivalent of the C++ code you've shown, calculating something else instead.

I'm not going to bother to explain how you can fix all these mistakes, because you shouldn't be using inline assembly. You can write your code in C++ and let the compiler do the vectorization.

For example compiling following code, equivalent to your example C++ code, with GCC 4.9 and the -O3 -funsafe-math-optimizations options:

dcmplx ComplexConv(int len, dcmplx *hat, dcmplx *buf)
{
    int    i;
    dcmplx xout;
    xout.re = xout.im = 0.0;
    for (i = 0; i < len; i++) {
        xout.re += hat[i].re * buf[i].re + hat[i].im * buf[i].im;
        xout.im += hat[i].re * buf[i].im - hat[i].im * buf[i].re;
    }
    return xout;
}

generates the following assembly as its inner loop:

.L97:
    add lr, lr, #1
    cmp ip, lr
    vld2.32 {d20-d23}, [r5]!
    vld2.32 {d24-d27}, [r4]!
    vmul.f32    q15, q12, q10
    vmul.f32    q14, q13, q10
    vmla.f32    q15, q13, q11
    vmls.f32    q14, q12, q11
    vadd.f32    q9, q9, q15
    vadd.f32    q8, q8, q14
    bhi .L97

Based on your inline assembly code, it's likely that the compiler generated better than what you would've come up with if you tried to vectorize it yourself.

The -funsafe-math-optimizations is necessary because the NEON instructions aren't fully IEEE 754 conformant. As the GCC documentation states:

If the selected floating-point hardware includes the NEON extension (e.g. -mfpu=‘neon’), note that floating-point operations are not generated by GCC's auto-vectorization pass unless -funsafe-math-optimizations is also specified. This is because NEON hardware does not fully implement the IEEE 754 standard for floating-point arithmetic (in particular denormal values are treated as zero), so the use of NEON instructions may lead to a loss of precision.

I should also note that the compiler generates almost as good as code above if you don't roll your own complex type, like in the following example:

#include <complex>
typedef std::complex<float> complex;
complex ComplexConv_std(int len, complex *hat, complex *buf)
{
    int    i;
    complex xout(0.0f, 0.0f); 
    for (i = 0; i < len; i++) {
        xout += std::conj(hat[i]) * buf[i];
    }
    return xout;
}

One advantage to using your own type however, is that you can improve the code compiler generates making one small change to how you declare struct dcmplx:

typedef struct {
    float re;
    float im;
} __attribute__((aligned(8)) dcmplx;

By saying it needs to be 8-byte (64-bit) aligned, this allows the compiler to skip the check to see if it is suitably aligned and then fall back on the slower scalar implementation instead.

Now, hypothetically, lets say you were unsatisfied with how GCC vectorized your code and thought you could do better. Would this justify using inline assembly? No, the next thing to try are the ARM NEON intrinsics. Using intrinics is just like normal C++ programming, you don't have worry about a bunch of special rules you need to follow. For example here's how I converted the vectorized assembly above into this untested code that uses intrinsics:

#include <assert.h>
#include <arm_neon.h>
dcmplx ComplexConv(int len, dcmplx *hat, dcmplx *buf)
{
    int    i;
    dcmplx xout;

    /* everything needs to be suitably aligned */
    assert(len % 4 == 0);
    assert(((unsigned) hat % 8) == 0);
    assert(((unsigned) buf % 8) == 0);

    float32x4_t re, im;
    for (i = 0; i < len; i += 4) {
        float32x4x2_t h = vld2q_f32(&hat[i].re);
        float32x4x2_t b = vld2q_f32(&buf[i].re);
        re = vaddq_f32(re, vmlaq_f32(vmulq_f32(h.val[0], b.val[0]),
                                     b.val[1], h.val[1]));
        im = vaddq_f32(im, vmlsq_f32(vmulq_f32(h.val[1], b.val[1]),
                                     b.val[0], h.val[0]));
    }
    float32x2_t re_tmp = vadd_f32(vget_low_f32(re), vget_high_f32(re));
    float32x2_t im_tmp = vadd_f32(vget_low_f32(im), vget_high_f32(im));
    xout.re = vget_lane_f32(vpadd_f32(re_tmp, re_tmp), 0);
    xout.im = vget_lane_f32(vpadd_f32(im_tmp, im_tmp), 0);
    return xout;
}

Finally if this wasn't good enough and you needed to tweak out every bit of performance you could then it's still not a good idea to use inline assembly. Instead your last resort should be to use regular assembly instead. Since your rewriting most of the function in assembly, you might as well write it completely in assembly. That means you don't have worry about telling the compiler about everything you're doing in the inline assembly. You only need to conform to the ARM ABI, which can be tricky enough, but is a lot easier than getting everything correct with inline assembly.

这篇关于ARM内联汇编代码错误“asm中不可能的约束”的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆