如何在gcc中启用sse3自动插件 [英] How to enable sse3 autovectorization in gcc

查看：224 发布时间：2018/4/20 16:26:58 c gcc sse auto-vectorization

本文介绍了如何在gcc中启用sse3自动插件的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我有一个简单的循环，带有n个复数的乘积。当我数百万次执行这个循环时，我希望它尽可能快。我明白，使用SSE3和gcc内在函数可以快速完成此操作，但是我对是否有可能让gcc自动矢量化代码感兴趣。以下是一些示例代码

  #include< complex.h> 
 complex float f（complex float x []，int n）{
 complex float p = 1.0; 
 for（int i = 0; i< n; i ++）
 p * = x [i]; 
 return p; 
}

您从gcc -S -O3 -ffast-math获得的程序集为：

  .filetest.c
 .section .text.unlikely，ax，@ progbits 
 .LCOLDB2：
 .text 
 .LHOTB2：
 .p2align 4，，15 
 .globl f 
 .type f，@function 
f： 
 .LFB0：
 .cfi_startproc 
 testl％esi，％esi 
 jle .L4 
 leal -1（％rsi），％eax 
 pxor％ xmm2，％xmm2 
 movss .LC1（％rip），％xmm3 
 leaq 8（％rdi，％rax，8），％rax 
 .p2align 4，，10 
 .p2align 3 
 .L3：
 movaps％xmm3，％xmm5 
 movaps％xmm3，％xmm4 
 movss（％rdi），％xmm0 
 addq $ 8，％ rdi 
 movss -4（％rdi），％xmm1 
 mulss％xmm0，％xmm5 
 mulss％xmm1，％xmm4 
 cmpq％rdi，％rax 
 mulss ％xmm2，％xmm0 
 mulss％xmm2，％xmm1 
 movaps％xmm5，％xmm3 
 movaps％xmm4，％xmm2 
 subss％xmm1，％xmm3 
 addss％xmm0，％xmm2 
 jne .L3 
 movaps％xmm2 （％rsp）
 movq％xmm3，-8（％rsp） ％xmm0 
 ret 
 .L4：
 movss .LC1（％rip），％xmm3 
 pxor％xmm1，％xmm1 
 jmp .L2 
。 cfi_endproc 
 .LFE0：
 .size f，。-f 
 .section .text.unlikely 
 .LCOLDE2：
 .text 
 .LHOTE2：
 .section .rodata.cst4，aM，@ progbits，4 
 .align 4 
 .LC1：
 .long 1065353216 
 .identGCC：（Ubuntu 5.4.0-6ubuntu1〜16.04.4）5.4.0 20160609
 .section .note.GNU-stack，，@ progbits

解决方案

问题是复杂类型不是SIMD友好的。我从来不是 complex 类型的粉丝，因为它是一个复合对象，通常不会映射到硬件中的原始类型或单个操作（当然不适用于x86硬件）。

为了使复杂算术SIMD更友好，您需要同时操作多个复数。对于SSE，您需要一次操作四个复数。

我们可以使用GCC的矢量扩展，以使语法更加简单。

  typedef float v4sf __attribute__（（vector_size（16）））;

然后我们可以删除一个数组的联合和向量扩展

  typedef union {
 v4sf v; 
 float e [4]; 
} float4

最后，我们定义一个包含四个复数的块，如

  typedef struct {
 float4 x; 
 float4 y; 
} complex4;

其中 x 是四个实数部分， code> y 是四个虚部。

一旦我们有了这个，我们可以像这样一次复数4个复数。

  static complex4 complex4_mul（complex4 a，complex4 b）{
 return（complex4）{axv * bxv -ayv * byv，ayv * bxv + axv * byv}; 
 
 
 
 
 
 $ b 最后我们将你的函数修改为在四个复数上运行时间。
 
 
  complex4 f4（complex4 x []，int n）{
 v4sf one = {1,1， 1,1}; 
 complex4 p = {one，one}; 
 for（int i = 0; i  return p; 
} 
  让我们看看汇编语言（英特尔语法） p 
 
 $ $ p $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ xmm1，XMMWORD PTR -16 [b] 
 cmp rdx，rsi 
 movaps xmm2，xmm4 
 movaps xmm5，xmm1 
 mulps xmm1，xmm3 
 mulps xmm2，xmm3 
 mulps xmm5，xmm0 
 mulps xmm0，xmm4 
 subps xmm2，xmm5 
 addps xmm0，xmm1 
 movaps xmm3，xmm2 
 jne .L3

恰好有四个4宽乘法，一个4宽加法和一个4宽减法。变量 p 保留在寄存器中，只有数组 x 从内存中加载，就像我们想要的一样。

让我们来看看复数乘积的代数

pre $ code> {a，bi } * {c，di} = {（ac - bd），（bc + ad）i}

这正是四次乘法，一次加法和一次减法。

正如我在 answer 有效的SIMD代数通常与标量算术相同。因此，我们用四个4宽乘法，加法和减法代替了四个1宽乘法，加法和减法。请注意，除了SSE之外，这不需要任何指令，也不需要额外的SSE指令（除了FMA4）将会更好。因此，在64位系统上，您可以使用 -O3 来编译。

将其扩展为8 AVX全SIMD。

使用GCC的矢量扩展的一个主要优势是您无需额外的工作即可获得FMA。例如。如果使用 -O3 -mfma4 进行编译，则主循环为

 。 L3：
 vmovaps xmm0，XMMWORD PTR 16 
 add rsi，32 
 vmulps xmm1，xmm0，xmm2 
 vmulps xmm0，xmm0，xmm3 
 vfmsubps xmm1， xmm3，XMMWORD PTR -32 [rsi]，xmm1 
 vmovaps xmm3，xmm1 
 vfmaddps xmm2，xmm2，XMMWORD PTR -32 [rsi]，xmm0 
 cmp rdx，rsi 
 jne .L3

I have a simple loop with takes the product of n complex numbers. As I perform this loop millions of times I want it to be as fast as possible. I understand that it's possible to do this quickly using SSE3 and gcc intrinsics but I am interested in whether it is possible to get gcc to auto-vectorize the code. Here is some sample code

#include <complex.h>
complex float f(complex float x[], int n ) {
  complex float p = 1.0;
  for (int i = 0; i < n; i++)
    p *= x[i];
  return p;
}

The assembly you get from gcc -S -O3 -ffast-math is:

        .file   "test.c"
        .section        .text.unlikely,"ax",@progbits
.LCOLDB2:
        .text
.LHOTB2:
        .p2align 4,,15
        .globl  f
        .type   f, @function
f:
.LFB0:
        .cfi_startproc
        testl   %esi, %esi
        jle     .L4
        leal    -1(%rsi), %eax
        pxor    %xmm2, %xmm2
        movss   .LC1(%rip), %xmm3
        leaq    8(%rdi,%rax,8), %rax
        .p2align 4,,10
        .p2align 3
.L3:
        movaps  %xmm3, %xmm5
        movaps  %xmm3, %xmm4
        movss   (%rdi), %xmm0
        addq    $8, %rdi
        movss   -4(%rdi), %xmm1
        mulss   %xmm0, %xmm5
        mulss   %xmm1, %xmm4
        cmpq    %rdi, %rax
        mulss   %xmm2, %xmm0
        mulss   %xmm2, %xmm1
        movaps  %xmm5, %xmm3
        movaps  %xmm4, %xmm2
        subss   %xmm1, %xmm3
        addss   %xmm0, %xmm2
        jne     .L3
        movaps  %xmm2, %xmm1
.L2:
        movss   %xmm3, -8(%rsp)
        movss   %xmm1, -4(%rsp)
        movq    -8(%rsp), %xmm0
        ret
.L4:
        movss   .LC1(%rip), %xmm3
        pxor    %xmm1, %xmm1
        jmp     .L2
        .cfi_endproc
.LFE0:
        .size   f, .-f
        .section        .text.unlikely
.LCOLDE2:
        .text
.LHOTE2:
        .section        .rodata.cst4,"aM",@progbits,4
        .align 4
.LC1:
        .long   1065353216
        .ident  "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.4) 5.4.0 20160609"
        .section        .note.GNU-stack,"",@progbits

解决方案

The problem is that the complex type is not SIMD friendly. I have never been a fan of the complex type because it's a composite object that usually does not map to a primitive type or single operation in hardware (certainly not with x86 hardware).

In order to make complex arithmetic SIMD friendly you need to operate on multiple complex numbers simultaneous. For SSE you need to operate on four complex numbers at once.

We can use GCC's vector extensions to make the syntax easier.
typedef float v4sf __attribute__ ((vector_size (16)));
Then we can delcare a union of an array and the vector extension
typedef union { v4sf v; float e[4]; } float4
And lastly we define a block of four complex numbers like this
typedef struct { float4 x; float4 y; } complex4;
where x is four real parts and y is four imaginary components.

Once we have this we can multiple 4 complex numbers at once like this
static complex4 complex4_mul(complex4 a, complex4 b) { return (complex4){a.x.v*b.x.v -a.y.v*b.y.v, a.y.v*b.x.v + a.x.v*b.y.v}; }
and finally we get to your function modified to operate on four complex numbers at a time.
complex4 f4(complex4 x[], int n) { v4sf one = {1,1,1,1}; complex4 p = {one,one}; for (int i = 0; i < n; i++) p = complex4_mul(p, x[i]); return p; }
Let's look at the assembly (Intel syntax) to see if it's optimal
.L3: movaps xmm4, XMMWORD PTR [rsi] add rsi, 32 movaps xmm1, XMMWORD PTR -16[rsi] cmp rdx, rsi movaps xmm2, xmm4 movaps xmm5, xmm1 mulps xmm1, xmm3 mulps xmm2, xmm3 mulps xmm5, xmm0 mulps xmm0, xmm4 subps xmm2, xmm5 addps xmm0, xmm1 movaps xmm3, xmm2 jne .L3
That's exactly four 4-wide multiplications, one 4-wide addition, and one 4-wide subtraction. The variable p stays in register and only the array x is loaded from memory just like we want.

Let's look at the algebra for the product of complex numbers
{a, bi}*{c, di} = {(ac - bd),(bc + ad)i}
That's exactly four multiplications, one addition, and one subtraction.

As I explained in this answer efficient SIMD algebraically is often identical to the scalar arithmetic. So we have replaced four 1-wide multiplications, addition, and subtraction, with four 4-wide multiplications, addition, and subtraction. That's the best you can do with 4-wide SIMD: four for the price of one.

Note that this does not need any instructions beyond SSE and no additional SSE instructions (except for FMA4) will be any better. So on a 64-bit system you can compile with -O3.

It is trivial to extend this for 8-wide SIMD with AVX.

One major advantage of using GCC's vector extensions is you get FMA without any additional effort. E.g. if you compile with -O3 -mfma4 the main loop is
.L3: vmovaps xmm0, XMMWORD PTR 16[rsi] add rsi, 32 vmulps xmm1, xmm0, xmm2 vmulps xmm0, xmm0, xmm3 vfmsubps xmm1, xmm3, XMMWORD PTR -32[rsi], xmm1 vmovaps xmm3, xmm1 vfmaddps xmm2, xmm2, XMMWORD PTR -32[rsi], xmm0 cmp rdx, rsi jne .L3

这篇关于如何在gcc中启用sse3自动插件的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

如何在gcc中启用sse3自动插件 [英] How to enable sse3 autovectorization in gcc

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

如何在gcc中启用sse3自动插件 [英] How to enable sse3 autovectorization in gcc

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭