展开循环,做独立的总和与矢量 [英] Unroll loop and do independent sum with vectorization

查看:581
本文介绍了展开循环,做独立的总和与矢量的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

有关下面的循环GCC将只向量化循环,如果我告诉它例如使用关联数学与 -Ofast

 浮动SUMF(浮点* X)
{
  X =(浮点*)__ builtin_assume_aligned(X,64);
  浮总和= 0;
  的for(int i = 0; I< 2048;我++)之和+ = X [I]
  返回总和;
}

下面是 -Ofast -mavx

组装

  SUMF(浮点*):
    vxorps%XMM0,%XMM0,%XMM0
    leaq 8192(%RDI),RAX%
.L2:
    vaddps(%RDI),%ymm0,%ymm0
    addq $ 32%RDI
    cmpq%RDI,RAX%
    JNE .L2
    vhaddps%ymm0,%ymm0,%ymm0
    vhaddps%ymm0,%ymm0,%ymm1
    vperm2f128 $ 1,%ymm1,ymm1%,%ymm0
    vaddps%ymm1,%ymm0,%ymm0
    vzeroupper
    RET

这清楚地表明环路已经量化。

但这个循环也有依赖链。为了克服在加入我需要展开并执行部分和在x86_64至少三倍的延迟(不包括SKYLAKE微架构,需要解开八次和做加法与需要在的Haswell和Broadwell微架构解开10倍,FMA指令) 。据我了解,我可以用展开 -funroll-循环回路

下面是 -Ofast -mavx -funroll-循环组装

  SUMF(浮点*):
    vxorps%XMM7,%XMM7,%XMM7
    leaq 8192(%RDI),RAX%
.L2:
    vaddps(%RDI),%ymm7,%ymm0
    addq $ 256,%RDI
    vaddps -224(%RDI),%ymm0,%ymm1
    vaddps -192(%RDI),%ymm1,%ymm2
    vaddps -160(%RDI),%ymm2,%ymm3
    vaddps -128(%RDI),%ymm3,%ymm4
    vaddps -96(%RDI),%ymm4,%ymm5
    vaddps -64(%RDI),%ymm5,%ymm6
    vaddps -32(%RDI),%ymm6,%ymm7
    cmpq%RDI,RAX%
    JNE .L2
    vhaddps%ymm7,%ymm7,%ymm8
    vhaddps%ymm8,%ymm8,%ymm9
    vperm2f128 $ 1,%ymm9,%ymm9,%ymm10
    vaddps%ymm9,%ymm10,%ymm0
    vzeroupper
    RET

GCC不展开循环八次。但是,它不会做独立的款项。它依赖于8数额。这是没有意义的,比没有展开没有更好的。

如何能拿到GCC展开循环,做独立的部分和?


编辑:

锵将展开四个独立的部分款项,即使没有 -funroll-循环上证所,但我不知道它的AVX code是高效。编译器不应该需要 -funroll-循环,因此很高兴看到锵正在做的这项权利 -Ofast 反正至少对SSE。

锵3.5.1与 -Ofast

  SUMF(浮点*):#@sumf(浮点*)
    xorps%XMM0,%XMM0
    xorl%EAX,EAX%
    xorps%将xmm1,xmm1中的%
.LBB0_1:#%vector.body
    MOVUPS(%RDI,%RAX,4),%XMM2
    MOVUPS 16(%RDI,%RAX,4),%XMM3
    ADDPS%XMM0,%XMM2
    ADDPS%将xmm1,%XMM3
    MOVUPS 32(%RDI,%RAX,4),%XMM0
    MOVUPS 48(%RDI,%RAX,4),%xmm1中
    ADDPS%XMM2,%XMM0
    ADDPS%XMM3,xmm1中的%
    addq $ 16%RAX
    cmpq $ 2048%RAX#IMM =为0x800
    JNE .LBB0_1
    ADDPS%XMM0,xmm1中的%
    MOVAPS%将xmm1,%XMM2
    movhlps%XMM2,%#XMM2 = XMM2 XMM2 [1,1]
    ADDPS%将xmm1,%XMM2
    pshufd $ 1,%XMM2,%#XMM0 = XMM0 XMM2 [1,0,0,0]
    ADDPS%XMM2,%XMM0
    retq

ICC 13.0.1与 -O3 将展开两个独立的部分款项。 ICC显然假定关联数学仅 -O3

  .B1.8:
    vaddps(%RDI,%RDX,4),%ymm1,ymm1%#5.29
    vaddps 32(%RDI,%RDX,4),%ymm0,%ymm0#5.29
    vaddps 64(%RDI,%RDX,4),%ymm1,ymm1%#5.29
    vaddps 96(%RDI,%RDX,4),%ymm0,%ymm0#5.29
    addq $ 32%的RDX#5.3
    cmpq%RAX,RDX%#5.3
    JB ..B1.8#习题99%#5.3


解决方案

一些使用gcc内在的和 __ _内置产生这样:

 的typedef浮动v8sf __attribute __((vector_size(32)));
的typedef uint32_t的v8u32 __attribute __((vector_size(32)));静态v8sf sumfvhelper1(v8sf改编[4])
{
  v8sf RETVAL = {0};
  为(为size_t I = 0; I&下; 4;我+ +)
    RETVAL + =改编[I]
  返回RETVAL;
}静浮sumfvhelper2(v8sf X)
{
  v8sf T = __builtin_shuffle(X,(v8u32){4,5,6,7,0,1,2,3});
  X + = T;
  T = __builtin_shuffle(X,(v8u32){2,3,0,1,6,7,4,5});
  X + = T;
  T = __builtin_shuffle(X,(v8u32){1,0,3,2,5,4,7,6});
  X + = T;
  返回X [0];
}浮sumfv(浮点* X)
{
  // X = __builtin_assume_aligned(X,64);
  v8sf * VX =(v8sf *)X;
  v8sf sumvv [4] = {{0}};
  用于(为size_t我= 0; I<八分之二千〇四十八; I + = 4)
    {
      sumvv [0] + = VX第[i + 0];
      sumvv [1] + = VX第[i + 1];
      sumvv [2] + = VX第[i + 2];
      sumvv [3] + = VX第[i + 3];
    }
  v8sf sumv = sumfvhelper1(sumvv);
  返回sumfvhelper2(sumv);
}

这GCC 4.8.4 的gcc -Wall -Wextra -Wpedantic -std = gnu11 -march =本地-O3 -fno签名3/0 -fno捕获-数学-freciprocal,数学-ffinite-数学只-fassociative-数学-S 变成了:

  sumfv:
    vxorps%XMM2,%XMM2,%XMM2
    xorl%EAX,EAX%
    vmovaps%ymm2,%ymm3
    vmovaps%ymm2,%ymm0
    vmovaps%ymm2,%ymm1
.L7:
    addq $ 4%RAX
    vaddps(%RDI),%ymm1,%ymm1
    SUBQ $ -128%RDI
    vaddps -96(%RDI),%ymm0,%ymm0
    vaddps -64(%RDI),%ymm3,%ymm3
    vaddps -32(%RDI),%ymm2,%ymm2
    cmpq $ 256,RAX%
    JNE .L7
    vaddps%ymm2,%ymm3,%ymm2
    vaddps%ymm0,%ymm1,%ymm0
    vaddps%ymm0,%ymm2,%ymm0
    vperm2f128 $ 1,%ymm0,%ymm0,%ymm1
    vaddps%ymm0,%ymm1,%ymm0
    vpermilps $ 78%ymm0,%ymm1
    vaddps%ymm0,%ymm1,%ymm0
    vpermilps $ 177%ymm0,%ymm1
    vaddps%ymm0,%ymm1,%ymm0
    vzeroupper
    RET

第二个辅助函数不是绝对必要的,但求和向量的元素往往会产生可怕的code的GCC。如果你愿意做平台相关的内部函数,也许可以与更换大部分__ builtin_ia32_hadps256()

For the following loop GCC will only vectorize the loop if I tell it to use associative math e.g. with -Ofast.

float sumf(float *x)
{
  x = (float*)__builtin_assume_aligned(x, 64);
  float sum = 0;
  for(int i=0; i<2048; i++) sum += x[i];
  return sum;
}

Here is the assembly with -Ofast -mavx

sumf(float*):
    vxorps  %xmm0, %xmm0, %xmm0
    leaq    8192(%rdi), %rax
.L2:
    vaddps  (%rdi), %ymm0, %ymm0
    addq    $32, %rdi
    cmpq    %rdi, %rax
    jne .L2
    vhaddps %ymm0, %ymm0, %ymm0
    vhaddps %ymm0, %ymm0, %ymm1
    vperm2f128  $1, %ymm1, %ymm1, %ymm0
    vaddps  %ymm1, %ymm0, %ymm0
    vzeroupper
    ret

This clearly shows the loop has been vectorized.

But this loop also has a dependency chain. In order to overcome the latency of the addition I need to unroll and do partial sums at least three times on x86_64 (excluding Skylake which needs to unroll eight times and doing the addition with FMA instructions which need to unroll 10 times on Haswell and Broadwell). As far as I understand I can unroll the loop with -funroll-loops.

Here is the assembly with -Ofast -mavx -funroll-loops.

sumf(float*):
    vxorps  %xmm7, %xmm7, %xmm7
    leaq    8192(%rdi), %rax
.L2:
    vaddps  (%rdi), %ymm7, %ymm0
    addq    $256, %rdi
    vaddps  -224(%rdi), %ymm0, %ymm1
    vaddps  -192(%rdi), %ymm1, %ymm2
    vaddps  -160(%rdi), %ymm2, %ymm3
    vaddps  -128(%rdi), %ymm3, %ymm4
    vaddps  -96(%rdi), %ymm4, %ymm5
    vaddps  -64(%rdi), %ymm5, %ymm6
    vaddps  -32(%rdi), %ymm6, %ymm7
    cmpq    %rdi, %rax
    jne .L2
    vhaddps %ymm7, %ymm7, %ymm8
    vhaddps %ymm8, %ymm8, %ymm9
    vperm2f128  $1, %ymm9, %ymm9, %ymm10
    vaddps  %ymm9, %ymm10, %ymm0
    vzeroupper
    ret

GCC does unroll the loop eight times. However, it does not do independent sums. It does eight dependent sums. That's pointless and no better than without unrolling.

How can I get GCC to unroll the loop and do independent partial sums?


Edit:

Clang unrolls to four independent partial sums even without -funroll-loops for SSE but I am not sure its AVX code is as efficient. The compiler should not need -funroll-loops with -Ofast anyway so it's good to see Clang is doing this right at least for SSE.

Clang 3.5.1 with -Ofast.

sumf(float*):                              # @sumf(float*)
    xorps   %xmm0, %xmm0
    xorl    %eax, %eax
    xorps   %xmm1, %xmm1
.LBB0_1:                                # %vector.body
    movups  (%rdi,%rax,4), %xmm2
    movups  16(%rdi,%rax,4), %xmm3
    addps   %xmm0, %xmm2
    addps   %xmm1, %xmm3
    movups  32(%rdi,%rax,4), %xmm0
    movups  48(%rdi,%rax,4), %xmm1
    addps   %xmm2, %xmm0
    addps   %xmm3, %xmm1
    addq    $16, %rax
    cmpq    $2048, %rax             # imm = 0x800
    jne .LBB0_1
    addps   %xmm0, %xmm1
    movaps  %xmm1, %xmm2
    movhlps %xmm2, %xmm2            # xmm2 = xmm2[1,1]
    addps   %xmm1, %xmm2
    pshufd  $1, %xmm2, %xmm0        # xmm0 = xmm2[1,0,0,0]
    addps   %xmm2, %xmm0
    retq

ICC 13.0.1 with -O3 unrolls to two independent partial sums. ICC apparently assumes associative math with only -O3.

.B1.8: 
    vaddps    (%rdi,%rdx,4), %ymm1, %ymm1                   #5.29
    vaddps    32(%rdi,%rdx,4), %ymm0, %ymm0                 #5.29
    vaddps    64(%rdi,%rdx,4), %ymm1, %ymm1                 #5.29
    vaddps    96(%rdi,%rdx,4), %ymm0, %ymm0                 #5.29
    addq      $32, %rdx                                     #5.3
    cmpq      %rax, %rdx                                    #5.3
    jb        ..B1.8        # Prob 99%                      #5.3

解决方案

Some use of gcc intrinsics and __builtin_ produce this:

typedef float v8sf __attribute__((vector_size(32)));
typedef uint32_t v8u32 __attribute__((vector_size(32)));

static v8sf sumfvhelper1(v8sf arr[4])
{
  v8sf retval = {0};
  for (size_t i = 0; i < 4; i++)
    retval += arr[i];
  return retval;
}

static float sumfvhelper2(v8sf x)
{
  v8sf t = __builtin_shuffle(x, (v8u32){4,5,6,7,0,1,2,3});
  x += t;
  t = __builtin_shuffle(x, (v8u32){2,3,0,1,6,7,4,5});
  x += t;
  t = __builtin_shuffle(x, (v8u32){1,0,3,2,5,4,7,6});
  x += t;
  return x[0];
}

float sumfv(float *x)
{
  //x = __builtin_assume_aligned(x, 64);
  v8sf *vx = (v8sf*)x;
  v8sf sumvv[4] = {{0}};
  for (size_t i = 0; i < 2048/8; i+=4)
    {
      sumvv[0] += vx[i+0];
      sumvv[1] += vx[i+1];
      sumvv[2] += vx[i+2];
      sumvv[3] += vx[i+3];
    }
  v8sf sumv = sumfvhelper1(sumvv);
  return sumfvhelper2(sumv);
}

Which gcc 4.8.4 gcc -Wall -Wextra -Wpedantic -std=gnu11 -march=native -O3 -fno-signed-zeros -fno-trapping-math -freciprocal-math -ffinite-math-only -fassociative-math -S turns into:

sumfv:
    vxorps  %xmm2, %xmm2, %xmm2
    xorl    %eax, %eax
    vmovaps %ymm2, %ymm3
    vmovaps %ymm2, %ymm0
    vmovaps %ymm2, %ymm1
.L7:
    addq    $4, %rax
    vaddps  (%rdi), %ymm1, %ymm1
    subq    $-128, %rdi
    vaddps  -96(%rdi), %ymm0, %ymm0
    vaddps  -64(%rdi), %ymm3, %ymm3
    vaddps  -32(%rdi), %ymm2, %ymm2
    cmpq    $256, %rax
    jne .L7
    vaddps  %ymm2, %ymm3, %ymm2
    vaddps  %ymm0, %ymm1, %ymm0
    vaddps  %ymm0, %ymm2, %ymm0
    vperm2f128  $1, %ymm0, %ymm0, %ymm1
    vaddps  %ymm0, %ymm1, %ymm0
    vpermilps   $78, %ymm0, %ymm1
    vaddps  %ymm0, %ymm1, %ymm0
    vpermilps   $177, %ymm0, %ymm1
    vaddps  %ymm0, %ymm1, %ymm0
    vzeroupper
    ret

The second helper function isn't strictly necessary, but summing over the elements of a vector tends to produce terrible code in gcc. If you're willing to do platform-dependent intrinsics, you can probably replace most of it with __builtin_ia32_hadps256().

这篇关于展开循环,做独立的总和与矢量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆