展开循环,做独立的总和与矢量 [英] Unroll loop and do independent sum with vectorization
问题描述
有关下面的循环GCC将只向量化循环,如果我告诉它例如使用关联数学与 -Ofast
。
浮动SUMF(浮点* X)
{
X =(浮点*)__ builtin_assume_aligned(X,64);
浮总和= 0;
的for(int i = 0; I< 2048;我++)之和+ = X [I]
返回总和;
}
下面是 -Ofast -mavx
SUMF(浮点*):
vxorps%XMM0,%XMM0,%XMM0
leaq 8192(%RDI),RAX%
.L2:
vaddps(%RDI),%ymm0,%ymm0
addq $ 32%RDI
cmpq%RDI,RAX%
JNE .L2
vhaddps%ymm0,%ymm0,%ymm0
vhaddps%ymm0,%ymm0,%ymm1
vperm2f128 $ 1,%ymm1,ymm1%,%ymm0
vaddps%ymm1,%ymm0,%ymm0
vzeroupper
RET
这清楚地表明环路已经量化。
但这个循环也有依赖链。为了克服在加入我需要展开并执行部分和在x86_64至少三倍的延迟(不包括SKYLAKE微架构,需要解开八次和做加法与需要在的Haswell和Broadwell微架构解开10倍,FMA指令) 。据我了解,我可以用展开 -funroll-循环回路
。
下面是 -Ofast -mavx -funroll-循环组装
。
SUMF(浮点*):
vxorps%XMM7,%XMM7,%XMM7
leaq 8192(%RDI),RAX%
.L2:
vaddps(%RDI),%ymm7,%ymm0
addq $ 256,%RDI
vaddps -224(%RDI),%ymm0,%ymm1
vaddps -192(%RDI),%ymm1,%ymm2
vaddps -160(%RDI),%ymm2,%ymm3
vaddps -128(%RDI),%ymm3,%ymm4
vaddps -96(%RDI),%ymm4,%ymm5
vaddps -64(%RDI),%ymm5,%ymm6
vaddps -32(%RDI),%ymm6,%ymm7
cmpq%RDI,RAX%
JNE .L2
vhaddps%ymm7,%ymm7,%ymm8
vhaddps%ymm8,%ymm8,%ymm9
vperm2f128 $ 1,%ymm9,%ymm9,%ymm10
vaddps%ymm9,%ymm10,%ymm0
vzeroupper
RET
GCC不展开循环八次。但是,它不会做独立的款项。它依赖于8数额。这是没有意义的,比没有展开没有更好的。
如何能拿到GCC展开循环,做独立的部分和?
编辑:
锵将展开四个独立的部分款项,即使没有 -funroll-循环
上证所,但我不知道它的AVX code是高效。编译器不应该需要 -funroll-循环,因此很高兴看到锵正在做的这项权利
与 -Ofast
反正至少对SSE。
锵3.5.1与 -Ofast
。
SUMF(浮点*):#@sumf(浮点*)
xorps%XMM0,%XMM0
xorl%EAX,EAX%
xorps%将xmm1,xmm1中的%
.LBB0_1:#%vector.body
MOVUPS(%RDI,%RAX,4),%XMM2
MOVUPS 16(%RDI,%RAX,4),%XMM3
ADDPS%XMM0,%XMM2
ADDPS%将xmm1,%XMM3
MOVUPS 32(%RDI,%RAX,4),%XMM0
MOVUPS 48(%RDI,%RAX,4),%xmm1中
ADDPS%XMM2,%XMM0
ADDPS%XMM3,xmm1中的%
addq $ 16%RAX
cmpq $ 2048%RAX#IMM =为0x800
JNE .LBB0_1
ADDPS%XMM0,xmm1中的%
MOVAPS%将xmm1,%XMM2
movhlps%XMM2,%#XMM2 = XMM2 XMM2 [1,1]
ADDPS%将xmm1,%XMM2
pshufd $ 1,%XMM2,%#XMM0 = XMM0 XMM2 [1,0,0,0]
ADDPS%XMM2,%XMM0
retq
ICC 13.0.1与 -O3
将展开两个独立的部分款项。 ICC显然假定关联数学仅 -O3
。
.B1.8:
vaddps(%RDI,%RDX,4),%ymm1,ymm1%#5.29
vaddps 32(%RDI,%RDX,4),%ymm0,%ymm0#5.29
vaddps 64(%RDI,%RDX,4),%ymm1,ymm1%#5.29
vaddps 96(%RDI,%RDX,4),%ymm0,%ymm0#5.29
addq $ 32%的RDX#5.3
cmpq%RAX,RDX%#5.3
JB ..B1.8#习题99%#5.3
一些使用gcc内在的和 __ _内置
产生这样:
的typedef浮动v8sf __attribute __((vector_size(32)));
的typedef uint32_t的v8u32 __attribute __((vector_size(32)));静态v8sf sumfvhelper1(v8sf改编[4])
{
v8sf RETVAL = {0};
为(为size_t I = 0; I&下; 4;我+ +)
RETVAL + =改编[I]
返回RETVAL;
}静浮sumfvhelper2(v8sf X)
{
v8sf T = __builtin_shuffle(X,(v8u32){4,5,6,7,0,1,2,3});
X + = T;
T = __builtin_shuffle(X,(v8u32){2,3,0,1,6,7,4,5});
X + = T;
T = __builtin_shuffle(X,(v8u32){1,0,3,2,5,4,7,6});
X + = T;
返回X [0];
}浮sumfv(浮点* X)
{
// X = __builtin_assume_aligned(X,64);
v8sf * VX =(v8sf *)X;
v8sf sumvv [4] = {{0}};
用于(为size_t我= 0; I<八分之二千〇四十八; I + = 4)
{
sumvv [0] + = VX第[i + 0];
sumvv [1] + = VX第[i + 1];
sumvv [2] + = VX第[i + 2];
sumvv [3] + = VX第[i + 3];
}
v8sf sumv = sumfvhelper1(sumvv);
返回sumfvhelper2(sumv);
}
这GCC 4.8.4 的gcc -Wall -Wextra -Wpedantic -std = gnu11 -march =本地-O3 -fno签名3/0 -fno捕获-数学-freciprocal,数学-ffinite-数学只-fassociative-数学-S
变成了:
sumfv:
vxorps%XMM2,%XMM2,%XMM2
xorl%EAX,EAX%
vmovaps%ymm2,%ymm3
vmovaps%ymm2,%ymm0
vmovaps%ymm2,%ymm1
.L7:
addq $ 4%RAX
vaddps(%RDI),%ymm1,%ymm1
SUBQ $ -128%RDI
vaddps -96(%RDI),%ymm0,%ymm0
vaddps -64(%RDI),%ymm3,%ymm3
vaddps -32(%RDI),%ymm2,%ymm2
cmpq $ 256,RAX%
JNE .L7
vaddps%ymm2,%ymm3,%ymm2
vaddps%ymm0,%ymm1,%ymm0
vaddps%ymm0,%ymm2,%ymm0
vperm2f128 $ 1,%ymm0,%ymm0,%ymm1
vaddps%ymm0,%ymm1,%ymm0
vpermilps $ 78%ymm0,%ymm1
vaddps%ymm0,%ymm1,%ymm0
vpermilps $ 177%ymm0,%ymm1
vaddps%ymm0,%ymm1,%ymm0
vzeroupper
RET
第二个辅助函数不是绝对必要的,但求和向量的元素往往会产生可怕的code的GCC。如果你愿意做平台相关的内部函数,也许可以与更换大部分__ builtin_ia32_hadps256()
。
For the following loop GCC will only vectorize the loop if I tell it to use associative math e.g. with -Ofast
.
float sumf(float *x)
{
x = (float*)__builtin_assume_aligned(x, 64);
float sum = 0;
for(int i=0; i<2048; i++) sum += x[i];
return sum;
}
Here is the assembly with -Ofast -mavx
sumf(float*):
vxorps %xmm0, %xmm0, %xmm0
leaq 8192(%rdi), %rax
.L2:
vaddps (%rdi), %ymm0, %ymm0
addq $32, %rdi
cmpq %rdi, %rax
jne .L2
vhaddps %ymm0, %ymm0, %ymm0
vhaddps %ymm0, %ymm0, %ymm1
vperm2f128 $1, %ymm1, %ymm1, %ymm0
vaddps %ymm1, %ymm0, %ymm0
vzeroupper
ret
This clearly shows the loop has been vectorized.
But this loop also has a dependency chain. In order to overcome the latency of the addition I need to unroll and do partial sums at least three times on x86_64 (excluding Skylake which needs to unroll eight times and doing the addition with FMA instructions which need to unroll 10 times on Haswell and Broadwell). As far as I understand I can unroll the loop with -funroll-loops
.
Here is the assembly with -Ofast -mavx -funroll-loops
.
sumf(float*):
vxorps %xmm7, %xmm7, %xmm7
leaq 8192(%rdi), %rax
.L2:
vaddps (%rdi), %ymm7, %ymm0
addq $256, %rdi
vaddps -224(%rdi), %ymm0, %ymm1
vaddps -192(%rdi), %ymm1, %ymm2
vaddps -160(%rdi), %ymm2, %ymm3
vaddps -128(%rdi), %ymm3, %ymm4
vaddps -96(%rdi), %ymm4, %ymm5
vaddps -64(%rdi), %ymm5, %ymm6
vaddps -32(%rdi), %ymm6, %ymm7
cmpq %rdi, %rax
jne .L2
vhaddps %ymm7, %ymm7, %ymm8
vhaddps %ymm8, %ymm8, %ymm9
vperm2f128 $1, %ymm9, %ymm9, %ymm10
vaddps %ymm9, %ymm10, %ymm0
vzeroupper
ret
GCC does unroll the loop eight times. However, it does not do independent sums. It does eight dependent sums. That's pointless and no better than without unrolling.
How can I get GCC to unroll the loop and do independent partial sums?
Edit:
Clang unrolls to four independent partial sums even without -funroll-loops
for SSE but I am not sure its AVX code is as efficient. The compiler should not need -funroll-loops
with -Ofast
anyway so it's good to see Clang is doing this right at least for SSE.
Clang 3.5.1 with -Ofast
.
sumf(float*): # @sumf(float*)
xorps %xmm0, %xmm0
xorl %eax, %eax
xorps %xmm1, %xmm1
.LBB0_1: # %vector.body
movups (%rdi,%rax,4), %xmm2
movups 16(%rdi,%rax,4), %xmm3
addps %xmm0, %xmm2
addps %xmm1, %xmm3
movups 32(%rdi,%rax,4), %xmm0
movups 48(%rdi,%rax,4), %xmm1
addps %xmm2, %xmm0
addps %xmm3, %xmm1
addq $16, %rax
cmpq $2048, %rax # imm = 0x800
jne .LBB0_1
addps %xmm0, %xmm1
movaps %xmm1, %xmm2
movhlps %xmm2, %xmm2 # xmm2 = xmm2[1,1]
addps %xmm1, %xmm2
pshufd $1, %xmm2, %xmm0 # xmm0 = xmm2[1,0,0,0]
addps %xmm2, %xmm0
retq
ICC 13.0.1 with -O3
unrolls to two independent partial sums. ICC apparently assumes associative math with only -O3
.
.B1.8:
vaddps (%rdi,%rdx,4), %ymm1, %ymm1 #5.29
vaddps 32(%rdi,%rdx,4), %ymm0, %ymm0 #5.29
vaddps 64(%rdi,%rdx,4), %ymm1, %ymm1 #5.29
vaddps 96(%rdi,%rdx,4), %ymm0, %ymm0 #5.29
addq $32, %rdx #5.3
cmpq %rax, %rdx #5.3
jb ..B1.8 # Prob 99% #5.3
Some use of gcc intrinsics and __builtin_
produce this:
typedef float v8sf __attribute__((vector_size(32)));
typedef uint32_t v8u32 __attribute__((vector_size(32)));
static v8sf sumfvhelper1(v8sf arr[4])
{
v8sf retval = {0};
for (size_t i = 0; i < 4; i++)
retval += arr[i];
return retval;
}
static float sumfvhelper2(v8sf x)
{
v8sf t = __builtin_shuffle(x, (v8u32){4,5,6,7,0,1,2,3});
x += t;
t = __builtin_shuffle(x, (v8u32){2,3,0,1,6,7,4,5});
x += t;
t = __builtin_shuffle(x, (v8u32){1,0,3,2,5,4,7,6});
x += t;
return x[0];
}
float sumfv(float *x)
{
//x = __builtin_assume_aligned(x, 64);
v8sf *vx = (v8sf*)x;
v8sf sumvv[4] = {{0}};
for (size_t i = 0; i < 2048/8; i+=4)
{
sumvv[0] += vx[i+0];
sumvv[1] += vx[i+1];
sumvv[2] += vx[i+2];
sumvv[3] += vx[i+3];
}
v8sf sumv = sumfvhelper1(sumvv);
return sumfvhelper2(sumv);
}
Which gcc 4.8.4 gcc -Wall -Wextra -Wpedantic -std=gnu11 -march=native -O3 -fno-signed-zeros -fno-trapping-math -freciprocal-math -ffinite-math-only -fassociative-math -S
turns into:
sumfv:
vxorps %xmm2, %xmm2, %xmm2
xorl %eax, %eax
vmovaps %ymm2, %ymm3
vmovaps %ymm2, %ymm0
vmovaps %ymm2, %ymm1
.L7:
addq $4, %rax
vaddps (%rdi), %ymm1, %ymm1
subq $-128, %rdi
vaddps -96(%rdi), %ymm0, %ymm0
vaddps -64(%rdi), %ymm3, %ymm3
vaddps -32(%rdi), %ymm2, %ymm2
cmpq $256, %rax
jne .L7
vaddps %ymm2, %ymm3, %ymm2
vaddps %ymm0, %ymm1, %ymm0
vaddps %ymm0, %ymm2, %ymm0
vperm2f128 $1, %ymm0, %ymm0, %ymm1
vaddps %ymm0, %ymm1, %ymm0
vpermilps $78, %ymm0, %ymm1
vaddps %ymm0, %ymm1, %ymm0
vpermilps $177, %ymm0, %ymm1
vaddps %ymm0, %ymm1, %ymm0
vzeroupper
ret
The second helper function isn't strictly necessary, but summing over the elements of a vector tends to produce terrible code in gcc. If you're willing to do platform-dependent intrinsics, you can probably replace most of it with __builtin_ia32_hadps256()
.
这篇关于展开循环,做独立的总和与矢量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!