为什么使用浮点类型时,O3优化不提高性能? [英] Why O3 optimization does not improve the performance when using float type?

查看:947
本文介绍了为什么使用浮点类型时,O3优化不提高性能?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我编译相应的 C 实施两个浮动 INT 当我编译它们 O2 几乎每一件事情都是一样的,但是当我使用 O3 标志使用自动向量化能力两者产生变异的速度提升。我看到大会放出来,发现差异,但我不知道为什么编译 GCC 这样吗?之间是什么原因和差异浮动键入和 INT 键入<?/ p>

乘法之前,我调换的,因为某些原因,第二个矩阵。矩阵的尺寸是128×128和 O2 标INT执行速度可达5.4在同一实现时启用 O3 标记和浮动实施加速是咬人几乎 0.94 。结果
诠释组件从放:

  .L2:
    vmovdqa 448(%RDI),%ymm0
    MOVL $ c_tra,EAX%
    MOVQ%R8,RDX%
    vmovdqa(%RDI),%ymm15
    vmovdqa%ymm0,-48(%RSP)
    vmovdqa 480(%RDI),%ymm0
    vmovdqa 32(%RDI),%ymm14
    vmovdqa 64(%RDI),%ymm13
    vmovdqa 96(%RDI),%ymm12
    vmovdqa 128(%RDI),%ymm11
    vmovdqa 160(%RDI),%ymm10
    vmovdqa 192(%RDI),%ymm9
    vmovdqa 224(%RDI),%ymm8
    vmovdqa 256(%RDI),%ymm7
    vmovdqa 288(%RDI),%ymm6
    vmovdqa 320(%RDI),%ymm5
    vmovdqa 352(%RDI),%ymm4
    vmovdqa 384(%RDI),%ymm3
    vmovdqa 416(%RDI),%ymm2
    vmovdqa%ymm0,-80(%RSP)
    .p2align 4日,10
    .p2align 3
.L5:
    vpmulld 32(%RAX),%ymm14,%ymm0
    vpmulld(RAX%),%ymm15,%ymm1
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 64(%RAX),%ymm13,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 96(%RAX),%ymm12,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 128(RAX%),%ymm11,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 160(RAX%),%ymm10,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 192(RAX%),%ymm9,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 224(RAX%),%ymm8,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 256(RAX%),%ymm7,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 288(RAX%),%ymm6,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 320(RAX%),%ymm5,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 352(RAX%),%ymm4,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 384(RAX%),%ymm3,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vpmulld 416(RAX%),%ymm2,%ymm0
    vpaddd%ymm0,%ymm1,%ymm1
    vmovdqa -48(%RSP),%ymm0
    addq $ 512%RAX
    addq $ 4%的RDX
    vpmulld -64(RAX%),%ymm0,%ymm0
    vpaddd%ymm0,%ymm1,%ymm0
    vmovdqa -80(%RSP),%ymm1
    vpmulld -32(RAX%),%ymm1,%ymm1
    vpaddd%ymm0,%ymm1,%ymm1
    vmovdqa%将xmm1,%XMM0
    vextracti128 $为0x1,%ymm1,xmm1中的%
    vpextrd $ 1,%XMM0,ESI%
    vpextrd $ 0%XMM0,ECX%
    ADDL%ESI,ECX%
    vpextrd $ 2%XMM0,ESI%
    ADDL%ESI,ECX%
    vpextrd $ 3%XMM0,ESI%
    ADDL%ESI,ECX%
    vpextrd $ 0%将xmm1,ESI%
    ADDL%ESI,ECX%
    vpextrd $ 1,%将xmm1,ESI%
    ADDL%ESI,ECX%
    vpextrd $ 2%将xmm1,ESI%
    ADDL%ESI,ECX%
    vpextrd $ 3%将xmm1,ESI%
    ADDL%ESI,ECX%
    MOVL%ECX,-4(%RDX)
    cmpq $ c_tra + 65536%RAX
    JNE .L5
    addq $ 512%R8
    addq $ 512%RDI
    cmpq $ c_result + 65536%R8
    JNE .L2

浮子组件出放:

  .L2:
    xorl%ESI,ESI%
    .p2align 4日,10
    .p2align 3
.L7:
    MOVQ%RDI,RSI%
    xorl%EAX,EAX%
    xorl%EDX,EDX%
    salq $ 5%RSI
    .p2align 4日,10
    .p2align 3
.L5:
    vcvtsi2ss%EDX,%XMM0,%XMM0
    vmovss一个(RCX%,%RAX),%XMM2
    vfmadd231ss c_tra(RSI%,%RAX),%XMM2,%XMM0
    addq $ 4%RAX
    vcvttss2si%XMM0,EDX%
    cmpq $ 128,RAX%
    JNE .L5
    vcvtsi2ss%EDX,%XMM0,%XMM0
    vmovss%XMM0,c_result(RCX%,%RDI)
    addq $ 4%RDI
    cmpq $ 128,%RDI
    JNE .L7


解决方案

这可以看出, vcvttss2si vcvtsi2ss 限制自动向量化我这种转换和经销商的矢量化标记 Ofast 矢量程序自动改变了一些变量prevent。因此,答案是自动矢量具有与转换问题。

I compiled the corresponding C implementation of two float and int matrix multiplication program when I compile them in O2 almost every thing is the same but when I use O3 flag to use auto vectorization capability both of them yield variant speedups. I see the assembly out put and found out the differences but I don't know why GCC compiled like this? what is the reason and differences between float type and int type ?

Before the multiplication I transposed the second matrix because of some reasons. size of the matrices are 128x128 and the speed up of O2 scalar int implementation is 5.4 over the same implementation when I enable O3 flag and for float implementation speedup is a bite worse almost 0.94.
Int assembly out put:

.L2:
    vmovdqa 448(%rdi), %ymm0
    movl    $c_tra, %eax
    movq    %r8, %rdx
    vmovdqa (%rdi), %ymm15
    vmovdqa %ymm0, -48(%rsp)
    vmovdqa 480(%rdi), %ymm0
    vmovdqa 32(%rdi), %ymm14
    vmovdqa 64(%rdi), %ymm13
    vmovdqa 96(%rdi), %ymm12
    vmovdqa 128(%rdi), %ymm11
    vmovdqa 160(%rdi), %ymm10
    vmovdqa 192(%rdi), %ymm9
    vmovdqa 224(%rdi), %ymm8
    vmovdqa 256(%rdi), %ymm7
    vmovdqa 288(%rdi), %ymm6
    vmovdqa 320(%rdi), %ymm5
    vmovdqa 352(%rdi), %ymm4
    vmovdqa 384(%rdi), %ymm3
    vmovdqa 416(%rdi), %ymm2
    vmovdqa %ymm0, -80(%rsp)
    .p2align 4,,10
    .p2align 3
.L5:
    vpmulld 32(%rax), %ymm14, %ymm0
    vpmulld (%rax), %ymm15, %ymm1
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 64(%rax), %ymm13, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 96(%rax), %ymm12, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 128(%rax), %ymm11, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 160(%rax), %ymm10, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 192(%rax), %ymm9, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 224(%rax), %ymm8, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 256(%rax), %ymm7, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 288(%rax), %ymm6, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 320(%rax), %ymm5, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 352(%rax), %ymm4, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 384(%rax), %ymm3, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 416(%rax), %ymm2, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vmovdqa -48(%rsp), %ymm0
    addq    $512, %rax
    addq    $4, %rdx
    vpmulld -64(%rax), %ymm0, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm0
    vmovdqa -80(%rsp), %ymm1
    vpmulld -32(%rax), %ymm1, %ymm1
    vpaddd  %ymm0, %ymm1, %ymm1
    vmovdqa %xmm1, %xmm0
    vextracti128    $0x1, %ymm1, %xmm1
    vpextrd $1, %xmm0, %esi
    vpextrd $0, %xmm0, %ecx
    addl    %esi, %ecx
    vpextrd $2, %xmm0, %esi
    addl    %esi, %ecx
    vpextrd $3, %xmm0, %esi
    addl    %esi, %ecx
    vpextrd $0, %xmm1, %esi
    addl    %esi, %ecx
    vpextrd $1, %xmm1, %esi
    addl    %esi, %ecx
    vpextrd $2, %xmm1, %esi
    addl    %esi, %ecx
    vpextrd $3, %xmm1, %esi
    addl    %esi, %ecx
    movl    %ecx, -4(%rdx)
    cmpq    $c_tra+65536, %rax
    jne .L5
    addq    $512, %r8
    addq    $512, %rdi
    cmpq    $c_result+65536, %r8
    jne .L2

Float assembly out put:

  .L2:
    xorl    %esi, %esi
    .p2align 4,,10
    .p2align 3
.L7:
    movq    %rdi, %rsi
    xorl    %eax, %eax
    xorl    %edx, %edx
    salq    $5, %rsi
    .p2align 4,,10
    .p2align 3
.L5:
    vcvtsi2ss   %edx, %xmm0, %xmm0
    vmovss  a(%rcx,%rax), %xmm2
    vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
    addq    $4, %rax
    vcvttss2si  %xmm0, %edx
    cmpq    $128, %rax
    jne .L5
    vcvtsi2ss   %edx, %xmm0, %xmm0
    vmovss  %xmm0, c_result(%rcx,%rdi)
    addq    $4, %rdi
    cmpq    $128, %rdi
    jne .L7

解决方案

It could be seen that vcvttss2si and vcvtsi2ss restrict the auto vectorization I changed some variable to prevent this conversion and auto vectorization flag Ofast vectorized the program automatically. So the answer is auto-vectorization has a problem with the conversions.

这篇关于为什么使用浮点类型时,O3优化不提高性能?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆