GCC优化技巧,它真的工作吗? [英] GCC optimization trick, does it really work?

查看:104
本文介绍了GCC优化技巧,它真的工作吗?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

在查看有关优化的一些问题时,此接受答案的关于编码实践的问题最有效地使用优化器引发我的好奇心。断言是局部变量应该用于函数中的计算,而不是输出参数。有人建议这将允许编译器进行额外的优化,否则不可能。



因此,编写一个简单的代码示例Foo类并编译代码片段g ++ v4.4和-O2给出了一些汇编器输出(使用-S)。汇编器列表的部分仅包含循环部分如下所示。在检查输出,似乎两个循环几乎相同,只有一个地址的差异。该地址是指向第一个示例的输出参数的指针,或者是第二个的局部变量的指针。



无论是否使用本地变量,实际效果似乎没有变化。所以问题分为3部分:



a)是GCC 不是做额外的优化,即使给出了暗示建议;



b)在这两种情况下都是GCC 成功优化,但不应是





这里是未优化的函数:

p>

  void DoSomething(const Foo& foo1,const Foo * foo2,int numFoo,Foo& barOut)
{
for(int i = 0; i {
barOut.munge(foo1,foo2 [i]);
}
}

与对应的程序集:

  .L3:
movl(%esi),%eax
addl $ 1,%ebx
addl $ 4,%esi
movl%eax,8(%esp)
movl(%ebp),%eax
movl%eax,4(%esp) eax;注意地址是输出参数的地址
movl%eax,(%esp)
call _ZN3Foo5mungeES_S_
cmpl%ebx,16(%ebp)
jg .L3

以下是重写的函数:

  void DoSomethingFaster(const Foo& foo1,const Foo * foo2,int numFoo,Foo& barOut)
{
Foo barTemp = barOut;
for(int i = 0; i {
barTemp.munge(foo1,foo2 [i]);
}
barOut = barTemp;
}

这里是使用局部变量的函数的编译器输出: p>

  .L3:
movl(%esi),%eax;将foo2 [i]指针插入EAX
addl $ 1,%ebx;增量i
addl $ 4,%esi;增量foo2 [i](32位系统,64位系统上为8)
movl%eax,8(%esp); PUSH foo2 [i]到栈(小心!从EAX,而不是ESI)
movl(%edi),%eax;将foo1指针加载到EAX
movl%eax,4(%esp); PUSH foo1
leal -28(%ebp),%eax;加载barTemp指针到EAX
movl%eax,(%esp); PUSH barTemp的这个指针
call _ZN3Foo5mungeES_S_; munge()!
cmpl%ebx,16(%ebp); i< numFoo
jg .L3;回忆增加一个进入循环
;所以测试如果更大


解决方案

不是一个非常好的一个,因为调用一个未知的函数编译器不能推理很多。下面是一个更好的例子:

  void FillOneA(int * array,int length,int& startIndex)
{
for(int i = 0; i }

void FillOneB(int * array,int length,int& startIndex)
{
int localIndex = startIndex;
for(int i = 0; i }

第一个版本优化很差,因为它需要防止某人调用它作为

  int array [10] = {0}; 
FillOneA(array,5,array [1]);

导致 {1,1,0,1,1,1, 0,0,0,0} ,因为与 i = 1 的迭代修改 startIndex 参数。



第二个不需要担心数组[localIndex + i] = 1 将修改 localIndex ,因为 localIndex 是地址从未被采用的局部变量。



在汇编(英特尔符号,因为这是我使用):

  FillOneA:
mov edx,[esp + 8]
xor eax,eax
test edx,edx
jle
$ b push esi
mov esi,[esp + 16]
push edi
mov edi,[esp + 12]
$ a:mov ecx,[esi]
add ecx,eax
inc eax
mov [edi + ecx * 4],1
cmp eax,edx
jl $ a
pop edi
pop esi
$ b:ret

FillOneB:
mov ecx,[esp + 8]
mov eax,[esp + 12]
mov edx,[eax]
test ecx,ecx
jle $ a
mov eax,[esp + 4]
push edi
lea edi,[eax + edx * 4]
mov eax,1
rep stosd
pop edi
$ a:ret

ADDED:编译器的洞察力是Bar,而不是munge:

  class Bar 
{
public:
float getValue()const
{
return valueBase * boost;
}

private:
float valueBase;
float boost;
};

class Foo
{
public:
void munge(float adjustment);
};

void Adjust10A(Foo& foo,const Bar& bar)
{
for(int i = 0; i <10; i ++)
foo.munge (bar.getValue());
}

void Adjust10B(Foo& foo,const Bar& bar)
{
Bar localBar = bar;
for(int i = 0; i <10; i ++)
foo.munge(localBar.getValue());
}

生成的代码是

  Adjust10A:
push ecx
push ebx
mov ebx,[esp + 12] ;; foo
push esi
mov esi,[esp + 20] ;; bar
push edi
mov edi,10
$ a:fld [esi + 4] ;; bar.valueBase
push ecx
fmul [esi] ;; valueBase * boost
mov ecx,ebx
fstp [esp + 16]
fld [esp + 16]
fstp [esp]
call Foo :: munge
dec edi
jne $ a
pop edi
pop esi
pop ebx
pop ecx
ret 0

Adjust10B :
sub esp,8
mov ecx,[esp + 16] ;; bar
mov eax,[ecx] ;; bar.valueBase
mov [esp],eax ;; localBar.valueBase
fld [esp] ;; localBar.valueBase
mov eax,[ecx + 4] ;; bar.boost
mov [esp + 4],eax ;; localBar.boost
fmul [esp + 4] ;; localBar.getValue()
push esi
push edi
mov edi,[esp + 20] ;; foo
fstp [esp + 24]
fld [esp + 24] ;; cache localBar.getValue()
mov esi,10 ;;循环计数器
$ a:push ecx
mov ecx,edi ;; foo
fstp [esp] ;;使用缓存的值
call Foo :: munge
fld [esp]
dec esi
jne $ a ;; loop
pop edi
fstp ST(0)
pop esi
add esp,8
ret 0
Adjust10A 中的内循环必须重新计算值,因为它必须防止 foo.munge 更改 bar



优化不是一个大满贯。 (例如,我们可以通过手动缓存 bar.getValue() localValue 中获得相同的效果。它往往对矢量化操作最有帮助,因为那些可以并行化。


While looking at some questions on optimization, this accepted answer for the question on coding practices for most effective use of the optimizer piqued my curiosity. The assertion is that local variables should be used for computations in a function, not output arguments. It was suggested this would allow the compiler to make additional optimizations otherwise not possible.

So, writing a simple bit of code for the example Foo class and compiling the code fragments with g++ v4.4 and -O2 gave some assembler output (use -S). The parts of the assembler listing with just the loop portion shown below. On examination of the output, it seems the loop is nearly identical for both, with just a difference in one address. That address being a pointer to the output argument for the first example or the local variable for the second.

There seems to no change in the actual effect whether the local variable is used or not. So the question breaks down to 3 parts:

a) is GCC not doing additional optimization, even given the hint suggested;

b) is GCC successfully optimizing in both cases, but should not be;

c) is GCC successfully optimizing in both cases, and is producing compliant output as defined by the C++ standard?

Here is the unoptimized function:

void DoSomething(const Foo& foo1, const Foo* foo2, int numFoo, Foo& barOut)
{
    for (int i=0; i<numFoo, i++)
    {
         barOut.munge(foo1, foo2[i]);
    }
}

And corresponding assembly:

.L3:
    movl    (%esi), %eax
    addl    $1, %ebx
    addl    $4, %esi
    movl    %eax, 8(%esp)
    movl    (%edi), %eax
    movl    %eax, 4(%esp)
    movl    20(%ebp), %eax       ; Note address is that of the output argument
    movl    %eax, (%esp)
    call    _ZN3Foo5mungeES_S_
    cmpl    %ebx, 16(%ebp)
    jg      .L3

Here is the re-written function:

void DoSomethingFaster(const Foo& foo1, const Foo* foo2, int numFoo, Foo& barOut)
{
    Foo barTemp = barOut;
    for (int i=0; i<numFoo, i++)
    {
         barTemp.munge(foo1, foo2[i]);
    }
    barOut = barTemp;
}

And here is the compiler output for the function using a local variable:

.L3:
    movl    (%esi), %eax          ; Load foo2[i] pointer into EAX
    addl    $1, %ebx              ; increment i
    addl    $4, %esi              ; increment foo2[i] (32-bit system, 8 on 64-bit systems)
    movl    %eax, 8(%esp)         ; PUSH foo2[i] onto stack (careful! from EAX, not ESI)
    movl    (%edi), %eax          ; Load foo1 pointer into EAX
    movl    %eax, 4(%esp)         ; PUSH foo1
    leal    -28(%ebp), %eax       ; Load barTemp pointer into EAX
    movl    %eax, (%esp)          ; PUSH the this pointer for barTemp
    call    _ZN3Foo5mungeES_S_    ; munge()!
    cmpl    %ebx, 16(%ebp)        ; i < numFoo
    jg      .L3                   ; recall incrementing i by one coming into the loop
                                  ; so test if greater

解决方案

The example given in that answer was not a very good one because of the call to an unknown function the compiler cannot reason much about. Here's a better example:

void FillOneA(int *array, int length, int& startIndex)
{
    for (int i = 0; i < length; i++) array[startIndex + i] = 1;
}

void FillOneB(int *array, int length, int& startIndex)
{
    int localIndex = startIndex;
    for (int i = 0; i < length; i++) array[localIndex + i] = 1;
}

The first version optimizes poorly because it needs to protect against the possibility that somebody called it as

int array[10] = { 0 };
FillOneA(array, 5, array[1]);

resulting in {1, 1, 0, 1, 1, 1, 0, 0, 0, 0 } since the iteration with i=1 modifies the startIndex parameter.

The second one doesn't need to worry about the possibility that the array[localIndex + i] = 1 will modify localIndex because localIndex is a local variable whose address has never been taken.

In assembly (Intel notation, because that's what I use):

FillOneA:
    mov     edx, [esp+8]
    xor     eax, eax
    test    edx, edx
    jle     $b
    push    esi
    mov     esi, [esp+16]
    push    edi
    mov     edi, [esp+12]
$a: mov     ecx, [esi]
    add     ecx, eax
    inc     eax
    mov     [edi+ecx*4], 1
    cmp     eax, edx
    jl      $a
    pop     edi
    pop     esi
$b: ret

FillOneB:
    mov     ecx, [esp+8]
    mov     eax, [esp+12]
    mov     edx, [eax]
    test    ecx, ecx
    jle     $a
    mov     eax, [esp+4]
    push    edi
    lea     edi, [eax+edx*4]
    mov     eax, 1
    rep stosd
    pop     edi
$a: ret

ADDED: Here's an example where the compiler's insight is into Bar, and not munge:

class Bar
{
public:
    float getValue() const
    {
        return valueBase * boost;
    }

private:
    float valueBase;
    float boost;
};

class Foo
{
public:
    void munge(float adjustment);
};

void Adjust10A(Foo& foo, const Bar& bar)
{
    for (int i = 0; i < 10; i++)
        foo.munge(bar.getValue());
}

void Adjust10B(Foo& foo, const Bar& bar)
{
    Bar localBar = bar;
    for (int i = 0; i < 10; i++)
        foo.munge(localBar.getValue());
}

The resulting code is

Adjust10A:
    push    ecx
    push    ebx
    mov     ebx, [esp+12] ;; foo
    push    esi
    mov     esi, [esp+20] ;; bar
    push    edi
    mov     edi, 10
$a: fld     [esi+4] ;; bar.valueBase
    push    ecx
    fmul    [esi] ;; valueBase * boost
    mov     ecx, ebx
    fstp    [esp+16]
    fld     [esp+16]
    fstp    [esp]
    call    Foo::munge
    dec     edi
    jne     $a
    pop     edi
    pop     esi
    pop     ebx
    pop     ecx
    ret     0

Adjust10B:
    sub     esp, 8
    mov     ecx, [esp+16] ;; bar
    mov     eax, [ecx] ;; bar.valueBase
    mov     [esp], eax ;; localBar.valueBase
    fld     [esp] ;; localBar.valueBase
    mov     eax, [ecx+4] ;; bar.boost
    mov     [esp+4], eax ;; localBar.boost
    fmul    [esp+4] ;; localBar.getValue()
    push    esi
    push    edi
    mov     edi, [esp+20] ;; foo
    fstp    [esp+24]
    fld     [esp+24] ;; cache localBar.getValue()
    mov     esi, 10 ;; loop counter
$a: push    ecx
    mov     ecx, edi ;; foo
    fstp    [esp] ;; use cached value
    call    Foo::munge
    fld     [esp]
    dec     esi
    jne     $a ;; loop
    pop     edi
    fstp    ST(0)
    pop     esi
    add     esp, 8
    ret     0

Observe that the inner loop in Adjust10A must recalculate the value since it must protect against the possibility that foo.munge changed bar.

That said, this style of optimization is not a slam dunk. (For example, we could've gotten the same effect by manually caching bar.getValue() into localValue.) It tends to be most helpful for vectorized operations, since those can be paralellized.

这篇关于GCC优化技巧,它真的工作吗?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆