GCC优化技巧,它真的工作吗? [英] GCC optimization trick, does it really work?
问题描述
在查看有关优化的一些问题时,此接受答案的关于编码实践的问题最有效地使用优化器引发我的好奇心。断言是局部变量应该用于函数中的计算,而不是输出参数。有人建议这将允许编译器进行额外的优化,否则不可能。
因此,编写一个简单的代码示例Foo类并编译代码片段g ++ v4.4和-O2给出了一些汇编器输出(使用-S)。汇编器列表的部分仅包含循环部分如下所示。在检查输出,似乎两个循环几乎相同,只有一个地址的差异。该地址是指向第一个示例的输出参数的指针,或者是第二个的局部变量的指针。
无论是否使用本地变量,实际效果似乎没有变化。所以问题分为3部分:
a)是GCC 不是做额外的优化,即使给出了暗示建议;
b)在这两种情况下都是GCC 成功优化,但不应是
这里是未优化的函数:
p> void DoSomething(const Foo& foo1,const Foo * foo2,int numFoo,Foo& barOut)
{
for(int i = 0; i {
barOut.munge(foo1,foo2 [i]);
}
}
与对应的程序集:
.L3:
movl(%esi),%eax
addl $ 1,%ebx
addl $ 4,%esi
movl%eax,8(%esp)
movl(%ebp),%eax
movl%eax,4(%esp) eax;注意地址是输出参数的地址
movl%eax,(%esp)
call _ZN3Foo5mungeES_S_
cmpl%ebx,16(%ebp)
jg .L3
以下是重写的函数:
void DoSomethingFaster(const Foo& foo1,const Foo * foo2,int numFoo,Foo& barOut)
{
Foo barTemp = barOut;
for(int i = 0; i {
barTemp.munge(foo1,foo2 [i]);
}
barOut = barTemp;
}
这里是使用局部变量的函数的编译器输出: p>
.L3:
movl(%esi),%eax;将foo2 [i]指针插入EAX
addl $ 1,%ebx;增量i
addl $ 4,%esi;增量foo2 [i](32位系统,64位系统上为8)
movl%eax,8(%esp); PUSH foo2 [i]到栈(小心!从EAX,而不是ESI)
movl(%edi),%eax;将foo1指针加载到EAX
movl%eax,4(%esp); PUSH foo1
leal -28(%ebp),%eax;加载barTemp指针到EAX
movl%eax,(%esp); PUSH barTemp的这个指针
call _ZN3Foo5mungeES_S_; munge()!
cmpl%ebx,16(%ebp); i< numFoo
jg .L3;回忆增加一个进入循环
;所以测试如果更大
不是一个非常好的一个,因为调用一个未知的函数编译器不能推理很多。下面是一个更好的例子:
void FillOneA(int * array,int length,int& startIndex)
{
for(int i = 0; i }
void FillOneB(int * array,int length,int& startIndex)
{
int localIndex = startIndex;
for(int i = 0; i }
第一个版本优化很差,因为它需要防止某人调用它作为
int array [10] = {0};
FillOneA(array,5,array [1]);
导致 {1,1,0,1,1,1, 0,0,0,0}
,因为与 i = 1
的迭代修改 startIndex
参数。
第二个不需要担心数组[localIndex + i] = 1
将修改 localIndex
,因为 localIndex
是地址从未被采用的局部变量。
在汇编(英特尔符号,因为这是我使用):
FillOneA:
mov edx,[esp + 8]
xor eax,eax
test edx,edx
jle
$ b push esi
mov esi,[esp + 16]
push edi
mov edi,[esp + 12]
$ a:mov ecx,[esi]
add ecx,eax
inc eax
mov [edi + ecx * 4],1
cmp eax,edx
jl $ a
pop edi
pop esi
$ b:ret
FillOneB:
mov ecx,[esp + 8]
mov eax,[esp + 12]
mov edx,[eax]
test ecx,ecx
jle $ a
mov eax,[esp + 4]
push edi
lea edi,[eax + edx * 4]
mov eax,1
rep stosd
pop edi
$ a:ret
ADDED:编译器的洞察力是Bar,而不是munge:
class Bar
{
public:
float getValue()const
{
return valueBase * boost;
}
private:
float valueBase;
float boost;
};
class Foo
{
public:
void munge(float adjustment);
};
void Adjust10A(Foo& foo,const Bar& bar)
{
for(int i = 0; i <10; i ++)
foo.munge (bar.getValue());
}
void Adjust10B(Foo& foo,const Bar& bar)
{
Bar localBar = bar;
for(int i = 0; i <10; i ++)
foo.munge(localBar.getValue());
}
生成的代码是
Adjust10A:
$请注意,
push ecx
push ebx
mov ebx,[esp + 12] ;; foo
push esi
mov esi,[esp + 20] ;; bar
push edi
mov edi,10
$ a:fld [esi + 4] ;; bar.valueBase
push ecx
fmul [esi] ;; valueBase * boost
mov ecx,ebx
fstp [esp + 16]
fld [esp + 16]
fstp [esp]
call Foo :: munge
dec edi
jne $ a
pop edi
pop esi
pop ebx
pop ecx
ret 0
Adjust10B :
sub esp,8
mov ecx,[esp + 16] ;; bar
mov eax,[ecx] ;; bar.valueBase
mov [esp],eax ;; localBar.valueBase
fld [esp] ;; localBar.valueBase
mov eax,[ecx + 4] ;; bar.boost
mov [esp + 4],eax ;; localBar.boost
fmul [esp + 4] ;; localBar.getValue()
push esi
push edi
mov edi,[esp + 20] ;; foo
fstp [esp + 24]
fld [esp + 24] ;; cache localBar.getValue()
mov esi,10 ;;循环计数器
$ a:push ecx
mov ecx,edi ;; foo
fstp [esp] ;;使用缓存的值
call Foo :: munge
fld [esp]
dec esi
jne $ a ;; loop
pop edi
fstp ST(0)
pop esi
add esp,8
ret 0
Adjust10A
中的内循环必须重新计算值,因为它必须防止foo.munge
更改bar
。
优化不是一个大满贯。 (例如,我们可以通过手动缓存
bar.getValue()
到localValue
中获得相同的效果。它往往对矢量化操作最有帮助,因为那些可以并行化。While looking at some questions on optimization, this accepted answer for the question on coding practices for most effective use of the optimizer piqued my curiosity. The assertion is that local variables should be used for computations in a function, not output arguments. It was suggested this would allow the compiler to make additional optimizations otherwise not possible.
So, writing a simple bit of code for the example Foo class and compiling the code fragments with g++ v4.4 and -O2 gave some assembler output (use -S). The parts of the assembler listing with just the loop portion shown below. On examination of the output, it seems the loop is nearly identical for both, with just a difference in one address. That address being a pointer to the output argument for the first example or the local variable for the second.
There seems to no change in the actual effect whether the local variable is used or not. So the question breaks down to 3 parts:
a) is GCC not doing additional optimization, even given the hint suggested;
b) is GCC successfully optimizing in both cases, but should not be;
c) is GCC successfully optimizing in both cases, and is producing compliant output as defined by the C++ standard?
Here is the unoptimized function:
void DoSomething(const Foo& foo1, const Foo* foo2, int numFoo, Foo& barOut) { for (int i=0; i<numFoo, i++) { barOut.munge(foo1, foo2[i]); } }
And corresponding assembly:
.L3: movl (%esi), %eax addl $1, %ebx addl $4, %esi movl %eax, 8(%esp) movl (%edi), %eax movl %eax, 4(%esp) movl 20(%ebp), %eax ; Note address is that of the output argument movl %eax, (%esp) call _ZN3Foo5mungeES_S_ cmpl %ebx, 16(%ebp) jg .L3
Here is the re-written function:
void DoSomethingFaster(const Foo& foo1, const Foo* foo2, int numFoo, Foo& barOut) { Foo barTemp = barOut; for (int i=0; i<numFoo, i++) { barTemp.munge(foo1, foo2[i]); } barOut = barTemp; }
And here is the compiler output for the function using a local variable:
.L3: movl (%esi), %eax ; Load foo2[i] pointer into EAX addl $1, %ebx ; increment i addl $4, %esi ; increment foo2[i] (32-bit system, 8 on 64-bit systems) movl %eax, 8(%esp) ; PUSH foo2[i] onto stack (careful! from EAX, not ESI) movl (%edi), %eax ; Load foo1 pointer into EAX movl %eax, 4(%esp) ; PUSH foo1 leal -28(%ebp), %eax ; Load barTemp pointer into EAX movl %eax, (%esp) ; PUSH the this pointer for barTemp call _ZN3Foo5mungeES_S_ ; munge()! cmpl %ebx, 16(%ebp) ; i < numFoo jg .L3 ; recall incrementing i by one coming into the loop ; so test if greater
解决方案The example given in that answer was not a very good one because of the call to an unknown function the compiler cannot reason much about. Here's a better example:
void FillOneA(int *array, int length, int& startIndex) { for (int i = 0; i < length; i++) array[startIndex + i] = 1; } void FillOneB(int *array, int length, int& startIndex) { int localIndex = startIndex; for (int i = 0; i < length; i++) array[localIndex + i] = 1; }
The first version optimizes poorly because it needs to protect against the possibility that somebody called it as
int array[10] = { 0 }; FillOneA(array, 5, array[1]);
resulting in
{1, 1, 0, 1, 1, 1, 0, 0, 0, 0 }
since the iteration withi=1
modifies thestartIndex
parameter.The second one doesn't need to worry about the possibility that the
array[localIndex + i] = 1
will modifylocalIndex
becauselocalIndex
is a local variable whose address has never been taken.In assembly (Intel notation, because that's what I use):
FillOneA: mov edx, [esp+8] xor eax, eax test edx, edx jle $b push esi mov esi, [esp+16] push edi mov edi, [esp+12] $a: mov ecx, [esi] add ecx, eax inc eax mov [edi+ecx*4], 1 cmp eax, edx jl $a pop edi pop esi $b: ret FillOneB: mov ecx, [esp+8] mov eax, [esp+12] mov edx, [eax] test ecx, ecx jle $a mov eax, [esp+4] push edi lea edi, [eax+edx*4] mov eax, 1 rep stosd pop edi $a: ret
ADDED: Here's an example where the compiler's insight is into Bar, and not munge:
class Bar { public: float getValue() const { return valueBase * boost; } private: float valueBase; float boost; }; class Foo { public: void munge(float adjustment); }; void Adjust10A(Foo& foo, const Bar& bar) { for (int i = 0; i < 10; i++) foo.munge(bar.getValue()); } void Adjust10B(Foo& foo, const Bar& bar) { Bar localBar = bar; for (int i = 0; i < 10; i++) foo.munge(localBar.getValue()); }
The resulting code is
Adjust10A: push ecx push ebx mov ebx, [esp+12] ;; foo push esi mov esi, [esp+20] ;; bar push edi mov edi, 10 $a: fld [esi+4] ;; bar.valueBase push ecx fmul [esi] ;; valueBase * boost mov ecx, ebx fstp [esp+16] fld [esp+16] fstp [esp] call Foo::munge dec edi jne $a pop edi pop esi pop ebx pop ecx ret 0 Adjust10B: sub esp, 8 mov ecx, [esp+16] ;; bar mov eax, [ecx] ;; bar.valueBase mov [esp], eax ;; localBar.valueBase fld [esp] ;; localBar.valueBase mov eax, [ecx+4] ;; bar.boost mov [esp+4], eax ;; localBar.boost fmul [esp+4] ;; localBar.getValue() push esi push edi mov edi, [esp+20] ;; foo fstp [esp+24] fld [esp+24] ;; cache localBar.getValue() mov esi, 10 ;; loop counter $a: push ecx mov ecx, edi ;; foo fstp [esp] ;; use cached value call Foo::munge fld [esp] dec esi jne $a ;; loop pop edi fstp ST(0) pop esi add esp, 8 ret 0
Observe that the inner loop in
Adjust10A
must recalculate the value since it must protect against the possibility thatfoo.munge
changedbar
.That said, this style of optimization is not a slam dunk. (For example, we could've gotten the same effect by manually caching
bar.getValue()
intolocalValue
.) It tends to be most helpful for vectorized operations, since those can be paralellized.这篇关于GCC优化技巧,它真的工作吗?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!