在C ++中,我应该麻烦缓存变量,还是让编译器做优化? (混叠) [英] In C++, should I bother to cache variables, or let the compiler do the optimization? (Aliasing)

查看:104
本文介绍了在C ++中,我应该麻烦缓存变量,还是让编译器做优化? (混叠)的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

请考虑以下代码( p 的类型为 unsigned char * > width 是一个整数类型,完全是未知的,取决于我们使用的某个外部库的版本):

  for(unsigned x = 0; x< static_cast< unsigned>(bitmap-> width); ++ x)
{
* p ++ = 0xAA;
* p ++ = 0xBB;
* p ++ = 0xCC;
}

/ s>



可以这样写:

 无符号宽度(static_cast< unsigned>(bitmap-> width)); 
for(unsigned x = 0; x< width; ++ x)
{
* p ++ = 0xAA;
* p ++ = 0xBB;
* p ++ = 0xCC;
}

...或者这是编译器优化? >

您认为是什么是更好的代码?



来自编辑的注释(Ike):对于那些想知道罢工文本的人,原来的问题,作为措辞,危险地接近偏离主题的领土,并非常接近被关闭,尽管正面反馈。这些都被打破了。

解决方案

如果您的问题没有解决,第一眼,我认为编译器可以生成两个版本的等效装配,优化标志激活。当我检查它,我惊讶地看到结果:



unoptimized.cpp



注意:此代码并不意味着执行。

  struct bitmap_t 
{
long long width;
} bitmap;

int main(int argc,char ** argv)
{
for(unsigned x = 0; x< static_cast< unsigned>(bitmap.width); ++ x)
{
argv [x] [0] ='\0';
}
return 0;
}



optimized.cpp
  



注意:此代码不适用于执行。 > struct bitmap_t
{
long long width;
} bitmap;

int main(int argc,char ** argv)
{
const unsigned width = static_cast< unsigned>(bitmap.width);
for(unsigned x = 0; x< width; ++ x)
{
argv [x] [0] ='\0';
}
return 0;
}






编译




  • $ g ++ -s -O3 unoptimized.cpp

  • $ g ++ -s -O3 optimized.cpp






程序集(unoptimized.s)



  .fileunoptimized.cpp
.text
.p2align 4,,15
.globl main
.type main,@function
main:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__ gxx_personality_v0
movl bitmap(%rip),%eax
testl%eax,%eax
je .L2
xorl%eax,%eax
。 p2align 4,,10
.p2align 3
.L3:
mov%eax,%edx
addl $ 1,%eax
movq(%rsi,%rdx, 8),%rdx
movb $ 0,(%rdx)
cmpl bitmap(%rip),%eax
jb .L3
.L2:
xorl%eax ,%eax
ret
.cfi_endproc
.LFE0:
.size main,。-main
.globl bitmap
.bss
。 align 8
.type bitmap,@object
.size bitmap,8
bitmap:
.zero 8
.identGCC:(GNU)4.4.7 20120313 (Red Hat 4.4.7-16)
.section .note.GNU-stack,,@ progbits



Assembly(optimized.s)



  .fileoptimized.cpp
.text
.p2align 4,,15
.globl main
.type main,@function
main:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__ gxx_personality_v0
movl bitmap(%rip),%eax
testl%eax,%eax
je .L2
subl $ 1,%eax
leaq 8 (,%rax,8),%rcx
xorl%eax,%eax
.p2align 4,and 10
.p2align 3
.L3:
movq %rsi,%rax),%rdx
addq $ 8,%rax
cmpq%rcx,%rax
movb $ 0(%rdx)
jne .L3
.L2:
xorl%eax,%eax
ret
.cfi_endproc
.LFE0:
.size main,。-main
.globl bitmap
.bss
.align 8
.type位图,@object
.size位图,8
位图:
.zero 8
.ident gcc:(GNU)4.4.7 20120313(Red Hat 4.4.7-16)
.section .note.GNU-stack,,@ progbits



diff



  $ diff -uN unoptimized.s optimized .s 
--- unoptimized.s 2015-11-24 16:11:55.837922223 +0000
+++ optimized.s 2015-11-24 16:12:02.628922941 +0000
@@ -1,4 +1,4 @@
- .fileunoptimized.cpp
+ .fileoptimized.cpp
.text
.p2align 4, ,15
.globl main
@@ -10,16 +10,17 @@
movl位图(%rip),%eax
testl%eax,%eax
je .L2
+ subl $ 1,%eax
+ leaq 8(,%rax,8),%rcx
xorl%eax,%eax
.p2align 4, ,10
.p2align 3
.L3:
- mov%eax,%edx
- addl $ 1,%eax
- movq(%rsi,%rdx, 8),%rdx
+ movq(%rsi,%rax),%rdx
+ addq $ 8,%rax
+ cmpq%rcx,%rax
movb $ 0, %rdx)
- cmpl位图(%rip),%eax
- jb .L3
+ jne .L3
.L2:
xorl%eax,%eax
ret






生成的优化版本确实已加载( lea c> c> ) width a href =http://stackoverflow.com/questions/3852909/movq-assembly-function> movq )。



当我得到时间时,我最终会发布一些基准。好问题。


Consider the following code (p is of type unsigned char* and bitmap->width is of some integer type, exactly which is unknown and depends on which version of some external library we're using):

for (unsigned x = 0;  x < static_cast<unsigned>(bitmap->width);  ++x)
{
    *p++ = 0xAA;
    *p++ = 0xBB;
    *p++ = 0xCC;
}

Is it worth optimizing it [..]

Could there be a case where this could yield more efficient results by writing:

unsigned width(static_cast<unsigned>(bitmap->width));
for (unsigned x = 0;  x < width;  ++x)
{
    *p++ = 0xAA;
    *p++ = 0xBB;
    *p++ = 0xCC;
}

... or is this trivial for the compiler to optimize?

What would you consider to be "better" code?

Note from editor (Ike): for those wondering about the strikeout text, the original question, as phrased, was dangerously close to off-topic territory and was very close to being closed in spite of positive feedback. These have been stricken out. Yet please do not punish the answerers who addressed these stricken sections of the question.

解决方案

At first glance, I thought the compiler could generate equivalent assembly for both versions with optimization flags activated. When I checked it, I was surprised to see the result:

Source unoptimized.cpp

note: this code is not meant to be executed.

struct bitmap_t
{
    long long width;
} bitmap;

int main(int argc, char** argv)
{
    for (unsigned x = 0 ; x < static_cast<unsigned>(bitmap.width) ; ++x)
    {
        argv[x][0] = '\0';
    }
    return 0;
}

Source optimized.cpp

note: this code is not meant to be executed.

struct bitmap_t
{
    long long width;
} bitmap;

int main(int argc, char** argv)
{
    const unsigned width = static_cast<unsigned>(bitmap.width);
    for (unsigned x = 0 ; x < width ; ++x)
    {
        argv[x][0] = '\0';
    }
    return 0;
}


Compilation

  • $ g++ -s -O3 unoptimized.cpp
  • $ g++ -s -O3 optimized.cpp

Assembly (unoptimized.s)

    .file   "unoptimized.cpp"
    .text
    .p2align 4,,15
.globl main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    movl    bitmap(%rip), %eax
    testl   %eax, %eax
    je  .L2
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
.L3:
    mov %eax, %edx
    addl    $1, %eax
    movq    (%rsi,%rdx,8), %rdx
    movb    $0, (%rdx)
    cmpl    bitmap(%rip), %eax
    jb  .L3
.L2:
    xorl    %eax, %eax
    ret
    .cfi_endproc
.LFE0:
    .size   main, .-main
.globl bitmap
    .bss
    .align 8
    .type   bitmap, @object
    .size   bitmap, 8
bitmap:
    .zero   8
    .ident  "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-16)"
    .section    .note.GNU-stack,"",@progbits

Assembly (optimized.s)

    .file   "optimized.cpp"
    .text
    .p2align 4,,15
.globl main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    movl    bitmap(%rip), %eax
    testl   %eax, %eax
    je  .L2
    subl    $1, %eax
    leaq    8(,%rax,8), %rcx
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
.L3:
    movq    (%rsi,%rax), %rdx
    addq    $8, %rax
    cmpq    %rcx, %rax
    movb    $0, (%rdx)
    jne .L3
.L2:
    xorl    %eax, %eax
    ret
    .cfi_endproc
.LFE0:
    .size   main, .-main
.globl bitmap
    .bss
    .align 8
    .type   bitmap, @object
    .size   bitmap, 8
bitmap:
    .zero   8
    .ident  "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-16)"
    .section    .note.GNU-stack,"",@progbits

diff

$ diff -uN unoptimized.s optimized.s
--- unoptimized.s   2015-11-24 16:11:55.837922223 +0000
+++ optimized.s 2015-11-24 16:12:02.628922941 +0000
@@ -1,4 +1,4 @@
-   .file   "unoptimized.cpp"
+   .file   "optimized.cpp"
    .text
    .p2align 4,,15
 .globl main
@@ -10,16 +10,17 @@
    movl    bitmap(%rip), %eax
    testl   %eax, %eax
    je  .L2
+   subl    $1, %eax
+   leaq    8(,%rax,8), %rcx
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
 .L3:
-   mov %eax, %edx
-   addl    $1, %eax
-   movq    (%rsi,%rdx,8), %rdx
+   movq    (%rsi,%rax), %rdx
+   addq    $8, %rax
+   cmpq    %rcx, %rax
    movb    $0, (%rdx)
-   cmpl    bitmap(%rip), %eax
-   jb  .L3
+   jne .L3
 .L2:
    xorl    %eax, %eax
    ret


The generated assembly for the optimized version does actually load (lea) the width constant unlike the unoptimized version which computes the width offset at each iteration (movq).

When I'll get time, I eventually post some benchmark on that. Good question.

这篇关于在C ++中,我应该麻烦缓存变量,还是让编译器做优化? (混叠)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆