gcc -O0对于2的幂的矩阵大小优于-O3(矩阵转置) [英] gcc -O0 outperforming -O3 on matrix sizes that are powers of 2 (matrix transpositions)

查看:145
本文介绍了gcc -O0对于2的幂的矩阵大小优于-O3(矩阵转置)的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

(为测试目的)我写了一个简单的方法来计算一个nxn矩阵的转置

  void transpose size_t _n,double * _A){
for(uint i = 0; i <_n; ++ i){
for(uint j = i + 1; j< _n; ++ j ){
double tmp = _A [i * _n + j];
_A [i * _n + j] = _A [j * _n + i];
_A [j * _n + i] = tmp;
}
}
}

使用优化级别O3或我希望编译器展开一些循环,这将导致更高的性能,特别是当矩阵大小是2的倍数(即双循环体可以执行每次迭代)或类似。相反,我测量的是恰恰相反。 2的权力实际上显示执行时间的显着尖峰。



这些尖峰也有规律的间隔64,更明显的间隔为128等等。每个尖峰延伸到相邻的矩阵大小,如下表所示

  size n time(us)
1020 2649
1021 2815
1022 3100
1023 5428
1024 15791
1025 6778
1026 3106
1027 2847
1028 2660
1029 3038
1030 2613

我使用gcc版本4.8.2编译,事情发生与clang 3.5所以这可能是一些通用的东西?


所以我的问题基本上是:为什么有执行周期性增加时间?是否有一些通用的东西与任何优化选项(因为它发生clang和gcc一样)?如果是这样的优化选项引起这个?



这么重要的是,即使O0版本的程序在512的倍数上胜过03版本吗?









EDIT:请注意此(对数)图中的尖峰的大小。使用优化来移动1024x1024矩阵实际上需要与转置1300x1300矩阵相同的时间,而没有优化。如果这是一个缓存故障/页面故障问题,那么有人需要向我解释为什么内存布局是如此显着不同的程序的优化版本,它失败的两个权力,只是恢复高性能稍大的矩阵。应该不缓存故障创建更多的步状模式?为什么执行时间再次下降? (为什么要优化创建之前没有的缓存故障?)






strong>以下应该是gcc生成的汇编代码



无优化(O0):

  _Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6,-16
mov rbp,rsp
.cfi_def_cfa_register 6
mov QWORD PTR [rbp-24],rdi
mov QWORD PTR [rbp-32],rsi
mov DWORD PTR [rbp-4] ,0
jmp .L2
.L5:
mov eax,DWORD PTR [rbp-4]
add eax,1
mov DWORD PTR [rbp-8] ,eax
jmp .L3
.L4:
mov rax,QWORD PTR [rbp-32]
mov rdx,QWORD PTR [rax]
mov eax,DWORD PTR [rbp-4]
imul rax,QWORD PTR [rbp-24]
mov rcx,rax
mov eax,DWORD PTR [rbp-8]
add rax,rcx
sal rax,3
add rax,rdx
mov rax,QWORD PTR [rax]
mov QWORD PTR [rbp-16],rax
mov rax,QWORD PTR [rbp-32]
mov rdx,QWORD PTR [rax]
mov eax,DWORD PTR [rbp-4]
imul rax,QWORD PTR [rbp-24]
mov rcx,rax
mov eax,DWORD PTR [rbp-8]
add rax,rcx
sal rax,3
add rdx,rax
mov rax,QWORD PTR [rbp-32]
mov rcx,QWORD PTR [rax]
mov eax,DWORD PTR [rbp-8]
imul rax,QWORD PTR [rbp-24]
mov rsi,rax
mov eax,DWORD PTR [rbp-4]
add rax,rsi
sal rax,3
add rax,rcx
mov rax,QWORD PTR [rax]
mov QWORD PTR [rdx],rax
mov rax,QWORD PTR [rbp-32]
mov rdx,QWORD PTR [rax]
mov eax,DWORD PTR [rbp-8]
imul rax,QWORD PTR [rbp-24]
mov rcx,rax
mov eax,DWORD PTR [rbp-4]
add rax,rcx
sal rax,3
add rdx,rax
mov rax,QWORD PTR [rbp-16]
mov QWORD PTR [rdx],rax
add DWORD PTR [ rbp-8],1
.L3:
mov eax,DWORD PTR [rbp-8]
cmp rax,QWORD PTR [rbp-24]
jb .L4
add DWORD PTR [rbp-4],1
.L2:
mov eax,DWORD PTR [rbp-4]
cmp rax,QWORD PTR [rbp-24]
jb .L5
pop rbp
.cfi_def_cfa 7,8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd,。-_ Z9transposemRPd
.identGCC:(Debian 4.8.2-15)4.8.2
.section .note.GNU-stack,,@ progbits
pre>

优化(O3)

  _Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset 3,-16
xor r11d,r11d
xor ebx,ebx
.L2:
cmp r11,rdi
mov r9,r11
jae .L10
.p2align 4,,10
.p2align 3
.L7:
add ebx,1
mov r11d,ebx
cmp rdi,r11
mov rax,r11
jbe .L2
mov r10,r9
mov r8,QWORD PTR [rsi]
mov edx,ebx
imul r10,rdi
.p2align 4,and 10
.p2align 3
。 L6:
lea rcx,[rax + r10]
add edx,1
imur rax,rdi
lea rcx,[r8 + rcx * 8]
movsd xmm0 ,QWORD PTR [rcx]
add rax,r9
lea rax,[r8 + rax * 8]
movsd xmm1,QWORD PTR [rax]
movsd QWORD PTR [rcx] ,xmm1
movsd QWORD PTR [rax],xmm0
mov eax,edx
cmp rdi,rax
ja .L6
cmp r11,rdi
mov r9,r11
jb .L7
.L10:
pop rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd,。-_ Z9transposemRPd
.identGCC:(Debian 4.8.2-15)4.8.2
.section .note.GNU-stack,,@ progbits


解决方案

执行时间的周期性增加必须是由于缓存只有N路关联而不是完全关联。您正在目击与缓存行选择算法相关的哈希冲突。



最快的L1缓存具有比下一级别L2更少的缓存行数。在每个级中,每个高速缓存行可以仅从有限的源集合填充。由于实现的简单性,选择源地址的散列算法通常仅使用在硬件和一些xor操作中自由的一些旋转/移位。



这会导致内存范围之间的竞争,例如在地址0x300010和0x341010之间。
在完全顺序的算法中,这没有关系 - N足够大到实际上所有形式的算法:

  for(i = 0; i <1000; i ++)a [i] + = b [i] * c [i] + d [i] 

但是当输入(或输出)的数量变大时,优化,在缓存中有一个输入会强制缓存中的另一个输入。

  //一个可能的优化方法和6输入
//具有两个不相关的执行路径 - 应该更快,但是也许不是
for(i = 0; i <500; i ++){
a [i] + = b [i] * c [i] + d [i]
a [i + 500] + = b [i + 500] * c [i + 500] + d [i + 500]
}


(For testing purposes) I have written a simple Method to calculate the transpose of a nxn Matrix

void transpose(const size_t _n, double* _A) {
    for(uint i=0; i < _n; ++i) {
        for(uint j=i+1; j < _n; ++j) {
            double tmp  = _A[i*_n+j];
            _A[i*_n+j] = _A[j*_n+i];
            _A[j*_n+i] = tmp;
        }
    }
}

When using optimization levels O3 or Ofast I expected the compiler to unroll some loops which would lead to higher performance especially when the matrix size is a multiple of 2 (i.e., the double loop body can be performed each iteration) or similar. Instead what I measured was the exact opposite. Powers of 2 actually show a significant spike in execution time.

These spikes are also at regular intervals of 64, more pronounced at intervals of 128 and so on. Each spike extends to the neighboring matrix sizes like in the following table

size n  time(us)
1020    2649
1021    2815
1022    3100
1023    5428
1024    15791
1025    6778
1026    3106
1027    2847
1028    2660
1029    3038
1030    2613

I compiled with a gcc version 4.8.2 but the same thing happens with a clang 3.5 so this might be some generic thing?

So my question basically is: Why is there this periodic increase in execution time? Is it some generic thing coming with any of the optimization options (as it happens with clang and gcc alike)? If so which optimization option is causing this?

And how can this be so significant that even the O0 version of the program outperforms the 03 version at multiples of 512?


EDIT: Note the magnitude of the spikes in this (logarithmic) plot. Transposing a 1024x1024 matrix with optimization actually takes as much time as transposing a 1300x1300 matrix without optimization. If this is a cache-fault / page-fault problem, then someone needs to explain to me why the memory layout is so significantly different for the optimized version of the program, that it fails for powers of two, just to recover high performance for slightly larger matrices. Shouldn't cache-faults create more of a step-like pattern? Why does the execution times go down again at all? (and why should optimization create cache-faults that weren't there before?)


EDIT: the following should be the assembler codes that gcc produced

no optimization (O0):

_Z9transposemRPd:
.LFB0:
    .cfi_startproc
    push    rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    mov rbp, rsp
    .cfi_def_cfa_register 6
    mov QWORD PTR [rbp-24], rdi
    mov QWORD PTR [rbp-32], rsi
    mov DWORD PTR [rbp-4], 0
    jmp .L2
.L5:
    mov eax, DWORD PTR [rbp-4]
    add eax, 1
    mov DWORD PTR [rbp-8], eax
    jmp .L3
.L4:
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-4]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-8]
    add rax, rcx
    sal rax, 3
    add rax, rdx
    mov rax, QWORD PTR [rax]
    mov QWORD PTR [rbp-16], rax
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-4]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-8]
    add rax, rcx
    sal rax, 3
    add rdx, rax
    mov rax, QWORD PTR [rbp-32]
    mov rcx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-8]
    imul    rax, QWORD PTR [rbp-24]
    mov rsi, rax
    mov eax, DWORD PTR [rbp-4]
    add rax, rsi
    sal rax, 3
    add rax, rcx
    mov rax, QWORD PTR [rax]
    mov QWORD PTR [rdx], rax
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-8]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-4]
    add rax, rcx
    sal rax, 3
    add rdx, rax
    mov rax, QWORD PTR [rbp-16]
    mov QWORD PTR [rdx], rax
    add DWORD PTR [rbp-8], 1
.L3:
    mov eax, DWORD PTR [rbp-8]
    cmp rax, QWORD PTR [rbp-24]
    jb  .L4
    add DWORD PTR [rbp-4], 1
.L2:
    mov eax, DWORD PTR [rbp-4]
    cmp rax, QWORD PTR [rbp-24]
    jb  .L5
    pop rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   _Z9transposemRPd, .-_Z9transposemRPd
    .ident  "GCC: (Debian 4.8.2-15) 4.8.2"
    .section    .note.GNU-stack,"",@progbits

with optimization (O3)

_Z9transposemRPd:
.LFB0:
    .cfi_startproc
    push    rbx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    xor r11d, r11d
    xor ebx, ebx
.L2:
    cmp r11, rdi
    mov r9, r11
    jae .L10
    .p2align 4,,10
    .p2align 3
.L7:
    add ebx, 1
    mov r11d, ebx
    cmp rdi, r11
    mov rax, r11
    jbe .L2
    mov r10, r9
    mov r8, QWORD PTR [rsi]
    mov edx, ebx
    imul    r10, rdi
    .p2align 4,,10
    .p2align 3
.L6:
    lea rcx, [rax+r10]
    add edx, 1
    imul    rax, rdi
    lea rcx, [r8+rcx*8]
    movsd   xmm0, QWORD PTR [rcx]
    add rax, r9
    lea rax, [r8+rax*8]
    movsd   xmm1, QWORD PTR [rax]
    movsd   QWORD PTR [rcx], xmm1
    movsd   QWORD PTR [rax], xmm0
    mov eax, edx
    cmp rdi, rax
    ja  .L6
    cmp r11, rdi
    mov r9, r11
    jb  .L7
.L10:
    pop rbx
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE0:
    .size   _Z9transposemRPd, .-_Z9transposemRPd
    .ident  "GCC: (Debian 4.8.2-15) 4.8.2"
    .section    .note.GNU-stack,"",@progbits

解决方案

The periodic increase of execution time must be due to the cache being only N-way associative instead of fully associative. You are witnessing hash collision related to cache line selection algorithm.

The fastest L1 cache has a smaller number of cache lines than the next level L2. In each level each cache line can be filled only from a limited set of sources. Because of the simplicity of the implementation, the hash algorithm selecting the source addresses typically uses only some rotations / shifts, which are free in hardware and some xor operations.

This causes a competition between memory ranges e.g. between addresses 0x300010 and 0x341010. In fully sequential algorithm this doesn't matter -- N is large enough for practically all algorithms of the form:

 for (i=0;i<1000;i++) a[i] += b[i] * c[i] + d[i];

But when the number of the inputs (or outputs) gets larger, which happens internally when the algorithm is optimized, having one input in the cache forces another input out of the cache.

 // one possible method of optimization with 2 outputs and 6 inputs
 // with two unrelated execution paths -- should be faster, but maybe it isn't
 for (i=0;i<500;i++) { 
       a[i]     += b[i]     * c[i]     + d[i];
       a[i+500] += b[i+500] * c[i+500] + d[i+500];
 }

这篇关于gcc -O0对于2的幂的矩阵大小优于-O3(矩阵转置)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆