为什么添加额外的检查循环在一些机器上有很大的不同,在其他机器上有小的差别? [英] Why does adding extra check in loop make big difference on some machines, and small difference on others?

查看:84
本文介绍了为什么添加额外的检查循环在一些机器上有很大的不同,在其他机器上有小的差别?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我一直在做一些测试,看看有多少差异附加边界检查在循环。这是通过考虑在访问数组时由诸如C#,Java等语言插入的隐式边界检查的成本。



更新:我尝试过相同的可执行文件程序在几个额外的计算机上,这将更多的光投射到正在发生的事情。我列出了原来的计算机,第二我现代的笔记本电脑。在我的现代笔记本电脑上,在循环中添加额外的检查只需要1到4%的时间,原始硬件的3和30%之间。

 处理器x86系列6型号30步进5 GenuineIntel〜2793 Mhz 
比例2检查:1检查= 1.0310
比例3检查:1检查= 1.2769

处理器Intel(R)Core(TM)i7-3610QM CPU @ 2.30GHz,2301 Mhz,4个核心,8个逻辑处理器
比率2检查:1检查= 1.0090
比率3检查:1检查= 1.0393

处理器Intel(R)Core(i)i5-2500 CPU @ 3.30GHz,4内核
比率2检查:1检查= 1.0035
比率3检查:1检查= 1.0639

处理器Intel(R)Core(TM)2 Duo CPU T9300 @ 2.50GHz,2501 Mhz,2个内核,2个逻辑处理器(s)
比率2检查:1检查= 1.1195
比率3检查:1检查= 1.3597

处理器x86家族15型号43步进1 AuthenticAMD〜2010 Mhz
比率2检查:1检查= 1.0776
比率3检查:1检查= 1.1451

在测试程序中,下面的第一个函数只检查一个绑定,第二个函数检查两个,第三个检查三个(在调用代码中, n1 = n2 = n3 )。我发现比率两个检查:一个约为1.03,比率三个检查:一个约为1.3。我很惊讶,添加一个检查使得这样的差异的性能。我有一个有趣的答案,关于检查现代处理器的边界的低成本到我原来的问题,这可能会引发一些光这里观察到的差异。



请注意,重要的是编译程序,而不需要打开整个程序优化;否则编译器可以简单地删除额外的边界检查。

  // dotprod.cpp 
#includedotprod.h

double SumProduct(const double * v1,const double * v2,int n)
{
double sum = 0;
for(int i = 0;
i ++ i)
sum + = v1 [i] * v2 [i]
return sum;
}

double SumProduct(const double * v1,const double * v2,int n1,int n2)
{
double sum = 0;
for(int i = 0;
i ++ i)
sum + = v1 [i] * v2 [i ];
return sum;
}

double SumProduct(const double * v1,const double * v2,int n1,int n2,int n3)
{
double sum =
for(int i = 0;
i ++ i)
sum + v1 [i] * v2 [i];
return sum;
}



此代码最初是使用Visual Studio 2010,Release,Win32 ve添加了'C'标签,因为速度差异的原因不太可能是C ++特定的,并且可能不是Windows特定的)。任何人都可以解释一下?



下面的其他代码。

  

// dotprod.h
double SumProduct(const double *,const double *,int n);
double SumProduct(const double *,const double *,int n1,int n2);
double SumProduct(const double *,const double *,int n1,int n2,int n3);

测试线束

  // main.cpp 

#include< stdio.h>
#include< math.h>
#include< numeric>
#include< vector>

#include< windows.h>

#include../dotprod/dotprod.h//单独的lib

typedef __int64 timecount_t;
inline timecount_t GetTimeCount()
{
LARGE_INTEGER li;
if(!QueryPerformanceCounter(& li)){
exit(1);
}
return li.QuadPart;
}

int main()
{
typedef std :: vector< double> dvec;
const int N = 100 * 1000;

//初始化
dvec v1(N);
dvec v2(N);
dvec dp1(N);
dvec dp2(N);
dvec dp3(N);
for(int i = 0; i v1 [i] = i;
v2 [i] = log(static_cast< double>(i + 1));
}

const timecount_t t0 = GetTimeCount();

//检查一个绑定的成本
for(int n = 0; n dp1 [n] = SumProduct(& [0]),&(v2 [0]),n);
}

const timecount_t t1 = GetTimeCount();

//用两个边界检查成本
for(int n = 0; n dp2 [n] = SumProduct(& [0]),&(v2 [0]),n,n);
}

const timecount_t t2 = GetTimeCount();

//用三个边界检查成本
for(int n = 0; n dp3 [n] = SumProduct(& [0]),&(v2 [0]),n,n,n)
}
const timecount_t t3 = GetTimeCount();

//检查结果
const double sumSumProducts1 = std :: accumulate(dp1.begin(),dp1.end(),0.0);
const double sumSumProducts2 = std :: accumulate(dp2.begin(),dp2.end(),0.0);
const double sumSumProducts3 = std :: accumulate(dp3.begin(),dp3.end(),0.0);
printf(点积的总和:%.1f,%.1f,%.1f\\\
,sumSumProducts1,sumSumProducts2,sumSumProducts3);

//输出定时
const timecount_t elapsed1 = t1-t0;
const timecount_t elapsed2 = t2-t1;
const timecount_t elapsed3 = t3-t2;
printf(Elapsed:%.0f,%.0f,%.0f\\\

static_cast< double>(elapsed1),
static_cast< double>(elapsed2),
static_cast< double>(elapsed3));
const double ratio2to1 = elapsed2 / static_cast< double>(elapsed1);
const double ratio3to1 = elapsed3 / static_cast< double>(elapsed1);
printf(Ratio 2:1 =%。2f \\\
,ratio2to1);
printf(Ratio 3:1 =%。2f \\\
,ratio3to1);

return 0;
}

为了生成程序集,我在此答案(情况2,关闭整个程序优化),生成以下asm文件。

 ;由Microsoft(R)Optimizing Compiler Version 16.00.40219.01生成的列表

TITLE C:\dev\TestSpeed\dotprod\dotprod.cpp
.686P
.XMM
include listing.inc
.model flat

INCLUDELIB OLDNAMES

PUBLIC __real @ 0000000000000000
PUBLIC?SumProduct @@ YANPBN0HHH @ Z; SumProduct
EXTRN __fltused:DWORD
; COMDAT __real @ 0000000000000000
;文件c:\dev\testspeed\dotprod\dotprod.cpp
CONST SEGMENT
__real @ 0000000000000000 DQ 00000000000000000r; 0
;函数编译标志:/ Ogtp
CONST ENDS
; COMDAT?SumProduct @@ YANPBN0HHH @ Z
_TEXT SEGMENT
tv491 = -4; size = 4
_v1 $ = 8; size = 4
_v2 $ = 12; size = 4
_n1 $ = 16; size = 4
_n2 $ = 20; size = 4
_n3 $ = 24; size = 4
?SumProduct @@ YANPBN0HHH @ Z PROC; SumProduct,COMDAT

; 25:{

push ebp
mov ebp,esp
push ecx

; 26:double sum = 0;

fldz
push ebx
mov ebx,DWORD PTR _v2 $ [ebp]
push esi
push edi
mov edi,DWORD PTR _n1 $ [ebp]

; 27:for(int i = 0;

xor ecx,ecx

; 28:i< n1& i< n2& n3;
; 29:++ i)

cmp edi,4
jl $ LC8 @ sumProduct

; 26:double sum = 0;

mov edi,DWORD PTR _v1 $ [ebp]
lea esi,DWORD PTR [edi + 24]

; 30:sum + = v1 [i] * v2 [i];

子edi,ebx
lea edx,DWORD PTR [ecx + 2]
lea eax,DWORD PTR [ebx + 8]
mov DWORD PTR tv491 [ebp ],edi
$ LN15 @ SumProduct:

; 28:i ; 29:++ i)

mov ebx,DWORD PTR _n2 $ [ebp]
cmp ecx,ebx
jge $ LN9 @ SumProduct
cmp ecx,DWORD PTR _n3 $ [ebp]
jge $ LN9 @ SumProduct

; 30:sum + = v1 [i] * v2 [i];

fld QWORD PTR [eax-8]
lea edi,DWORD PTR [edx-1]
fmul QWORD PTR [esi-24]
faddp ST ),ST(0)
cmp edi,ebx
jge SHORT $ LN9 @ SumProduct

; 28:i ; 29:++ i)

cmp edi,DWORD PTR _n3 $ [ebp]
jge SHORT $ LN9 @ SumProduct

; 30:sum + = v1 [i] * v2 [i];

mov edi,DWORD PTR tv491 [ebp]
fld QWORD PTR [edi + eax]
fmul QWORD PTR [eax]
faddp ST (0)
cmp edx,ebx
jge SHORT $ LN9 @ SumProduct

; 28:i ; 29:++ i)

cmp edx,DWORD PTR _n3 $ [ebp]
jge SHORT $ LN9 @ SumProduct

; 30:sum + = v1 [i] * v2 [i];

fld QWORD PTR [eax + 8]
lea edi,DWORD PTR [edx + 1]
fmul QWORD PTR [esi-8]
faddp ST ),ST(0)
cmp edi,ebx
jge SHORT $ LN9 @ SumProduct

; 28:i ; 29:++ i)

cmp edi,DWORD PTR _n3 $ [ebp]
jge SHORT $ LN9 @ SumProduct

; 30:sum + = v1 [i] * v2 [i];

fld QWORD PTR [eax + 16]
mov edi,DWORD PTR _n1 $ [ebp]
fmul QWORD PTR [esi]
add ecx,4
lea ebx,DWORD PTR [edi-3]
add eax,32; 00000020H
add esi,32; 00000020H
faddp ST(1),ST(0)
add edx,4
cmp ecx,ebx
jl SHORT $ LN15 @ SumProduct
mov ebx,DWORD PTR _v2 $ [ebp]
$ LC8 @ SumProduct:

; 28:i ; 29:++ i)

cmp ecx,edi
jge SHORT $ LN9 @ SumProduct
mov edx,DWORD PTR _v1 $ [ebp]
lea eax,DWORD PTR [ebx + ecx * 8]
sub edx,ebx
$ LC3 @ Sum产品:
cmp ecx,DWORD PTR _n2 $ [ebp]
jge SHORT $ LN9 @ SumProduct
cmp ecx,DWORD PTR _n3 $ [ebp]
jge SHORT $ LN9 @ SumProduct

; 30:sum + = v1 [i] * v2 [i];

fld QWORD PTR [eax + edx]
inc ecx
fmul QWORD PTR [eax]
add eax,8
faddp ST(1), ST(0)
cmp ecx,edi
jl SHORT $ LC3 @ SumProduct
$ LN9 @ Sum产品:

; 31:return sum;
; 32:}

pop edi
pop esi
pop ebx
mov esp,ebp
pop ebp
ret 0
? SumProduct @@ YANPBN0HHH @ Z ENDP; SumProduct
_TEXT ENDS
PUBLIC?SumProduct @@ YANPBN0HH @ Z; SumProduct
;函数编译标志:/ Ogtp
; COMDAT?SumProduct @@ YANPBN0HH @ Z
_TEXT SEGMENT
tv448 = -4; size = 4
_v1 $ = 8; size = 4
_v2 $ = 12; size = 4
_n1 $ = 16; size = 4
_n2 $ = 20; size = 4
?SumProduct @@ YANPBN0HH @ Z PROC; SumProduct,COMDAT

; 15:{

push ebp
mov ebp,esp
push ecx

16:double sum = 0;

fldz
push ebx
mov ebx,DWORD PTR _v2 $ [ebp]
push esi
push edi
mov edi,DWORD PTR _n1 $ [ebp]

; 17:for(int i = 0;

xor ecx,ecx

; 18:i< n1& i< n2;
; 19 :++ i)

cmp edi,4
jl SHORT $ LC8 @ SumProduct @ 2

; 16:double sum = 0;

mov edi,DWORD PTR _v1 $ [ebp]
lea edx,DWORD PTR [edi + 24]

; 20:sum + = v1 [i] * v2 [i];

sub edi,ebx
lea esi,DWORD PTR [ecx + 2]
lea eax,DWORD PTR [ebx + 8]
mov DWORD PTR tv448 [ebp ],edi
$ LN19 @ SumProduct @ 2:
mov edi,DWORD PTR _n2 $ [ebp]
cmp ecx,edi
jge SHORT $ LN9 @ SumProduct @ 2
fld QWORD PTR [eax-8]
lea ebx,DWORD PTR [esi-1]
fmul QWORD PTR [edx-24]
faddp ST(1)
cmp ebx,edi
jge SHORT $ LN9 @ SumProduct @ 2
mov ebx,DWORD PTR tv448 [ebp]
fld QWORD PTR [ebx + eax]
fmul QWORD PTR [eax]
faddp ST(1),ST(0)
cmp esi,edi
jge SHORT $ LN9 @ SumProduct @ 2
fld QWORD PTR [eax + 8 ]
lea ebx,DWORD PTR [esi + 1]
fmul QWORD PTR [edx-8]
faddp ST(1),ST(0)
cmp ebx,edi
jge SHORT $ LN9 @ SumProduct @ 2
fld QWORD PTR [eax + 16]
mov edi,DWORD PTR _n1 $ [ebp]
fmul QWORD PTR [edx]
add ecx,4
lea ebx,DWORD PTR [edi-3]
add eax,32; 00000020H
add edx,32; 00000020H
faddp ST(1),ST(0)
add esi,4
cmp ecx,ebx
jl SHORT $ LN19 @ SumProduct @ 2
mov ebx, DWORD PTR _v2 $ [ebp]
$ LC8 @ SumProduct @ 2:

; 18:i ; 19:++ i)

cmp ecx,edi
jge SHORT $ LN9 @ SumProduct @ 2
mov edx,DWORD PTR _v1 $ [ebp]
lea eax ,DWORD PTR [ebx + ecx * 8]
sub edx,ebx
$ LC3 @ SumProduct @ 2:
cmp ecx,DWORD PTR _n2 $ [ebp]
jge SHORT $ LN9 @ SumProduct @ 2

; 20:sum + = v1 [i] * v2 [i];

fld QWORD PTR [eax + edx]
inc ecx
fmul QWORD PTR [eax]
add eax,8
faddp ST(1), ST(0)
cmp ecx,edi
jl SHORT $ LC3 @ SumProduct @ 2
$ LN9 @ SumProduct @ 2:

; 21:return sum;
; 22:}

pop edi
pop esi
pop ebx
mov esp,ebp
pop ebp
ret 0
? SumProduct @@ YANPBN0HH @ Z ENDP; SumProduct
_TEXT ENDS
PUBLIC?SumProduct @@ YANPBN0H @ Z; SumProduct
;函数编译标志:/ Ogtp
; COMDAT?SumProduct @@ YANPBN0H @ Z
_TEXT SEGMENT
_v1 $ = 8; size = 4
_v2 $ = 12; size = 4
?SumProduct @@ YANPBN0H @ Z PROC; SumProduct,COMDAT
; -n $ = eax

; 5:{

push ebp
mov ebp,esp
mov edx,DWORD PTR _v2 $ [ebp]

; 6:double sum = 0;

fldz
push ebx
push esi
mov esi,eax

; 7:for(int i = 0;

xor ebx,ebx
push edi
mov edi,DWORD PTR _v1 $ [ebp]

; 8:i ; 9:++ i)

cmp esi,4
jl SHORT $ LC9 @ SumProduct @ 3

; 6:double sum = 0;

lea eax,DWORD PTR [edx + 8]
lea ecx,DWORD PTR [edi + 24]

; 10:sum + = v1 [i] * v2 [i];

sub edi,edx
lea edx,DWORD PTR [esi-4]
shr edx,2
inc edx
lea ebx,DWORD PTR [ edx * 4]
$ LN10 @ SumProduct @ 3:
fld QWORD PTR [eax-8]
add eax,32; 00000020H
fmul QWORD PTR [ecx-24]
add ecx,32; 00000020H
dec edx
faddp ST(1),ST(0)
fld QWORD PTR [edi + eax-32]
fmul QWORD PTR [eax-32]
faddp ST(1),ST(0)
fld QWORD PTR [eax-24]
fmul QWORD PTR [ecx-40]
faddp ST b $ b fld QWORD PTR [eax-16]
fmul QWORD PTR [ecx-32]
faddp ST(1),ST(0)
jne SHORT $ LN10 @ SumProduct @ 3

; 6:double sum = 0;

mov edx,DWORD PTR _v2 $ [ebp]
mov edi,DWORD PTR _v1 $ [ebp]
$ LC9 @ SumProduct @ 3:
$ b b; 8:i ; 9:++ i)

cmp ebx,esi
jge SHORT $ LN8 @ SumProduct @ 3
sub edi,edx
lea eax,DWORD PTR [edx + ebx * 8]
sub esi,ebx
$ LC3 @ SumProduct @ 3:

; 10:sum + = v1 [i] * v2 [i];

fld QWORD PTR [eax + edi]
add eax,8
dec esi
fmul QWORD PTR [eax-8]
faddp ST ),ST(0)
jne SHORT $ LC3 @ SumProduct @ 3
$ LN8 @ SumProduct @ 3:

; 11:return sum;
; 12:}

pop edi
pop esi
pop ebx
pop ebp
ret 0
?SumProduct @@ YANPBN0H @ Z ENDP; SumProduct
_TEXT ENDS
END


解决方案

CPU之间的一个巨大区别是流水线优化



CPU可以并行执行几个指令,直到达到条件分支。从这一点开始,而不是等待直到所有指令被执行,CPU可以并行地继续分支,直到条件可用并且准备好被评估。如果假设是正确的,那么我们有一个增益。否则CPU将与其他分支一起运行。



因此,CPU的棘手部分是找到最好的假设,并尽可能并行执行尽可能多的指令。 / p>

I have been doing some testing to see how much of a difference additional bounds checking makes in loops. This is prompted by thinking about the cost of implicit bounds checking inserted by languages such as C#, Java etc, when you access arrays.

Update: I have tried the same executable program out on several additional computers, which throws a lot more light onto what is happening. I've listed the original computer first, and second my modern laptop. On my modern laptop, adding additional checks in the loop adds only between 1 and 4% to the time taken, compared to between 3 and 30% for the original hardware.

Processor   x86 Family 6 Model 30 Stepping 5 GenuineIntel ~2793 Mhz
Ratio 2 checks : 1 check = 1.0310
Ratio 3 checks : 1 check = 1.2769

Processor   Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz, 2301 Mhz, 4 Core(s), 8 Logical Processor(s)
Ratio 2 checks : 1 check = 1.0090
Ratio 3 checks : 1 check = 1.0393

Processor   Intel(R) Core(TM) i5-2500 CPU @ 3.30GHz, 4 Cores(s)
Ratio 2 checks : 1 check = 1.0035
Ratio 3 checks : 1 check = 1.0639

Processor   Intel(R) Core(TM)2 Duo CPU     T9300  @ 2.50GHz, 2501 Mhz, 2 Core(s), 2 Logical Processor(s)
Ratio 2 checks : 1 check = 1.1195
Ratio 3 checks : 1 check = 1.3597

Processor   x86 Family 15 Model 43 Stepping 1 AuthenticAMD ~2010 Mhz
Ratio 2 checks : 1 check = 1.0776
Ratio 3 checks : 1 check = 1.1451

In the test program, below, the first function checks just one bound, the second function checks two, and the third checks three (in the calling code, n1=n2=n3). I found that the ratio two checks:one was about 1.03, and the ratio three checks:one was about 1.3. I was surprised by that adding one more check made such a difference to performance. I got an interesting answer concerning the low cost of bounds checking on modern processors to my original question, which may throw some light on the differences observed here.

Note that it's important to compile the program without whole program optimization turned on; otherwise the compiler can simply remove the additional bounds checking.

// dotprod.cpp
#include "dotprod.h"

double SumProduct(const double* v1, const double* v2, int n)
{
    double sum=0;
    for(int i=0;
        i<n;
        ++i)
        sum += v1[i]*v2[i];
    return sum;
}

double SumProduct(const double* v1, const double* v2, int n1, int n2)
{
    double sum=0;
    for(int i=0;
        i<n1 && i <n2;
        ++i)
        sum += v1[i]*v2[i];
    return sum;
}

double SumProduct(const double* v1, const double* v2, int n1, int n2, int n3)
{
    double sum=0;
    for(int i=0;
        i<n1 && i <n2 && i <n3;
        ++i)
        sum += v1[i]*v2[i];
    return sum;
}

This code was originally built using Visual Studio 2010, Release, Win32 (I've added the 'C' tag because the reasoning behind the difference in speed is not likely to be C++ specific, and may not be Windows specific). Can anyone explain it?

Rest of the code below, for information. This has some C++ specific stuff in it.

Header file

// dotprod.h
double SumProduct(const double*, const double*, int n);
double SumProduct(const double*, const double*, int n1, int n2);
double SumProduct(const double*, const double*, int n1, int n2, int n3);

Test harness

// main.cpp

#include <stdio.h>
#include <math.h>
#include <numeric>
#include <vector>

#include <windows.h>

#include "../dotprod/dotprod.h" // separate lib

typedef __int64 timecount_t;
inline timecount_t GetTimeCount()
{
    LARGE_INTEGER li;
    if (!QueryPerformanceCounter(&li)) {
        exit(1);
    }
    return li.QuadPart;
}

int main()
{
    typedef std::vector<double> dvec;
    const int N  = 100 * 1000;

    // Initialize
    dvec v1(N);
    dvec v2(N);
    dvec dp1(N);
    dvec dp2(N);
    dvec dp3(N);
    for(int i=0; i<N; ++i) {
        v1[i] = i;
        v2[i] = log(static_cast<double>(i+1));
    }

    const timecount_t t0 = GetTimeCount();

    // Check cost with one bound
    for(int n=0; n<N; ++n) {
        dp1[n] = SumProduct(&(v1[0]),&(v2[0]),n); 
    }

    const timecount_t t1 = GetTimeCount();

    // Check cost with two bounds
    for(int n=0; n<N; ++n) {
        dp2[n] = SumProduct(&(v1[0]),&(v2[0]),n,n); 
    }

    const timecount_t t2 = GetTimeCount();

    // Check cost with three bounds
    for(int n=0; n<N; ++n) {
        dp3[n] = SumProduct(&(v1[0]),&(v2[0]),n,n,n); 
    }
    const timecount_t t3 = GetTimeCount();

    // Check results
    const double sumSumProducts1 = std::accumulate(dp1.begin(), dp1.end(), 0.0);
    const double sumSumProducts2 = std::accumulate(dp2.begin(), dp2.end(), 0.0);
    const double sumSumProducts3 = std::accumulate(dp3.begin(), dp3.end(), 0.0);
    printf("Sums of dot products: %.1f, %.1f, %.1f\n", sumSumProducts1, sumSumProducts2, sumSumProducts3);

    // Output timings
    const timecount_t elapsed1 = t1-t0;
    const timecount_t elapsed2 = t2-t1;
    const timecount_t elapsed3 = t3-t2;
    printf("Elapsed: %.0f, %.0f, %.0f\n",
        static_cast<double>(elapsed1),
        static_cast<double>(elapsed2),
        static_cast<double>(elapsed3));
    const double ratio2to1 = elapsed2 / static_cast<double>(elapsed1);
    const double ratio3to1 = elapsed3 / static_cast<double>(elapsed1);
    printf("Ratio 2:1=%.2f\n", ratio2to1);
    printf("Ratio 3:1=%.2f\n", ratio3to1);

    return 0;
}

In order to produce assembly, I took the advice in this answer (case 2, turning off whole program optimization), producing the following asm file.

; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01 

    TITLE   C:\dev\TestSpeed\dotprod\dotprod.cpp
    .686P
    .XMM
    include listing.inc
    .model  flat

INCLUDELIB OLDNAMES

PUBLIC  __real@0000000000000000
PUBLIC  ?SumProduct@@YANPBN0HHH@Z           ; SumProduct
EXTRN   __fltused:DWORD
;   COMDAT __real@0000000000000000
; File c:\dev\testspeed\dotprod\dotprod.cpp
CONST   SEGMENT
__real@0000000000000000 DQ 00000000000000000r   ; 0
; Function compile flags: /Ogtp
CONST   ENDS
;   COMDAT ?SumProduct@@YANPBN0HHH@Z
_TEXT   SEGMENT
tv491 = -4                      ; size = 4
_v1$ = 8                        ; size = 4
_v2$ = 12                       ; size = 4
_n1$ = 16                       ; size = 4
_n2$ = 20                       ; size = 4
_n3$ = 24                       ; size = 4
?SumProduct@@YANPBN0HHH@Z PROC              ; SumProduct, COMDAT

; 25   : {

    push    ebp
    mov ebp, esp
    push    ecx

; 26   :     double sum=0;

    fldz
    push    ebx
    mov ebx, DWORD PTR _v2$[ebp]
    push    esi
    push    edi
    mov edi, DWORD PTR _n1$[ebp]

; 27   :     for(int i=0;

    xor ecx, ecx

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp edi, 4
    jl  $LC8@SumProduct

; 26   :     double sum=0;

    mov edi, DWORD PTR _v1$[ebp]
    lea esi, DWORD PTR [edi+24]

; 30   :         sum += v1[i]*v2[i];

    sub edi, ebx
    lea edx, DWORD PTR [ecx+2]
    lea eax, DWORD PTR [ebx+8]
    mov DWORD PTR tv491[ebp], edi
$LN15@SumProduct:

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    mov ebx, DWORD PTR _n2$[ebp]
    cmp ecx, ebx
    jge $LN9@SumProduct
    cmp ecx, DWORD PTR _n3$[ebp]
    jge $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax-8]
    lea edi, DWORD PTR [edx-1]
    fmul    QWORD PTR [esi-24]
    faddp   ST(1), ST(0)
    cmp edi, ebx
    jge SHORT $LN9@SumProduct

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp edi, DWORD PTR _n3$[ebp]
    jge SHORT $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    mov edi, DWORD PTR tv491[ebp]
    fld QWORD PTR [edi+eax]
    fmul    QWORD PTR [eax]
    faddp   ST(1), ST(0)
    cmp edx, ebx
    jge SHORT $LN9@SumProduct

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp edx, DWORD PTR _n3$[ebp]
    jge SHORT $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+8]
    lea edi, DWORD PTR [edx+1]
    fmul    QWORD PTR [esi-8]
    faddp   ST(1), ST(0)
    cmp edi, ebx
    jge SHORT $LN9@SumProduct

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp edi, DWORD PTR _n3$[ebp]
    jge SHORT $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+16]
    mov edi, DWORD PTR _n1$[ebp]
    fmul    QWORD PTR [esi]
    add ecx, 4
    lea ebx, DWORD PTR [edi-3]
    add eax, 32                 ; 00000020H
    add esi, 32                 ; 00000020H
    faddp   ST(1), ST(0)
    add edx, 4
    cmp ecx, ebx
    jl  SHORT $LN15@SumProduct
    mov ebx, DWORD PTR _v2$[ebp]
$LC8@SumProduct:

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp ecx, edi
    jge SHORT $LN9@SumProduct
    mov edx, DWORD PTR _v1$[ebp]
    lea eax, DWORD PTR [ebx+ecx*8]
    sub edx, ebx
$LC3@SumProduct:
    cmp ecx, DWORD PTR _n2$[ebp]
    jge SHORT $LN9@SumProduct
    cmp ecx, DWORD PTR _n3$[ebp]
    jge SHORT $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+edx]
    inc ecx
    fmul    QWORD PTR [eax]
    add eax, 8
    faddp   ST(1), ST(0)
    cmp ecx, edi
    jl  SHORT $LC3@SumProduct
$LN9@SumProduct:

; 31   :     return sum;
; 32   : }

    pop edi
    pop esi
    pop ebx
    mov esp, ebp
    pop ebp
    ret 0
?SumProduct@@YANPBN0HHH@Z ENDP              ; SumProduct
_TEXT   ENDS
PUBLIC  ?SumProduct@@YANPBN0HH@Z            ; SumProduct
; Function compile flags: /Ogtp
;   COMDAT ?SumProduct@@YANPBN0HH@Z
_TEXT   SEGMENT
tv448 = -4                      ; size = 4
_v1$ = 8                        ; size = 4
_v2$ = 12                       ; size = 4
_n1$ = 16                       ; size = 4
_n2$ = 20                       ; size = 4
?SumProduct@@YANPBN0HH@Z PROC               ; SumProduct, COMDAT

; 15   : {

    push    ebp
    mov ebp, esp
    push    ecx

; 16   :     double sum=0;

    fldz
    push    ebx
    mov ebx, DWORD PTR _v2$[ebp]
    push    esi
    push    edi
    mov edi, DWORD PTR _n1$[ebp]

; 17   :     for(int i=0;

    xor ecx, ecx

; 18   :         i<n1 && i <n2;
; 19   :         ++i)

    cmp edi, 4
    jl  SHORT $LC8@SumProduct@2

; 16   :     double sum=0;

    mov edi, DWORD PTR _v1$[ebp]
    lea edx, DWORD PTR [edi+24]

; 20   :         sum += v1[i]*v2[i];

    sub edi, ebx
    lea esi, DWORD PTR [ecx+2]
    lea eax, DWORD PTR [ebx+8]
    mov DWORD PTR tv448[ebp], edi
$LN19@SumProduct@2:
    mov edi, DWORD PTR _n2$[ebp]
    cmp ecx, edi
    jge SHORT $LN9@SumProduct@2
    fld QWORD PTR [eax-8]
    lea ebx, DWORD PTR [esi-1]
    fmul    QWORD PTR [edx-24]
    faddp   ST(1), ST(0)
    cmp ebx, edi
    jge SHORT $LN9@SumProduct@2
    mov ebx, DWORD PTR tv448[ebp]
    fld QWORD PTR [ebx+eax]
    fmul    QWORD PTR [eax]
    faddp   ST(1), ST(0)
    cmp esi, edi
    jge SHORT $LN9@SumProduct@2
    fld QWORD PTR [eax+8]
    lea ebx, DWORD PTR [esi+1]
    fmul    QWORD PTR [edx-8]
    faddp   ST(1), ST(0)
    cmp ebx, edi
    jge SHORT $LN9@SumProduct@2
    fld QWORD PTR [eax+16]
    mov edi, DWORD PTR _n1$[ebp]
    fmul    QWORD PTR [edx]
    add ecx, 4
    lea ebx, DWORD PTR [edi-3]
    add eax, 32                 ; 00000020H
    add edx, 32                 ; 00000020H
    faddp   ST(1), ST(0)
    add esi, 4
    cmp ecx, ebx
    jl  SHORT $LN19@SumProduct@2
    mov ebx, DWORD PTR _v2$[ebp]
$LC8@SumProduct@2:

; 18   :         i<n1 && i <n2;
; 19   :         ++i)

    cmp ecx, edi
    jge SHORT $LN9@SumProduct@2
    mov edx, DWORD PTR _v1$[ebp]
    lea eax, DWORD PTR [ebx+ecx*8]
    sub edx, ebx
$LC3@SumProduct@2:
    cmp ecx, DWORD PTR _n2$[ebp]
    jge SHORT $LN9@SumProduct@2

; 20   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+edx]
    inc ecx
    fmul    QWORD PTR [eax]
    add eax, 8
    faddp   ST(1), ST(0)
    cmp ecx, edi
    jl  SHORT $LC3@SumProduct@2
$LN9@SumProduct@2:

; 21   :     return sum;
; 22   : }

    pop edi
    pop esi
    pop ebx
    mov esp, ebp
    pop ebp
    ret 0
?SumProduct@@YANPBN0HH@Z ENDP               ; SumProduct
_TEXT   ENDS
PUBLIC  ?SumProduct@@YANPBN0H@Z             ; SumProduct
; Function compile flags: /Ogtp
;   COMDAT ?SumProduct@@YANPBN0H@Z
_TEXT   SEGMENT
_v1$ = 8                        ; size = 4
_v2$ = 12                       ; size = 4
?SumProduct@@YANPBN0H@Z PROC                ; SumProduct, COMDAT
; _n$ = eax

; 5    : {

    push    ebp
    mov ebp, esp
    mov edx, DWORD PTR _v2$[ebp]

; 6    :     double sum=0;

    fldz
    push    ebx
    push    esi
    mov esi, eax

; 7    :     for(int i=0;

    xor ebx, ebx
    push    edi
    mov edi, DWORD PTR _v1$[ebp]

; 8    :         i<n;
; 9    :         ++i)

    cmp esi, 4
    jl  SHORT $LC9@SumProduct@3

; 6    :     double sum=0;

    lea eax, DWORD PTR [edx+8]
    lea ecx, DWORD PTR [edi+24]

; 10   :         sum += v1[i]*v2[i];

    sub edi, edx
    lea edx, DWORD PTR [esi-4]
    shr edx, 2
    inc edx
    lea ebx, DWORD PTR [edx*4]
$LN10@SumProduct@3:
    fld QWORD PTR [eax-8]
    add eax, 32                 ; 00000020H
    fmul    QWORD PTR [ecx-24]
    add ecx, 32                 ; 00000020H
    dec edx
    faddp   ST(1), ST(0)
    fld QWORD PTR [edi+eax-32]
    fmul    QWORD PTR [eax-32]
    faddp   ST(1), ST(0)
    fld QWORD PTR [eax-24]
    fmul    QWORD PTR [ecx-40]
    faddp   ST(1), ST(0)
    fld QWORD PTR [eax-16]
    fmul    QWORD PTR [ecx-32]
    faddp   ST(1), ST(0)
    jne SHORT $LN10@SumProduct@3

; 6    :     double sum=0;

    mov edx, DWORD PTR _v2$[ebp]
    mov edi, DWORD PTR _v1$[ebp]
$LC9@SumProduct@3:

; 8    :         i<n;
; 9    :         ++i)

    cmp ebx, esi
    jge SHORT $LN8@SumProduct@3
    sub edi, edx
    lea eax, DWORD PTR [edx+ebx*8]
    sub esi, ebx
$LC3@SumProduct@3:

; 10   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+edi]
    add eax, 8
    dec esi
    fmul    QWORD PTR [eax-8]
    faddp   ST(1), ST(0)
    jne SHORT $LC3@SumProduct@3
$LN8@SumProduct@3:

; 11   :     return sum;
; 12   : }

    pop edi
    pop esi
    pop ebx
    pop ebp
    ret 0
?SumProduct@@YANPBN0H@Z ENDP                ; SumProduct
_TEXT   ENDS
END

解决方案

One big difference between CPUs is the pipeline optimization

The CPU can execute in parallel several instructions until reaches a conditional branch. From this point instead of waiting until all the instructions are executed, the CPU can continue with a branch in parallel until the condition is available and ready to be evaluated. If the assumption was correct, then we have a gain. Otherwise the CPU will go with the other branch.

So the tricky part for a CPU is to find the best assumptions and to execute as many instructions in parallel as possible.

这篇关于为什么添加额外的检查循环在一些机器上有很大的不同,在其他机器上有小的差别?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆