为什么加入环路额外的检查让别人在某些机器上很大的区别,小差异? [英] Why does adding extra check in loop make big difference on some machines, and small difference on others?

查看:149
本文介绍了为什么加入环路额外的检查让别人在某些机器上很大的区别,小差异?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我一直在做一些测试,看看有多少的差别额外的边界检查使得循环。这是通过考虑隐含界限由语言,如C#,Java的等等,当你访问阵列插入检查的费用提示。

更新:我已经试过几个额外的计算机相同的可执行程序时,它抛出了很多更多的光线到发生了什么。我列出原来的计算机第一,二我现代的笔记本电脑。在我的现代的笔记本电脑,在环中加入额外的检查增加了拍摄,相比于图3和30%之间的原始硬件的时间只有1和4%之间。

 处理器x86系列6型30步进5 GenuineIntel〜2793兆赫
比2检查:1检查= 1.0310
比3检查:1检查= 1.2769处理器英特尔(R)酷睿(TM)i7-3610QM CPU @ 2.30GHz中,2301兆赫,4芯(S),8个逻辑处理器(S)
比2检查:1检查= 1.0090
比3检查:1检查= 1.0393处理器英特尔(R)酷睿(TM)i5-2500 CPU @ 3.30GHz,4个内核(S)
比2检查:1检查= 1.0035
比3检查:1检查= 1.0639处理器英特尔(R)酷睿(TM)2双核T9300 CPU @ 2.50GHz,2501兆赫,酷睿2(S),2个逻辑处理器(S)
比2检查:1检查= 1.1195
比3检查:1检查= 1.3597处理器x86系列15型号43步进1 AuthenticAMD〜2010兆赫
比2检查:1检查= 1.0776
比3检查:1检查= 1.1451

在测试程序中,在下面,第一函数检查只是一个约束,在第二功能检查两个,所述第三检查三(在调用code, N1 = N2 = N3 )。我发现比的两项检查:一是的约为1.03,比值的三项检查:一是的约1.3。我很惊讶,通过增加一个支票性能这样的差异。我得到了一个关于的我原来的问题,这可能会在抛出一些轻href=\"http://stackoverflow.com/a/16719474/834521\">有趣的答案这里观察到的差异。

请注意,它编译没有整个程序优化开启该程序是很重要的;否则编译器可以简单地删除其他边界检查。

  // dotprod.cpp
#包括dotprod.h双SUMPRODUCT(常量双* V1,常量双* V2,INT N)
{
    双总和= 0;
    的for(int i = 0;
        I< N;
        ++ I)
        总和+ = V1 [I] * V2 [I]
    返回总和;
}双SUMPRODUCT(常量双* V1,常量双* V2,INT N1,N2 INT)
{
    双总和= 0;
    的for(int i = 0;
        I< N1和放大器;&安培; I< N2;
        ++ I)
        总和+ = V1 [I] * V2 [I]
    返回总和;
}双SUMPRODUCT(常量双* V1,常量双* V2,INT N1,N2 INT,INT N3)
{
    双总和= 0;
    的for(int i = 0;
        I< N1和放大器;&安培; I< N2和放大器;&安培; I< N3;
        ++ I)
        总和+ = V1 [I] * V2 [I]
    返回总和;
}

这code最初是使用Visual Studio 2010内置,发布的Win32(我已经添加了'C'的标签,因为在速度上的差异背后的推理不可能是C ++特有的,不得视窗具体)。谁能解释一下吗?

低于code,信息的其余部分。这有一些C ++具体的东西在里面。

头文件

  // dotprod.h
双SUMPRODUCT(常量双*,常量双*,INT N);
双SUMPRODUCT(常量双*,常量双*,诠释N1,N2 INT);
双SUMPRODUCT(常量双*,常量双*,诠释N1,N2 INT,INT N3);

测试工具

  //的main.cpp#包括LT&;&stdio.h中GT;
#包括LT&;&math.h中GT;
#包括LT&;&数字GT;
#包括LT&;矢量>#包括LT&;&WINDOWS.H GT;#包括../dotprod/dotprod.h//独立的lib的typedef __int64 timecount_t;
内嵌timecount_t GetTimeCount()
{
    LARGE_INTEGER立;
    如果(QueryPerformanceCounter的(安培;!李)){
        出口(1);
    }
    返回li.QuadPart;
}诠释的main()
{
    的typedef的std ::矢量<&双GT; dvec;
    const int的N = 100 * 1000;    //初始化
    dvec V1(N);
    dvec V2(N);
    dvec DP1(N);
    dvec DP2(N);
    dvec DP3(N);
    的for(int i = 0; I< N ++我){
        V1 [i] =我;
        V2 [I] =日志(的static_cast<双>第(i + 1));
    }    常量timecount_t T0 = GetTimeCount();    //检查一个成本约束
    对于(INT N = 0; N< N ++ N){
        DP1 [η] = SUMPRODUCT(及(V1 [0]),及(2版[0])中,n);
    }    常量timecount_t T1 = GetTimeCount();    //检查两个边界成本
    对于(INT N = 0; N< N ++ N){
        DP2 [η] = SUMPRODUCT(及(V1 [0]),及(2版[0]),N,N);
    }    常量timecount_t T2 = GetTimeCount();    //检查有三个界限成本
    对于(INT N = 0; N< N ++ N){
        DP3 [η] = SUMPRODUCT(及(V1 [0]),及(2版[0])中,n,N,N);
    }
    常量timecount_t T3 = GetTimeCount();    //检查结果
    常量双sumSumProducts1 =的std ::累加(dp1.begin(),dp1.end(),0.0);
    常量双sumSumProducts2 =的std ::累加(dp2.begin(),dp2.end(),0.0);
    常量双sumSumProducts3 =的std ::累加(dp3.begin(),dp3.end(),0.0);
    的printf(的点积和:%.1F,%.1F,%.1F \\ n,sumSumProducts1,sumSumProducts2,sumSumProducts3);    //输出计时
    常量timecount_t elapsed1 = T1-T0;
    常量timecount_t elapsed2 = T2-T1;
    常量timecount_t elapsed3 = T3-T2;
    的printf(经历:到%.0f,到%.0f,到%.0f \\ n
        的static_cast<双>(elapsed1)
        的static_cast<双>(elapsed2)
        的static_cast<双>(elapsed3));
    常量双ratio2to1 = elapsed2 /&的static_cast LT;双>(elapsed1);
    常量双ratio3to1 = elapsed3 /&的static_cast LT;双>(elapsed1);
    的printf(比2:1 =%2F \\ n,ratio2to1);
    的printf(比例3:1 =%2F \\ n,ratio3to1);    返回0;
}

为了生产组装,我在了建议这个答案(2的情况下,关闭整个程序优化) ,产下的asm文件。

 ;由微软(R)优化编译器版本16.00.40219.01清单生成    TITLE C:\\ dev的\\ TestSpeed​​ \\ dotprod \\ dotprod.cpp
    .686P
    .XMM
    包括listing.inc
    .MODEL平INCLUDELIB OLDNAMES公共__real @ 0000000000000000
公共SUMPRODUCT @@ YANPBN0HHH @ Z?; SUMPRODUCT
EXTRN __fltused:DWORD
; COMDAT __real @ 0000000000000000
;文件C:\\ dev的\\ testspeed \\ dotprod \\ dotprod.cpp
CONST段
__real @ 0000000000000000 DQ 00000000000000000r; 0
;功能编译标志:/ Ogtp
CONST ENDS
; COMDAT?SUMPRODUCT @@ YANPBN0HHH @ Z
_TEXT段
tv491 = -4;大小= 4
_v1 $ = 8;大小= 4
_v2 $ = 12;大小= 4
_n1 $ = 16;大小= 4
_n2 $ = 20;大小= 4
_n3 $ = 24;大小= 4
?SUMPRODUCT @@ YANPBN0HHH @ Z PROC; SUMPRODUCT,COMDAT; 25:{    推EBP
    MOV EBP,ESP
    推ECX; 26:双总和= 0;    FLDZ
    推EBX
    MOV EBX,DWORD PTR _v2 $ [EBP]
    推ESI
    推EDI
    MOV EDI,DWORD PTR _n1 $ [EBP]; 27:为(INT I = 0;    异或ECX,ECX; 28:I< N1和放大器;&安培; I< N2和放大器;&安培; I< N3;
; 29:++ I)    CMP EDI,4
    JL $ @ LC8 SUMPRODUCT; 26:双总和= 0;    MOV EDI,DWORD PTR _v1 $ [EBP]
    LEA ESI,DWORD PTR [EDI + 24]; 30:总和+ = V1 [I] * V2 [I]    子EDI,EBX
    LEA EDX,DWORD PTR [ECX + 2]
    LEA EAX,DWORD PTR [EBX + 8]
    MOV DWORD PTR tv491 [EBP],EDI
$ @ LN15 SUMPRODUCT:; 28:I< N1和放大器;&安培; I< N2和放大器;&安培; I< N3;
; 29:++ I)    MOV EBX,DWORD PTR _n2 $ [EBP]
    CMP ECX,EBX
    JGE $ @ LN9 SUMPRODUCT
    CMP ECX,DWORD PTR _n3 $ [EBP]
    JGE $ @ LN9 SUMPRODUCT; 30:总和+ = V1 [I] * V2 [I]    FLD QWORD PTR [EAX-8]
    LEA EDI,DWORD PTR [EDX-1]
    FMUL QWORD PTR [ESI-24]
    FADDP ST(1),ST(0)
    CMP EDI,EBX
    JGE SHORT $ @ LN9 SUMPRODUCT; 28:I< N1和放大器;&安培; I< N2和放大器;&安培; I< N3;
; 29:++ I)    CMP EDI,DWORD PTR _n3 $ [EBP]
    JGE SHORT $ @ LN9 SUMPRODUCT; 30:总和+ = V1 [I] * V2 [I]    MOV EDI,DWORD PTR tv491 [EBP]
    FLD QWORD PTR [EDI + EAX]
    FMUL QWORD PTR [EAX]
    FADDP ST(1),ST(0)
    CMP EDX,EBX
    JGE SHORT $ @ LN9 SUMPRODUCT; 28:I< N1和放大器;&安培; I< N2和放大器;&安培; I< N3;
; 29:++ I)    CMP EDX,DWORD PTR _n3 $ [EBP]
    JGE SHORT $ @ LN9 SUMPRODUCT; 30:总和+ = V1 [I] * V2 [I]    FLD QWORD PTR [EAX + 8]
    LEA EDI,DWORD PTR [EDX + 1]
    FMUL QWORD PTR [ESI-8]
    FADDP ST(1),ST(0)
    CMP EDI,EBX
    JGE SHORT $ @ LN9 SUMPRODUCT; 28:I< N1和放大器;&安培; I< N2和放大器;&安培; I< N3;
; 29:++ I)    CMP EDI,DWORD PTR _n3 $ [EBP]
    JGE SHORT $ @ LN9 SUMPRODUCT; 30:总和+ = V1 [I] * V2 [I]    FLD QWORD PTR [EAX + 16]
    MOV EDI,DWORD PTR _n1 $ [EBP]
    FMUL QWORD PTR [ESI]
    加ECX,4
    LEA EBX,DWORD PTR [EDI-3]
    添加EAX,32; 00000020H
    加ESI,32; 00000020H
    FADDP ST(1),ST(0)
    添加EDX,4
    CMP ECX,EBX
    JL SHORT $ @ LN15 SUMPRODUCT
    MOV EBX,DWORD PTR _v2 $ [EBP]
$ @ LC8 SUMPRODUCT:; 28:I< N1和放大器;&安培; I< N2和放大器;&安培; I< N3;
; 29:++ I)    CMP ECX,EDI
    JGE SHORT $ @ LN9 SUMPRODUCT
    MOV EDX,DWORD PTR _v1 $ [EBP]
    LEA EAX,DWORD PTR [EBX + ECX * 8]
    子EDX,EBX
$ @ LC3 SUMPRODUCT:
    CMP ECX,DWORD PTR _n2 $ [EBP]
    JGE SHORT $ @ LN9 SUMPRODUCT
    CMP ECX,DWORD PTR _n3 $ [EBP]
    JGE SHORT $ @ LN9 SUMPRODUCT; 30:总和+ = V1 [I] * V2 [I]    FLD QWORD PTR [EAX + EDX]
    INC ECX
    FMUL QWORD PTR [EAX]
    添加EAX,8
    FADDP ST(1),ST(0)
    CMP ECX,EDI
    JL SHORT $ @ LC3 SUMPRODUCT
$ @ LN9 SUMPRODUCT:; 31:收益总和;
; 32:}    流行EDI
    流行ESI
    流行EBX
    MOV ESP,EBP
    流行EBP
    RET 0
?SUMPRODUCT @@ YANPBN0HHH @ Z ENDP; SUMPRODUCT
_TEXT ENDS
公共SUMPRODUCT @@ YANPBN0HH @ Z?; SUMPRODUCT
;功能编译标志:/ Ogtp
; COMDAT?SUMPRODUCT @@ YANPBN0HH @ Z
_TEXT段
tv448 = -4;大小= 4
_v1 $ = 8;大小= 4
_v2 $ = 12;大小= 4
_n1 $ = 16;大小= 4
_n2 $ = 20;大小= 4
?SUMPRODUCT @@ YANPBN0HH @ Z PROC; SUMPRODUCT,COMDAT; 15:{    推EBP
    MOV EBP,ESP
    推ECX; 16:双总和= 0;    FLDZ
    推EBX
    MOV EBX,DWORD PTR _v2 $ [EBP]
    推ESI
    推EDI
    MOV EDI,DWORD PTR _n1 $ [EBP]; 17:为(INT I = 0;    异或ECX,ECX; 18:I< N1和放大器;&安培; I< N2;
; 19:++ I)    CMP EDI,4
    JL SHORT $ @ LC8 @ SUMPRODUCT 2; 16:双总和= 0;    MOV EDI,DWORD PTR _v1 $ [EBP]
    LEA EDX,DWORD PTR [EDI + 24]; 20:总和+ = V1 [I] * V2 [I]    子EDI,EBX
    LEA ESI,DWORD PTR [ECX + 2]
    LEA EAX,DWORD PTR [EBX + 8]
    MOV DWORD PTR tv448 [EBP],EDI
$ @ LN19 @ SUMPRODUCT 2:
    MOV EDI,DWORD PTR _n2 $ [EBP]
    CMP ECX,EDI
    JGE SHORT $ @ LN9 @ SUMPRODUCT 2
    FLD QWORD PTR [EAX-8]
    LEA EBX,DWORD PTR [ESI-1]
    FMUL QWORD PTR [EDX-24]
    FADDP ST(1),ST(0)
    CMP EBX,EDI
    JGE SHORT $ @ LN9 @ SUMPRODUCT 2
    MOV EBX,DWORD PTR tv448 [EBP]
    FLD QWORD PTR [EBX + EAX]
    FMUL QWORD PTR [EAX]
    FADDP ST(1),ST(0)
    CMP ESI,EDI
    JGE SHORT $ @ LN9 @ SUMPRODUCT 2
    FLD QWORD PTR [EAX + 8]
    LEA EBX,DWORD PTR [ESI + 1]
    FMUL QWORD PTR [EDX-8]
    FADDP ST(1),ST(0)
    CMP EBX,EDI
    JGE SHORT $ @ LN9 @ SUMPRODUCT 2
    FLD QWORD PTR [EAX + 16]
    MOV EDI,DWORD PTR _n1 $ [EBP]
    FMUL QWORD PTR [EDX]
    加ECX,4
    LEA EBX,DWORD PTR [EDI-3]
    添加EAX,32; 00000020H
    添加EDX,32; 00000020H
    FADDP ST(1),ST(0)
    加ESI,4
    CMP ECX,EBX
    JL SHORT $ @ LN19 @ SUMPRODUCT 2
    MOV EBX,DWORD PTR _v2 $ [EBP]
$ @ LC8 @ SUMPRODUCT 2:; 18:I< N1和放大器;&安培; I< N2;
; 19:++ I)    CMP ECX,EDI
    JGE SHORT $ @ LN9 @ SUMPRODUCT 2
    MOV EDX,DWORD PTR _v1 $ [EBP]
    LEA EAX,DWORD PTR [EBX + ECX * 8]
    子EDX,EBX
$ @ LC3 @ SUMPRODUCT 2:
    CMP ECX,DWORD PTR _n2 $ [EBP]
    JGE SHORT $ @ LN9 @ SUMPRODUCT 2; 20:总和+ = V1 [I] * V2 [I]    FLD QWORD PTR [EAX + EDX]
    INC ECX
    FMUL QWORD PTR [EAX]
    添加EAX,8
    FADDP ST(1),ST(0)
    CMP ECX,EDI
    JL SHORT $ @ LC3 @ SUMPRODUCT 2
$ @ LN9 @ SUMPRODUCT 2:; 21:收益总和;
; 22:}    流行EDI
    流行ESI
    流行EBX
    MOV ESP,EBP
    流行EBP
    RET 0
?SUMPRODUCT @@ YANPBN0HH @ Z ENDP; SUMPRODUCT
_TEXT ENDS
公共SUMPRODUCT @@ YANPBN0H @ Z?; SUMPRODUCT
;功能编译标志:/ Ogtp
; COMDAT?SUMPRODUCT @@ YANPBN0H @ Z
_TEXT段
_v1 $ = 8;大小= 4
_v2 $ = 12;大小= 4
?SUMPRODUCT @@ YANPBN0H @ Z PROC; SUMPRODUCT,COMDAT
; _n $ = EAX; 5:{    推EBP
    MOV EBP,ESP
    MOV EDX,DWORD PTR _v2 $ [EBP]; 6:双总和= 0;    FLDZ
    推EBX
    推ESI
    MOV ESI,EAX; 7:(INT I = 0;    XOR EBX,EBX
    推EDI
    MOV EDI,DWORD PTR _v1 $ [EBP]; 8:I< N;
; 9:++ I)    CMP ESI,4
    JL SHORT $ @ LC9 @ SUMPRODUCT 3; 6:双总和= 0;    LEA EAX,DWORD PTR [EDX + 8]
    LEA ECX,DWORD PTR [EDI + 24]; 10:总和+ = V1 [I] * V2 [I]    子EDI,EDX
    LEA EDX,DWORD PTR [ESI-4]
    SHR EDX,2
    INC EDX
    LEA EBX,DWORD PTR [EDX * 4]
$ @ LN10 @ SUMPRODUCT 3:
    FLD QWORD PTR [EAX-8]
    添加EAX,32; 00000020H
    FMUL QWORD PTR [ECX-24]
    加ECX,32; 00000020H
    十二月EDX
    FADDP ST(1),ST(0)
    FLD QWORD PTR [EDI + EAX-32]
    FMUL QWORD PTR [EAX-32]
    FADDP ST(1),ST(0)
    FLD QWORD PTR [EAX-24]
    FMUL QWORD PTR [ECX-40]
    FADDP ST(1),ST(0)
    FLD QWORD PTR [EAX-16]
    FMUL QWORD PTR [ECX-32]
    FADDP ST(1),ST(0)
    JNE SHORT $ @ LN10 @ SUMPRODUCT 3; 6:双总和= 0;    MOV EDX,DWORD PTR _v2 $ [EBP]
    MOV EDI,DWORD PTR _v1 $ [EBP]
$ @ LC9 @ SUMPRODUCT 3:; 8:I< N;
; 9:++ I)    CMP EBX,ESI
    JGE SHORT $ @ LN8 @ SUMPRODUCT 3
    子EDI,EDX
    LEA EAX,DWORD PTR [EDX + EBX * 8]
    子ESI,EBX
$ @ LC3 @ SUMPRODUCT 3:; 10:总和+ = V1 [I] * V2 [I]    FLD QWORD PTR [EAX + EDI]
    添加EAX,8
    十二月ESI
    FMUL QWORD PTR [EAX-8]
    FADDP ST(1),ST(0)
    JNE SHORT $ @ LC3 @ SUMPRODUCT 3
$ @ LN8 @ SUMPRODUCT 3:; 11:收益总和;
; 12:}    流行EDI
    流行ESI
    流行EBX
    流行EBP
    RET 0
?SUMPRODUCT @@ YANPBN0H @ Z ENDP; SUMPRODUCT
_TEXT ENDS
结束


解决方案

CPU之间的一大区别是管道优化

CPU可以并行几条指令执行,直到达到一个条件分支。从这个角度,而不是等待直到所有的指令被执行时,CPU可以与平行的分支继续,直到条件是可用的,并准备进行评估。如果假设是正确的,那么我们就有收益。否则,CPU会与其他分支。

因此​​,对于CPU棘手的部分是要找到最好的假设和并行地执行尽可能多的指令。

I have been doing some testing to see how much of a difference additional bounds checking makes in loops. This is prompted by thinking about the cost of implicit bounds checking inserted by languages such as C#, Java etc, when you access arrays.

Update: I have tried the same executable program out on several additional computers, which throws a lot more light onto what is happening. I've listed the original computer first, and second my modern laptop. On my modern laptop, adding additional checks in the loop adds only between 1 and 4% to the time taken, compared to between 3 and 30% for the original hardware.

Processor   x86 Family 6 Model 30 Stepping 5 GenuineIntel ~2793 Mhz
Ratio 2 checks : 1 check = 1.0310
Ratio 3 checks : 1 check = 1.2769

Processor   Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz, 2301 Mhz, 4 Core(s), 8 Logical Processor(s)
Ratio 2 checks : 1 check = 1.0090
Ratio 3 checks : 1 check = 1.0393

Processor   Intel(R) Core(TM) i5-2500 CPU @ 3.30GHz, 4 Cores(s)
Ratio 2 checks : 1 check = 1.0035
Ratio 3 checks : 1 check = 1.0639

Processor   Intel(R) Core(TM)2 Duo CPU     T9300  @ 2.50GHz, 2501 Mhz, 2 Core(s), 2 Logical Processor(s)
Ratio 2 checks : 1 check = 1.1195
Ratio 3 checks : 1 check = 1.3597

Processor   x86 Family 15 Model 43 Stepping 1 AuthenticAMD ~2010 Mhz
Ratio 2 checks : 1 check = 1.0776
Ratio 3 checks : 1 check = 1.1451

In the test program, below, the first function checks just one bound, the second function checks two, and the third checks three (in the calling code, n1=n2=n3). I found that the ratio two checks:one was about 1.03, and the ratio three checks:one was about 1.3. I was surprised by that adding one more check made such a difference to performance. I got an interesting answer concerning the low cost of bounds checking on modern processors to my original question, which may throw some light on the differences observed here.

Note that it's important to compile the program without whole program optimization turned on; otherwise the compiler can simply remove the additional bounds checking.

// dotprod.cpp
#include "dotprod.h"

double SumProduct(const double* v1, const double* v2, int n)
{
    double sum=0;
    for(int i=0;
        i<n;
        ++i)
        sum += v1[i]*v2[i];
    return sum;
}

double SumProduct(const double* v1, const double* v2, int n1, int n2)
{
    double sum=0;
    for(int i=0;
        i<n1 && i <n2;
        ++i)
        sum += v1[i]*v2[i];
    return sum;
}

double SumProduct(const double* v1, const double* v2, int n1, int n2, int n3)
{
    double sum=0;
    for(int i=0;
        i<n1 && i <n2 && i <n3;
        ++i)
        sum += v1[i]*v2[i];
    return sum;
}

This code was originally built using Visual Studio 2010, Release, Win32 (I've added the 'C' tag because the reasoning behind the difference in speed is not likely to be C++ specific, and may not be Windows specific). Can anyone explain it?

Rest of the code below, for information. This has some C++ specific stuff in it.

Header file

// dotprod.h
double SumProduct(const double*, const double*, int n);
double SumProduct(const double*, const double*, int n1, int n2);
double SumProduct(const double*, const double*, int n1, int n2, int n3);

Test harness

// main.cpp

#include <stdio.h>
#include <math.h>
#include <numeric>
#include <vector>

#include <windows.h>

#include "../dotprod/dotprod.h" // separate lib

typedef __int64 timecount_t;
inline timecount_t GetTimeCount()
{
    LARGE_INTEGER li;
    if (!QueryPerformanceCounter(&li)) {
        exit(1);
    }
    return li.QuadPart;
}

int main()
{
    typedef std::vector<double> dvec;
    const int N  = 100 * 1000;

    // Initialize
    dvec v1(N);
    dvec v2(N);
    dvec dp1(N);
    dvec dp2(N);
    dvec dp3(N);
    for(int i=0; i<N; ++i) {
        v1[i] = i;
        v2[i] = log(static_cast<double>(i+1));
    }

    const timecount_t t0 = GetTimeCount();

    // Check cost with one bound
    for(int n=0; n<N; ++n) {
        dp1[n] = SumProduct(&(v1[0]),&(v2[0]),n); 
    }

    const timecount_t t1 = GetTimeCount();

    // Check cost with two bounds
    for(int n=0; n<N; ++n) {
        dp2[n] = SumProduct(&(v1[0]),&(v2[0]),n,n); 
    }

    const timecount_t t2 = GetTimeCount();

    // Check cost with three bounds
    for(int n=0; n<N; ++n) {
        dp3[n] = SumProduct(&(v1[0]),&(v2[0]),n,n,n); 
    }
    const timecount_t t3 = GetTimeCount();

    // Check results
    const double sumSumProducts1 = std::accumulate(dp1.begin(), dp1.end(), 0.0);
    const double sumSumProducts2 = std::accumulate(dp2.begin(), dp2.end(), 0.0);
    const double sumSumProducts3 = std::accumulate(dp3.begin(), dp3.end(), 0.0);
    printf("Sums of dot products: %.1f, %.1f, %.1f\n", sumSumProducts1, sumSumProducts2, sumSumProducts3);

    // Output timings
    const timecount_t elapsed1 = t1-t0;
    const timecount_t elapsed2 = t2-t1;
    const timecount_t elapsed3 = t3-t2;
    printf("Elapsed: %.0f, %.0f, %.0f\n",
        static_cast<double>(elapsed1),
        static_cast<double>(elapsed2),
        static_cast<double>(elapsed3));
    const double ratio2to1 = elapsed2 / static_cast<double>(elapsed1);
    const double ratio3to1 = elapsed3 / static_cast<double>(elapsed1);
    printf("Ratio 2:1=%.2f\n", ratio2to1);
    printf("Ratio 3:1=%.2f\n", ratio3to1);

    return 0;
}

In order to produce assembly, I took the advice in this answer (case 2, turning off whole program optimization), producing the following asm file.

; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01 

    TITLE   C:\dev\TestSpeed\dotprod\dotprod.cpp
    .686P
    .XMM
    include listing.inc
    .model  flat

INCLUDELIB OLDNAMES

PUBLIC  __real@0000000000000000
PUBLIC  ?SumProduct@@YANPBN0HHH@Z           ; SumProduct
EXTRN   __fltused:DWORD
;   COMDAT __real@0000000000000000
; File c:\dev\testspeed\dotprod\dotprod.cpp
CONST   SEGMENT
__real@0000000000000000 DQ 00000000000000000r   ; 0
; Function compile flags: /Ogtp
CONST   ENDS
;   COMDAT ?SumProduct@@YANPBN0HHH@Z
_TEXT   SEGMENT
tv491 = -4                      ; size = 4
_v1$ = 8                        ; size = 4
_v2$ = 12                       ; size = 4
_n1$ = 16                       ; size = 4
_n2$ = 20                       ; size = 4
_n3$ = 24                       ; size = 4
?SumProduct@@YANPBN0HHH@Z PROC              ; SumProduct, COMDAT

; 25   : {

    push    ebp
    mov ebp, esp
    push    ecx

; 26   :     double sum=0;

    fldz
    push    ebx
    mov ebx, DWORD PTR _v2$[ebp]
    push    esi
    push    edi
    mov edi, DWORD PTR _n1$[ebp]

; 27   :     for(int i=0;

    xor ecx, ecx

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp edi, 4
    jl  $LC8@SumProduct

; 26   :     double sum=0;

    mov edi, DWORD PTR _v1$[ebp]
    lea esi, DWORD PTR [edi+24]

; 30   :         sum += v1[i]*v2[i];

    sub edi, ebx
    lea edx, DWORD PTR [ecx+2]
    lea eax, DWORD PTR [ebx+8]
    mov DWORD PTR tv491[ebp], edi
$LN15@SumProduct:

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    mov ebx, DWORD PTR _n2$[ebp]
    cmp ecx, ebx
    jge $LN9@SumProduct
    cmp ecx, DWORD PTR _n3$[ebp]
    jge $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax-8]
    lea edi, DWORD PTR [edx-1]
    fmul    QWORD PTR [esi-24]
    faddp   ST(1), ST(0)
    cmp edi, ebx
    jge SHORT $LN9@SumProduct

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp edi, DWORD PTR _n3$[ebp]
    jge SHORT $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    mov edi, DWORD PTR tv491[ebp]
    fld QWORD PTR [edi+eax]
    fmul    QWORD PTR [eax]
    faddp   ST(1), ST(0)
    cmp edx, ebx
    jge SHORT $LN9@SumProduct

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp edx, DWORD PTR _n3$[ebp]
    jge SHORT $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+8]
    lea edi, DWORD PTR [edx+1]
    fmul    QWORD PTR [esi-8]
    faddp   ST(1), ST(0)
    cmp edi, ebx
    jge SHORT $LN9@SumProduct

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp edi, DWORD PTR _n3$[ebp]
    jge SHORT $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+16]
    mov edi, DWORD PTR _n1$[ebp]
    fmul    QWORD PTR [esi]
    add ecx, 4
    lea ebx, DWORD PTR [edi-3]
    add eax, 32                 ; 00000020H
    add esi, 32                 ; 00000020H
    faddp   ST(1), ST(0)
    add edx, 4
    cmp ecx, ebx
    jl  SHORT $LN15@SumProduct
    mov ebx, DWORD PTR _v2$[ebp]
$LC8@SumProduct:

; 28   :         i<n1 && i <n2 && i <n3;
; 29   :         ++i)

    cmp ecx, edi
    jge SHORT $LN9@SumProduct
    mov edx, DWORD PTR _v1$[ebp]
    lea eax, DWORD PTR [ebx+ecx*8]
    sub edx, ebx
$LC3@SumProduct:
    cmp ecx, DWORD PTR _n2$[ebp]
    jge SHORT $LN9@SumProduct
    cmp ecx, DWORD PTR _n3$[ebp]
    jge SHORT $LN9@SumProduct

; 30   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+edx]
    inc ecx
    fmul    QWORD PTR [eax]
    add eax, 8
    faddp   ST(1), ST(0)
    cmp ecx, edi
    jl  SHORT $LC3@SumProduct
$LN9@SumProduct:

; 31   :     return sum;
; 32   : }

    pop edi
    pop esi
    pop ebx
    mov esp, ebp
    pop ebp
    ret 0
?SumProduct@@YANPBN0HHH@Z ENDP              ; SumProduct
_TEXT   ENDS
PUBLIC  ?SumProduct@@YANPBN0HH@Z            ; SumProduct
; Function compile flags: /Ogtp
;   COMDAT ?SumProduct@@YANPBN0HH@Z
_TEXT   SEGMENT
tv448 = -4                      ; size = 4
_v1$ = 8                        ; size = 4
_v2$ = 12                       ; size = 4
_n1$ = 16                       ; size = 4
_n2$ = 20                       ; size = 4
?SumProduct@@YANPBN0HH@Z PROC               ; SumProduct, COMDAT

; 15   : {

    push    ebp
    mov ebp, esp
    push    ecx

; 16   :     double sum=0;

    fldz
    push    ebx
    mov ebx, DWORD PTR _v2$[ebp]
    push    esi
    push    edi
    mov edi, DWORD PTR _n1$[ebp]

; 17   :     for(int i=0;

    xor ecx, ecx

; 18   :         i<n1 && i <n2;
; 19   :         ++i)

    cmp edi, 4
    jl  SHORT $LC8@SumProduct@2

; 16   :     double sum=0;

    mov edi, DWORD PTR _v1$[ebp]
    lea edx, DWORD PTR [edi+24]

; 20   :         sum += v1[i]*v2[i];

    sub edi, ebx
    lea esi, DWORD PTR [ecx+2]
    lea eax, DWORD PTR [ebx+8]
    mov DWORD PTR tv448[ebp], edi
$LN19@SumProduct@2:
    mov edi, DWORD PTR _n2$[ebp]
    cmp ecx, edi
    jge SHORT $LN9@SumProduct@2
    fld QWORD PTR [eax-8]
    lea ebx, DWORD PTR [esi-1]
    fmul    QWORD PTR [edx-24]
    faddp   ST(1), ST(0)
    cmp ebx, edi
    jge SHORT $LN9@SumProduct@2
    mov ebx, DWORD PTR tv448[ebp]
    fld QWORD PTR [ebx+eax]
    fmul    QWORD PTR [eax]
    faddp   ST(1), ST(0)
    cmp esi, edi
    jge SHORT $LN9@SumProduct@2
    fld QWORD PTR [eax+8]
    lea ebx, DWORD PTR [esi+1]
    fmul    QWORD PTR [edx-8]
    faddp   ST(1), ST(0)
    cmp ebx, edi
    jge SHORT $LN9@SumProduct@2
    fld QWORD PTR [eax+16]
    mov edi, DWORD PTR _n1$[ebp]
    fmul    QWORD PTR [edx]
    add ecx, 4
    lea ebx, DWORD PTR [edi-3]
    add eax, 32                 ; 00000020H
    add edx, 32                 ; 00000020H
    faddp   ST(1), ST(0)
    add esi, 4
    cmp ecx, ebx
    jl  SHORT $LN19@SumProduct@2
    mov ebx, DWORD PTR _v2$[ebp]
$LC8@SumProduct@2:

; 18   :         i<n1 && i <n2;
; 19   :         ++i)

    cmp ecx, edi
    jge SHORT $LN9@SumProduct@2
    mov edx, DWORD PTR _v1$[ebp]
    lea eax, DWORD PTR [ebx+ecx*8]
    sub edx, ebx
$LC3@SumProduct@2:
    cmp ecx, DWORD PTR _n2$[ebp]
    jge SHORT $LN9@SumProduct@2

; 20   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+edx]
    inc ecx
    fmul    QWORD PTR [eax]
    add eax, 8
    faddp   ST(1), ST(0)
    cmp ecx, edi
    jl  SHORT $LC3@SumProduct@2
$LN9@SumProduct@2:

; 21   :     return sum;
; 22   : }

    pop edi
    pop esi
    pop ebx
    mov esp, ebp
    pop ebp
    ret 0
?SumProduct@@YANPBN0HH@Z ENDP               ; SumProduct
_TEXT   ENDS
PUBLIC  ?SumProduct@@YANPBN0H@Z             ; SumProduct
; Function compile flags: /Ogtp
;   COMDAT ?SumProduct@@YANPBN0H@Z
_TEXT   SEGMENT
_v1$ = 8                        ; size = 4
_v2$ = 12                       ; size = 4
?SumProduct@@YANPBN0H@Z PROC                ; SumProduct, COMDAT
; _n$ = eax

; 5    : {

    push    ebp
    mov ebp, esp
    mov edx, DWORD PTR _v2$[ebp]

; 6    :     double sum=0;

    fldz
    push    ebx
    push    esi
    mov esi, eax

; 7    :     for(int i=0;

    xor ebx, ebx
    push    edi
    mov edi, DWORD PTR _v1$[ebp]

; 8    :         i<n;
; 9    :         ++i)

    cmp esi, 4
    jl  SHORT $LC9@SumProduct@3

; 6    :     double sum=0;

    lea eax, DWORD PTR [edx+8]
    lea ecx, DWORD PTR [edi+24]

; 10   :         sum += v1[i]*v2[i];

    sub edi, edx
    lea edx, DWORD PTR [esi-4]
    shr edx, 2
    inc edx
    lea ebx, DWORD PTR [edx*4]
$LN10@SumProduct@3:
    fld QWORD PTR [eax-8]
    add eax, 32                 ; 00000020H
    fmul    QWORD PTR [ecx-24]
    add ecx, 32                 ; 00000020H
    dec edx
    faddp   ST(1), ST(0)
    fld QWORD PTR [edi+eax-32]
    fmul    QWORD PTR [eax-32]
    faddp   ST(1), ST(0)
    fld QWORD PTR [eax-24]
    fmul    QWORD PTR [ecx-40]
    faddp   ST(1), ST(0)
    fld QWORD PTR [eax-16]
    fmul    QWORD PTR [ecx-32]
    faddp   ST(1), ST(0)
    jne SHORT $LN10@SumProduct@3

; 6    :     double sum=0;

    mov edx, DWORD PTR _v2$[ebp]
    mov edi, DWORD PTR _v1$[ebp]
$LC9@SumProduct@3:

; 8    :         i<n;
; 9    :         ++i)

    cmp ebx, esi
    jge SHORT $LN8@SumProduct@3
    sub edi, edx
    lea eax, DWORD PTR [edx+ebx*8]
    sub esi, ebx
$LC3@SumProduct@3:

; 10   :         sum += v1[i]*v2[i];

    fld QWORD PTR [eax+edi]
    add eax, 8
    dec esi
    fmul    QWORD PTR [eax-8]
    faddp   ST(1), ST(0)
    jne SHORT $LC3@SumProduct@3
$LN8@SumProduct@3:

; 11   :     return sum;
; 12   : }

    pop edi
    pop esi
    pop ebx
    pop ebp
    ret 0
?SumProduct@@YANPBN0H@Z ENDP                ; SumProduct
_TEXT   ENDS
END

解决方案

One big difference between CPUs is the pipeline optimization

The CPU can execute in parallel several instructions until reaches a conditional branch. From this point instead of waiting until all the instructions are executed, the CPU can continue with a branch in parallel until the condition is available and ready to be evaluated. If the assumption was correct, then we have a gain. Otherwise the CPU will go with the other branch.

So the tricky part for a CPU is to find the best assumptions and to execute as many instructions in parallel as possible.

这篇关于为什么加入环路额外的检查让别人在某些机器上很大的区别,小差异?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆