如何将代码添加到循环中使其更快? [英] How can adding code to a loop make it faster?

查看:120
本文介绍了如何将代码添加到循环中使其更快?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有一个内部循环的简单函数 - 它缩放输入值,在查找表中查找输出值,并将其复制到目标。 (ftol_ambient是我从网上复制的一个窍门,用于将float快速转换为int)。

  for(i = 0; i < iCount; ++ i)
{
iScaled = ftol_ambient(* pSource * PRECISION3);
if(iScaled <= 0)
* pDestination = 0;
else if(iScaled> = PRECISION3)
* pDestination = 255;
else
{
iSRGB = FloatToSRGBTable3 [iScaled];
* pDestination = iSRGB;
}
pSource ++;
pDestination ++;
}



现在我的查找表是有限的,浮动是无限的,的一对一错误。我创建了一个函数的副本与一些代码来处理这种情况。注意,唯一的区别是添加了两行代码 - 请忽略丑陋的指针转换。

  for(i = 0; i  {
iScaled = ftol_ambient(* pSource * PRECISION3);
if(iScaled <= 0)
* pDestination = 0;
else if(iScaled> = PRECISION3)
* pDestination = 255;
else
{
iSRGB = FloatToSRGBTable3 [iScaled];
if(((int *)SRGBCeiling)[iSRGB] <= *((int *)pSource))
++ iSRGB;
* pDestination =(unsigned char)iSRGB;
}
pSource ++;
pDestination ++;
}

这里是奇怪的部分。我测试这两个版本相同的输入100000元素,重复100次。在我的Athlon 64 1.8 GHz(32位模式),第一个功能需要0.231秒,第二个(较长)功能需要0.185秒。这两个函数在同一个源文件中是相邻的,因此不可能有不同的编译器设置。我已经运行了很多次测试,颠倒了他们运行的顺序,每次的时间大致相同。



我知道有很多神秘在现代处理器中,但是这是怎么回事?



这里比较是来自Microsoft VC ++ 6编译器的相关汇编器输出。


 ; 173:for(i = 0; i  
$ L4455:

; 174:{
; 175:iScaled = ftol_ambient(* pSource * PRECISION3);

fld DWORD PTR [esi]
fmul DWORD PTR __real @ 4 @ 400b8000000000000000
fstp QWORD PTR $ T5011 [ebp]

; 170:int i;
; 171:int iScaled;
; 172:unsigned int iSRGB;

fld QWORD PTR $ T5011 [ebp]

; 173:for(i = 0; i
fistp DWORD PTR _i $ 5009 [ebp]

; 176:if(iScaled< = 0)

mov edx,DWORD PTR _i $ 5009 [ebp]
test edx,edx
jg SHORT $ L4458

; 177:* pDestination = 0;

mov BYTE PTR [ecx],0

; 178:else if(iScaled> = PRECISION3)

jmp SHORT $ L4461
$ L4458:
cmp edx,4096; 00001000H
jl SHORT $ L4460

; 179:* pDestination = 255;

mov BYTE PTR [ecx],255; 000000ffH

; 180:else

jmp SHORT $ L4461
$ L4460:

; 181:{
; 182:iSRGB = FloatToSRGBTable3 [iScaled];
; 183:* pDestination =(unsigned char)iSRGB;

mov dl,BYTE PTR _FloatToSRGBTable3 [edx]
mov BYTE PTR [ecx],dl
$ L4461:

; 184:}
; 185:pSource ++;

add esi,4

; 186:pDestination ++;

inc ecx
dec edi
jne SHORT $ L4455


  $ L4472:

; 199:{
; 200:iScaled = ftol_ambient(* pSource * PRECISION3);

fld DWORD PTR [esi]
fmul DWORD PTR __real @ 4 @ 400b8000000000000000
fstp QWORD PTR $ T4865 [ebp]

; 195:int i;
; 196:int iScaled;
; 197:unsigned int iSRGB;

fld QWORD PTR $ T4865 [ebp]

; 198:for(i = 0; i
fistp DWORD PTR _i $ 4863 [ebp]

; 201:if(iScaled< = 0)

mov edx,DWORD PTR _i $ 4863 [ebp]
test edx,edx
jg SHORT $ L4475

; 202:* pDestination = 0;

mov BYTE PTR [edi],0

; 203:else if(iScaled> = PRECISION3)

jmp SHORT $ L4478
$ L4475:
cmp edx,4096; 00001000H
jl SHORT $ L4477

; 204:* pDestination = 255;

mov BYTE PTR [edi],255; 000000ffH

; 205:else

jmp SHORT $ L4478
$ L4477:

; 206:{
; 207:iSRGB = FloatToSRGBTable3 [iScaled];

xor ecx,ecx
mov cl,BYTE PTR _FloatToSRGBTable3 [edx]

; 208:if(((int *)SRGBCeiling)[iSRGB] <= *((int *)pSource))

mov edx,DWORD PTR _SRGBCeiling [ecx * 4]
cmp edx,DWORD PTR [esi]
jg SHORT $ L4481

; 209:++ iSRGB;

inc ecx
$ L4481:

; 210:* pDestination =(unsigned char)iSRGB;

mov BYTE PTR [edi],cl
$ L4478:

; 211:}
; 212:pSource ++;

add esi,4

; 213:pDestination ++;

inc edi
dec eax
jne SHORT $ L4472



编辑:尝试测试 Nils Pipenbrinck的假设,我在第一个函数的循环前面和里面添加了几行:

  int one = 1; 
int two = 2;

if(one == two)
++ iSRGB;

第一个函数的运行时间现在下降到0.152秒。有趣。



编辑2: Nils指出,比较将在发布版本中进行优化,确实是这样。汇编代码中的更改非常微妙,我会在这里发布,看看它是否提供任何线索。在这一点上,我想知道是否是代码对齐。

 ; 175:for(i = 0; i  
$ L4457:

; 176:{
; 177:iScaled = ftol_ambient(* pSource * PRECISION3);

fld DWORD PTR [edi]
fmul DWORD PTR __real @ 4 @ 400b8000000000000000
fstp QWORD PTR $ T5014 [ebp]

; 170:int i;
; 171:int iScaled;
; 172:int one = 1;

fld QWORD PTR $ T5014 [ebp]

; 173:int two = 2;

fistp DWORD PTR _i $ 5012 [ebp]

; 178:if(iScaled <= 0)

mov esi,DWORD PTR _i $ 5012 [ebp]
test esi,esi
jg SHORT $ L4460

; 179:* pDestination = 0;

mov BYTE PTR [edx],0

; 180:else if(iScaled> = PRECISION3)

jmp SHORT $ L4463
$ L4460:
cmp esi,4096; 00001000H
jl SHORT $ L4462

; 181:* pDestination = 255;

mov BYTE PTR [edx],255; 000000ffH

; 182:else

jmp SHORT $ L4463
$ L4462:

; 183:{
; 184:iSRGB = FloatToSRGBTable3 [iScaled];

xor ecx,ecx
mov cl,BYTE PTR _FloatToSRGBTable3 [esi]

; 185:if(one == two)
; 186:++ iSRGB;
; 187:* pDestination =(unsigned char)iSRGB;

mov BYTE PTR [edx],cl
$ L4463:

; 188:}
; 189:pSource ++;

add edi,4

; 190:pDestination ++;

inc edx
dec eax
jne SHORT $ L4457




在第二个循环中,添加的代码可能足以将其中一个分支移动到不同的分支预测时隙。



为了确保您可以使用英特尔VTune分析器或AMD CodeAnalyst工具。这些工具将显示您的代码中发生了什么。



但是,请记住,最有可能不值得进一步优化此代码。如果你调整你的代码在你的CPU更快,它可能会同时变得更慢在不同的品牌。






编辑



如果你想阅读分支预测,请给Agner Fog的优秀网站尝试: http://www.agner.org/optimize/



此pdf详细解释了分支预测槽分配: http://www.agner.org/optimize/microarchitecture.pdf


I have a simple function with an inner loop - it scales the input value, looks up an output value in a lookup table, and copies it to the destination. (ftol_ambient is a trick I copied from the web for fast conversion of float to int).

for (i = 0;  i < iCount;  ++i)
{
    iScaled = ftol_ambient(*pSource * PRECISION3);
    if (iScaled <= 0)
        *pDestination = 0;
    else if (iScaled >= PRECISION3)
        *pDestination = 255;
    else
    {
        iSRGB = FloatToSRGBTable3[iScaled];
        *pDestination = iSRGB;
    }
    pSource++;
    pDestination++;
}

Now my lookup table is finite, and floats are infinite, so there's a possibility of off-by-one errors. I created a copy of the function with some code to handle that case. Notice that the only difference is the added 2 lines of code - please ignore the ugly pointer casting.

for (i = 0;  i < iCount;  ++i)
{
    iScaled = ftol_ambient(*pSource * PRECISION3);
    if (iScaled <= 0)
        *pDestination = 0;
    else if (iScaled >= PRECISION3)
        *pDestination = 255;
    else
    {
        iSRGB = FloatToSRGBTable3[iScaled];
        if (((int *)SRGBCeiling)[iSRGB] <= *((int *)pSource))
            ++iSRGB;
        *pDestination = (unsigned char) iSRGB;
    }
    pSource++;
    pDestination++;
}

Here's the strange part. I'm testing both versions with identical input of 100000 elements, repeated 100 times. On my Athlon 64 1.8 GHz (32 bit mode), the first function takes 0.231 seconds, and the second (longer) function takes 0.185 seconds. Both functions are adjacent in the same source file, so there's no possibility of different compiler settings. I've run the tests many times, reversing the order they're run in, and the timings are roughly the same every time.

I know there's a lot of mystery in modern processors, but how is this possible?

Here for comparison are the relevant assembler outputs from the Microsoft VC++6 compiler.


; 173  : 	for (i = 0;  i < iCount;  ++i)

$L4455:

; 174  : 	{
; 175  : 		iScaled = ftol_ambient(*pSource * PRECISION3);

	fld	DWORD PTR [esi]
	fmul	DWORD PTR __real@4@400b8000000000000000
	fstp	QWORD PTR $T5011[ebp]

; 170  : 	int i;
; 171  : 	int iScaled;
; 172  : 	unsigned int iSRGB;

	fld	QWORD PTR $T5011[ebp]

; 173  : 	for (i = 0;  i < iCount;  ++i)

	fistp	DWORD PTR _i$5009[ebp]

; 176  : 		if (iScaled <= 0)

	mov	edx, DWORD PTR _i$5009[ebp]
	test	edx, edx
	jg	SHORT $L4458

; 177  : 			*pDestination = 0;

	mov	BYTE PTR [ecx], 0

; 178  : 		else if (iScaled >= PRECISION3)

	jmp	SHORT $L4461
$L4458:
	cmp	edx, 4096				; 00001000H
	jl	SHORT $L4460

; 179  : 			*pDestination = 255;

	mov	BYTE PTR [ecx], 255			; 000000ffH

; 180  : 		else

	jmp	SHORT $L4461
$L4460:

; 181  : 		{
; 182  : 			iSRGB = FloatToSRGBTable3[iScaled];
; 183  : 			*pDestination = (unsigned char) iSRGB;

	mov	dl, BYTE PTR _FloatToSRGBTable3[edx]
	mov	BYTE PTR [ecx], dl
$L4461:

; 184  : 		}
; 185  : 		pSource++;

	add	esi, 4

; 186  : 		pDestination++;

	inc	ecx
	dec	edi
	jne	SHORT $L4455


$L4472:

; 199  : 	{
; 200  : 		iScaled = ftol_ambient(*pSource * PRECISION3);

	fld	DWORD PTR [esi]
	fmul	DWORD PTR __real@4@400b8000000000000000
	fstp	QWORD PTR $T4865[ebp]

; 195  : 	int i;
; 196  : 	int iScaled;
; 197  : 	unsigned int iSRGB;

	fld	QWORD PTR $T4865[ebp]

; 198  : 	for (i = 0;  i < iCount;  ++i)

	fistp	DWORD PTR _i$4863[ebp]

; 201  : 		if (iScaled <= 0)

	mov	edx, DWORD PTR _i$4863[ebp]
	test	edx, edx
	jg	SHORT $L4475

; 202  : 			*pDestination = 0;

	mov	BYTE PTR [edi], 0

; 203  : 		else if (iScaled >= PRECISION3)

	jmp	SHORT $L4478
$L4475:
	cmp	edx, 4096				; 00001000H
	jl	SHORT $L4477

; 204  : 			*pDestination = 255;

	mov	BYTE PTR [edi], 255			; 000000ffH

; 205  : 		else

	jmp	SHORT $L4478
$L4477:

; 206  : 		{
; 207  : 			iSRGB = FloatToSRGBTable3[iScaled];

	xor	ecx, ecx
	mov	cl, BYTE PTR _FloatToSRGBTable3[edx]

; 208  : 			if (((int *)SRGBCeiling)[iSRGB] <= *((int *)pSource))

	mov	edx, DWORD PTR _SRGBCeiling[ecx*4]
	cmp	edx, DWORD PTR [esi]
	jg	SHORT $L4481

; 209  : 				++iSRGB;

	inc	ecx
$L4481:

; 210  : 			*pDestination = (unsigned char) iSRGB;

	mov	BYTE PTR [edi], cl
$L4478:

; 211  : 		}
; 212  : 		pSource++;

	add	esi, 4

; 213  : 		pDestination++;

	inc	edi
	dec	eax
	jne	SHORT $L4472


Edit: Trying to test Nils Pipenbrinck's hypothesis, I added a couple of lines before and inside of the loop of the first function:

int one = 1;
int two = 2;

        if (one == two)
            ++iSRGB;

The run time of the first function is now down to 0.152 seconds. Interesting.


Edit 2: Nils pointed out that the comparison would be optimized out of a release build, and indeed it is. The changes in the assembly code are very subtle, I will post it here to see if it provides any clues. At this point I'm wondering if it's code alignment?

; 175  : 	for (i = 0;  i < iCount;  ++i)

$L4457:

; 176  : 	{
; 177  : 		iScaled = ftol_ambient(*pSource * PRECISION3);

	fld	DWORD PTR [edi]
	fmul	DWORD PTR __real@4@400b8000000000000000
	fstp	QWORD PTR $T5014[ebp]

; 170  : 	int i;
; 171  : 	int iScaled;
; 172  : 	int one = 1;

	fld	QWORD PTR $T5014[ebp]

; 173  : 	int two = 2;

	fistp	DWORD PTR _i$5012[ebp]

; 178  : 		if (iScaled <= 0)

	mov	esi, DWORD PTR _i$5012[ebp]
	test	esi, esi
	jg	SHORT $L4460

; 179  : 			*pDestination = 0;

	mov	BYTE PTR [edx], 0

; 180  : 		else if (iScaled >= PRECISION3)

	jmp	SHORT $L4463
$L4460:
	cmp	esi, 4096				; 00001000H
	jl	SHORT $L4462

; 181  : 			*pDestination = 255;

	mov	BYTE PTR [edx], 255			; 000000ffH

; 182  : 		else

	jmp	SHORT $L4463
$L4462:

; 183  : 		{
; 184  : 			iSRGB = FloatToSRGBTable3[iScaled];

	xor	ecx, ecx
	mov	cl, BYTE PTR _FloatToSRGBTable3[esi]

; 185  : 			if (one == two)
; 186  : 				++iSRGB;
; 187  : 			*pDestination = (unsigned char) iSRGB;

	mov	BYTE PTR [edx], cl
$L4463:

; 188  : 		}
; 189  : 		pSource++;

	add	edi, 4

; 190  : 		pDestination++;

	inc	edx
	dec	eax
	jne	SHORT $L4457

解决方案

My guess is, that in the first case two different branches end up in the same branch-prediction slot on the CPU. If these two branches predict different each time the code will slow down.

In the second loop, the added code may just be enough to move one of the branches to a different branch prediction slot.

To be sure you can give the Intel VTune analyzer or the AMD CodeAnalyst tool a try. These tools will show you what's exactly going on in your code.

However, keep in mind that it's most probably not worth to optimize this code further. If you tune your code to be faster on your CPU it may at the same time become slower on a different brand.


EDIT:

If you want to read on the branch-prediction give Agner Fog's excellent web-site a try: http://www.agner.org/optimize/

This pdf explains the branch-prediction slot allocation in detail: http://www.agner.org/optimize/microarchitecture.pdf

这篇关于如何将代码添加到循环中使其更快?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆