加快速度 [英] speed it up

查看:100
本文介绍了加快速度的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述




我有2个C代码片段,可以产生相同的结果。然而A比我的电脑上的B(x86)快2倍,比我的PDA慢了1.5倍(strongARM @

206MhZ)

/ /启动条件+类型

pSrc =新的无符号短[320 * 240];

pDst =新的无符号短[320 * 240];

注册unsigned short x,y,* ldst;

short xptch = 320,yptch = -1;

dst = pDst + 319;

src = pSrc;

// A:

(unsigned long *)pDisplay =(unsigned long *)dst;

for(x = 0; x <240; x ++)

{

for(y = 0; y <160; y ++)

{

* pDisplay ++ =(*(src-240)<< 16)| *(SRC); //处理4个字节

一次

src- = 480;

}

src + = 76801; //(320 * 240 + 1); //前排+ 320行下来

底部

}


// B:

for(y = 0; y< 320; y ++)

{

ldst = dst; //获取当前行地址

for(x = 0; x <240; x ++)

{

*(ldst)= * SRC ++; // src上的一个像素

ldst + = xptch; //在dest右边添加一个像素

}

dst + = yptch; //在dst缓冲区添加一行

}


有人可以向我解释。更好:如何让这个真的很快?

使用ASM?我需要一个ARM处理器的优化版本。

例子B显示它显然做了什么,我想。


谢谢你的建议,


-

-Gernot

int main(int argc,char ** argv){printf

( %silto%c%cf%cgl%ssic%ccom%c,ma,58,g,64,ba,46,10);}


________________________________________

寻找好游戏?亲自动手吧!

GLBasic - 你可以这样做
www.GLBasic .com

解决方案

Gernot Frisch写道:



我有2个C代码片段,可以产生相同的结果。然而A比我的PC(x86)上的B快2倍,但我的PDA慢了1.5倍(strongARM @
206MhZ)

//启动条件+类型
pSrc = new unsigned short [320 * 240];
pDst = new unsigned short [320 * 240];
注册unsigned short x,y,* ldst;
short xptch = 320, yptch = -1;
dst = pDst + 319;
src = pSrc;

// A:
(unsigned long *)pDisplay =(unsigned long * )dst;
for(x = 0; x <240; x ++)
{
for(y = 0; y< 160; y ++)
{
* pDisplay ++ =(*(src-240)<< 16)| *(SRC); //处理4个字节
一次
src- = 480;
}
src + = 76801; //(320 * 240 + 1); //前排+ 320线下方


// B:
for(y = 0; y< 320; y ++ )
{
ldst = dst; //获取当前行地址
(x = 0; x <240; x ++)
{
*(ldst)= * src ++; // src上的一个像素
ldst + = xptch; //在右侧添加一个像素
}
dst + = yptch; //在dst缓冲区添加一行


有人可以向我解释一下。更好:如何使这个快速?
使用ASM?我需要一个针对ARM处理器的优化版本。
示例B显示了它的作用,我认为。

谢谢你的建议,




看起来你正在进行{矩阵或位图}旋转,

但是我不确定。


无论如何,要优化手臂。 ARM处理器喜欢推出

for-loops和减少分支数量(对于大多数处理器来说,这可能是真的。) ARM处理器有特殊说明,

可以从内存中一次加载许多寄存器,并将许多指令放入内存中。我的信息是这两条指令都需要

顺序内存位置。因此我们可以使用负载而不是

put。


让我们专注于算法B.

我将优化它分步。

B1:

/ *const修饰符将允许编译器更好地优化代码。

* /

const short xpitch = 320;

const short ypitch = -1;

const unsigned short * src = pSrc;

for(y = 0; y< 320; ++ y)

{

ldst = dst;

for(x = 0; x <60; ++ x)

{

* ldst = * src ++;

ldst + = xpitch;

* ldst = * src ++;

ldst + = xpitch;

* ldst = * src ++;

ldst + = xpitch;

* ldst = * src ++;

ldst + = xpitch;

}

dst + = ypitch;

}

上面的修改,内部循环展开

,这样就可以为每个

分支执行4次内存传输,而不是每个分支一次传输,如

你的原始代码。


B2:将内循环替换为:

for(x = 0; x <60; ++ x)
{

注册未签名的短s1,s2,s3,s4;

注册unsigne d int index = 0;

s1 = * src ++;

s2 = * src ++;

s3 = * src ++;

s4 = * src ++;

ldst [index] = s1;

index + = xpitch;

ldst [index] = s2 ;

指数+ = xpitch;

ldst [index] = s3;

指数+ = xpitch;

ldst [index] = s4;

index + = xpitch;

}

上面的循环告诉编译器有4个寄存器

一次被加载,然后被写入内存。

希望这会触发特殊指令。

ldst [index]"赋值告诉编译器

在注册指令索引的位置使用商店。

你可以通过
$ b $扩展或推出内部循环b可用的寄存器数量。函数中使用的三个变量至少有
:程序计数器,
返回地址和局部变量指针。因此,用汇编语言打印函数
,看看剩下多少个b / b $ b寄存器,然后展开内循环。


如果你告诉我们算法在做什么,也许

我们可以为处理器建议一种更优化的方法。


-

Thomas Matthews


C ++新闻组欢迎辞:
http://www.slack.net/~shiva/welcome.txt

C ++常见问题: http://www.parashift.com/c++-faq-lite

C常见问题: http://www.eskimo.com/~scs/ c-faq / top.html

alt.comp.lang.learn.c-c ++ faq:
http://www.raos.demon.uk/acllc-c++/faq.html

其他网站:
< a rel =nofollowhref =http://www.josuttis.comtarget =_ blank> http://www.josuttis.com - C ++ STL图书馆书籍


>看起来你正在执行{矩阵或位图}旋转,

但我不确定。


是的,是的。我在这里将我的后台缓冲区转移到显示内存。

(PDA设备)。

B2:用以下内容替换内环:
= 0; x <60; ++ x)
{
注册无符号短s1,s2,s3,s4;
注册unsigned int index = 0;
s1 = * src ++;
s2 = * src ++;
s3 = * src ++;
s4 = * src ++;
ldst [index] = s1;
index + = xpitch;
ldst [index] = s2;
index + = xpitch;
ldst [index] = s3;
index + = xpitch;
ldst [index] = s4;
index + = xpitch;
}




< klonk> (这是我的雅司病)你的速度快了近5倍!难以置信。


非常感谢你,

Gernot




但是现在它的执行错误了。我找不到任何错误(其他

比错字x(i)ptch)。

这里真的很奇怪...

尝试定义/取消定义快速。

你能找到不同的东西!?

-Gernot


#define FAST

void Do()

{

unsigned short * src,* dst = NULL,* dst2 = NULL,* pDst,* pDst2,* pSrc;

pSrc =新的无符号短[320 * 240];

pDst =新的无符号短[320 * 240];

pDst2 = new unsigned short [320 * 240];

long n;

for(n = 0; n< 320 * 240; n ++)pSrc [n] = 21 + n * n;

memset(pDst,0,320 * 240 * 2);

memset(pDst2,0,320 * 240 * 2);


注册unsigned short x,y,* ldst;

short xpitch = 640;

short ypitch = -2;

短xptch =(xpitch>> 1),yptch =(ypitch>> 1);


dst = pDst2 + 319;

src = pSrc;

//旧方法

for(y = 0; y <320; y ++)

{

ldst = dst; //获取当前行地址

for(x = 0; x <240; x ++)

{

*(ldst)= * SRC ++; // src上的一个像素

ldst + = xptch; //在dest右边添加一个像素

}

dst + = yptch; //在dst缓冲区添加一行

}


dst = pDst + 319;

src = pSrc;

//新方法

for(y = 0; y <320; y ++)

{

ldst = dst ; //获取当前行地址

#ifndef FAST

for(x = 0; x <240; x ++)

{

*(ldst)= * src ++; // src上的一个像素

ldst + = xptch; //在dest右边添加一个像素

}

#else

for(x = 0; x< 60; x ++)

{

注册无符号短s1,s2,s3,s4;

注册unsigned int index = 0;

s1 = * src ++;

s2 = * src ++;

s3 = * src ++;

s4 = * src ++;

ldst [index] = s1;

index + = xptch;

ldst [index] = s2;

index + = xptch ;

ldst [index] = s3;

index + = xptch;

ldst [index] = s4;

index + = xptch;

}

#endif

dst + = yptch; //在dst缓冲区添加一行

}

long q = 0;

for(n = 0; n< 320 * 240; n ++)

{

if(pDst [n]!= pDst2 [n])q ++;

}

printf("错误:%d \ n",q);

}


Hi,

I have 2 C code snippets that prodcue the same result. However A is 2x
faster than B on my PC (x86) but 1.5x slower on my PDA (strongARM @
206MhZ)
// Startup conditions + types
pSrc = new unsigned short[320*240];
pDst = new unsigned short[320*240];
register unsigned short x, y, *ldst;
short xptch = 320, yptch = -1;
dst = pDst + 319;
src = pSrc;
// A:
(unsigned long*) pDisplay = (unsigned long*)dst;
for(x=0; x<240; x++)
{
for(y=0; y<160; y++)
{
*pDisplay++ = (*(src-240)<<16) | *(src); // Process 4 bytes at
once
src-=480;
}
src+=76801; // (320*240+1); // Get a row ahead+320 lines down to
the bottom
}

// B:
for (y = 0; y < 320; y++ )
{
ldst = dst; // Get current line address
for (x = 0; x < 240; x++ )
{
*(ldst) = *src++; // one pixel right on src
ldst += xptch; // add a pixel to the right on dest
}
dst += yptch; // add a line to dst buffer
}

Can someone explain it to me. An better: How to make this really fast?
Using ASM? I need an optimized version for an ARM processor.
Example B shows what it does obviously, I think.

Thank you in advice,

--
-Gernot
int main(int argc, char** argv) {printf
("%silto%c%cf%cgl%ssic%ccom%c", "ma", 58, ''g'', 64, "ba", 46, 10);}

________________________________________
Looking for a good game? Do it yourself!
GLBasic - you can do
www.GLBasic.com

解决方案

Gernot Frisch wrote:

Hi,

I have 2 C code snippets that prodcue the same result. However A is 2x
faster than B on my PC (x86) but 1.5x slower on my PDA (strongARM @
206MhZ)
// Startup conditions + types
pSrc = new unsigned short[320*240];
pDst = new unsigned short[320*240];
register unsigned short x, y, *ldst;
short xptch = 320, yptch = -1;
dst = pDst + 319;
src = pSrc;
// A:
(unsigned long*) pDisplay = (unsigned long*)dst;
for(x=0; x<240; x++)
{
for(y=0; y<160; y++)
{
*pDisplay++ = (*(src-240)<<16) | *(src); // Process 4 bytes at
once
src-=480;
}
src+=76801; // (320*240+1); // Get a row ahead+320 lines down to
the bottom
}

// B:
for (y = 0; y < 320; y++ )
{
ldst = dst; // Get current line address
for (x = 0; x < 240; x++ )
{
*(ldst) = *src++; // one pixel right on src
ldst += xptch; // add a pixel to the right on dest
}
dst += yptch; // add a line to dst buffer
}

Can someone explain it to me. An better: How to make this really fast?
Using ASM? I need an optimized version for an ARM processor.
Example B shows what it does obviously, I think.

Thank you in advice,



Looks like you are performing a {matrix or bitmap} rotation,
but I''m not sure.

Anyway, to optimize for the ARM. The ARM processor likes rolled out
for-loops and reduced number of branches (which might be true for
most processors). The ARM processor has special instructions that
can load many registers at once from memory and put many instructions
into memory. My information is that both instructions require
sequential memory locations. Thus we can use the load but not the
put.

Let us concentrate on algorithm B.
I will optimize it in steps.
B1:
/* The "const" modifiers will allow the compiler to better
* optimize the code.
*/
const short xpitch = 320;
const short ypitch = -1;
const unsigned short * src = pSrc;
for (y = 0; y < 320; ++y)
{
ldst = dst;
for (x = 0; x < 60; ++x)
{
*ldst = *src++;
ldst += xpitch;
*ldst = *src++;
ldst += xpitch;
*ldst = *src++;
ldst += xpitch;
*ldst = *src++;
ldst += xpitch;
}
dst += ypitch;
}
In the above modification, the inner loop is unrolled
so that 4 memory transfers are performed for each
branch, rather than one transfer per branch as in
your original code.

B2: Replace inner loop with:
for (x = 0; x < 60; ++x)
{
register unsigned short s1, s2, s3, s4;
register unsigned int index = 0;
s1 = *src++;
s2 = *src++;
s3 = *src++;
s4 = *src++;
ldst[index] = s1;
index += xpitch;
ldst[index] = s2;
index += xpitch;
ldst[index] = s3;
index += xpitch;
ldst[index] = s4;
index += xpitch;
}
The above loop tells the compiler that 4 registers
are being loaded at once, then written to memory.
Hopefully this will trigger that special instruction.
The "ldst[index]" assignment is telling the compiler
to use a store at location indexed by register instruction.
You can expand or rollout the inner loop more by the
number of registers available. There are a minimum
of three variables used in a function: program counter,
return address, and local variable pointer. So print
the function in assembly language and see how many
registers are left, then expand the inner loop.

If you tell us what the algorithms are doing, perhaps
we can suggest a more optimal method for the processors.

--
Thomas Matthews

C++ newsgroup welcome message:
http://www.slack.net/~shiva/welcome.txt
C++ Faq: http://www.parashift.com/c++-faq-lite
C Faq: http://www.eskimo.com/~scs/c-faq/top.html
alt.comp.lang.learn.c-c++ faq:
http://www.raos.demon.uk/acllc-c++/faq.html
Other sites:
http://www.josuttis.com -- C++ STL Library book


> Looks like you are performing a {matrix or bitmap} rotation,

but I''m not sure.
Yes, right. I''m transfering my back buffer to the display memory here.
(PDA-device).
B2: Replace inner loop with:
for (x = 0; x < 60; ++x)
{
register unsigned short s1, s2, s3, s4;
register unsigned int index = 0;
s1 = *src++;
s2 = *src++;
s3 = *src++;
s4 = *src++;
ldst[index] = s1;
index += xpitch;
ldst[index] = s2;
index += xpitch;
ldst[index] = s3;
index += xpitch;
ldst[index] = s4;
index += xpitch;
}



<klonk> (That was my yaws) You made it almost 5x faster! Unbelievable.

Thank you so much,
Gernot



but now it''s got errors in execution. I can''t find any error (other
than the typo x(i)ptch).
Really strange here...
Try defining /undefining FAST.
Can you find what''s different!?
-Gernot

#define FAST
void Do()
{
unsigned short* src, *dst=NULL, *dst2=NULL, *pDst, *pDst2, *pSrc;
pSrc = new unsigned short[320*240];
pDst = new unsigned short[320*240];
pDst2 = new unsigned short[320*240];
long n;
for (n=0; n<320*240; n++) pSrc[n]=21+n*n;
memset(pDst, 0, 320*240*2);
memset(pDst2, 0, 320*240*2);

register unsigned short x, y, *ldst;
short xpitch=640;
short ypitch=-2;
short xptch = (xpitch >> 1), yptch = (ypitch >> 1);

dst = pDst2 + 319;
src = pSrc;
// Old method
for (y = 0; y < 320; y++ )
{
ldst = dst; // Get current line address
for (x = 0; x < 240; x++ )
{
*(ldst) = *src++; // one pixel right on src
ldst += xptch; // add a pixel to the right on dest
}
dst += yptch; // add a line to dst buffer
}

dst = pDst + 319;
src = pSrc;
// new method
for (y = 0; y < 320; y++ )
{
ldst = dst; // Get current line address
#ifndef FAST
for (x = 0; x < 240; x++ )
{
*(ldst) = *src++; // one pixel right on src
ldst += xptch; // add a pixel to the right on dest
}
#else
for (x = 0; x < 60; x++)
{
register unsigned short s1, s2, s3, s4;
register unsigned int index=0;
s1 = *src++;
s2 = *src++;
s3 = *src++;
s4 = *src++;
ldst[index] = s1;
index += xptch;
ldst[index] = s2;
index += xptch;
ldst[index] = s3;
index += xptch;
ldst[index] = s4;
index += xptch;
}
#endif
dst += yptch; // add a line to dst buffer
}

long q=0;
for (n=0; n<320*240;n++)
{
if (pDst[n] != pDst2[n]) q++;
}
printf ("Errors: %d\n", q);
}


这篇关于加快速度的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆