帮助我提高一些SSE2 code [英] Help me improve some more SSE2 code
问题描述
我要寻找一些帮助改善这一双线性对酷睿2 CPU的扩展SSE2 code
在我的Atom N270和上I7本code为约2倍比MMX code更快。但是在酷睿2处理器,只有等于MMX code。
code如下
无效ConversionProcess :: convert_SSE2(BBitmap *从,BBitmap *为)
{
UINT32 fromBPR,toBPR,fromBPRDIV4,X,Y,年,XR; ULLint开始= RDTSC();
ULLint停止;
如果(由&放大器;&放大器;向){
UINT32宽度,高度;
宽度=从 - 方式>界()IntegerWidth()+ 1;
高度=从 - 方式>界()IntegerHeight()+ 1; UINT32 toWidth,toHeight;
toWidth =用于─方式>界()IntegerWidth()+ 1;
toHeight =用于─方式>界()IntegerHeight()+ 1; fromBPR =从 - > BytesPerRow();
fromBPRDIV4 = fromBPR>> 2;
toBPR =&用于─GT; BytesPerRow(); UINT32 x_ratio =((宽-1) - ; 7;)/ toWidth;
UINT32 y_ratio =((高度-1) - ; 7;)/ toHeight; UINT8 * toPtr =(UINT8 *)款拟>位();
UINT8 * fromPtr1 =(UINT8 *)从 - >位();
UINT8 * fromPtr2 =(UINT8 *)从 - >位()+ fromBPR; 结构FilterInfo {
UINT16 one_minus_diff; //一减差异
UINT16差异; //使用差异值来计算用于平均像素的权重
UINT16 one_minus_diff_rep; //一减重复差异
UINT16 diff_rep; //使用差异值来计算用于重复平均像素的权重
}; FilterInfo * xWeights =(FilterInfo *)memalign可(16,toWidth * 8);
FilterInfo * yWeights =(FilterInfo *)memalign可(16,toHeight * 8);
UINT32 * xIndexes =(UINT32 *)memalign可(16,(toWidth + 2)* 4); //将2指数overread
UINT32 * yIndexes =(UINT32 *)memalign可(16,toHeight * 4); X = 0;
对于(UINT32 J = 0; J< toWidth; J ++){
XR = X>> 7;
xWeights [J] .diff作= X - (XR<< 7);
xWeights [J] .one_minus_diff = 127 - xWeights [J] .diff作;
xWeights [J] .one_minus_diff_rep = xWeights [J] .one_minus_diff;
xWeights [J] .diff_rep = xWeights [J] .diff作;
xIndexes [J] = XR<< 2; X + = x_ratio;
} Y = 0;
对于(UINT32 J = 0; J< toHeight; J ++){
年= Y>> 7;
yWeights [J] .diff作= Y - (年<< 7);
yWeights [J] .one_minus_diff = 127 - yWeights [J] .diff作;
yIndexes [J] =(年* fromBPR);
Y + = y_ratio; } 对于(UINT32我= 0; I< toHeight;我++){
_ScaleSSE2X2(toPtr,fromPtr1 + yIndexes [I],fromPtr2 + yIndexes [Ⅰ],xIndexes,xWeights,&放大器; yWeights [I],toWidth);
toPtr + = toBPR;
} 免费(xWeights);
免费(yWeights);
免费(xIndexes);
免费(yIndexes); 停止= RDTSC() - 启动;
如果(STOP< timeTaken){
timeTaken =停止;
}
}
};
;版权所有(C)2011戴维McPaul
;
;版权所有。根据MIT许可证的条款分发。
;;一个颇为未优化双线性缩放器%宏cglobal 1
全球_ 1%
定义%1%_%1
16对齐
%1:
%endmacro段.data ALIGN = 16RGB_AND分贝为0xFF
分贝为0x00
分贝为0x00
分贝为0x00
DB 0xFF的
分贝为0x00
分贝为0x00
分贝为0x00
DB 0xFF的
分贝为0x00
分贝为0x00
分贝为0x00
DB 0xFF的
分贝为0x00
分贝为0x00
分贝为0x00;无效_ScaleSSE2X2(无效* toPtr,无效* fromPtr1,无效* fromPtr2,无效* xIndexPtr,无效* xWeightPtr,无效* yWeightPtr,UINT32长度);长EQU EBP + 32
yWeightPtr EQU EBP + 28
xWeightPtr EQU EBP + 24
xIndexPtr EQU EBP + 20
fromPtr2 EQU EBP + 16
fromPtr1 EQU EBP + 12
toPtr EQU EBP + 8.text段ALIGN = 16
cglobal ScaleSSE2X2
;保留寄存器。 EAX,ECX,EDX自动提供
推EBP
MOV EBP,ESP
推EBX; yWeights,xIndexPtr
推动电子数据交换;刮
推ESI; fromPtr3 MOV ESI,[fromPtr1]
MOV EDX,[fromPtr2]
MOV EAX,[xWeightPtr]
MOV EBX,[yWeightPtr]
MOV ECX,[长度]; Ÿ计算权重和缓存
MOVD XMM7,[EBX]获得1 yDiff和yDiff
pshuflw XMM7,XMM7,01010000b; 1-yDiff,1- yDiff,yDiff,yDiff
pshufd XMM7,XMM7,01000100b;重复 MOV EBX,[xIndexPtr] 推EBP;重用toPtr框架PTR
MOV EBP,[toPtr]不能使用参数裁判了 SHR ECX,1 ;首先计算指数
MOV EDI,[EBX]指数 16对齐
REPEATLOOPX2: ;装载的第一和第二组加权成XMM3
MOVDQA XMM3,[EAX];获得1 xDiff,xDiff,1- xDiff,xDiff
pmullw XMM3,XMM7;计算F1,F2,F3,F4(2)
添加EAX,16 ;装载第一套源像素的
MOVQ XMM0,[ESI + EDI] XMM0 = fromPtr1 +指数| fromPtr1 +指数+ 4
MOVQ将xmm1,[EDX + EDI]将xmm1 = fromPtr2 +指数| fromPtr2 +指数+ 4
punpcklqdq XMM0,xmm1中;所有4个像素合并成XMM0 子EDI,[EBX + 4];如果X指数是一样的,然后跳过第二个负载
JZ SKIP ;计算第二个索引
MOV EDI,[EBX + 4];指数 ;加载第二组来源像素
MOVQ XMM4,[ESI + EDI] XMM4 = fromPtr1 +指数| fromPtr1 +指数+ 4
MOVQ xmm5,[EDX + EDI] xmm5 = fromPtr2 +指数| fromPtr2 +指数+ 4
punpcklqdq XMM4,xmm5;所有4个像素合并成XMM4 MOVDQA将xmm1,XMM0;副本XMM1,XMM2
pshufd XMM2,XMM0,0xE4
MOVDQA xmm5,XMM4;副本XMM1,XMM2
pshufd xmm6,XMM4,0xE4 JMP NEXT
16对齐
跳跃:
MOVDQA将xmm1,XMM0;副本XMM1,XMM2
pshufd XMM2,XMM0,0xE4
MOVDQA XMM4,XMM0;复制第一像素设置XMM0第二像素集XMM4
pshufd xmm5,XMM4,0xE4;副本XMM4,xmm6
MOVDQA xmm6,XMM4
下一个:
; prefetchnta [EDX + EDI + 16] 添加EBX,8;使用彩色计算DEST RGB值= A * F1 + B * F2 + C * F3 + D * F4;来自两个像素组提取b和结合成一个单一章
PAND XMM0 [RGB_AND]清除所有但的R值留下B000
PAND XMM4,[RGB_AND]清除所有但的R值留下B000
PACKSSDW XMM0,XMM4;包下降到16位值 MOVDQA XMM4,[RGB_AND] XMM4现在是免费的
PMADDWD XMM0,XMM3;乘法和加法获得=的temp1一个* F1 + B * F2,TEMP2 = C * F3 + D * F4;提取摹
psrld将xmm1,8;旋转克至低字节
PAND将xmm1,XMM4;提取Tg值G000
psrld xmm5,8;旋转克至低字节
PAND xmm5,XMM4;提取Tg值G000
PACKSSDW将xmm1,xmm5;包下降到16位值 PMADDWD将xmm1,XMM3;相乘并加;提取物 - [R
psrld XMM2,16;旋转b,来低字节
PAND XMM2,XMM4;提取物B值B000
psrld xmm6,16;旋转b,来低字节
PAND xmm6,XMM4;提取物B值B000
PACKSSDW XMM2,xmm6;包下降到16位值 PMADDWD XMM2,XMM3;相乘并加;添加和的temp1 TEMP2留给我们RRRR XXXX XXXX RRRR
psrld XMM0,14;缩减到范围
pshufd XMM3,XMM0,00110001b;提取TEMP2
paddd XMM0,XMM3;添加回temp1目录 psrld将xmm1,14;缩减到范围
pshufd XMM3,将xmm1,00110001b
paddd将xmm1,XMM3;加 psrld XMM2,14;缩减到范围
pshufd XMM3,XMM2,00110001b
paddd XMM2,XMM3;加;重组到2 RGBA值 PSLLD将xmm1,8
陈健波XMM0,xmm1中
PSLLD XMM2,16
陈健波XMM0,XMM2
pshufd XMM0,XMM0,00001000b;向下洗牌 MOVQ [EBP],XMM0;输出32位* 2
添加EBP,8 MOV EDI,[EBX]指数 子ECX,1
JNZ REPEATLOOPX2;清理 流行EBP
流行ESI
流行EDI
流行EBX
MOV ESP,EBP
流行EBP
RET
两点建议:
-
在酷睿2体面的探查下运行的测试工具这个code(如变焦)看看那里的热点和依赖性/其他摊位
-
使用内部函数重新编写SIMD code,然后让编译器处理寄存器分配,指令调度和其他的优化 - 一个体面的编译器,如ICC,甚至GCC会做很多更好的工作比你的手工codeD组装。作为奖励,你还可以重新目标不同的x86处理器家族,而无需重新编写code。
I am looking for some help to improve this bilinear scaling sse2 code on core2 cpus
On my Atom N270 and on an i7 this code is about 2x faster than the mmx code. But under core2 cpus it is only equal to the mmx code.
Code follows
void ConversionProcess::convert_SSE2(BBitmap *from, BBitmap *to)
{
uint32 fromBPR, toBPR, fromBPRDIV4, x, y, yr, xr;
ULLint start = rdtsc();
ULLint stop;
if (from && to) {
uint32 width, height;
width = from->Bounds().IntegerWidth() + 1;
height = from->Bounds().IntegerHeight() + 1;
uint32 toWidth, toHeight;
toWidth = to->Bounds().IntegerWidth() + 1;
toHeight = to->Bounds().IntegerHeight() + 1;
fromBPR = from->BytesPerRow();
fromBPRDIV4 = fromBPR >> 2;
toBPR = to->BytesPerRow();
uint32 x_ratio = ((width-1) << 7) / toWidth ;
uint32 y_ratio = ((height-1) << 7) / toHeight ;
uint8* toPtr = (uint8*)to->Bits();
uint8* fromPtr1 = (uint8*)from->Bits();
uint8* fromPtr2 = (uint8*)from->Bits() + fromBPR;
struct FilterInfo {
uint16 one_minus_diff; // one minus diff
uint16 diff; // diff value used to calculate the weights used to average the pixels
uint16 one_minus_diff_rep; // one minus diff repeated
uint16 diff_rep; // diff value used to calculate the weights used to average the pixels repeated
};
FilterInfo *xWeights = (FilterInfo *)memalign(16, toWidth * 8);
FilterInfo *yWeights = (FilterInfo *)memalign(16, toHeight * 8);
uint32 *xIndexes = (uint32 *)memalign(16, (toWidth+2) * 4); // will overread by 2 index
uint32 *yIndexes = (uint32 *)memalign(16, toHeight * 4);
x = 0;
for (uint32 j=0;j < toWidth;j++) {
xr = x >> 7;
xWeights[j].diff = x - (xr << 7);
xWeights[j].one_minus_diff = 127 - xWeights[j].diff;
xWeights[j].one_minus_diff_rep = xWeights[j].one_minus_diff;
xWeights[j].diff_rep = xWeights[j].diff;
xIndexes[j] = xr << 2;
x += x_ratio;
}
y = 0;
for (uint32 j=0;j < toHeight; j++) {
yr = y >> 7;
yWeights[j].diff = y - (yr << 7);
yWeights[j].one_minus_diff = 127 - yWeights[j].diff;
yIndexes[j] = (yr * fromBPR);
y += y_ratio;
}
for (uint32 i=0;i < toHeight; i++) {
_ScaleSSE2X2(toPtr, fromPtr1 + yIndexes[i], fromPtr2 + yIndexes[i], xIndexes, xWeights, &yWeights[i], toWidth);
toPtr += toBPR;
}
free(xWeights);
free(yWeights);
free(xIndexes);
free(yIndexes);
stop = rdtsc() - start;
if (stop < timeTaken) {
timeTaken = stop;
}
}
}
;
; Copyright (C) 2011 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;
; A rather unoptimised bilinear scaler
%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro
SECTION .data align=16
RGB_AND db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
db 0xff
db 0x00
db 0x00
db 0x00
; void _ScaleSSE2X2(void *toPtr, void *fromPtr1, void *fromPtr2, void* xIndexPtr, void *xWeightPtr, void *yWeightPtr, uint32 length);
length equ ebp+32
yWeightPtr equ ebp+28
xWeightPtr equ ebp+24
xIndexPtr equ ebp+20
fromPtr2 equ ebp+16
fromPtr1 equ ebp+12
toPtr equ ebp+8
SECTION .text align=16
cglobal ScaleSSE2X2
; reserve registers. eax, ecx, edx automatically available
push ebp
mov ebp, esp
push ebx ; yWeights, xIndexPtr
push edi ; scratch
push esi ; fromPtr3
mov esi, [fromPtr1]
mov edx, [fromPtr2]
mov eax, [xWeightPtr]
mov ebx, [yWeightPtr]
mov ecx, [length]
; calculate y weights and cache
movd xmm7, [ebx] ; get 1-yDiff and yDiff
pshuflw xmm7, xmm7, 01010000b ; 1-yDiff, 1-yDiff, yDiff, yDiff
pshufd xmm7, xmm7, 01000100b ; duplicate
mov ebx, [xIndexPtr]
push ebp ; reuse frame ptr for toPtr
mov ebp, [toPtr] ; Cannot use parameter refs anymore
shr ecx,1
; calculate first index
mov edi, [ebx] ; index
align 16
REPEATLOOPX2:
; load first and second set of weights into xmm3
movdqa xmm3, [eax] ; get 1-xDiff, xDiff, 1-xDiff, xDiff
pmullw xmm3, xmm7 ; calculate F1, F2, F3, F4 (2)
add eax, 16
; load first set of source pixels
movq xmm0, [esi+edi] ; xmm0 = fromPtr1 + index | fromPtr1 + index + 4
movq xmm1, [edx+edi] ; xmm1 = fromPtr2 + index | fromPtr2 + index + 4
punpcklqdq xmm0, xmm1 ; combine all 4 pixels into xmm0
sub edi, [ebx+4] ; if the x index is the same then skip the second load
jz SKIP
; calculate second index
mov edi, [ebx+4] ; index
; load second set of source pixels
movq xmm4, [esi+edi] ; xmm4 = fromPtr1 + index | fromPtr1 + index + 4
movq xmm5, [edx+edi] ; xmm5 = fromPtr2 + index | fromPtr2 + index + 4
punpcklqdq xmm4, xmm5 ; combine all 4 pixels into xmm4
movdqa xmm1, xmm0 ; copy to xmm1, xmm2
pshufd xmm2, xmm0, 0xE4
movdqa xmm5, xmm4 ; copy to xmm1, xmm2
pshufd xmm6, xmm4, 0xE4
jmp NEXT
align 16
SKIP:
movdqa xmm1, xmm0 ; copy to xmm1, xmm2
pshufd xmm2, xmm0, 0xE4
movdqa xmm4, xmm0 ; copy first pixel set xmm0 to second pixel set xmm4
pshufd xmm5, xmm4, 0xE4 ; copy to xmm4, xmm6
movdqa xmm6, xmm4
NEXT:
; prefetchnta [edx+edi+16]
add ebx, 8
; calculate dest rgb values using color = a * F1 + b * F2 + c * F3 + d * F4
; extract b from both sets of pixels and combine into a single reg
pand xmm0, [RGB_AND] ; clear all but r values leaving b000
pand xmm4, [RGB_AND] ; clear all but r values leaving b000
packssdw xmm0, xmm4 ; pack down to 16 bit values
movdqa xmm4, [RGB_AND] ; xmm4 is now free
pmaddwd xmm0, xmm3 ; multiply and add to get temp1 = a * F1 + b * F2, temp2 = c * F3 + d * F4
; extract g
psrld xmm1, 8 ; rotate g to low bytes
pand xmm1, xmm4 ; extract g values g000
psrld xmm5, 8 ; rotate g to low bytes
pand xmm5, xmm4 ; extract g values g000
packssdw xmm1, xmm5 ; pack down to 16 bit values
pmaddwd xmm1, xmm3 ; multiply and add
; extract r
psrld xmm2, 16 ; rotate b to low bytes
pand xmm2, xmm4 ; extract b values b000
psrld xmm6, 16 ; rotate b to low bytes
pand xmm6, xmm4 ; extract b values b000
packssdw xmm2, xmm6 ; pack down to 16 bit values
pmaddwd xmm2, xmm3 ; multiply and add
; Add temp1 and temp2 leaving us with rrrr xxxx rrrr xxxx
psrld xmm0, 14 ; scale back to range
pshufd xmm3, xmm0, 00110001b ; extract temp2
paddd xmm0, xmm3 ; add back to temp1
psrld xmm1, 14 ; scale back to range
pshufd xmm3, xmm1, 00110001b
paddd xmm1, xmm3 ; add
psrld xmm2, 14 ; scale back to range
pshufd xmm3, xmm2, 00110001b
paddd xmm2, xmm3 ; add
; recombine into 2 rgba values
pslld xmm1, 8
por xmm0, xmm1
pslld xmm2, 16
por xmm0, xmm2
pshufd xmm0, xmm0, 00001000b ; shuffle down
movq [ebp], xmm0 ; output 32bit * 2
add ebp, 8
mov edi, [ebx] ; index
sub ecx, 1
jnz REPEATLOOPX2
; Cleanup
pop ebp
pop esi
pop edi
pop ebx
mov esp, ebp
pop ebp
ret
Two suggestions:
run this code in a test harness under a decent profiler on Core 2 (e.g. Zoom) to see where the hotspots and dependency/other stalls are
re-write the SIMD code using intrinsics and then let the compiler handle register allocation, instruction scheduling and other optimisations - a decent compiler such as ICC, or even gcc will do a lot better job than your hand-coded assembly. And as a bonus you can also re-target for different x86 CPU families without having to re-write your code.
这篇关于帮助我提高一些SSE2 code的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!