帮助我提高一些SSE2 code [英] Help me improve some more SSE2 code

查看:187
本文介绍了帮助我提高一些SSE2 code的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我要寻找一些帮助改善这一双线性对酷睿2 CPU的扩展SSE2 code

在我的Atom N270和上I7本code为约2倍比MMX code更快。但是在酷睿2处理器,只有等于MMX code。

code如下

 无效ConversionProcess :: convert_SSE2(BBitmap *从,BBitmap *为)
{
    UINT32 fromBPR,toBPR,fromBPRDIV4,X,Y,年,XR;    ULLint开始= RDTSC();
    ULLint停止;
    如果(由&放大器;&放大器;向){
        UINT32宽度,高度;
        宽度=从 - 方式>界()IntegerWidth()+ 1;
        高度=从 - 方式>界()IntegerHeight()+ 1;        UINT32 toWidth,toHeight;
        toWidth =用于─方式>界()IntegerWidth()+ 1;
        toHeight =用于─方式>界()IntegerHeight()+ 1;        fromBPR =从 - > BytesPerRow();
        fromBPRDIV4 = fromBPR>> 2;
        toBPR =&用于─GT; BytesPerRow();        UINT32 x_ratio =((宽-1) - ; 7;)/ toWidth;
        UINT32 y_ratio =((高度-1) - ; 7;)/ toHeight;        UINT8 * toPtr =(UINT8 *)款拟>位();
        UINT8 * fromPtr1 =(UINT8 *)从 - >位();
        UINT8 * fromPtr2 =(UINT8 *)从 - >位()+ fromBPR;        结构FilterInfo {
            UINT16 one_minus_diff; //一减差异
            UINT16差异; //使用差异值来计算用于平均像素的权重
            UINT16 one_minus_diff_rep; //一减重复差异
            UINT16 diff_rep; //使用差异值来计算用于重复平均像素的权重
        };        FilterInfo * xWeights =(FilterInfo *)memalign可(16,toWidth * 8);
        FilterInfo * yWeights =(FilterInfo *)memalign可(16,toHeight * 8);
        UINT32 * xIndexes =(UINT32 *)memalign可(16,(toWidth + 2)* 4); //将2指数overread
        UINT32 * yIndexes =(UINT32 *)memalign可(16,toHeight * 4);        X = 0;
        对于(UINT32 J = 0; J< toWidth; J ++){
            XR = X>> 7;
            xWeights [J] .diff作= X - (XR<< 7);
            xWeights [J] .one_minus_diff = 127 - xWeights [J] .diff作;
            xWeights [J] .one_minus_diff_rep = xWeights [J] .one_minus_diff;
            xWeights [J] .diff_rep = xWeights [J] .diff作;
            xIndexes [J] = XR<< 2;            X + = x_ratio;
        }        Y = 0;
        对于(UINT32 J = 0; J< toHeight; J ++){
            年= Y>> 7;
            yWeights [J] .diff作= Y - (年<< 7);
            yWeights [J] .one_minus_diff = 127 - yWeights [J] .diff作;
            yIndexes [J] =(年* fromBPR);
            Y + = y_ratio;        }        对于(UINT32我= 0; I< toHeight;我++){
            _ScaleSSE2X2(toPtr,fromPtr1 + yIndexes [I],fromPtr2 + yIndexes [Ⅰ],xIndexes,xWeights,&放大器; yWeights [I],toWidth);
            toPtr + = toBPR;
        }        免费(xWeights);
        免费(yWeights);
        免费(xIndexes);
        免费(yIndexes);        停止= RDTSC() - 启动;
        如果(STOP< timeTaken){
            timeTaken =停止;
        }
    }
};
;版权所有(C)2011戴维McPaul
;
;版权所有。根据MIT许可证的条款分发。
;;一个颇为未优化双线性缩放器%宏cglobal 1
    全球_ 1%
    定义%1%_%1
    16对齐
%1:
%endmacro段.data ALIGN = 16RGB_AND分贝为0xFF
        分贝为0x00
        分贝为0x00
        分贝为0x00
        DB 0xFF的
        分贝为0x00
        分贝为0x00
        分贝为0x00
        DB 0xFF的
        分贝为0x00
        分贝为0x00
        分贝为0x00
        DB 0xFF的
        分贝为0x00
        分贝为0x00
        分贝为0x00;无效_ScaleSSE2X2(无效* toPtr,无效* fromPtr1,无效* fromPtr2,无效* xIndexPtr,无效* xWeightPtr,无效* yWeightPtr,UINT32长度);长EQU EBP + 32
yWeightPtr EQU EBP + 28
xWeightPtr EQU EBP + 24
xIndexPtr EQU EBP + 20
fromPtr2 EQU EBP + 16
fromPtr1 EQU EBP + 12
toPtr EQU EBP + 8.text段ALIGN = 16
cglobal ScaleSSE2X2
;保留寄存器。 EAX,ECX,EDX自动提供
    推EBP
    MOV EBP,ESP
    推EBX; yWeights,xIndexPtr
    推动电子数据交换;刮
    推ESI; fromPtr3    MOV ESI,[fromPtr1]
    MOV EDX,[fromPtr2]
    MOV EAX,[xWeightPtr]
    MOV EBX,[yWeightPtr]
    MOV ECX,[长度]; Ÿ计算权重和缓存
    MOVD XMM7,[EBX]获得1 yDiff和yDiff
    pshuflw XMM7,XMM7,01010000b; 1-yDiff,1- yDiff,yDiff,yDiff
    pshufd XMM7,XMM7,01000100b;重复    MOV EBX,[xIndexPtr]    推EBP;重用toPtr框架PTR
    MOV EBP,[toPtr]不能使用参数裁判了    SHR ECX,1    ;首先计算指数
    MOV EDI,[EBX]指数    16对齐
REPEATLOOPX2:    ;装载的第一和第二组加权成XMM3
    MOVDQA XMM3,[EAX];获得1 xDiff,xDiff,1- xDiff,xDiff
    pmullw XMM3,XMM7;计算F1,F2,F3,F4(2)
    添加EAX,16    ;装载第一套源像素的
    MOVQ XMM0,[ESI + EDI] XMM0 = fromPtr1 +指数| fromPtr1 +指数+ 4
    MOVQ将xmm1,[EDX + EDI]将xmm1 = fromPtr2 +指数| fromPtr2 +指数+ 4
    punpcklqdq XMM0,xmm1中;所有4个像素合并成XMM0    子EDI,[EBX + 4];如果X指数是一样的,然后跳过第二个负载
    JZ SKIP    ;计算第二个索引
    MOV EDI,[EBX + 4];指数    ;加载第二组来源像素
    MOVQ XMM4,[ESI + EDI] XMM4 = fromPtr1 +指数| fromPtr1 +指数+ 4
    MOVQ xmm5,[EDX + EDI] xmm5 = fromPtr2 +指数| fromPtr2 +指数+ 4
    punpcklqdq XMM4,xmm5;所有4个像素合并成XMM4    MOVDQA将xmm1,XMM0;副本XMM1,XMM2
    pshufd XMM2,XMM0,0xE4
    MOVDQA xmm5,XMM4;副本XMM1,XMM2
    pshufd xmm6,XMM4,0xE4    JMP NEXT
16对齐
跳跃:
    MOVDQA将xmm1,XMM0;副本XMM1,XMM2
    pshufd XMM2,XMM0,0xE4
    MOVDQA XMM4,XMM0;复制第一像素设置XMM0第二像素集XMM4
    pshufd xmm5,XMM4,0xE4;副本XMM4,xmm6
    MOVDQA xmm6,XMM4
下一个:
; prefetchnta [EDX + EDI + 16]    添加EBX,8;使用彩色计算DEST RGB值= A * F1 + B * F2 + C * F3 + D * F4;来自两个像素组提取b和结合成一个单一章
    PAND XMM0 [RGB_AND]清除所有但的R值留下B000
    PAND XMM4,[RGB_AND]清除所有但的R值留下B000
    PACKSSDW XMM0,XMM4;包下降到16位值    MOVDQA XMM4,[RGB_AND] XMM4现在是免费的
    PMADDWD XMM0,XMM3;乘法和加法获得=的temp1一个* F1 + B * F2,TEMP2 = C * F3 + D * F4;提取摹
    psrld将xmm1,8;旋转克至低字节
    PAND将xmm1,XMM4;提取Tg值G000
    psrld xmm5,8;旋转克至低字节
    PAND xmm5,XMM4;提取Tg值G000
    PACKSSDW将xmm1,xmm5;包下降到16位值    PMADDWD将xmm1,XMM3;相乘并加;提取物 - [R
    psrld XMM2,16;旋转b,来低字节
    PAND XMM2,XMM4;提取物B值B000
    psrld xmm6,16;旋转b,来低字节
    PAND xmm6,XMM4;提取物B值B000
    PACKSSDW XMM2,xmm6;包下降到16位值    PMADDWD XMM2,XMM3;相乘并加;添加和的temp1 TEMP2留给我们RRRR XXXX XXXX RRRR
    psrld XMM0,14;缩减到范围
    pshufd XMM3,XMM0,00110001b;提取TEMP2
    paddd XMM0,XMM3;添加回temp1目录    psrld将xmm1,14;缩减到范围
    pshufd XMM3,将xmm1,00110001b
    paddd将xmm1,XMM3;加    psrld XMM2,14;缩减到范围
    pshufd XMM3,XMM2,00110001b
    paddd XMM2,XMM3;加;重组到2 RGBA值    PSLLD将xmm1,8
    陈健波XMM0,xmm1中
    PSLLD XMM2,16
    陈健波XMM0,XMM2
    pshufd XMM0,XMM0,00001000b;向下洗牌    MOVQ [EBP],XMM0;输出32位* 2
    添加EBP,8    MOV EDI,[EBX]指数    子ECX,1
    JNZ REPEATLOOPX2;清理    流行EBP
    流行ESI
    流行EDI
    流行EBX
    MOV ESP,EBP
    流行EBP
    RET


解决方案

两点建议:


  • 在酷睿2体面的探查下运行的测试工具这个code(如变焦)看看那里的热点和依赖性/其他摊位


  • 使用内部函数重新编写SIMD code,然后让编译器处理寄存器分配,指令调度和其他的优化​​ - 一个体面的编译器,如ICC,甚至GCC会做很多更好的工作比你的手工codeD组装。作为奖励,你还可以重新目标不同的x86处理器家族,而无需重新编写code。


I am looking for some help to improve this bilinear scaling sse2 code on core2 cpus

On my Atom N270 and on an i7 this code is about 2x faster than the mmx code. But under core2 cpus it is only equal to the mmx code.

Code follows

void ConversionProcess::convert_SSE2(BBitmap *from, BBitmap *to)
{
    uint32 fromBPR, toBPR, fromBPRDIV4, x, y, yr, xr;

    ULLint start = rdtsc();
    ULLint stop;
    if (from && to) {
        uint32 width, height;
        width = from->Bounds().IntegerWidth() + 1;
        height = from->Bounds().IntegerHeight() + 1;

        uint32 toWidth, toHeight;
        toWidth = to->Bounds().IntegerWidth() + 1;
        toHeight = to->Bounds().IntegerHeight() + 1;

        fromBPR = from->BytesPerRow();
        fromBPRDIV4 = fromBPR >> 2;
        toBPR = to->BytesPerRow();

        uint32 x_ratio = ((width-1) << 7) / toWidth ;
        uint32 y_ratio = ((height-1) << 7) / toHeight ;

        uint8* toPtr = (uint8*)to->Bits();
        uint8* fromPtr1 = (uint8*)from->Bits();
        uint8* fromPtr2 = (uint8*)from->Bits() + fromBPR;

        struct FilterInfo {
            uint16 one_minus_diff;      // one minus diff
            uint16 diff;                // diff value used to calculate the weights used to average the pixels
            uint16 one_minus_diff_rep;  // one minus diff repeated
            uint16 diff_rep;            // diff value used to calculate the weights used to average the pixels repeated
        };

        FilterInfo *xWeights = (FilterInfo *)memalign(16, toWidth * 8);
        FilterInfo *yWeights = (FilterInfo *)memalign(16, toHeight * 8);
        uint32 *xIndexes = (uint32 *)memalign(16, (toWidth+2) * 4);     // will overread by 2 index
        uint32 *yIndexes = (uint32 *)memalign(16, toHeight * 4);

        x = 0;
        for (uint32 j=0;j < toWidth;j++) {
            xr = x >> 7;
            xWeights[j].diff = x - (xr << 7);
            xWeights[j].one_minus_diff = 127 - xWeights[j].diff;
            xWeights[j].one_minus_diff_rep = xWeights[j].one_minus_diff;
            xWeights[j].diff_rep = xWeights[j].diff;
            xIndexes[j] = xr << 2;

            x += x_ratio;
        }

        y = 0;
        for (uint32 j=0;j < toHeight; j++) {
            yr = y >> 7;
            yWeights[j].diff = y - (yr << 7);
            yWeights[j].one_minus_diff = 127 - yWeights[j].diff;
            yIndexes[j] = (yr * fromBPR);
            y += y_ratio;

        }

        for (uint32 i=0;i < toHeight; i++) {
            _ScaleSSE2X2(toPtr, fromPtr1 + yIndexes[i], fromPtr2 + yIndexes[i], xIndexes, xWeights, &yWeights[i], toWidth);
            toPtr += toBPR; 
        }

        free(xWeights);
        free(yWeights);
        free(xIndexes);
        free(yIndexes);

        stop = rdtsc() - start;
        if (stop < timeTaken) {
            timeTaken = stop;
        }
    }
}

;
; Copyright (C) 2011 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;

; A rather unoptimised bilinear scaler

%macro  cglobal 1
    global  _%1
    %define %1 _%1
    align 16
%1:
%endmacro

SECTION .data align=16

RGB_AND db  0xff
        db  0x00
        db  0x00
        db  0x00
        db  0xff
        db  0x00
        db  0x00
        db  0x00
        db  0xff
        db  0x00
        db  0x00
        db  0x00
        db  0xff
        db  0x00
        db  0x00
        db  0x00

; void  _ScaleSSE2X2(void *toPtr, void *fromPtr1, void *fromPtr2, void* xIndexPtr, void *xWeightPtr, void *yWeightPtr, uint32 length);

length      equ ebp+32
yWeightPtr  equ ebp+28
xWeightPtr  equ ebp+24
xIndexPtr   equ ebp+20
fromPtr2    equ ebp+16
fromPtr1    equ ebp+12
toPtr       equ ebp+8

SECTION .text align=16
cglobal ScaleSSE2X2
; reserve registers. eax, ecx, edx automatically available
    push ebp
    mov ebp, esp
    push ebx    ; yWeights, xIndexPtr
    push edi    ; scratch
    push esi    ; fromPtr3

    mov esi, [fromPtr1]
    mov edx, [fromPtr2]
    mov eax, [xWeightPtr]
    mov ebx, [yWeightPtr]
    mov ecx, [length]

; calculate y weights and cache
    movd xmm7, [ebx]                ; get 1-yDiff and yDiff
    pshuflw xmm7, xmm7, 01010000b   ; 1-yDiff, 1-yDiff, yDiff, yDiff
    pshufd xmm7, xmm7, 01000100b    ; duplicate

    mov ebx, [xIndexPtr]

    push ebp                        ; reuse frame ptr for toPtr
    mov ebp, [toPtr]                ; Cannot use parameter refs anymore

    shr ecx,1

    ; calculate first index
    mov edi, [ebx]                  ; index

    align 16
REPEATLOOPX2:

    ; load first and second set of weights into xmm3
    movdqa xmm3, [eax]              ; get 1-xDiff, xDiff, 1-xDiff, xDiff
    pmullw xmm3, xmm7               ; calculate F1, F2, F3, F4 (2)
    add eax, 16

    ; load first set of source pixels
    movq xmm0, [esi+edi]            ; xmm0 = fromPtr1 + index | fromPtr1 + index + 4
    movq xmm1, [edx+edi]            ; xmm1 = fromPtr2 + index | fromPtr2 + index + 4
    punpcklqdq xmm0, xmm1           ; combine all 4 pixels into xmm0

    sub edi, [ebx+4]                ; if the x index is the same then skip the second load
    jz SKIP

    ; calculate second index
    mov edi, [ebx+4]                ; index

    ; load second set of source pixels
    movq xmm4, [esi+edi]            ; xmm4 = fromPtr1 + index | fromPtr1 + index + 4
    movq xmm5, [edx+edi]            ; xmm5 = fromPtr2 + index | fromPtr2 + index + 4
    punpcklqdq xmm4, xmm5           ; combine all 4 pixels into xmm4

    movdqa xmm1, xmm0               ; copy to xmm1, xmm2
    pshufd xmm2, xmm0, 0xE4 
    movdqa xmm5, xmm4               ; copy to xmm1, xmm2
    pshufd xmm6, xmm4, 0xE4 

    jmp NEXT
align 16
SKIP:
    movdqa xmm1, xmm0               ; copy to xmm1, xmm2
    pshufd xmm2, xmm0, 0xE4 
    movdqa xmm4, xmm0               ; copy first pixel set xmm0 to second pixel set xmm4
    pshufd xmm5, xmm4, 0xE4         ; copy to xmm4, xmm6
    movdqa xmm6, xmm4               
NEXT:
;   prefetchnta [edx+edi+16]

    add ebx, 8

; calculate dest rgb values using color = a * F1 + b * F2 + c * F3 + d * F4

; extract b from both sets of pixels and combine into a single reg
    pand xmm0, [RGB_AND]            ; clear all but r values leaving b000
    pand xmm4, [RGB_AND]            ; clear all but r values leaving b000
    packssdw xmm0, xmm4             ; pack down to 16 bit values

    movdqa xmm4, [RGB_AND]          ; xmm4 is now free
    pmaddwd xmm0, xmm3              ; multiply and add to get temp1 = a * F1 + b * F2, temp2 = c * F3 + d * F4

; extract g
    psrld xmm1, 8                   ; rotate g to low bytes
    pand xmm1, xmm4                 ; extract g values g000
    psrld xmm5, 8                   ; rotate g to low bytes
    pand xmm5, xmm4                 ; extract g values g000
    packssdw xmm1, xmm5             ; pack down to 16 bit values

    pmaddwd xmm1, xmm3              ; multiply and add

; extract r
    psrld xmm2, 16                  ; rotate b to low bytes
    pand xmm2, xmm4                 ; extract b values b000
    psrld xmm6, 16                  ; rotate b to low bytes
    pand xmm6, xmm4                 ; extract b values b000
    packssdw xmm2, xmm6             ; pack down to 16 bit values

    pmaddwd xmm2, xmm3              ; multiply and add

;   Add temp1 and temp2 leaving us with rrrr xxxx rrrr xxxx
    psrld xmm0, 14                  ; scale back to range
    pshufd xmm3, xmm0, 00110001b    ; extract temp2
    paddd xmm0, xmm3                ; add back to temp1

    psrld xmm1, 14                  ; scale back to range
    pshufd xmm3, xmm1, 00110001b
    paddd xmm1, xmm3                ; add

    psrld xmm2, 14                  ; scale back to range
    pshufd xmm3, xmm2, 00110001b
    paddd xmm2, xmm3                ; add

;   recombine into 2 rgba values

    pslld xmm1, 8
    por xmm0, xmm1
    pslld xmm2, 16
    por xmm0, xmm2
    pshufd xmm0, xmm0, 00001000b    ; shuffle down

    movq [ebp], xmm0                ; output 32bit * 2
    add ebp, 8

    mov edi, [ebx]                  ; index

    sub ecx, 1
    jnz REPEATLOOPX2

; Cleanup

    pop ebp
    pop esi
    pop edi
    pop ebx
    mov esp, ebp
    pop ebp
    ret

解决方案

Two suggestions:

  • run this code in a test harness under a decent profiler on Core 2 (e.g. Zoom) to see where the hotspots and dependency/other stalls are

  • re-write the SIMD code using intrinsics and then let the compiler handle register allocation, instruction scheduling and other optimisations - a decent compiler such as ICC, or even gcc will do a lot better job than your hand-coded assembly. And as a bonus you can also re-target for different x86 CPU families without having to re-write your code.

这篇关于帮助我提高一些SSE2 code的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆