如何将此代码重写为sse内在函数 [英] How to rewrite this code to sse intrinsics

查看:155
本文介绍了如何将此代码重写为sse内在函数的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述



我有这样的代码

  for(int k = 0; k <= n-4; k + = 4)
{

xc0 = 512 +((idx + k * iddx)> 6);
int yc0 = 512 +((idy + k * iddy)>> 6);

int xc1 = 512 +((idx +(k + 1)* iddx)>> 6);
int yc1 = 512 +((idy +(k + 1)* iddy)>> 6);

int xc2 = 512 +((idx +(k + 2)* iddx)> 6);
int yc2 = 512 +((idy +(k + 2)* iddy)>> 6);

int xc3 = 512 +((idx +(k + 3)* iddx)> 6);
int yc3 = 512 +((idy +(k + 3)* iddy)> 6);

无符号color0 = working_buffer [yc0 * working_buffer_size_x + xc0];
unsigned color1 = working_buffer [yc1 * working_buffer_size_x + xc1];
unsigned color2 = working_buffer [yc2 * working_buffer_size_x + xc2];
unsigned color3 = working_buffer [yc3 * working_buffer_size_x + xc3];

int adr = base_adr + k;

frame_bitmap [adr] = color0;
frame_bitmap [adr + 1] = color1;
frame_bitmap [adr + 2] = color2;
frame_bitmap [adr + 3] = color3;
}

这里是int / unsigned,这是循环的关键部分,不是肯定如果整数sse将有助于这里的速度,但奇迹,如果它会工作在所有?可以someopne帮助这个?



(im使用mingw32)

解决方案

我的sse生锈,但你应该做的是:

  xmm0:[k,k + 1,k + 2,k + 3] // xc0,xc1,... 
xmm1:[k,k + 1,k + 2,k + 3] // yc0,yc1,....
//循环
xmm2:[512,512,512,512]
xmm3:[idx,idx,idx,idx]
xmm4:[iddx,iddx,iddx,iddx]
xmm5:[idy,idy,idy,idy]
xmm6:[iddy,iddy,iddy,iddy]
xmm7:[working_buffer_size_x,working_buffer_size_x,working_buffer_size_x,working_buffer_size_x]

计算:

  xmm0 * xmm4 
xmm0 + xmm3
xmm0> 6
xmm0 + xmm2

xmm0:[xc0,xc1,xc2,xc3]
/////////////////// ////////////

xmm1 * xmm6
xmm1 + xmm5
xmm1> 6
xmm1 + xmm2

xmm1:[yc0,yc1,yc2,yc3]

xmm1 * xmm7
xmm1 + xmm0

现在 xmm1 是:

  xmm1:[yc0 * working_buffer_size_x + xc0,yc1 * working_buffer_size_x + xc1,yc2 * working_buffer_size_x + xc2,yc3 * working_buffer_size_x + xc3] 

您正在每个循环(working_buffer,frame_bitmap数组)中读取和写入内存,操作的速度太慢于计算本身,



p>您需要将 working_buffer和frame_bitmap 数组对齐和 SSE4.1

  #include< emmintrin.h> 
#include< smmintrin.h> //SSE4.1

int a [4] __attribute __((aligned(16)));
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;

xmm2 = _mm_set1_epi32(512);
xmm3 = _mm_set1_epi32(idx);
xmm4 = _mm_set1_epi32(iddx);
xmm5 = _mm_set1_epi32(idy);
xmm6 = _mm_set1_epi32(iddy);
xmm7 = _mm_set1_epi32(working_buffer_size_x);

for(k = 0;k≤n-4; k + = 4){
xmm0 = _mm_set_epi32(k + 3,k + 2,k + 1,k) ;
xmm1 = _mm_set_epi32(k + 3,k + 2,k + 1,k);

// xmm0 * xmm4
xmm0 = _mm_mullo_epi32(xmm0,xmm4);

// xmm0 + xmm3
xmm0 = _mm_add_epi32(xmm0,xmm3)

// xmm0>> 6
xmm0 = _mm_srai_epi32(xmm0,6);

// xmm0 + xmm2
xmm0 = _mm_add_epi32(xmm0,xmm2);



// xmm1 * xmm6
xmm1 = _mm_mullo_epi32(xmm1,xmm6);

// xmm1 + xmm5
xmm1 = _mm_add_epi32(xmm1,xmm5);

// xmm1>> 6
xmm1 = _mm_srai_epi32(xmm1,6)

// xmm1 + xmm2
xmm1 = _mm_add_epi32(xmm1,xmm2);


// xmm1 * xmm7
xmm1 = _mm_mullo_epi32(xmm1,xmm7);
// xmm1 + xmm0
xmm1 = _mm_add_epi32(xmm1,xmm0);


// a [0] = yc0 * working_buffer_size_x + xc0
// a [1] = yc1 * working_buffer_size_x + xc1
// a [2] = yc2 * working_buffer_size_x + xc2
// a [3] = yc3 * working_buffer_size_x + xc3
_mm_store_si128((__ m128i *)& a [0],xmm1)

unsigned color0 = working_buffer [a [0]];
unsigned color1 = working_buffer [a [1]];
unsigned color2 = working_buffer [a [2]];
unsigned color3 = working_buffer [a [3]];

int adr = base_adr + k;

frame_bitmap [adr] = color0;
frame_bitmap [adr + 1] = color1;
frame_bitmap [adr + 2] = color2;
frame_bitmap [adr + 3] = color3;
}



您可以通过避免 _mm_store_si128 ((__m128i *)& a [0],xmm1); 或 int adr = base_adr + k;


Im new in sse intrinsics and would appreciate some hints assistance in using this 9as this is yet foggy to me)

I got such code

for(int k=0; k<=n-4; k+=4) 
 { 

  int xc0 = 512 + ((idx + k*iddx)>>6); 
  int yc0 = 512 + ((idy + k*iddy)>>6); 

  int xc1 = 512 + ((idx + (k+1)*iddx)>>6); 
  int yc1 = 512 + ((idy + (k+1)*iddy)>>6); 

  int xc2 = 512 + ((idx + (k+2)*iddx)>>6); 
  int yc2 = 512 + ((idy + (k+2)*iddy)>>6); 

  int xc3 = 512 + ((idx + (k+3)*iddx)>>6); 
  int yc3 = 512 + ((idy + (k+3)*iddy)>>6); 

  unsigned color0 =  working_buffer[yc0*working_buffer_size_x + xc0]; 
  unsigned color1 =  working_buffer[yc1*working_buffer_size_x + xc1]; 
  unsigned color2 =  working_buffer[yc2*working_buffer_size_x + xc2]; 
  unsigned color3 =  working_buffer[yc3*working_buffer_size_x + xc3]; 

  int adr = base_adr + k; 

  frame_bitmap[adr]  = color0; 
  frame_bitmap[adr+1]= color1; 
  frame_bitmap[adr+2]= color2; 
  frame_bitmap[adr+3]= color3; 
 } 

all here is int/unsigned, this is critical part of the loop, not sure if integer sse would help here in speed but wonder if it would work at all? could someopne help with this?

(im using mingw32)

解决方案

My sse is a bit rusty, but what you should do is:

xmm0: [k, k+1, k+2, k+3] //xc0, xc1,....
xmm1: [k, k+1, k+2, k+3] //yc0, yc1,....
//initialize before the loop
xmm2: [512, 512, 512, 512]
xmm3: [idx, idx, idx, idx]
xmm4: [iddx, iddx, iddx, iddx]
xmm5: [idy, idy, idy, idy]
xmm6: [iddy, iddy, iddy, iddy]
xmm7: [working_buffer_size_x, working_buffer_size_x, working_buffer_size_x, working_buffer_size_x]

Calculations:

xmm0 * xmm4
xmm0 + xmm3
xmm0 >> 6
xmm0 + xmm2

xmm0: [xc0, xc1, xc2, xc3]
///////////////////////////////

xmm1 * xmm6
xmm1 + xmm5
xmm1 >> 6
xmm1 + xmm2

xmm1: [yc0, yc1, yc2, yc3]

xmm1 * xmm7
xmm1 + xmm0

Now xmm1 is:

xmm1: [yc0*working_buffer_size_x + xc0, yc1*working_buffer_size_x + xc1, yc2*working_buffer_size_x + xc2, yc3*working_buffer_size_x + xc3]

You are reading and writing memory in each loop (working_buffer, frame_bitmap arrays), operations that are way too slower than the calculations itself, so the speed improvement won't be as much as you expected to be.

EDIT

You need working_buffer and frame_bitmap arrays to be aligned and SSE4.1:

#include <emmintrin.h>
#include <smmintrin.h> //SSE4.1

int a[4] __attribute__((aligned(16)));
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

xmm2 = _mm_set1_epi32(512);
xmm3 = _mm_set1_epi32(idx);
xmm4 = _mm_set1_epi32(iddx);
xmm5 = _mm_set1_epi32(idy);
xmm6 = _mm_set1_epi32(iddy);
xmm7 = _mm_set1_epi32(working_buffer_size_x);

for(k = 0; k <= n - 4; k +=4){
    xmm0 = _mm_set_epi32(k + 3, k + 2, k + 1, k);
    xmm1 = _mm_set_epi32(k + 3, k + 2, k + 1, k);

    //xmm0 * xmm4
    xmm0 = _mm_mullo_epi32(xmm0, xmm4);

    //xmm0 + xmm3
    xmm0 = _mm_add_epi32(xmm0, xmm3);

    //xmm0 >> 6
    xmm0 = _mm_srai_epi32(xmm0, 6);

    //xmm0 + xmm2
    xmm0 = _mm_add_epi32(xmm0, xmm2);



    //xmm1 * xmm6
    xmm1 = _mm_mullo_epi32(xmm1, xmm6);

    //xmm1 + xmm5
    xmm1 = _mm_add_epi32(xmm1, xmm5);

    //xmm1 >> 6
    xmm1 = _mm_srai_epi32(xmm1, 6);

    //xmm1 + xmm2
    xmm1 = _mm_add_epi32(xmm1, xmm2);


    //xmm1 * xmm7
    xmm1 = _mm_mullo_epi32(xmm1, xmm7);
    //xmm1 + xmm0
    xmm1 = _mm_add_epi32(xmm1, xmm0);


    //a[0] = yc0*working_buffer_size_x + xc0
    //a[1] = yc1*working_buffer_size_x + xc1
    //a[2] = yc2*working_buffer_size_x + xc2
    //a[3] = yc3*working_buffer_size_x + xc3
    _mm_store_si128((__m128i *)&a[0], xmm1);

    unsigned color0 =  working_buffer[ a[0] ]; 
    unsigned color1 =  working_buffer[ a[1] ]; 
    unsigned color2 =  working_buffer[ a[2] ]; 
    unsigned color3 =  working_buffer[ a[3] ]; 

    int adr = base_adr + k; 

    frame_bitmap[adr]  = color0; 
    frame_bitmap[adr+1]= color1; 
    frame_bitmap[adr+2]= color2; 
    frame_bitmap[adr+3]= color3; 
}

You can optimize it even more by avoiding the _mm_store_si128((__m128i *)&a[0], xmm1); or the int adr = base_adr + k; using assembly with direct manipulation of memory.

这篇关于如何将此代码重写为sse内在函数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆