加快某些SSE2 Intrinsics的颜色转换 [英] Speeding up some SSE2 Intrinsics for color conversion

查看:235
本文介绍了加快某些SSE2 Intrinsics的颜色转换的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图执行从YCbCr到BGRA的图像颜色转换(不要问一点,这样的头痛)。



无论如何,这需要以尽可能快地执行,所以我使用编译器内在函数编写它以利用SSE2。这是我第一次尝试SIMD的土地,我基本上是一个初学者,所以我敢肯定有很多我做的效率低下。



我的算术代码实际的颜色转换变得特别慢,英特尔的VTune显示它是一个重大的瓶颈。



那么,我可以加快以下代码的速度吗?它是在32位,一次4个像素。我最初尝试做它在8位,每次16像素(如上循环),但计算导致整数溢出和转换损坏。这整个过程,包括英特尔jpeg解码正在花费〜14ms为单个领域的全高清。如果我可以把它缩短到至少12ms,最好是10ms,这将是巨大的。



任何帮助或提示感谢。谢谢!

  const __m128i s128_8 = _mm_set1_epi8((char)128); 


const int nNumPixels = roi.width * roi.height;

for(int i = 0; i {
//前进并预取我们打包的UV数据。
//只要负载直接下一个,这会节省我们的时间。
_mm_prefetch((const char *)& pSrc8u [2] [i],_ MM_HINT_T0);

//在我们用UV数据写它之前,我们需要提取和blit出我们的k。
__m128i sK1 = _mm_load_si128((__ m128i *)& pSrc8u [2] [i]);
__m128i sK2 = _mm_load_si128((__ m128i *)& pSrc8u [2] [i + 16]);

//暂时使用目标缓冲区,所以我们不需要浪费时间做内存分配。
_mm_store_si128((__m128i *)& m_pKBuffer [i],sK1);
_mm_store_si128((__m128i *)& m_pKBuffer [i + 16],sK2);

//理论上,这个预取需要在第一次读取之前一些周期。这不是,但它似乎节省我们的时间。值得调查。
_mm_prefetch((const char *)& pSrc8u [1] [i],_ MM_HINT_T0);

__m128i sUVI1 = _mm_load_si128((__ m128i *)& pSrc8u [1] [i]);
__m128i sUVI2 = _mm_load_si128((__ m128i *)& pSrc8u [1] [i + 16]);

//在我们的YCbCr之前减去128 - > BGRA转换,所以我们可以一次去16像素而不是4.
sUVI1 = _mm_sub_epi8(sUVI1,s128_8);
sUVI2 = _mm_sub_epi8(sUVI2,s128_8);

//将交织的8x1字节块的UV数据交叉并双倍化到
__m128i sU1 = _mm_unpacklo_epi8(sUVI1,sUVI1);
__m128i sV1 = _mm_unpackhi_epi8(sUVI1,sUVI1);

__m128i sU2 = _mm_unpacklo_epi8(sUVI2,sUVI2);
__m128i sV2 = _mm_unpackhi_epi8(sUVI2,sUVI2);

_mm_store_si128((__ m128i *)& pSrc8u [1] [i],sU1);
_mm_store_si128((__ m128i *)& pSrc8u [1] [i + 16],sU2);

_mm_store_si128((__ m128i *)& pSrc8u [2] [i],sV1);
_mm_store_si128((__ m128i *)& pSrc8u [2] [i + 16],sV2);
}

const __m128i s16 = _mm_set1_epi32(16);
const __m128i s299 = _mm_set1_epi32(299);
const __m128i s410 = _mm_set1_epi32(410);
const __m128i s518 = _mm_set1_epi32(518);
const __m128i s101 = _mm_set1_epi32(101);
const __m128i s209 = _mm_set1_epi32(209);

Ipp8u * pDstP = pDst8u;
for(int i = 0; i {
__m128i sK = _mm_set_epi32(m_pKBuffer [i],m_pKBuffer [i + 1],m_pKBuffer [i + 2],m_pKBuffer [i + 3]); pSrc8u [0] [i + 1],pSrc8u [0] [i + 2],pSrc8u [0] [i + 3],pSrc8u [0] );
__m128i sU = _mm_set_epi32(char)pSrc8u [1] [i],(char)pSrc8u [1] [i + 1],(char)pSrc8u [1] [i + 2] [1] [i + 3]);
__m128i sV = _mm_set_epi32(char)pSrc8u [2] [i],(char)pSrc8u [2] [i + 1],(char)pSrc8u [2] [i + 2] [2] [i + 3]);

// N.b. - 尝试以8位执行sub 16,类似于sub 128用于U和V - 但是这里做的是更快
//,因为算术上保存的时间小于附加负载/在swizzle循环中需要的存储
sY = _mm_mullo_epi32(_mm_sub_epi32(sY,s16),s299);

__m128i sR = _mm_srli_epi32(_mm_add_epi32(sY,_mm_mullo_epi32(s410,sV)),8);
__m128i sG = _mm_srli_epi32(_mm_sub_epi32(_mm_sub_epi32(sY,_mm_mullo_epi32(s101,sU)),_mm_mullo_epi32(s209,sV)),8)
__m128i sB = _mm_srli_epi32(_mm_add_epi32(sY,_mm_mullo_epi32(s518,sU)),8);

//微软的YUV转换
// __ m128i sC = _mm_sub_epi32(sY,s16);
// __ m128i sD = _mm_sub_epi32(sU,s128);
// __ m128i sE = _mm_sub_epi32(sV,s128);
//
// __ m128i sR = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(s298,sC),_mm_mullo_epi32(s409,sE)),s128),8)
// __ m128i sG = _mm_srli_epi32(_mm_sub_epi32(_mm_mullo_epi32(s298,sC),_mm_sub_epi32(_mm_mullo_epi32(s100,sD),_mm_mullo_epi32(s208,sE))),s128)
// __ m128i sB = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(s298,sC),_mm_mullo_epi32(s516,sD)),s128),8)

__m128i sKG1 = _mm_unpacklo_epi32(sK,sG);
__m128i sKGh = _mm_unpackhi_epi32(sK,sG);

__m128i sRBl = _mm_unpacklo_epi32(sR,sB);
__m128i sRBh = _mm_unpackhi_epi32(sR,sB);

__m128i sKRGB1 = _mm_unpackhi_epi32(sKGh,sRBh);
__m128i sKRGB2 = _mm_unpacklo_epi32(sKGh,sRBh);
__m128i sKRGB3 = _mm_unpackhi_epi32(sKG1,sRBl);
__m128i sKRGB4 = _mm_unpacklo_epi32(sKG1,sRB1);

__m128i p1 = _mm_packus_epi16(sKRGB1,sKRGB2);
__m128i p2 = _mm_packus_epi16(sKRGB3,sKRGB4);

__m128i po = _mm_packus_epi16(p1,p2);

_mm_store_si128((__ m128i *)pDstP,po);
}


解决方案



一个建议:删除 _mm_prefetch intrinsics - 他们几乎肯定没有帮助,甚至可能阻止操作更多的最近的CPU(已经做了一个很好的工作与自动预取)。



另一个领域查看:

  __ m128i sK = _mm_set_epi32(m_pKBuffer [i],m_pKBuffer [i + 1],m_pKBuffer [i + ,m_pKBuffer [i + 3]); 
__m128i sY = _mm_set_epi32(pSrc8u [0] [i],pSrc8u [0] [i + 1],pSrc8u [0] [i + 2],pSrc8u [0] [i + 3]
__m128i sU = _mm_set_epi32(char)pSrc8u [1] [i],(char)pSrc8u [1] [i + 1],(char)pSrc8u [1] [i + 2] [1] [i + 3]);
__m128i sV = _mm_set_epi32(char)pSrc8u [2] [i],(char)pSrc8u [2] [i + 1],(char)pSrc8u [2] [i + 2] [2] [i + 3]);

这会产生很多的不必要的指令 - code> _mm_load_xxx 和 _mm_unpackxx_xxx 这里。它会像更多的代码看起来,但它会更加高效。你应该可能每循环处理16像素,而不是4 - 这样你加载一个8位值的向量一次,并解包,得到每组4个值作为32位int的向量根据需要。 / p>

I'm trying to perform image colour conversion from YCbCr to BGRA (Don't ask about the A bit, such a headache).

Anyway, this needs to perform as fast as possible, so I've written it using compiler intrinsics to take advantage of SSE2. This is my first venture into SIMD land, I'm basically a beginner and so I'm sure there's plenty I'm doing inefficiently.

My arithmetic code for doing the actual colour conversion turns out to be particularly slow, and Intel's VTune is showing it up as a significant bottleneck.

So, any way I can speed up the following code? It's being done in 32bit, 4 pixels at a time. I originally tried doing it in 8 bits, 16 pixels at a time (as in the upper loop), but the calculations cause integer overflow and a broken conversion. This whole process, including the Intel jpeg decode is taking ~14ms for a single field of full HD. It'd be great if I could get it down to at least 12ms, ideally 10ms.

Any help or tips gratefully appreciated. Thanks!

const __m128i s128_8    = _mm_set1_epi8((char)128);


const int nNumPixels = roi.width * roi.height;

for (int i=0; i<nNumPixels; i+=32)
{
    // Go ahead and prefetch our packed UV Data.
    // As long as the load remains directly next, this saves us time.
    _mm_prefetch((const char*)&pSrc8u[2][i],_MM_HINT_T0); 

    // We need to fetch and blit out our k before we write over it with UV data.
    __m128i sK1 = _mm_load_si128((__m128i*)&pSrc8u[2][i]);
    __m128i sK2 = _mm_load_si128((__m128i*)&pSrc8u[2][i+16]);

    // Using the destination buffer temporarily here so we don't need to waste time doing a memory allocation.
    _mm_store_si128 ((__m128i*)&m_pKBuffer[i],      sK1);
    _mm_store_si128 ((__m128i*)&m_pKBuffer[i+16],   sK2);

    // In theory, this prefetch needs to be some cycles ahead of the first read. It isn't, yet it does appear to save us time. Worth investigating.
    _mm_prefetch((const char*)&pSrc8u[1][i],_MM_HINT_T0); 

    __m128i sUVI1 = _mm_load_si128((__m128i*)&pSrc8u[1][i]);
    __m128i sUVI2 = _mm_load_si128((__m128i*)&pSrc8u[1][i+16]);  

    // Subtract the 128 here ahead of our YCbCr -> BGRA conversion so we can go 16 pixels at a time rather than 4.
    sUVI1 = _mm_sub_epi8(sUVI1, s128_8);
    sUVI2 = _mm_sub_epi8(sUVI2, s128_8);

    // Swizzle and double up UV data from interleaved 8x1 byte blocks into planar
    __m128i sU1 = _mm_unpacklo_epi8(sUVI1, sUVI1);
    __m128i sV1 = _mm_unpackhi_epi8(sUVI1, sUVI1);

    __m128i sU2 = _mm_unpacklo_epi8(sUVI2, sUVI2);  
    __m128i sV2 = _mm_unpackhi_epi8(sUVI2, sUVI2);  

    _mm_store_si128((__m128i*)&pSrc8u[1][i],        sU1);
    _mm_store_si128((__m128i*)&pSrc8u[1][i+16],     sU2); 

    _mm_store_si128((__m128i*)&pSrc8u[2][i],        sV1);
    _mm_store_si128((__m128i*)&pSrc8u[2][i+16],     sV2); 
}

const __m128i s16   = _mm_set1_epi32(16);
const __m128i s299  = _mm_set1_epi32(299);
const __m128i s410  = _mm_set1_epi32(410);
const __m128i s518  = _mm_set1_epi32(518);
const __m128i s101  = _mm_set1_epi32(101);
const __m128i s209  = _mm_set1_epi32(209);

Ipp8u* pDstP = pDst8u;
for (int i=0; i<nNumPixels; i+=4, pDstP+=16)
{
    __m128i sK = _mm_set_epi32(m_pKBuffer[i],           m_pKBuffer[i+1],            m_pKBuffer[i+2],            m_pKBuffer[i+3]);

    __m128i sY = _mm_set_epi32(pSrc8u[0][i],            pSrc8u[0][i+1],             pSrc8u[0][i+2],             pSrc8u[0][i+3]);
    __m128i sU = _mm_set_epi32((char)pSrc8u[1][i],      (char)pSrc8u[1][i+1],       (char)pSrc8u[1][i+2],       (char)pSrc8u[1][i+3]);
    __m128i sV = _mm_set_epi32((char)pSrc8u[2][i],      (char)pSrc8u[2][i+1],       (char)pSrc8u[2][i+2],       (char)pSrc8u[2][i+3]);

    // N.b. - Attempted to do the sub 16 in 8 bits similar to the sub 128 for U and V - however doing it here is quicker
    // as the time saved on the arithmetic is less than the time taken by the additional loads/stores needed in the swizzle loop
    sY = _mm_mullo_epi32(_mm_sub_epi32(sY, s16), s299);

    __m128i sR  = _mm_srli_epi32(_mm_add_epi32(sY,_mm_mullo_epi32(s410, sV)), 8);
    __m128i sG  = _mm_srli_epi32(_mm_sub_epi32(_mm_sub_epi32(sY, _mm_mullo_epi32(s101, sU)),_mm_mullo_epi32(s209, sV)), 8);
    __m128i sB  = _mm_srli_epi32(_mm_add_epi32(sY, _mm_mullo_epi32(s518, sU)), 8);

    //Microsoft's YUV Conversion
    //__m128i sC = _mm_sub_epi32(sY, s16);
    //__m128i sD = _mm_sub_epi32(sU, s128);
    //__m128i sE = _mm_sub_epi32(sV, s128);
    //
    //__m128i sR =  _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(s298, sC), _mm_mullo_epi32(s409, sE)), s128), 8);
    //__m128i sG    = _mm_srli_epi32(_mm_add_epi32(_mm_sub_epi32(_mm_mullo_epi32(s298, sC), _mm_sub_epi32(_mm_mullo_epi32(s100, sD), _mm_mullo_epi32(s208, sE))), s128), 8);
    //__m128i sB    = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(s298, sC), _mm_mullo_epi32(s516, sD)), s128), 8);

    __m128i sKGl = _mm_unpacklo_epi32(sK, sG);
    __m128i sKGh = _mm_unpackhi_epi32(sK, sG);

    __m128i sRBl = _mm_unpacklo_epi32(sR, sB);
    __m128i sRBh = _mm_unpackhi_epi32(sR, sB);

    __m128i sKRGB1 = _mm_unpackhi_epi32(sKGh,sRBh);
    __m128i sKRGB2 = _mm_unpacklo_epi32(sKGh,sRBh);
    __m128i sKRGB3 = _mm_unpackhi_epi32(sKGl,sRBl);
    __m128i sKRGB4 = _mm_unpacklo_epi32(sKGl,sRBl);

    __m128i p1 = _mm_packus_epi16(sKRGB1, sKRGB2);
    __m128i p2 = _mm_packus_epi16(sKRGB3, sKRGB4);

    __m128i po = _mm_packus_epi16(p1, p2);

    _mm_store_si128((__m128i*)pDstP, po);
}

解决方案

You may be bandwidth limited here, as there is very little computation relative to the number of loads and stores.

One suggestion: get rid of the _mm_prefetch intrinsics - they are almost certainly not helping and may even hinder operation on more recent CPUs (which already do a pretty good job with automatic prefetching).

Another area to look at:

__m128i sK = _mm_set_epi32(m_pKBuffer[i],           m_pKBuffer[i+1],            m_pKBuffer[i+2],            m_pKBuffer[i+3]);
__m128i sY = _mm_set_epi32(pSrc8u[0][i],            pSrc8u[0][i+1],             pSrc8u[0][i+2],             pSrc8u[0][i+3]);
__m128i sU = _mm_set_epi32((char)pSrc8u[1][i],      (char)pSrc8u[1][i+1],       (char)pSrc8u[1][i+2],       (char)pSrc8u[1][i+3]);
__m128i sV = _mm_set_epi32((char)pSrc8u[2][i],      (char)pSrc8u[2][i+1],       (char)pSrc8u[2][i+2],       (char)pSrc8u[2][i+3]);

This is generating a lot of unnecessary instructions - you should be using _mm_load_xxx and _mm_unpackxx_xxx here. It will look like more code, but it will be a lot more efficient. And you should probably be processing 16 pixels per iteration of the loop, rather than 4 - that way you load a vector of 8 bit values once, and unpack to get each set of 4 values as a vector of 32 bit ints as needed.

这篇关于加快某些SSE2 Intrinsics的颜色转换的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆