快速转换矢量从RGB到BGRA [英] Fast vectorized conversion from RGB to BGRA

查看：438 发布时间：2016/8/21 21:32:46 c opengl sse simd vectorization

本文介绍了快速转换矢量从RGB到BGRA的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

在后续对RGB转换为RGBA一些previous问题，ARGB到BGR，我想一个 RGB是 SSE <加快BGRA 转换/ STRONG>。假设一个32位机，并想使用内在。我有困难对准源和目标缓冲区与128位寄存器工作，并寻求其他精明的矢量化解决方案。

要实现矢量化的程序如下：......

 无效RGB8ToBGRX8（INT W，常量无效*中，无效*总分）
    {
        INT I;
        INT宽度= W;
        const的无符号字符* SRC =（const的无符号字符*）的;
        无符号整型* DST =（unsigned int类型*）出;
        unsigned int类型invalue，outvalue;        对于（i = 0; I＆LT;宽度;我++，SRC + = 3，DST ++）
        {
                invalue = SRC [0];
                outvalue =（invalue＆LT;＆LT; 16）;
                invalue = SRC [1];
                outvalue | =（invalue＆所述;＆下; 8）;
                invalue = SRC [2];
                outvalue | =（invalue）;
                * DST = outvalue | 0xff000000;
        }
      }

这个程序被primarly用于大型纹理（512KB），所以如果我可以并行的一些操作的，它可能是有益的一去处理更多的像素。当然，我需要的资料。：）

编辑：

我的编译参数...

 的gcc -O2的main.c

解决方案

这是使用SSE3内在函数来执行请求的操作的一个例子。输入和输出指针必须为16字节对齐，并且它的16个像素的块上一次操作
我不认为你会得到一个显著的速度提升，虽然。在像素上执行的操作非常简单，因此内存带宽占主导地位。
的#include＆LT; tmmintrin.h＆GT;/ *进出必须对准16字节* / 无效rgb_to_bgrx_sse（无符号W，常量无效*中，无效*总分） { 常量__m128i * in_vec =中; __m128i * out_vec =出来; W / = 16; 而（w--大于0）{ / * 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * in_vec [0]镭嘎巴铷千兆降B RC公元前了Gc钆路重新屋宇署戈为RF * in_vec [1] GF的Bf了Rg千兆克博伽梵歌的Rh GH了Bh日GI碧GJ的Rj BJ RK GK * in_vec [2]浅滩器Rl胃肠B1中罗通用汽车家蚕氡GN BN滚装转到博卢比的Gp BP * / __m128i IN1，IN2，IN3; __m128i出来; IN1 = in_vec [0]; OUT = _mm_shuffle_epi8（IN1， _mm_set_epi8（0xff的，9，10，11，0xff的，6，7，8，0xff的，3，4，5，0xff的，0，1，2））; OUT = _mm_or_si128（出， _mm_set_epi8（0xff的，0，0，0，0xff的，0，0，0，0xff的，0，0，0，0xff的，0，0，0））; out_vec [0] = OUT; IN2 = in_vec [1]; IN1 = _mm_and_si128（IN1， _mm_set_epi8（0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0，0，0，0，0，0，0，0））; OUT = _mm_and_si128（IN2， _mm_set_epi8（0，0，0，0，0，0，0，0，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的））; OUT = _mm_or_si128（满分，IN1）; OUT = _mm_shuffle_epi8（出， _mm_set_epi8（0xff的，5，6，7，0xff的，2，3，4，0xff的，15，0，1，0xff的，12，13，14））; OUT = _mm_or_si128（出， _mm_set_epi8（0xff的，0，0，0，0xff的，0，0，0，0xff的，0，0，0，0xff的，0，0，0））; out_vec [1] = OUT; 立方英寸= in_vec [2]; in_vec + = 3; IN2 = _mm_and_si128（IN2， _mm_set_epi8（0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0，0，0，0，0，0，0，0））; OUT = _mm_and_si128（立方英寸， _mm_set_epi8（0，0，0，0，0，0，0，0，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的，0xff的））; OUT = _mm_or_si128（出来，平方英寸）; OUT = _mm_shuffle_epi8（出， _mm_set_epi8（0xff的，1，2，3，0xff的，14，15，0，0xff的，11，12，13，0xff的，8，9，10））; OUT = _mm_or_si128（出， _mm_set_epi8（0xff的，0，0，0，0xff的，0，0，0，0xff的，0，0，0，0xff的，0，0，0））; out_vec [2] = OUT; OUT = _mm_shuffle_epi8（立方英寸， _mm_set_epi8（0xff的，13，14，15，0xff的，10，11，12，0xff的，7，8，9，0xff的，4，5，6））; OUT = _mm_or_si128（出， _mm_set_epi8（0xff的，0，0，0，0xff的，0，0，0，0xff的，0，0，0，0xff的，0，0，0））; out_vec [3] = OUT; out_vec + = 4; } }
In a follow-up to some previous questions on converting RGB to RGBA, and ARGB to BGR, I would like to speed up a RGB to BGRA conversion with SSE. Assume a 32-bit machine, and would like to use intrinsics. I'm having difficulty aligning both source and destination buffers to work with 128-bit registers, and seek for other savvy vectorization solutions.

The routine to be vectorized is as follows...
void RGB8ToBGRX8(int w, const void *in, void *out) { int i; int width = w; const unsigned char *src= (const unsigned char*) in; unsigned int *dst= (unsigned int*) out; unsigned int invalue, outvalue; for (i=0; i<width; i++, src+=3, dst++) { invalue = src[0]; outvalue = (invalue<<16); invalue = src[1]; outvalue |= (invalue<<8); invalue = src[2]; outvalue |= (invalue); *dst = outvalue | 0xff000000; } }
This routine gets used primarly for large textures (512KB), so if I can parallelize some of the operations, it may be beneficial to process more pixels at a go. Of course, I'll need to profile. :)

Edit:

My compilation arguments...
gcc -O2 main.c

解决方案
This is an example of using SSE3 intrinsics to perform the requested operation. The input and output pointers must be 16-byte aligned, and it operates on a block of 16 pixels at a time.

I don't think you will get a significant speed boost, though. The operations performed on the pixels are so simple that memory bandwidth dominates.
#include <tmmintrin.h> /* in and out must be 16-byte aligned */ void rgb_to_bgrx_sse(unsigned w, const void *in, void *out) { const __m128i *in_vec = in; __m128i *out_vec = out; w /= 16; while (w-- > 0) { /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * in_vec[0] Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf * in_vec[1] Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk * in_vec[2] Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp */ __m128i in1, in2, in3; __m128i out; in1 = in_vec[0]; out = _mm_shuffle_epi8(in1, _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2)); out = _mm_or_si128(out, _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0)); out_vec[0] = out; in2 = in_vec[1]; in1 = _mm_and_si128(in1, _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0)); out = _mm_and_si128(in2, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff)); out = _mm_or_si128(out, in1); out = _mm_shuffle_epi8(out, _mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14)); out = _mm_or_si128(out, _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0)); out_vec[1] = out; in3 = in_vec[2]; in_vec += 3; in2 = _mm_and_si128(in2, _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0)); out = _mm_and_si128(in3, _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff)); out = _mm_or_si128(out, in2); out = _mm_shuffle_epi8(out, _mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10)); out = _mm_or_si128(out, _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0)); out_vec[2] = out; out = _mm_shuffle_epi8(in3, _mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6)); out = _mm_or_si128(out, _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0)); out_vec[3] = out; out_vec += 4; } }

这篇关于快速转换矢量从RGB到BGRA的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

快速转换矢量从RGB到BGRA [英] Fast vectorized conversion from RGB to BGRA

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录关闭

快速转换矢量从RGB到BGRA [英] Fast vectorized conversion from RGB to BGRA

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录 关闭

登录关闭