快速转换矢量从RGB到BGRA [英] Fast vectorized conversion from RGB to BGRA

查看:438
本文介绍了快速转换矢量从RGB到BGRA的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

在后续对RGB转换为RGBA一些previous问题,ARGB到BGR,我想一个 RGB是 SSE <加快BGRA 转换/ STRONG>。假设一个32位机,并想使用内在。我有困难对准源和目标缓冲区与128位寄存器工作,并寻求其他精明的矢量化解决方案。

要实现矢量化的程序如下:......

 无效RGB8ToBGRX8(INT W,常量无效*中,无效*总分)
    {
        INT I;
        INT宽度= W;
        const的无符号字符* SRC =(const的无符号字符*)的;
        无符号整型* DST =(unsigned int类型*)出;
        unsigned int类型invalue,outvalue;        对于(i = 0; I&LT;宽度;我++,SRC + = 3,DST ++)
        {
                invalue = SRC [0];
                outvalue =(invalue&LT;&LT; 16);
                invalue = SRC [1];
                outvalue | =(invalue&所述;&下; 8);
                invalue = SRC [2];
                outvalue | =(invalue);
                * DST = outvalue | 0xff000000;
        }
      }

这个程序被primarly用于大型纹理(512KB),所以如果我可以并行的一些操作的,它可能是有益的一去处理更多的像素。当然,我需要的资料。 :)

编辑:

我的编译参数...

 的gcc -O2的main.c


解决方案

这是使用SSE3内在函数来执行请求的操作的一个例子。输入和输出指针必须为16字节对齐,并且它的16个像素的块上一次操作

我不认为你会得到一个显著的速度提升,虽然。在像素上执行的操作非常简单,因此内存带宽占主导地位。

 的#include&LT; tmmintrin.h&GT;/ *进出必须对准16字节* /
无效rgb_to_bgrx_sse(无符号W,常量无效*中,无效*总分)
{
    常量__m128i * in_vec =中;
    __m128i * out_vec =出来;    W / = 16;    而(w--大于0){
        / * 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
         * in_vec [0]镭嘎巴铷千兆降B RC公元前了Gc钆路重新屋宇署戈为RF
         * in_vec [1] GF的Bf了Rg千兆克博伽梵歌的Rh GH了Bh日GI碧GJ的Rj BJ RK GK
         * in_vec [2]浅滩器Rl胃肠B1中罗通用汽车家蚕氡GN BN滚装转到博卢比的Gp BP
         * /
        __m128i IN1,IN2,IN3;
        __m128i出来;        IN1 = in_vec [0];        OUT = _mm_shuffle_epi8(IN1,
            _mm_set_epi8(0xff的,9,10,11,0xff的,6,7,8,0xff的,3,4,5,0xff的,0,1,2));
        OUT = _mm_or_si128(出,
            _mm_set_epi8(0xff的,0,0,0,0xff的,0,0,0,0xff的,0,0,0,0xff的,0,0,0));
        out_vec [0] = OUT;        IN2 = in_vec [1];        IN1 = _mm_and_si128(IN1,
            _mm_set_epi8(0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0,0,0,0,0,0,0,0));
        OUT = _mm_and_si128(IN2,
            _mm_set_epi8(0,0,0,0,0,0,0,0,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的));
        OUT = _mm_or_si128(满分,IN1);
        OUT = _mm_shuffle_epi8(出,
            _mm_set_epi8(0xff的,5,6,7,0xff的,2,3,4,0xff的,15,0,1,0xff的,12,13,14));
        OUT = _mm_or_si128(出,
            _mm_set_epi8(0xff的,0,0,0,0xff的,0,0,0,0xff的,0,0,0,0xff的,0,0,0));
        out_vec [1] = OUT;        立方英寸= in_vec [2];
        in_vec + = 3;        IN2 = _mm_and_si128(IN2,
            _mm_set_epi8(0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0,0,0,0,0,0,0,0));
        OUT = _mm_and_si128(立方英寸,
            _mm_set_epi8(0,0,0,0,0,0,0,0,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的,0xff的));
        OUT = _mm_or_si128(出来,平方英寸);
        OUT = _mm_shuffle_epi8(出,
            _mm_set_epi8(0xff的,1,2,3,0xff的,14,15,0,0xff的,11,12,13,0xff的,8,9,10));
        OUT = _mm_or_si128(出,
            _mm_set_epi8(0xff的,0,0,0,0xff的,0,0,0,0xff的,0,0,0,0xff的,0,0,0));
        out_vec [2] = OUT;        OUT = _mm_shuffle_epi8(立方英寸,
            _mm_set_epi8(0xff的,13,14,15,0xff的,10,11,12,0xff的,7,8,9,0xff的,4,5,6));
        OUT = _mm_or_si128(出,
            _mm_set_epi8(0xff的,0,0,0,0xff的,0,0,0,0xff的,0,0,0,0xff的,0,0,0));
        out_vec [3] = OUT;        out_vec + = 4;
    }
}

In a follow-up to some previous questions on converting RGB to RGBA, and ARGB to BGR, I would like to speed up a RGB to BGRA conversion with SSE. Assume a 32-bit machine, and would like to use intrinsics. I'm having difficulty aligning both source and destination buffers to work with 128-bit registers, and seek for other savvy vectorization solutions.

The routine to be vectorized is as follows...

    void RGB8ToBGRX8(int w, const void *in, void *out)
    {
        int i;
        int width = w;
        const unsigned char *src= (const unsigned char*) in;
        unsigned int *dst= (unsigned int*) out;
        unsigned int invalue, outvalue;

        for (i=0; i<width; i++, src+=3, dst++)
        {
                invalue = src[0];
                outvalue = (invalue<<16);
                invalue = src[1];
                outvalue |= (invalue<<8);
                invalue = src[2];
                outvalue |= (invalue);
                *dst = outvalue | 0xff000000;
        }
      }

This routine gets used primarly for large textures (512KB), so if I can parallelize some of the operations, it may be beneficial to process more pixels at a go. Of course, I'll need to profile. :)

Edit:

My compilation arguments...

gcc -O2 main.c

解决方案

This is an example of using SSE3 intrinsics to perform the requested operation. The input and output pointers must be 16-byte aligned, and it operates on a block of 16 pixels at a time.

I don't think you will get a significant speed boost, though. The operations performed on the pixels are so simple that memory bandwidth dominates.

#include <tmmintrin.h>

/* in and out must be 16-byte aligned */
void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
{
    const __m128i *in_vec = in;
    __m128i *out_vec = out;

    w /= 16;

    while (w-- > 0) {
        /*             0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
         * in_vec[0]   Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
         * in_vec[1]   Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
         * in_vec[2]   Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
         */
        __m128i in1, in2, in3;
        __m128i out;

        in1 = in_vec[0];

        out = _mm_shuffle_epi8(in1,
            _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[0] = out;

        in2 = in_vec[1];

        in1 = _mm_and_si128(in1,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in2,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in1);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[1] = out;

        in3 = in_vec[2];
        in_vec += 3;

        in2 = _mm_and_si128(in2,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in3,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in2);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[2] = out;

        out = _mm_shuffle_epi8(in3,
            _mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[3] = out;

        out_vec += 4;
    }
}

这篇关于快速转换矢量从RGB到BGRA的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆