包装10位值与SIMD字节流 [英] packing 10 bit values into a byte stream with SIMD

查看：139 发布时间：2016/8/7 19:45:16 c++ x86 bit-manipulation simd

本文介绍了包装10位值与SIMD字节流的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我想在一个连续的字节流打包10位像素，使用SIMD指令。下面的code不原则上，但在SIMD版本比标版要慢。

这个问题似乎是，我无法找到有效装载寄存器良好的聚集/分散操作。

改进建议？

  // SIMD_test.cpp：定义控制台应用程序的入口点。
//的#includestdafx.h中＃包括WINDOWS.H
＃包括LT＆;＆tmmintrin.h GT;
＃包括LT＆;＆stdint.h GT;
＃包括LT＆;＆string.h中GT;//参考非SIMD执行该工程
// 4 UINT16在一个时间作为输入，并5 UINT8按循环迭代输出无效packSlow（uint16_t * PTR，uint8_t有* streamBuffer，uint32_t的NCOL）
{
    对于（uint32_t的J = 0; J＆LT; NCOL; J + = 4）
    {
        streamBuffer [0] =（uint8_t有）（ptr的[0]）;
        streamBuffer [1] =（uint8_t有）（（（PTR [0]＆放大器; 0x3FF处）GT;→8）|（（PTR [1]＆安培; 0x3F的）下; 2））;
        streamBuffer [2] =（uint8_t有）（（（PTR [1]＆安培; 0x3FF处）GT;＆→6）|（（PTR [2]＆安培;为0x0F）所述; 4;））;
        streamBuffer [3] =（uint8_t有）（（（PTR [2]＆放大器; 0x3FF处）GT;→4）|（（PTR [3]＆放大器; 0×03）所述; 6;））;
        streamBuffer [4] =（uint8_t有）（（PTR [3]＆放大器; 0x3FF处）GT;＆→2）;
        streamBuffer + = 5;
        PTR + = 4;
    }
}
//写得不好的SIMD执行。尝试做同样的
//作为packSlow，但在时间8次迭代无效packFast（uint16_t * PTR，uint8_t有* streamBuffer，uint32_t的NCOL）
{
    常量__m128i MASKA = _mm_set_epi16（0x3FF处，0x3FF处，0x3FF处，0x3FF处，0x3FF处，0x3FF处，0x3FF处，0x3FF处）;
    常量__m128i maskb = _mm_set_epi16（0x3F的，0x3F的，0x3F的，0x3F的，0x3F的，0x3F的，0x3F的，0x3F的）;
    常量__m128i maskc = _mm_set_epi16（为0x0F，为0x0F，为0x0F，为0x0F，为0x0F，为0x0F，为0x0F，为0x0F）;
    常量__m128i maskd = _mm_set_epi16（0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03）;    为（uint32_t的J = 0; J＆下; NCOL; J + = 4 * 8）
    {
        _mm_ prefetch（（为const char *）（PTR + J），_ MM_HINT_T0）;
    }    为（uint32_t的J = 0; J＆下; NCOL; J + = 4 * 8）
    {
        //这个取的舞台是昂贵的。每学期需要2个周期
        __m128i ptr0 = _mm_set_epi16（PTR [0]，PTR [4]，PTR [8]，PTR [12]，PTR [16]，PTR [20]，PTR [24]，PTR [28]）;
        __m128i ptr1的= _mm_set_epi16（PTR [1]，PTR [5]，PTR [9]，PTR [13]，PTR [17]，PTR [21]，PTR [25]，PTR [29]）;
        __m128i PTR2 = _mm_set_epi16（PTR [2]，PTR [6]，PTR [10]，PTR [14]，PTR [18]，PTR [22]，PTR [26]，PTR [30]）;
        __m128i ptr3 = _mm_set_epi16（PTR [3]，PTR [7]，PTR [11]，PTR [15]，PTR [19]，PTR [23]，PTR [27]，PTR [31]）;           //我觉得这部分是相当不错的优化
        __m128i streamBuffer0 = ptr0;
        __m128i streamBuffer1 = _mm_or_si128（_mm_srl_epi16（_mm_and_si128（ptr0，MASKA），_mm_set_epi32（0,0，0,8）），_mm_sll_epi16（_mm_and_si128（ptr1的，maskb），_mm_set_epi32（0,0，0,2）））;
        __m128i streamBuffer2 = _mm_or_si128（_mm_srl_epi16（_mm_and_si128（ptr1的，MASKA），_mm_set_epi32（0,0，0,6）），_mm_sll_epi16（_mm_and_si128（PTR2，maskc），_mm_set_epi32（0,0，0,4）））;
        __m128i streamBuffer3 = _mm_or_si128（_mm_srl_epi16（_mm_and_si128（PTR2，MASKA），_mm_set_epi32（0,0，0,4）），_mm_sll_epi16（_mm_and_si128（ptr3，maskd），_mm_set_epi32（0,0，0,6）））;
        __m128i streamBuffer4 = _mm_srl_epi16（_mm_and_si128（ptr3，MASKA），_mm_set_epi32（0,0，0,2））;        //这又是非常缓慢的。 〜每字节输出2个周期
        对于（INT J = 15; J＆GT; = 0; J- = 2）
        {
            streamBuffer [0] = streamBuffer0.m128i_u8 [J]。
            streamBuffer [1] = streamBuffer1.m128i_u8 [J]。
            streamBuffer [2] = streamBuffer2.m128i_u8 [J]。
            streamBuffer [3] = streamBuffer3.m128i_u8 [J]。
            streamBuffer [4] = streamBuffer4.m128i_u8 [J]。
            streamBuffer + = 5;
        }
        PTR + = 32;
    }}INT _tmain（INT ARGC，_TCHAR *的argv []）
{    uint16_t像素[512];
    uint8_t有packed1 [512 * 10/8]。
    uint8_t有packed2 [512 * 10/8]。    的for（int i = 0; I＆LT; 512;我++）
    {
        像素[我] =我;
    }    LARGE_INTEGER T0，T1，T2;    QueryPerformanceCounter的（安培; T0）;
    对于（INT K = 0; K＆LT; 1000; k ++）packSlow（像素，packed1,512）;
    QueryPerformanceCounter的（安培T1）;
    对于（INT K = 0; K＆LT; 1000; k ++）packFast（像素，packed2,512）;
    QueryPerformanceCounter的（安培; T2）;    的printf（％D \\ n，t1.QuadPart-t0.QuadPart，t2.QuadPart-t1.QuadPart）;    如果（memcmp（packed1，packed2，sizeof的（packed1）））
    {
        的printf（失败\\ n）;
    }
    返回0;
}

解决方案

我在上交所没有经验明确。不过，我会尽量优化code如下：

  //警告。这个程序需要streamBuffer至少有3个额外的备用字节
//在结束时被用作暂存空间。它会写0到这些字节。
//例如，streamBuffer必须是640 + 3个字节分配的内存如果
// 512 10位采样输出。无效packSlow1（uint16_t * PTR，uint8_t有* streamBuffer，uint32_t的NCOL）
{
    为（uint32_t的J = 0; J＆下; NCOL; J + = 4 * 4）
    {
        uint64_t中* DST;
        uint64_t中的src [4] [4];        // __m128i S01 = _mm_set_epi64（PTR [0]，PTR [1]）;
        // __m128i S23 = _mm_set_epi64（PTR [2]，PTR [3]）;
        //  -  -  要么  -  - 
        // __m128i s0123 = _mm_load_si128（PTR [0]）
        // __m128i S01 = _ ????? _（s0123）//一些指令，从s0123提取S01
        // __m128i S23 = _ ????? _（s0123）//一些指令来提取S23        SRC [0] [0] = ptr的[0]＆放大器; 0x3FF处;
        SRC [0] [1] = ptr的[1]＆放大器; 0x3FF处;
        SRC [0] [2] = ptr的[2]＆放大器; 0x3FF处;
        SRC [0] [3] = ptr的[3]＆放大器; 0x3FF处;        SRC [1] [0] = ptr的[4]＆放大器; 0x3FF处;
        SRC [1] [1] = ptr的[5]＆安培; 0x3FF处;
        SRC [1] [2] = ptr的[6]＆放大器; 0x3FF处;
        SRC [1] [3] = ptr的[7]＆放大器; 0x3FF处;        SRC [2] [0] = ptr的[8]＆放大器; 0x3FF处;
        SRC [2] [1] = ptr的[9]＆放大器; 0x3FF处;
        SRC [2] [2] = ptr的[10]＆安培; 0x3FF处;
        SRC [2] [3] = ptr的[11]＆放大器; 0x3FF处;        SRC [3] [0] = ptr的[12]＆放大器; 0x3FF处;
        SRC [3] [1] = ptr的[13]＆放大器; 0x3FF处;
        SRC [3] [2] = ptr的[14]＆放大器; 0x3FF处;
        SRC [3] [3] = ptr的[15]＆安培; 0x3FF处;        //貌似_mm_maskmoveu_si128可以储存导致效率
        DST =（uint64_t中*）streamBuffer;
        DST [0] = SRC [0] [0] | （源[0] [1];小于10）| （源[0] [2]所述;＆下; 20）| （源[0] [3]所述，小于30）;        DST =（uint64_t中*）（streamBuffer + 5）;
        DST [0] = SRC [1] [0] | （源[1] [1];小于10）| （源[1] [2]所述;＆下; 20）| （源[1] [3]所述，小于30）;        DST =（uint64_t中*）（streamBuffer + 10）;
        DST [0] = SRC [2] [0] | （源[2] [1];小于10）| （源[2] [2]所述;＆下; 20）| （源[2] [3]所述，小于30）;        DST =（uint64_t中*）（streamBuffer + 15）;
        DST [0] = SRC [3] [0] | （源[3] [1];小于10）| （源[3] [2]所述;＆下; 20）| （源[3] [3]所述，小于30）;        streamBuffer + = 5 * 4;
        PTR + = 4 * 4;
    }
}

更新：

测试：

 的Ubuntu 12.04，x86_64的GNU / Linux上，GCC v4.6.3（虚拟盒）
英特尔酷睿i7（MacBook Pro的）
与-O3编译5717633386（1X）：packSlow
3868744491（1.4X）：packSlow1（从岗位版）
4471858853（1.2X）：packFast2（马克Lakata的帖子）
1820784764（3.1X）：packFast3（从岗位版）Windows 8.1中，64位，VS2012防爆preSS
Intel酷睿i5华硕（ASUS）
编译标准发布选项，并支持SSE200413185（1X）packSlow
00782005（0.5X）packSlow1
00236639（1.7X）packFast2
00148906（2.8X）packFast3

我看到华硕笔记本电脑完全不同的结果与Windows 8.1和VS防爆preSS 2012（与-O2编译code）。 packSlow1为2x比原来慢packSlow，而packFast2是1.7X比packSlow快（不是2.9倍）。研究这个问题后，我明白了原因。 VC编译无法保存所有的常量到XMMS寄存器用于packFast2，所以它插入额外的存储器访问进入循环（见生成的程序集）。慢速内存访问解释的性能下降。

为了获得更稳定的结果我增加的像素缓冲区，256×512个，增加循环计数器从1000到二百五十六分之一千万。

下面是我的SSE优化的函数的版本。

  //警告。这个程序需要streamBuffer至少有3个额外的备用字节
//在结束时被用作暂存空间。它会写0到这些字节。
//例如，streamBuffer必须是640 + 3个字节分配的内存如果
// 512 10位采样输出。无效packFast3（uint16_t * PTR，uint8_t有* streamBuffer，uint32_t的NCOL）
{
    常量__m128i M0 = _mm_set_epi16（0，0x3FF对，0，0x3FF对，0，0x3FF对，0，0x3FF处）;
    常量__m128i M1 = _mm_set_epi16（0x3FF处，0，为0x3FF，0，为0x3FF，0，为0x3FF，0）;
    常量__m128i 2 = _mm_set_epi32（0，为0xFFFFFFFF，0，为0xFFFFFFFF）;
    常量__m128i立方米= _mm_set_epi32（0xFFFFFFFF的，0，为0xFFFFFFFF，0）;
    常量__m128i M4 = _mm_set_epi32（0，0，为0xFFFFFFFF，为0xFFFFFFFF）;
    常量__m128i M5 = _mm_set_epi32（0xFFFFFFFF的，为0xFFFFFFFF，0，0）;
    __m128i S0，T0，R0，X0，X1;    //展开和正常循环给出了相同的结果
    对于（uint32_t的J = 0; J＆LT; NCOL; J + = 8）
    {
        //装载8个样品为S0
        S0 = _mm_loadu_si128（（__ m128i *）PTR）; // S0 = 00070006_00050004_00030002_00010000        //加入16位采样为32位的字
        X0 = _mm_and_si128（S0，M0）; // X0 = 00000006_00000004_00000002_00000000
        X1 = _mm_and_si128（S0，M1）; // X1 = 00070000_00050000_00030000_00010000
        T0 = _mm_or_si128（X0，_mm_srli_epi32（X1，6））; // T0 = 00001c06_00001404_00000c02_00000400        //加入32位字为64位的双字
        X0 = _mm_and_si128（T0，2）; // X0 = 00000000_00001404_00000000_00000400
        X1 = _mm_and_si128（T0，M3）; // X1 = 00001c06_00000000_00000c02_00000000
        T0 = _mm_or_si128（X0，_mm_srli_epi64（X1，12））; // T0 = 00000001_c0601404_00000000_c0200400        //加入64位双字
        X0 = _mm_and_si128（T0，M4）; // X0 = 00000000_00000000_00000000_c0200400
        X1 = _mm_and_si128（T0，M5）; // X1 = 00000001_c0601404_00000000_00000000
        R0 = _mm_or_si128（X0，_mm_srli_si128（X1，3））; // R0 = 00000000_000001c0_60140400_c0200400        //结果存储
        _mm_storeu_si128（（__ m128i *）streamBuffer，R0）;        streamBuffer + = 10;
        PTR + = 8;
    }
}

I'm trying to packing 10 bit pixels in to a continuous byte stream, using SIMD instructions. The code below does it "in principle" but the SIMD version is slower than the scalar version.

The problem seem to be that I can't find good gather/scatter operations that load the register efficiently.

Any suggestions for improvement?

// SIMD_test.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

#include "Windows.h"
#include <tmmintrin.h>
#include <stdint.h>
#include <string.h>

// reference non-SIMD implementation that "works"
// 4 uint16 at a time as input, and 5 uint8 as output per loop iteration

void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    for(uint32_t j=0;j<NCOL;j+=4)
    {
        streamBuffer[0] = (uint8_t)(ptr[0]);
        streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2));
        streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4));
        streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6));
        streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ;
        streamBuffer += 5;
        ptr += 4;
    }
}


// poorly written SIMD implementation. Attempts to do the same
// as the packSlow, but 8 iterations at a time

void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
    const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F);
    const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F);
    const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03);

    for(uint32_t j=0;j<NCOL;j+=4*8)
    {
        _mm_prefetch((const char*)(ptr+j),_MM_HINT_T0);
    }

    for(uint32_t j=0;j<NCOL;j+=4*8)
    {
        // this "fetch" stage is costly. Each term takes 2 cycles
        __m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]);
        __m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]);
        __m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]);
        __m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]);

           // I think this part is fairly well optimized
        __m128i streamBuffer0 =  ptr0;
        __m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2)));
        __m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4)));
        __m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6)));
        __m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ;

        // this again is terribly slow. ~2 cycles per byte output
        for(int j=15;j>=0;j-=2)
        {
            streamBuffer[0] = streamBuffer0.m128i_u8[j];
            streamBuffer[1] = streamBuffer1.m128i_u8[j];
            streamBuffer[2] = streamBuffer2.m128i_u8[j];
            streamBuffer[3] = streamBuffer3.m128i_u8[j];
            streamBuffer[4] = streamBuffer4.m128i_u8[j];
            streamBuffer += 5;
        }
        ptr += 32;
    }

}

int _tmain(int argc, _TCHAR* argv[])
{

    uint16_t pixels[512];
    uint8_t packed1[512*10/8];
    uint8_t packed2[512*10/8];

    for(int i=0;i<512;i++)
    {
        pixels[i] = i;
    }

    LARGE_INTEGER t0,t1,t2;

    QueryPerformanceCounter(&t0);
    for(int k=0;k<1000;k++) packSlow(pixels,packed1,512);
    QueryPerformanceCounter(&t1);
    for(int k=0;k<1000;k++) packFast(pixels,packed2,512);
    QueryPerformanceCounter(&t2);

    printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart);

    if (memcmp(packed1,packed2,sizeof(packed1)))
    {
        printf("failed\n");
    }


    return 0;
}

解决方案

I have no experience specifically in SSE. But I would have tried to optimize the code as follows.

// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.

void packSlow1(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    for(uint32_t j=0;j<NCOL;j+=4*4)
    {
        uint64_t *dst;
        uint64_t src[4][4];

        // __m128i s01 = _mm_set_epi64(ptr[0], ptr[1]);
        // __m128i s23 = _mm_set_epi64(ptr[2], ptr[3]);
        // ---- or ----
        // __m128i s0123 = _mm_load_si128(ptr[0])
        // __m128i s01   = _?????_(s0123) // some instruction to extract s01 from s0123
        // __m128i s23   = _?????_(s0123) // some instruction to extract s23

        src[0][0] = ptr[0] & 0x3ff;
        src[0][1] = ptr[1] & 0x3ff;
        src[0][2] = ptr[2] & 0x3ff;
        src[0][3] = ptr[3] & 0x3ff;

        src[1][0] = ptr[4] & 0x3ff;
        src[1][1] = ptr[5] & 0x3ff;
        src[1][2] = ptr[6] & 0x3ff;
        src[1][3] = ptr[7] & 0x3ff;

        src[2][0] = ptr[8] & 0x3ff;
        src[2][1] = ptr[9] & 0x3ff;
        src[2][2] = ptr[10] & 0x3ff;
        src[2][3] = ptr[11] & 0x3ff;

        src[3][0] = ptr[12] & 0x3ff;
        src[3][1] = ptr[13] & 0x3ff;
        src[3][2] = ptr[14] & 0x3ff;
        src[3][3] = ptr[15] & 0x3ff;

        // looks like _mm_maskmoveu_si128 can store result efficiently
        dst = (uint64_t*)streamBuffer;
        dst[0] = src[0][0] | (src[0][1] << 10) | (src[0][2] << 20) | (src[0][3] << 30);

        dst = (uint64_t*)(streamBuffer + 5);
        dst[0] = src[1][0] | (src[1][1] << 10) | (src[1][2] << 20) | (src[1][3] << 30);

        dst = (uint64_t*)(streamBuffer + 10);
        dst[0] = src[2][0] | (src[2][1] << 10) | (src[2][2] << 20) | (src[2][3] << 30);

        dst = (uint64_t*)(streamBuffer + 15);
        dst[0] = src[3][0] | (src[3][1] << 10) | (src[3][2] << 20) | (src[3][3] << 30);

        streamBuffer += 5 * 4;
        ptr += 4 * 4;
    }
}

UPDATE:

Benchmarks:

Ubuntu 12.04, x86_64 GNU/Linux, gcc v4.6.3 (Virtual Box)
Intel Core i7 (Macbook pro)
compiled with -O3

5717633386 (1X):   packSlow
3868744491 (1.4X): packSlow1 (version from the post)
4471858853 (1.2X): packFast2 (from Mark Lakata's post)
1820784764 (3.1X): packFast3 (version from the post)

Windows 8.1, x64, VS2012 Express
Intel Core i5 (Asus)
compiled with standard 'Release' options and SSE2 enabled

00413185 (1X)   packSlow
00782005 (0.5X) packSlow1
00236639 (1.7X) packFast2
00148906 (2.8X) packFast3

I see completely different results on Asus notebook with Windows 8.1 and VS Express 2012 (code compiled with -O2). packSlow1 is 2x slower than original packSlow, while packFast2 is 1.7X (not 2.9X) faster than packSlow. After researching this problem, I understood the reason. VC compiler was unable to save all the constants into XMMS registers for packFast2 , so it inserted additional memory accesses into the loop (see generated assembly). Slow memory access explains performance degradation.

In order to get more stable results I increased pixels buffer to 256x512 and increased loop counter from 1000 to 10000000/256.

Here is my version of SSE optimized function.

// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.

void packFast3(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
    const __m128i m0 = _mm_set_epi16(0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF);
    const __m128i m1 = _mm_set_epi16(0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0);
    const __m128i m2 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
    const __m128i m3 = _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
    const __m128i m4 = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
    const __m128i m5 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
    __m128i s0, t0, r0, x0, x1;

    // unrolled and normal loop gives the same result
    for(uint32_t j=0;j<NCOL;j+=8)
    {
        // load 8 samples into s0
        s0 = _mm_loadu_si128((__m128i*)ptr);            // s0=00070006_00050004_00030002_00010000

        // join 16-bit samples into 32-bit words
        x0 = _mm_and_si128(s0, m0);                     // x0=00000006_00000004_00000002_00000000
        x1 = _mm_and_si128(s0, m1);                     // x1=00070000_00050000_00030000_00010000
        t0 = _mm_or_si128(x0, _mm_srli_epi32(x1, 6));   // t0=00001c06_00001404_00000c02_00000400

        // join 32-bit words into 64-bit dwords
        x0 = _mm_and_si128(t0, m2);                     // x0=00000000_00001404_00000000_00000400
        x1 = _mm_and_si128(t0, m3);                     // x1=00001c06_00000000_00000c02_00000000
        t0 = _mm_or_si128(x0, _mm_srli_epi64(x1, 12));  // t0=00000001_c0601404_00000000_c0200400

        // join 64-bit dwords
        x0 = _mm_and_si128(t0, m4);                     // x0=00000000_00000000_00000000_c0200400
        x1 = _mm_and_si128(t0, m5);                     // x1=00000001_c0601404_00000000_00000000
        r0 = _mm_or_si128(x0, _mm_srli_si128(x1, 3));   // r0=00000000_000001c0_60140400_c0200400

        // and store result
        _mm_storeu_si128((__m128i*)streamBuffer, r0);

        streamBuffer += 10;
        ptr += 8;
    }
}

这篇关于包装10位值与SIMD字节流的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

包装10位值与SIMD字节流 [英] packing 10 bit values into a byte stream with SIMD

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

包装10位值与SIMD字节流 [英] packing 10 bit values into a byte stream with SIMD

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭