将32位数据打包为32字节SIMD向量的最快方法 [英] Fastest way to unpack 32 bits to a 32 byte SIMD vector

查看:196
本文介绍了将32位数据打包为32字节SIMD向量的最快方法的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

将32位存储在内存中的uint32_t中,最快的方法是将每个位解压缩到AVX寄存器的单独字节元素中?这些位可以在其各自字节内的任何位置.

为澄清起见,我的意思是位0到字节0,位1到字节1.显然,该字节内的所有其他位都为零.目前,我能做到的最好是2 PSHUFB,并且每个位置都有一个掩码寄存器.

如果uint32_t是位图,则相应的矢量元素应为0或非0. (也就是说,我们可以获得一个vpcmpeqb相对于全零矢量的矢量蒙版).

https://software.intel.com/zh-CN/forums/topic/283382

解决方案

要将32位整数x的32位广播"到256位YMM寄存器z的32字节或16位的两个128位XMM寄存器z_lowz_high,您可以执行以下操作.

使用AVX2:

__m256i y = _mm256_set1_epi32(x);
__m256i z = _mm256_shuffle_epi8(y,mask1);
z = _mm256_and_si256(z,mask2);

在没有AVX2的情况下,最好使用SSE做到这一点:

__m128i y = _mm_set1_epi32(x);      
__m128i z_low  = _mm_shuffle_epi8(y,mask_low);
__m128i z_high = _mm_shuffle_epi8(y,mask_high); 
z_low  = _mm_and_si128(z_low ,mask2);
z_high = _mm_and_si128(z_high,mask2);

遮罩和工作示例如下所示.如果您打算多次进行此操作,则可能应该 在主循环之外定义遮罩.

#include <immintrin.h>
#include <stdio.h>

int main() {
    int x = 0x87654321;

    static const char mask1a[32] = {
        0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00,
        0x01, 0x01, 0x01, 0x01,
        0x01, 0x01, 0x01, 0x01,
        0x02, 0x02, 0x02, 0x02,
        0x02, 0x02, 0x02, 0x02,
        0x03, 0x03, 0x03, 0x03,
        0x03, 0x03, 0x03, 0x03
    };

    static const char mask2a[32] = {
        0x01, 0x02, 0x04, 0x08,
        0x10, 0x20, 0x40, 0x80,
        0x01, 0x02, 0x04, 0x08,
        0x10, 0x20, 0x40, 0x80,
        0x01, 0x02, 0x04, 0x08,
        0x10, 0x20, 0x40, 0x80,
        0x01, 0x02, 0x04, 0x08,
        0x10, 0x20, 0x40, 0x80,
    };

char out[32];

#if defined ( __AVX2__ )
    __m256i mask2 = _mm256_loadu_si256((__m256i*)mask2a);
    __m256i mask1  = _mm256_loadu_si256((__m256i*)mask1a);

    __m256i y =    _mm256_set1_epi32(x);
    __m256i z =    _mm256_shuffle_epi8(y,mask1);
    z = _mm256_and_si256(z,mask2);

    _mm256_storeu_si256((__m256i*)out,z);

#else
    __m128i mask2 = _mm_loadu_si128((__m128i*)mask2a);
    __m128i mask_low  = _mm_loadu_si128((__m128i*)&mask1a[ 0]);
    __m128i mask_high = _mm_loadu_si128((__m128i*)&mask1a[16]);    

    __m128i y = _mm_set1_epi32(x); 
    __m128i z_low  = _mm_shuffle_epi8(y,mask_low);
    __m128i z_high = _mm_shuffle_epi8(y,mask_high);
    z_low  = _mm_and_si128(z_low,mask2);
    z_high = _mm_and_si128(z_high,mask2);

    _mm_storeu_si128((__m128i*)&out[ 0],z_low);
    _mm_storeu_si128((__m128i*)&out[16],z_high);
#endif
    for(int i=0; i<8; i++) {
        for(int j=0; j<4; j++) {        
            printf("%x ", out[4*i+j]);
        }printf("\n");
    } printf("\n");
}


要在每个向量元素中获取0或-1:

对全零采取额外的步骤_mm256_cmpeq_epi8.任何非零将变为0,零将变为-1.如果我们不希望这种反转,请使用andnot而不是and.它将其第一个操作数取反.

__m256i expand_bits_to_bytes(uint32_t x)
{
    __m256i xbcast = _mm256_set1_epi32(x);    // we only use the low 32bits of each lane, but this is fine with AVX2

    // Each byte gets the source byte containing the corresponding bit
    __m256i shufmask = _mm256_set_epi64x(
        0x0303030303030303, 0x0202020202020202,
        0x0101010101010101, 0x0000000000000000);
    __m256i shuf  = _mm256_shuffle_epi8(xbcast, shufmask);

    __m256i andmask  = _mm256_set1_epi64x(0x8040201008040201);  // every 8 bits -> 8 bytes, pattern repeats.
    __m256i isolated_inverted = _mm256_andnot_si256(shuf, andmask);

    // this is the extra step: compare each byte == 0 to produce 0 or -1
    return _mm256_cmpeq_epi8(isolated_inverted, _mm256_setzero_si256());
     // alternative: compare against the AND mask to get 0 or -1,
     // avoiding the need for a vector zero constant.
}

Godbolt编译器资源管理器.

另请参阅是否存在对于其他元素大小,是否要与intel avx2中的movemask指令相反?.

Having 32 bits stored in a uint32_t in memory, what's the fastest way to unpack each bit to a separate byte element of an AVX register? The bits can be in any position within their respective byte.

Edit: to clarify, I mean bit 0 goes to byte 0, bit 1 to byte 1. Obviously all other bits within the byte on zero. Best I could at the moment is 2 PSHUFB and having a mask register for each position.

If the uint32_t is a bitmap, then the corresponding vector elements should be 0 or non-0. (i.e. so we could get a vector mask with a vpcmpeqb against a vector of all-zero).

https://software.intel.com/en-us/forums/topic/283382

解决方案

To "broadcast" the 32 bits of a 32-bit integer x to 32 bytes of a 256-bit YMM register z or 16 bytes of a two 128-bit XMM registers z_low and z_high you can do the following.

With AVX2:

__m256i y = _mm256_set1_epi32(x);
__m256i z = _mm256_shuffle_epi8(y,mask1);
z = _mm256_and_si256(z,mask2);

Without AVX2 it's best to do this with SSE:

__m128i y = _mm_set1_epi32(x);      
__m128i z_low  = _mm_shuffle_epi8(y,mask_low);
__m128i z_high = _mm_shuffle_epi8(y,mask_high); 
z_low  = _mm_and_si128(z_low ,mask2);
z_high = _mm_and_si128(z_high,mask2);

The masks and a working example are shown below. If you plan to do this several times you should probably define the masks outside of the main loop.

#include <immintrin.h>
#include <stdio.h>

int main() {
    int x = 0x87654321;

    static const char mask1a[32] = {
        0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00,
        0x01, 0x01, 0x01, 0x01,
        0x01, 0x01, 0x01, 0x01,
        0x02, 0x02, 0x02, 0x02,
        0x02, 0x02, 0x02, 0x02,
        0x03, 0x03, 0x03, 0x03,
        0x03, 0x03, 0x03, 0x03
    };

    static const char mask2a[32] = {
        0x01, 0x02, 0x04, 0x08,
        0x10, 0x20, 0x40, 0x80,
        0x01, 0x02, 0x04, 0x08,
        0x10, 0x20, 0x40, 0x80,
        0x01, 0x02, 0x04, 0x08,
        0x10, 0x20, 0x40, 0x80,
        0x01, 0x02, 0x04, 0x08,
        0x10, 0x20, 0x40, 0x80,
    };

char out[32];

#if defined ( __AVX2__ )
    __m256i mask2 = _mm256_loadu_si256((__m256i*)mask2a);
    __m256i mask1  = _mm256_loadu_si256((__m256i*)mask1a);

    __m256i y =    _mm256_set1_epi32(x);
    __m256i z =    _mm256_shuffle_epi8(y,mask1);
    z = _mm256_and_si256(z,mask2);

    _mm256_storeu_si256((__m256i*)out,z);

#else
    __m128i mask2 = _mm_loadu_si128((__m128i*)mask2a);
    __m128i mask_low  = _mm_loadu_si128((__m128i*)&mask1a[ 0]);
    __m128i mask_high = _mm_loadu_si128((__m128i*)&mask1a[16]);    

    __m128i y = _mm_set1_epi32(x); 
    __m128i z_low  = _mm_shuffle_epi8(y,mask_low);
    __m128i z_high = _mm_shuffle_epi8(y,mask_high);
    z_low  = _mm_and_si128(z_low,mask2);
    z_high = _mm_and_si128(z_high,mask2);

    _mm_storeu_si128((__m128i*)&out[ 0],z_low);
    _mm_storeu_si128((__m128i*)&out[16],z_high);
#endif
    for(int i=0; i<8; i++) {
        for(int j=0; j<4; j++) {        
            printf("%x ", out[4*i+j]);
        }printf("\n");
    } printf("\n");
}


To get 0 or -1 in each vector element:

It takes one extra step _mm256_cmpeq_epi8 against all-zeros. Any non-zero turns into 0, and zero turns into -1. If we don't want this inversion, use andnot instead of and. It inverts its first operand.

__m256i expand_bits_to_bytes(uint32_t x)
{
    __m256i xbcast = _mm256_set1_epi32(x);    // we only use the low 32bits of each lane, but this is fine with AVX2

    // Each byte gets the source byte containing the corresponding bit
    __m256i shufmask = _mm256_set_epi64x(
        0x0303030303030303, 0x0202020202020202,
        0x0101010101010101, 0x0000000000000000);
    __m256i shuf  = _mm256_shuffle_epi8(xbcast, shufmask);

    __m256i andmask  = _mm256_set1_epi64x(0x8040201008040201);  // every 8 bits -> 8 bytes, pattern repeats.
    __m256i isolated_inverted = _mm256_andnot_si256(shuf, andmask);

    // this is the extra step: compare each byte == 0 to produce 0 or -1
    return _mm256_cmpeq_epi8(isolated_inverted, _mm256_setzero_si256());
     // alternative: compare against the AND mask to get 0 or -1,
     // avoiding the need for a vector zero constant.
}

See it on the Godbolt Compiler Explorer.

Also see is there an inverse instruction to the movemask instruction in intel avx2? for other element sizes.

这篇关于将32位数据打包为32字节SIMD向量的最快方法的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆