紧凑型AVX2寄存器,所以选择的整数根据面膜是连续 [英] Compact AVX2 register so selected integers are contiguous according to mask

查看:638
本文介绍了紧凑型AVX2寄存器,所以选择的整数根据面膜是连续的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

在这个问题优化阵列压实,最多的回答状态:


  

SSE / AVX注册与最新的指令集允许一个更好的办法。我们可以直接使用PMOVMSKB的结果,将其转化为控制寄存器类似PSHUFB。


这可能与Haswell的(AVX2)?还是需要AVX512的口味呢?

我有含int32s一个AVX2矢量和的比较结果的相应载体。我想以某种方式将它洗使得与在掩模(比较真)设置相应的最高位的元素在载体的低端相邻。

我能看到的最好的是获得与_mm256_movemask_ps / vmovmskps位掩码(无* D变种?),然后使用在256 AVX2向量查表得到的交叉车道_mm256_permutevar8x32_epi32 / vpermd

解决方案

做的第一件事就是找到一个快速的标量函数。这里是不使用的分支的一个版本。

 内嵌INT紧凑型(INT * X,诠释* Y,const int的N){
    INT CNT = 0;
    的for(int i = 0; I< N;我++){
        INT切= X [I]!= 0;
        Y [CNT] =切* X [I]
        CNT + =切换;
    }
    返回CNT;
}

在SIMD最好的结果可能依赖于零点的分布情况。如果是疏或密。下面code应分配这是疏或密工作。例如长期的零和非零的运行。如果分布更均匀,我不知道这是否code会有任何好处。但它无论如何都会给出正确的结果。

下面是我测试了AVX2版本。

  INT compact_AVX2(INT * X,诠释* Y,诠释N){
    INT I = 0,CNT = 0;
    对于(I = 0; I&下;正8; I + = 8){
        __m256i X4 = _mm256_loadu_si256((__ m256i *)及X [I]);
        __m256i CMP = _mm256_cmpeq_epi32(X4,_mm256_setzero_si256());
        INT面膜= _mm256_movemask_epi8(CMP);
        如果(屏蔽== -1)继续; //全部为零
        如果(掩模){
            CNT + =紧凑(安培; X [I],&放大器; Y [CNT],8);
        }
        其他{
            _mm256_storeu_si256((__ m256i *)及Y [CNT],4个);
            CNT + = 8;
        }
    }
    CNT + =紧凑(安培; X [I],&放大器; Y [CNT],N-1); //清理对n不是8的倍数
    返回CNT;
}

下面是我测试的版本SSE2

  INT compact_SSE2(INT * X,诠释* Y,诠释N){
    INT I = 0,CNT = 0;
    对于(I = 0; I&下;正4; I + = 4){
        __m128i X4 = _mm_loadu_si128((__ m128i *)及X [I]);
        __m128i CMP = _mm_cmpeq_epi32(X4,_mm_setzero_si128());
        INT面膜= _mm_movemask_epi8(CMP);
        如果(屏蔽== 0xFFFF的)继续; //全零
        如果(掩模){
            CNT + =紧凑(安培; X [I],&放大器; Y [CNT],4);
        }
        其他{
            _mm_storeu_si128((__ m128i *)及Y [CNT],4个);
            CNT + = 4;
        }
    }
    CNT + =紧凑(安培; X [I],&放大器; Y [CNT],N-1); //清理对于n不是4的倍数
    返回CNT;
}

下面是一个完整的测试

 的#include<&stdio.h中GT;
#包括LT&;&stdlib.h中GT;
#如果定义(__GNUC__)及和放大器; !定义(__INTEL_COMPILER)
#包括LT&;&x86intrin.h GT;
#其他
#包括LT&;&immintrin.h GT;
#万一#定义ñ50内联INT紧凑型(INT * X,诠释* Y,const int的N){
    INT CNT = 0;
    的for(int i = 0; I< N;我++){
        INT切= X [I]!= 0;
        Y [CNT] =切* X [I]
        CNT + =切换;
    }
    返回CNT;
}INT compact_SSE2(INT * X,诠释* Y,诠释N){
        INT I = 0,CNT = 0;
        对于(I = 0; I&下;正4; I + = 4){
            __m128i X4 = _mm_loadu_si128((__ m128i *)及X [I]);
            __m128i CMP = _mm_cmpeq_epi32(X4,_mm_setzero_si128());
            INT面膜= _mm_movemask_epi8(CMP);
            如果(屏蔽== 0xFFFF的)继续; //全零
            如果(掩模){
                CNT + =紧凑(安培; X [I],&放大器; Y [CNT],4);
            }
            其他{
                _mm_storeu_si128((__ m128i *)及Y [CNT],4个);
                CNT + = 4;
            }
        }
        CNT + =紧凑(安培; X [I],&放大器; Y [CNT],N-1); //清理对于n不是4的倍数
        返回CNT;
    }INT compact_AVX2(INT * X,诠释* Y,诠释N){
    INT I = 0,CNT = 0;
    对于(I = 0; I&下;正8; I + = 8){
        __m256i X4 = _mm256_loadu_si256((__ m256i *)及X [I]);
        __m256i CMP = _mm256_cmpeq_epi32(X4,_mm256_setzero_si256());
        INT面膜= _mm256_movemask_epi8(CMP);
        如果(屏蔽== -1)继续; //全部为零
        如果(掩模){
            CNT + =紧凑(安培; X [I],&放大器; Y [CNT],8);
        }
        其他{
            _mm256_storeu_si256((__ m256i *)及Y [CNT],4个);
            CNT + = 8;
        }
    }
    CNT + =紧凑(安培; X [I],&放大器; Y [CNT],N-1); //清理对n不是8的倍数
    返回CNT;
}诠释主(){
    INT X [N],Y [N];
    的for(int i = 0; I< N;我++)×[我] =兰特()%10;
    // INT CNT = compact_SSE2(X,Y,N);
    INT CNT = compact_AVX2(X,Y,N);
    的for(int i = 0; I< N;我++)的printf(%d个X [I]);的printf(\\ n);
    的for(int i = 0; I< CNT;我++)的printf(%d个Y [I]);的printf(\\ n);
}

In the question Optimizing Array Compaction, the top answer states:

SSE/AVX registers with latest instruction sets allow a better approach. We can use the result of PMOVMSKB directly, transforming it to the control register for something like PSHUFB.

Is this possible with Haswell (AVX2)? Or does it require one of the flavors of AVX512?

I've got a AVX2 vector containing int32s, and a corresponding vector of the result of a compare. I want to shuffle it somehow so that the elements with the corresponding msb set in the mask (compare true) are contiguous in the low end of the vector.

The best I can see is get a bit mask with _mm256_movemask_ps/vmovmskps (no *d variant?) and then use that in a 256 AVX2 vector lookup table to get a shuffle mask for the cross-lane _mm256_permutevar8x32_epi32/vpermd

解决方案

The first thing to do is find a fast scalar function. Here is a version which does not use a branch.

inline int compact(int *x, int *y, const int n) {
    int cnt = 0;
    for(int i=0; i<n; i++) {
        int cut = x[i]!=0;
        y[cnt] = cut*x[i];
        cnt += cut;
    }
    return cnt;
}

The best result with SIMD probably depends on the distribution of zeros. If it's sparse or dense . The following code should work well for distribution which are sparse or dense. For example long runs of zeros and non-zeros. If the distribution is more even I don't know if this code will have any benefit. But it will give the correct result anyway.

Here is a AVX2 version I tested.

int compact_AVX2(int *x, int *y, int n) {
    int i =0, cnt = 0;
    for(i=0; i<n-8; i+=8) {
        __m256i x4 = _mm256_loadu_si256((__m256i*)&x[i]);
        __m256i cmp = _mm256_cmpeq_epi32(x4, _mm256_setzero_si256());
        int mask = _mm256_movemask_epi8(cmp);
        if(mask == -1) continue; //all zeros
        if(mask) {
            cnt += compact(&x[i],&y[cnt], 8);
        }
        else {
            _mm256_storeu_si256((__m256i*)&y[cnt], x4);
            cnt +=8;
        }       
    }
    cnt += compact(&x[i], &y[cnt], n-i); // cleanup for n not a multiple of 8
    return cnt;
}

Here is the SSE2 version I tested.

int compact_SSE2(int *x, int *y, int n) {
    int i =0, cnt = 0;
    for(i=0; i<n-4; i+=4) {
        __m128i x4 = _mm_loadu_si128((__m128i*)&x[i]);
        __m128i cmp = _mm_cmpeq_epi32(x4, _mm_setzero_si128());
        int mask = _mm_movemask_epi8(cmp);
        if(mask == 0xffff) continue; //all zeroes
        if(mask) {
            cnt += compact(&x[i],&y[cnt], 4);
        }
        else {
            _mm_storeu_si128((__m128i*)&y[cnt], x4);
            cnt +=4;
        }       
    }
    cnt += compact(&x[i], &y[cnt], n-i); // cleanup for n not a multiple of 4
    return cnt;
}

Here is a full test

#include <stdio.h>
#include <stdlib.h>
#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
#include <x86intrin.h>                
#else
#include <immintrin.h>                
#endif

#define N 50

inline int compact(int *x, int *y, const int n) {
    int cnt = 0;
    for(int i=0; i<n; i++) {
        int cut = x[i]!=0;
        y[cnt] = cut*x[i];
        cnt += cut;
    }
    return cnt;
}

int compact_SSE2(int *x, int *y, int n) {
        int i =0, cnt = 0;
        for(i=0; i<n-4; i+=4) {
            __m128i x4 = _mm_loadu_si128((__m128i*)&x[i]);
            __m128i cmp = _mm_cmpeq_epi32(x4, _mm_setzero_si128());
            int mask = _mm_movemask_epi8(cmp);
            if(mask == 0xffff) continue; //all zeroes
            if(mask) {
                cnt += compact(&x[i],&y[cnt], 4);
            }
            else {
                _mm_storeu_si128((__m128i*)&y[cnt], x4);
                cnt +=4;
            }       
        }
        cnt += compact(&x[i], &y[cnt], n-i); // cleanup for n not a multiple of 4
        return cnt;
    }

int compact_AVX2(int *x, int *y, int n) {
    int i =0, cnt = 0;
    for(i=0; i<n-8; i+=8) {
        __m256i x4 = _mm256_loadu_si256((__m256i*)&x[i]);
        __m256i cmp = _mm256_cmpeq_epi32(x4, _mm256_setzero_si256());
        int mask = _mm256_movemask_epi8(cmp);
        if(mask == -1) continue; //all zeros
        if(mask) {
            cnt += compact(&x[i],&y[cnt], 8);
        }
        else {
            _mm256_storeu_si256((__m256i*)&y[cnt], x4);
            cnt +=8;
        }       
    }
    cnt += compact(&x[i], &y[cnt], n-i); // cleanup for n not a multiple of 8
    return cnt;
}

int main() {
    int x[N], y[N];
    for(int i=0; i<N; i++) x[i] = rand()%10;
    //int cnt = compact_SSE2(x,y,N);
    int cnt = compact_AVX2(x,y,N);
    for(int i=0; i<N; i++) printf("%d ", x[i]); printf("\n");
    for(int i=0; i<cnt; i++) printf("%d ", y[i]); printf("\n");
}

这篇关于紧凑型AVX2寄存器,所以选择的整数根据面膜是连续的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆