SSE 4 popcount用于16个8位值? [英] SSE 4 popcount for 16 8-bit values?

查看:252
本文介绍了SSE 4 popcount用于16个8位值?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

下面的代码使用标记 -msse4 与GCC编译,但问题是弹出计数仅获取转换后的 __ m128i 类型。基本上我想要的是计数 __ m128i 类型中的所有16个数字,但我不确定在创建变量 popA后调用的内在函数调用。不知何故 popA 必须转换成一个包含所有128位信息的整数?我猜想 _mm_cvtsi128_si64 并使用少量的洗牌操作,但我的操作系统是32位的。是否只有shuffle方法并使用 _mm_cvtsi128_si32



编辑:如果shuffle方法是我需要帮助实现它的32位操作系统的唯一选项,请。



继承代码。

  #include< stdio.h> 
#include< smmintrin.h>
#include< emmintrin.h>

int main(void)
{
int A = 1;
__m128i popA = _mm_set_epi8(A,A,A,A,A,A,A,A,A,A,A,A,A,A,A);

unsigned int integer = _mm_cvtsi128_si32(popA);
// long long LONG = _mm_cvtsi128_si64(popA); //我的操作系统是32位,所以在这里没有运气

printf(integer =%d\\\
,integer);
int pop = _mm_popcnt_u32(integer);
// int popLONG = _mm_popcnt_u64(LONG);
printf(popcount =%d \\\
,pop);
// printf(popcount LONG =%d\\\
,popLONG);

返回0;
}

编辑2 :这个最后运行GCC编译器标志 -msse -msse2 -msse3 -msse4 )尽管我不确定 pop_count1()是正确的。



输出:
pop_count1():1799 1799 1799 1799 1799 1799 1799 1799


pop_count2():每个字节的填充数:1 1 1 1 1 1 1 1 0 1 2 3 4 5 6 7

  #include< stdio.h> 
#include< xmmintrin.h>
#include< emmintrin.h>
#include< mmintrin.h>
#include< stdint.h>
#include< tmmintrin.h>

void print128_num(__ m128i var)
{
uint16_t * val =(uint16_t *)& var;
printf(pop_count1():%i%i%i%i%i%i%i%i \ n,
val [0],val [1],val [2] ,val [3],val [4],val [5],
val [6],val [7]);
}
static __m128i parallelPopcnt16bytes(__m128i xmm)// for pop_count2
{
const __m128i mask4 = _mm_set1_epi8(0x0F);
const __m128i lookup = _mm_setr_epi8(0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4);
__m128i低,高,数;

low = _mm_and_si128(mask4,xmm);
high = _mm_and_si128(mask4,_mm_srli_epi16(xmm,4));
count = _mm_add_epi8(_mm_shuffle_epi8(lookup,low),_mm_shuffle_epi8(lookup,high));
返回计数;
}
void pop_count1()
{
int A = 1; (A,A,A,A,A,A,A,A,A,A,A,A,A,A); b。
__m128i bit0 = _mm_set1_epi8(0x80);
__m128i mask0 = _mm_and_si128(in,bit0);
__m128i sum = _mm_cmpeq_epi8(mask0,_mm_setzero_si128());

/ *一般模式* /
__m128i bit1 = _mm_set1_epi8(0x40);
__m128i mask1 = _mm_and_si128(in,bit1);
mask1 = _mm_cmpeq_epi8(mask1,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask1);

/ * next bit * /
__m128i bit2 = _mm_set1_epi8(0x20);
__m128i mask2 = _mm_and_si128(in,bit2);
mask2 = _mm_cmpeq_epi8(mask2,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask2);

__m128i bit3 = _mm_set1_epi8(0x10);
__m128i mask3 = _mm_and_si128(in,bit3);
mask3 = _mm_cmpeq_epi8(mask3,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask3);

__m128i bit4 = _mm_set1_epi8(0x08);
__m128i mask4 = _mm_and_si128(in,bit4);
mask4 = _mm_cmpeq_epi8(mask4,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask4);

__m128i bit5 = _mm_set1_epi8(0x04);
__m128i mask5 = _mm_and_si128(in,bit5);
mask5 = _mm_cmpeq_epi8(mask5,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask5);

__m128i bit6 = _mm_set1_epi8(0x02);
__m128i mask6 = _mm_and_si128(in,bit6);
mask6 = _mm_cmpeq_epi8(mask6,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask6);

__m128i bit7 = _mm_set1_epi8(0x01);
__m128i mask7 = _mm_and_si128(in,bit7);
mask7 = _mm_cmpeq_epi8(mask7,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask7);

/ *完成* /
sum = _mm_sub_epi8(_mm_setzero_si128(),sum);

print128_num(sum);
}
void pop_count2()
{
int index;
__m128i testVector = _mm_set_epi8(1,2,4,8,16,32,64,128,0,1,3,7,15,31,63,127);
__m128i counts = parallelPopcnt16bytes(testVector);

printf(pop_count2():每个字节的填充数:);
for(index = 15; index> = 0; index--)
{
uint8_t * bytes =(void *)& counts;
printf(%d,bytes [index]);
}
printf(\\\
);
}
int main(void)
{
pop_count1();
pop_count2();

返回0;
}


解决方案

SSE 4 popcount for 16 8位值可以这样并行完成:

  #include  
#include< stdint.h>
#include< immintrin.h>

// ---------------------------------------- ------------------------------------
//
// parallelPopcnt16bytes - 在xmm(16组)中查找8位组的总体数量
// xmm结果的每个字节包含范围从0到8的值
//
static __m128i parallelPopcnt16bytes(__m128i xmm)
{
const __m128i mask4 = _mm_set1_epi8(0x0F);
const __m128i lookup = _mm_setr_epi8(0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4);
__m128i低,高,数;

low = _mm_and_si128(mask4,xmm);
high = _mm_and_si128(mask4,_mm_srli_epi16(xmm,4));
count = _mm_add_epi8(_mm_shuffle_epi8(lookup,low),_mm_shuffle_epi8(lookup,high));
返回计数;
}

// ----------------------------------- -----------------------------------------

int main(void)
{
int index;
__m128i testVector = _mm_set_epi8(1,2,4,8,16,32,64,128,0,1,3,7,15,31,63,127);
__m128i counts = parallelPopcnt16bytes(testVector);

printf(每个字节的填充数量:);
for(index = 15; index> = 0; index--)
{
uint8_t * bytes =(void *)& counts;
printf(%d,bytes [index]);
}
printf(\\\
);
返回0;
}

// ----------------------------------- -----------------------------------------


I have the following code which compiles with GCC using the flag -msse4 but the problem is that the pop count only gets the last four 8-bits of the converted __m128i type. Basically what I want is to count all 16 numbers inside the __m128i type but I'm not sure what intrinsic function call to make after creating the variable popA. Somehow popA has to be converted into an integer that contains all the 128-bits of information? I suppose theres _mm_cvtsi128_si64 and using a few shuffle few operations but my OS is 32-bit. Is there only the shuffle method and using _mm_cvtsi128_si32?

EDIT: If the shuffle method is the only option I need help implementing it for my 32-bit OS, please.

Heres the code.

#include <stdio.h>
#include <smmintrin.h>
#include <emmintrin.h>

int main(void)
{
    int A = 1;
    __m128i popA = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);

    unsigned int integer = _mm_cvtsi128_si32(popA);
    //long long LONG = _mm_cvtsi128_si64(popA);//my OS is 32-bits so no luck here

    printf("integer = %d\n", integer);
    int pop = _mm_popcnt_u32(integer);
    //int popLONG = _mm_popcnt_u64(LONG);
    printf("popcount = %d\n", pop);
    //printf("popcount LONG = %d\n", popLONG);

    return 0;
}

EDIT 2: This one finally runs (with GCC compiler flags -msse -msse2 -msse3 -msse4) although I'm not sure if the output for pop_count1() is correct.

Output: pop_count1(): 1799 1799 1799 1799 1799 1799 1799 1799

pop_count2():population count for each byte: 1 1 1 1 1 1 1 1 0 1 2 3 4 5 6 7

  #include <stdio.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <mmintrin.h>
#include <stdint.h>
#include <tmmintrin.h>

void print128_num(__m128i var)
{
    uint16_t *val = (uint16_t*) &var;
    printf("pop_count1(): %i %i %i %i %i %i %i %i \n",
           val[0], val[1], val[2], val[3], val[4], val[5],
           val[6], val[7]);
}
static __m128i parallelPopcnt16bytes (__m128i xmm)//for pop_count2
{
    const __m128i mask4 = _mm_set1_epi8 (0x0F);
    const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
   __m128i low, high, count;

   low = _mm_and_si128 (mask4, xmm);
   high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
   count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
   return count;
}
void pop_count1()
{
    int A = 1;
    __m128i in = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);
    __m128i bit0 = _mm_set1_epi8( 0x80 );
    __m128i mask0 = _mm_and_si128( in, bit0 );
    __m128i sum = _mm_cmpeq_epi8( mask0, _mm_setzero_si128() );

/* general pattern */
    __m128i bit1 = _mm_set1_epi8( 0x40 );
    __m128i mask1 = _mm_and_si128( in, bit1 );
    mask1 = _mm_cmpeq_epi8( mask1, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask1 );

/* next bit */
    __m128i bit2 = _mm_set1_epi8( 0x20 );
    __m128i mask2 = _mm_and_si128( in, bit2 );
    mask2 = _mm_cmpeq_epi8( mask2, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask2 );

    __m128i bit3 = _mm_set1_epi8( 0x10 );
    __m128i mask3 = _mm_and_si128( in, bit3 );
    mask3 = _mm_cmpeq_epi8( mask3, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask3 );

    __m128i bit4 = _mm_set1_epi8( 0x08 );
    __m128i mask4 = _mm_and_si128( in, bit4 );
    mask4 = _mm_cmpeq_epi8( mask4, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask4 );

    __m128i bit5 = _mm_set1_epi8( 0x04 );
    __m128i mask5 = _mm_and_si128( in, bit5 );
    mask5 = _mm_cmpeq_epi8( mask5, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask5 );

    __m128i bit6 = _mm_set1_epi8( 0x02 );
    __m128i mask6 = _mm_and_si128( in, bit6 );
    mask6 = _mm_cmpeq_epi8( mask6, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask6 );

    __m128i bit7 = _mm_set1_epi8( 0x01 );
    __m128i mask7 = _mm_and_si128( in, bit7 );
    mask7 = _mm_cmpeq_epi8( mask7, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask7 );

/* finish up */
    sum = _mm_sub_epi8( _mm_setzero_si128(), sum );

    print128_num(sum);
}
void pop_count2()
{
    int index;
    __m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
    __m128i counts = parallelPopcnt16bytes (testVector);

    printf ("pop_count2():population count for each byte:");
    for (index = 15; index >= 0; index--)
        {
        uint8_t *bytes = (void *) &counts;
        printf (" %d", bytes [index]);
        }
    printf ("\n");
}
int main(void)
{
    pop_count1();
    pop_count2();

    return 0;
}

解决方案

SSE 4 popcount for 16 8-bit values can be done in parallel this way:

#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>

//----------------------------------------------------------------------------
//
// parallelPopcnt16bytes - find population count for 8-bit groups in xmm (16 groups)
//                         each byte of xmm result contains a value ranging from 0 to 8
//
static __m128i parallelPopcnt16bytes (__m128i xmm)
   {
    const __m128i mask4 = _mm_set1_epi8 (0x0F);
    const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
   __m128i low, high, count;

   low = _mm_and_si128 (mask4, xmm);
   high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
   count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
   return count;
   }

//----------------------------------------------------------------------------

int main (void)
    {
    int index;
    __m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
    __m128i counts = parallelPopcnt16bytes (testVector);

    printf ("population count for each byte:");
    for (index = 15; index >= 0; index--)
        {
        uint8_t *bytes = (void *) &counts;
        printf (" %d", bytes [index]);
        }
    printf ("\n");
    return 0;
    }

//----------------------------------------------------------------------------

这篇关于SSE 4 popcount用于16个8位值?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆