SSE 4 popcount用于16个8位值? [英] SSE 4 popcount for 16 8-bit values?
问题描述
下面的代码使用标记 -msse4
与GCC编译,但问题是弹出计数仅获取转换后的 __ m128i
类型。基本上我想要的是计数 __ m128i
类型中的所有16个数字,但我不确定在创建变量 popA后调用的内在函数调用
。不知何故 popA
必须转换成一个包含所有128位信息的整数?我猜想 _mm_cvtsi128_si64
并使用少量的洗牌操作,但我的操作系统是32位的。是否只有shuffle方法并使用 _mm_cvtsi128_si32
?
编辑:如果shuffle方法是我需要帮助实现它的32位操作系统的唯一选项,请。
继承代码。
#include< stdio.h>
#include< smmintrin.h>
#include< emmintrin.h>
int main(void)
{
int A = 1;
__m128i popA = _mm_set_epi8(A,A,A,A,A,A,A,A,A,A,A,A,A,A,A);
unsigned int integer = _mm_cvtsi128_si32(popA);
// long long LONG = _mm_cvtsi128_si64(popA); //我的操作系统是32位,所以在这里没有运气
printf(integer =%d\\\
,integer);
int pop = _mm_popcnt_u32(integer);
// int popLONG = _mm_popcnt_u64(LONG);
printf(popcount =%d \\\
,pop);
// printf(popcount LONG =%d\\\
,popLONG);
返回0;
}
编辑2 :这个最后运行GCC编译器标志 -msse -msse2 -msse3 -msse4
)尽管我不确定
输出:
pop_count1():1799 1799 1799 1799 1799 1799 1799 1799
pop_count2():每个字节的填充数:1 1 1 1 1 1 1 1 0 1 2 3 4 5 6 7
#include< stdio.h>
#include< xmmintrin.h>
#include< emmintrin.h>
#include< mmintrin.h>
#include< stdint.h>
#include< tmmintrin.h>
void print128_num(__ m128i var)
{
uint16_t * val =(uint16_t *)& var;
printf(pop_count1():%i%i%i%i%i%i%i%i \ n,
val [0],val [1],val [2] ,val [3],val [4],val [5],
val [6],val [7]);
}
static __m128i parallelPopcnt16bytes(__m128i xmm)// for pop_count2
{
const __m128i mask4 = _mm_set1_epi8(0x0F);
const __m128i lookup = _mm_setr_epi8(0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4);
__m128i低,高,数;
low = _mm_and_si128(mask4,xmm);
high = _mm_and_si128(mask4,_mm_srli_epi16(xmm,4));
count = _mm_add_epi8(_mm_shuffle_epi8(lookup,low),_mm_shuffle_epi8(lookup,high));
返回计数;
}
void pop_count1()
{
int A = 1; (A,A,A,A,A,A,A,A,A,A,A,A,A,A); b。
__m128i bit0 = _mm_set1_epi8(0x80);
__m128i mask0 = _mm_and_si128(in,bit0);
__m128i sum = _mm_cmpeq_epi8(mask0,_mm_setzero_si128());
/ *一般模式* /
__m128i bit1 = _mm_set1_epi8(0x40);
__m128i mask1 = _mm_and_si128(in,bit1);
mask1 = _mm_cmpeq_epi8(mask1,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask1);
/ * next bit * /
__m128i bit2 = _mm_set1_epi8(0x20);
__m128i mask2 = _mm_and_si128(in,bit2);
mask2 = _mm_cmpeq_epi8(mask2,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask2);
__m128i bit3 = _mm_set1_epi8(0x10);
__m128i mask3 = _mm_and_si128(in,bit3);
mask3 = _mm_cmpeq_epi8(mask3,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask3);
__m128i bit4 = _mm_set1_epi8(0x08);
__m128i mask4 = _mm_and_si128(in,bit4);
mask4 = _mm_cmpeq_epi8(mask4,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask4);
__m128i bit5 = _mm_set1_epi8(0x04);
__m128i mask5 = _mm_and_si128(in,bit5);
mask5 = _mm_cmpeq_epi8(mask5,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask5);
__m128i bit6 = _mm_set1_epi8(0x02);
__m128i mask6 = _mm_and_si128(in,bit6);
mask6 = _mm_cmpeq_epi8(mask6,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask6);
__m128i bit7 = _mm_set1_epi8(0x01);
__m128i mask7 = _mm_and_si128(in,bit7);
mask7 = _mm_cmpeq_epi8(mask7,_mm_setzero_si128());
sum = _mm_add_epi8(sum,mask7);
/ *完成* /
sum = _mm_sub_epi8(_mm_setzero_si128(),sum);
print128_num(sum);
}
void pop_count2()
{
int index;
__m128i testVector = _mm_set_epi8(1,2,4,8,16,32,64,128,0,1,3,7,15,31,63,127);
__m128i counts = parallelPopcnt16bytes(testVector);
printf(pop_count2():每个字节的填充数:);
for(index = 15; index> = 0; index--)
{
uint8_t * bytes =(void *)& counts;
printf(%d,bytes [index]);
}
printf(\\\
);
}
int main(void)
{
pop_count1();
pop_count2();
返回0;
}
SSE 4 popcount for 16 8位值可以这样并行完成:
#include
#include< stdint.h>
#include< immintrin.h>
// ---------------------------------------- ------------------------------------
//
// parallelPopcnt16bytes - 在xmm(16组)中查找8位组的总体数量
// xmm结果的每个字节包含范围从0到8的值
//
static __m128i parallelPopcnt16bytes(__m128i xmm)
{
const __m128i mask4 = _mm_set1_epi8(0x0F);
const __m128i lookup = _mm_setr_epi8(0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4);
__m128i低,高,数;
low = _mm_and_si128(mask4,xmm);
high = _mm_and_si128(mask4,_mm_srli_epi16(xmm,4));
count = _mm_add_epi8(_mm_shuffle_epi8(lookup,low),_mm_shuffle_epi8(lookup,high));
返回计数;
}
// ----------------------------------- -----------------------------------------
int main(void)
{
int index;
__m128i testVector = _mm_set_epi8(1,2,4,8,16,32,64,128,0,1,3,7,15,31,63,127);
__m128i counts = parallelPopcnt16bytes(testVector);
printf(每个字节的填充数量:);
for(index = 15; index> = 0; index--)
{
uint8_t * bytes =(void *)& counts;
printf(%d,bytes [index]);
}
printf(\\\
);
返回0;
}
// ----------------------------------- -----------------------------------------
I have the following code which compiles with GCC using the flag -msse4
but the problem is that the pop count only gets the last four 8-bits of the converted __m128i
type. Basically what I want is to count all 16 numbers inside the __m128i
type but I'm not sure what intrinsic function call to make after creating the variable popA
. Somehow popA
has to be converted into an integer that contains all the 128-bits of information? I suppose theres _mm_cvtsi128_si64
and using a few shuffle few operations but my OS is 32-bit. Is there only the shuffle method and using _mm_cvtsi128_si32
?
EDIT: If the shuffle method is the only option I need help implementing it for my 32-bit OS, please.
Heres the code.
#include <stdio.h>
#include <smmintrin.h>
#include <emmintrin.h>
int main(void)
{
int A = 1;
__m128i popA = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);
unsigned int integer = _mm_cvtsi128_si32(popA);
//long long LONG = _mm_cvtsi128_si64(popA);//my OS is 32-bits so no luck here
printf("integer = %d\n", integer);
int pop = _mm_popcnt_u32(integer);
//int popLONG = _mm_popcnt_u64(LONG);
printf("popcount = %d\n", pop);
//printf("popcount LONG = %d\n", popLONG);
return 0;
}
EDIT 2: This one finally runs (with GCC compiler flags -msse -msse2 -msse3 -msse4
) although I'm not sure if the output for pop_count1()
is correct.
Output:
pop_count1(): 1799 1799 1799 1799 1799 1799 1799 1799
pop_count2():population count for each byte: 1 1 1 1 1 1 1 1 0 1 2 3 4 5 6 7
#include <stdio.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <mmintrin.h>
#include <stdint.h>
#include <tmmintrin.h>
void print128_num(__m128i var)
{
uint16_t *val = (uint16_t*) &var;
printf("pop_count1(): %i %i %i %i %i %i %i %i \n",
val[0], val[1], val[2], val[3], val[4], val[5],
val[6], val[7]);
}
static __m128i parallelPopcnt16bytes (__m128i xmm)//for pop_count2
{
const __m128i mask4 = _mm_set1_epi8 (0x0F);
const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m128i low, high, count;
low = _mm_and_si128 (mask4, xmm);
high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
return count;
}
void pop_count1()
{
int A = 1;
__m128i in = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);
__m128i bit0 = _mm_set1_epi8( 0x80 );
__m128i mask0 = _mm_and_si128( in, bit0 );
__m128i sum = _mm_cmpeq_epi8( mask0, _mm_setzero_si128() );
/* general pattern */
__m128i bit1 = _mm_set1_epi8( 0x40 );
__m128i mask1 = _mm_and_si128( in, bit1 );
mask1 = _mm_cmpeq_epi8( mask1, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask1 );
/* next bit */
__m128i bit2 = _mm_set1_epi8( 0x20 );
__m128i mask2 = _mm_and_si128( in, bit2 );
mask2 = _mm_cmpeq_epi8( mask2, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask2 );
__m128i bit3 = _mm_set1_epi8( 0x10 );
__m128i mask3 = _mm_and_si128( in, bit3 );
mask3 = _mm_cmpeq_epi8( mask3, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask3 );
__m128i bit4 = _mm_set1_epi8( 0x08 );
__m128i mask4 = _mm_and_si128( in, bit4 );
mask4 = _mm_cmpeq_epi8( mask4, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask4 );
__m128i bit5 = _mm_set1_epi8( 0x04 );
__m128i mask5 = _mm_and_si128( in, bit5 );
mask5 = _mm_cmpeq_epi8( mask5, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask5 );
__m128i bit6 = _mm_set1_epi8( 0x02 );
__m128i mask6 = _mm_and_si128( in, bit6 );
mask6 = _mm_cmpeq_epi8( mask6, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask6 );
__m128i bit7 = _mm_set1_epi8( 0x01 );
__m128i mask7 = _mm_and_si128( in, bit7 );
mask7 = _mm_cmpeq_epi8( mask7, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask7 );
/* finish up */
sum = _mm_sub_epi8( _mm_setzero_si128(), sum );
print128_num(sum);
}
void pop_count2()
{
int index;
__m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
__m128i counts = parallelPopcnt16bytes (testVector);
printf ("pop_count2():population count for each byte:");
for (index = 15; index >= 0; index--)
{
uint8_t *bytes = (void *) &counts;
printf (" %d", bytes [index]);
}
printf ("\n");
}
int main(void)
{
pop_count1();
pop_count2();
return 0;
}
SSE 4 popcount for 16 8-bit values can be done in parallel this way:
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>
//----------------------------------------------------------------------------
//
// parallelPopcnt16bytes - find population count for 8-bit groups in xmm (16 groups)
// each byte of xmm result contains a value ranging from 0 to 8
//
static __m128i parallelPopcnt16bytes (__m128i xmm)
{
const __m128i mask4 = _mm_set1_epi8 (0x0F);
const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m128i low, high, count;
low = _mm_and_si128 (mask4, xmm);
high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
return count;
}
//----------------------------------------------------------------------------
int main (void)
{
int index;
__m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
__m128i counts = parallelPopcnt16bytes (testVector);
printf ("population count for each byte:");
for (index = 15; index >= 0; index--)
{
uint8_t *bytes = (void *) &counts;
printf (" %d", bytes [index]);
}
printf ("\n");
return 0;
}
//----------------------------------------------------------------------------
这篇关于SSE 4 popcount用于16个8位值?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!