__m128i的清除高位字节 [英] Clear upper bytes of __m128i

查看:168
本文介绍了__m128i的清除高位字节的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我如何清除 16 - ?我的高字节 __ m128i

我已经试过这一点;它的工作原理,但我不知道是否有一个更好的(更短,速度更快)的方式:

  INT I = ... // 0℃; I< 16__m128i X = ...__m128i面膜= _mm_set_epi8(
    0,
    (ⅰ> 14)? -1:0,
    (ⅰ> 13)? -1:0,
    (ⅰ> 12)? -1:0,
    (ⅰ> 11)? -1:0,
    (ⅰ→10)? -1:0,
    (ⅰ> 9)? -1:0,
    (ⅰ→8)? -1:0,
    (ⅰ大于7)? -1:0,
    (ⅰ→6)? -1:0,
    (ⅰ大于5)? -1:0,
    (ⅰ→4)? -1:0,
    (ⅰ→3)? -1:0,
    (ⅰ→2)? -1:0,
    (ⅰ→1)? -1:0,
    -1);X = _mm_and_si128(X,掩模);


解决方案

我想实现这个的几种不同的方式,并与一对夫妇不同的编译器基准测试他们在早期的酷睿i7 @ 2.67 GHz和最近的Haswell @ 3.6GHz的

  //
// mask_shift_0
//
//使用PSHUFB(注:需要SSSE3)
//内嵌__m128i mask_shift_0(uint32_t的N)
{
  常量__m128i VMASK = _mm_set1_epi8(255);
  常量__m128i vperm = _mm_set_epi8(112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127);
  __m128i VP = _mm_add_epi8(vperm,_mm_set1_epi8(N));
  返回_mm_shuffle_epi8(VMASK,VP);
}//
// mask_shift_1
//
//使用16元LUT
//内嵌__m128i mask_shift_1(uint32_t的N)
{
  静态常量中int8_t mask_lut [16] [16] __attribute__((排列(16)))= {
    {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1},
    {0,0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1},
    {0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1},
    {0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1},
    {0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1},
    {0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1},
    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1},
    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1}
  };
  返回_mm_load_si128((__ m128i *)及mask_lut [N]);
}//
// mask_shift_2
//
//使用错位加载从2矢量LUT
//内嵌__m128i mask_shift_2(uint32_t的N)
{
  静态常量中int8_t mask_lut [32] = {
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
  };
  返回_mm_loadu_si128((__ m128i *)(mask_lut + 16 - N));
}//
// mask_shift_3
//
//使用比较和单矢量LUT
//内嵌__m128i mask_shift_3(uint32_t的N)
{
  常量__m128i VM = _mm_setr_epi8(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16);
  __m128i VN = _mm_set1_epi8(N);
  返回_mm_cmpgt_epi8(VM,VN);
}//
// mask_shift_4
//
//使用跳转表和直接转移
//内嵌__m128i mask_shift_4(uint32_t的N)
{
  常量__m128i VMASK = _mm_set1_epi8(-1);
  开关(N)
  {
    情况下0:
      返回VMASK;
    情况1:
      返回_mm_slli_si128(VMASK,1);
    案例2:
      返回_mm_slli_si128(VMASK,2);
    案例3:
      返回_mm_slli_si128(VMASK,3);
    情况4:
      返回_mm_slli_si128(VMASK,4);
    情况5:
      返回_mm_slli_si128(VMASK,5);
    情况6:
      返回_mm_slli_si128(VMASK,6);
    案例7:
      返回_mm_slli_si128(VMASK,7);
    案例8:
      返回_mm_slli_si128(VMASK,8);
    案例9:
      返回_mm_slli_si128(VMASK,9);
    案例10:
      返回_mm_slli_si128(VMASK,10);
    案例11:
      返回_mm_slli_si128(VMASK,11);
    案例12:
      返回_mm_slli_si128(VMASK,12);
    案例13:
      返回_mm_slli_si128(VMASK,13);
    案例14:
      返回_mm_slli_si128(VMASK,14);
    案例15:
      返回_mm_slli_si128(VMASK,15);
  }
}//
// lsb_mask_0
//
//供稿由@ Leeor / @ DTB
//
//使用_mm_set_epi64x
//内嵌__m128i lsb_mask_0(INT N)
{
  如果(N GT; = 8)
    返回_mm_set_epi64x(〜(-1LL&所述;≤(N - 8)* 8),-1);
  其他
    返回_mm_set_epi64x(0,〜(-1LL&所述;≤(N - 0)* 8));
}//
// lsb_mask_1
//
//供稿由@ Leeor / @ DTB
//
//一样lsb_mask_0但使用条件运算符,而不是的if / else
//内嵌__m128i lsb_mask_1(INT N)
{
  返回_mm_set_epi64x(正&GT = 8〜(-1LL&所述;≤(N - 8)* 8):0,N GT; = 8 -1:?〜(-1LL&所述;≤(N - 0) * 8));
}

结果很有趣:

酷睿i7 @ 2.67 GHz的,苹果LLVM GCC 4.2.1(GCC -O3)

  mask_shift_0:2.23377 NS
mask_shift_1:2.14724 NS
mask_shift_2:2.14270 NS
mask_shift_3:2.15063 NS
mask_shift_4:2.98304 NS
lsb_mask_0:2.15782 NS
lsb_mask_1:2.96628 NS

酷睿i7 @ 2.67 GHz的,苹果铛4.2(铛-Os)

  mask_shift_0:1.35014 NS
mask_shift_1:1.12789 NS
mask_shift_2:1.04329 NS
mask_shift_3:1.09258 NS
mask_shift_4:2.01478 NS
lsb_mask_0:1.70573 NS
lsb_mask_1:1.84337 NS

的Haswell E3-1285 @ 3.6 GHz的,GCC 4.7.2(GCC -O2)

  mask_shift_0:0.851416毫微秒
mask_shift_1:0.575245毫微秒
mask_shift_2:0.577746毫微秒
mask_shift_3:0.850086毫微秒
mask_shift_4:1.398270毫微秒
lsb_mask_0:1.359660毫微秒
lsb_mask_1:1.709720毫微秒

所以 mask_shift_4 (开关/箱)似乎是在所有情况下最慢的方法,而其他类似pretty。基于LUT的方法似乎是一贯最快的整体。

注:我得到一些可疑的快速数字与铛-O3 的gcc -O3 (GCC 4.7.2只) - 我需要看看生成的程序集对于这些情况,看看有什么编译器正在做什么,并确保它没有做任何事情聪明,比如优化掉的时间测试工具的某些部分。

如果任何人有这个什么意见或者有其他mask_shift实现他们想试试我会很乐意将其添加到测试套件和更新的结果。

How do I clear the 16 - i upper bytes of a __m128i?

I've tried this; it works, but I'm wondering if there is a better (shorter, faster) way:

int i = ...  //  0 < i < 16

__m128i x = ...

__m128i mask = _mm_set_epi8(
    0,
    (i > 14) ? -1 : 0,
    (i > 13) ? -1 : 0,
    (i > 12) ? -1 : 0,
    (i > 11) ? -1 : 0,
    (i > 10) ? -1 : 0,
    (i >  9) ? -1 : 0,
    (i >  8) ? -1 : 0,
    (i >  7) ? -1 : 0,
    (i >  6) ? -1 : 0,
    (i >  5) ? -1 : 0,
    (i >  4) ? -1 : 0,
    (i >  3) ? -1 : 0,
    (i >  2) ? -1 : 0,
    (i >  1) ? -1 : 0,
    -1);

x = _mm_and_si128(x, mask);

解决方案

I tried a few different ways of implementing this and benchmarked them with a couple of different compilers on an early Core i7 @ 2.67 GHz and a recent Haswell @ 3.6 GHz:

//
// mask_shift_0
//
// use PSHUFB (note: SSSE3 required)
//

inline __m128i mask_shift_0(uint32_t n)
{
  const __m128i vmask = _mm_set1_epi8(255);
  const __m128i vperm = _mm_set_epi8(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
  __m128i vp = _mm_add_epi8(vperm, _mm_set1_epi8(n));
  return _mm_shuffle_epi8(vmask, vp);
}

//
// mask_shift_1
//
// use 16 element LUT
//

inline __m128i mask_shift_1(uint32_t n)
{
  static const int8_t mask_lut[16][16] __attribute__ ((aligned(16))) = {
    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 },
    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1 }
  };
  return _mm_load_si128((__m128i *)&mask_lut[n]);
}

//
// mask_shift_2
//
// use misaligned load from 2 vector LUT
//

inline __m128i mask_shift_2(uint32_t n)
{
  static const int8_t mask_lut[32] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  };
  return _mm_loadu_si128((__m128i *)(mask_lut + 16 - n));
}

//
// mask_shift_3
//
// use compare and single vector LUT
//

inline __m128i mask_shift_3(uint32_t n)
{
  const __m128i vm = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
  __m128i vn = _mm_set1_epi8(n);
  return _mm_cmpgt_epi8(vm, vn);
}

//
// mask_shift_4
//
// use jump table and immediate shifts
//

inline __m128i mask_shift_4(uint32_t n)
{
  const __m128i vmask = _mm_set1_epi8(-1);
  switch (n)
  {
    case 0:
      return vmask;
    case 1:
      return _mm_slli_si128(vmask, 1);
    case 2:
      return _mm_slli_si128(vmask, 2);
    case 3:
      return _mm_slli_si128(vmask, 3);
    case 4:
      return _mm_slli_si128(vmask, 4);
    case 5:
      return _mm_slli_si128(vmask, 5);
    case 6:
      return _mm_slli_si128(vmask, 6);
    case 7:
      return _mm_slli_si128(vmask, 7);
    case 8:
      return _mm_slli_si128(vmask, 8);
    case 9:
      return _mm_slli_si128(vmask, 9);
    case 10:
      return _mm_slli_si128(vmask, 10);
    case 11:
      return _mm_slli_si128(vmask, 11);
    case 12:
      return _mm_slli_si128(vmask, 12);
    case 13:
      return _mm_slli_si128(vmask, 13);
    case 14:
      return _mm_slli_si128(vmask, 14);
    case 15:
      return _mm_slli_si128(vmask, 15);
  }
}

//
// lsb_mask_0
//
// Contributed by by @Leeor/@dtb
//
// uses _mm_set_epi64x
//

inline __m128i lsb_mask_0(int n)
{
  if (n >= 8)
    return _mm_set_epi64x(~(-1LL << (n - 8) * 8), -1);
  else
    return _mm_set_epi64x(0, ~(-1LL << (n - 0) * 8));
}

//
// lsb_mask_1
//
// Contributed by by @Leeor/@dtb
//
// same as lsb_mask_0 but uses conditional operator instead of if/else
//

inline __m128i lsb_mask_1(int n)
{
  return _mm_set_epi64x(n >= 8 ? ~(-1LL << (n - 8) * 8) : 0, n >= 8 ? -1 : ~(-1LL << (n - 0) * 8));
}

Results were interesting:

Core i7 @ 2.67 GHz, Apple LLVM gcc 4.2.1 (gcc -O3)

mask_shift_0: 2.23377 ns
mask_shift_1: 2.14724 ns
mask_shift_2: 2.14270 ns
mask_shift_3: 2.15063 ns
mask_shift_4: 2.98304 ns
lsb_mask_0:   2.15782 ns
lsb_mask_1:   2.96628 ns

Core i7 @ 2.67 GHz, Apple clang 4.2 (clang -Os)

mask_shift_0: 1.35014 ns
mask_shift_1: 1.12789 ns
mask_shift_2: 1.04329 ns
mask_shift_3: 1.09258 ns
mask_shift_4: 2.01478 ns
lsb_mask_0:   1.70573 ns
lsb_mask_1:   1.84337 ns

Haswell E3-1285 @ 3.6 GHz, gcc 4.7.2 (gcc -O2)

mask_shift_0: 0.851416 ns
mask_shift_1: 0.575245 ns
mask_shift_2: 0.577746 ns
mask_shift_3: 0.850086 ns
mask_shift_4: 1.398270 ns
lsb_mask_0:   1.359660 ns
lsb_mask_1:   1.709720 ns

So mask_shift_4 (switch/case) seems to be the slowest method in all cases, whereas the others are pretty similar. The LUT-based methods seem to be consistently the fastest overall.

NB: I get some suspiciously fast numbers with clang -O3 and gcc -O3 (gcc 4.7.2 only) - I need to look at the generated assembly for these cases to see what the compiler is doing, and make sure it is not doing anything "clever", such as optimise away some part of the timing test harness.

If anyone else has any further ideas on this or has another mask_shift implementation they'd like to try I would be happy to add it to the test suite and update the results.

这篇关于__m128i的清除高位字节的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆