最快的跨度3收集指令顺序是什么? [英] What's the fastest stride-3 gather instruction sequence?

查看:123
本文介绍了最快的跨度3收集指令顺序是什么?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

问题:



从内存生成第3步32位元素聚集的最有效序列是什么?
如果内存的排列方式为:

  MEM = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3。 .. 

我们要获得三个YMM寄存器,其中:

  YMM0 = R0 R1 R2 R3 R4 R5 R6 R7 
YMM1 = G0 G1 G2 G3 G4 G5 G6 G7
YMM2 = B0 B1 B2 B3 B4 B5 B6 B7



动机和讨论



标量C代码类似于

  template< typename T> 
T进程(常量T *输入){
T Result = 0;
for(int i = 0; i <4096; ++ i){
T R =输入[3 * i];
T G =输入[3 * i + 1];
T B =输入[3 * i + 2];
结果+ = some_parallelizable_algorithm< T>(R,G,B);
}
返回结果;
}

让我们说 some_parallelizable_algorithm 是用内在函数编写的,

  template< typename T> 
__m256i some_parallelizable_algorithm(__ m256i R,__ m256i G,__ m256i B);

所以T = int32_t的向量实现可以是这样的:

 模板<> 
int32_t处理< int32_t>(const int32_t *输入){
__m256i Step = _mm256_set_epi32(0,1,2,3,4,5,6,7);
__m256i结果= _mm256_setzero_si256();
for(int i = 0; i< 4096; i + = 8){
// R = R0 R1 R2 R3 R4 R5 R6 R7
__m256i R = _mm256_i32gather_epi32(输入+ 3 * i,步骤3);
// G = G0 G1 G2 G3 G4 G5 G6 G7
__m256i G = _mm256_i32gather_epi32(Input + 3 * i + 1,Step,3);
// B = B0 B1 B2 B3 B4 B5 B6 B7
__m256i B = _mm256_i32gather_epi32(Input + 3 * i + 2,Step,3);
结果= _mm256_add_epi32(结果,
some_parallelizable_algorithm< int32_t>(R,G,B));
}
//这应该是不太有趣的部分:
//对Result进行约简并返回结果
}

首先,可以这样做是因为存在用于32位元素的收集指令,但是对于16位元素或8位元素则没有。
其次,更重要的是,出于性能原因,应完全避免上面的collect指令。使用连续的宽加载并随机加载值以获得R,G和B向量可能更有效。

 模板<> 
int32_t处理< int32_t>(const int32_t *输入){
__m256i结果= _mm256_setzero_si256();
for(int i = 0; i< 4096; i + = 3){
__m256i Ld0 = _mm256_lddqu_si256((__ m256i *)Input + 3 * i));
__m256i Ld1 = _mm256_lddqu_si256((__ m256i *)Input + 3 * i + 1));
__m256i Ld2 = _mm256_lddqu_si256((__ m256i *)Input + 3 * i + 2));
__m256i R = ???
__m256i G = ???
__m256i B = ???
结果= _mm256_add_epi32(结果,
some_parallelizable_algorithm< int32_t>(R,G,B));
}
//这应该是不太有趣的部分:
//对Result进行约简并返回结果
}

似乎对于大功率2步幅(2,4,...),已知有使用UNKPCKL / UNKPCKH的方法,但对于3步幅访问我找不到任何引用。



我有兴趣解决T = int32_t,T = int16_t和T = int8_t的问题,但为了保持专注,让我们仅讨论第一个

解决方案



逐字复制其解决方案:

  float * p; //第一个向量的地址
__m128 * m =(__m128 *)p;
__m256 m03;
__m256 m14;
__m256 m25;
m03 = _mm256_castps128_ps256(m [0]); //载入下半部分
m14 = _mm256_castps128_ps256(m [1]);
m25 = _mm256_castps128_ps256(m [2]);
m03 = _mm256_insertf128_ps(m03,m [3],1); //加载上半部分
m14 = _mm256_insertf128_ps(m14,m [4],1);
m25 = _mm256_insertf128_ps(m25,m [5],1);

__m256 xy = _mm256_shuffle_ps(m14,m25,_MM_SHUFFLE(2,1,3,2)); //最高的x和y的
__m256 yz = _mm256_shuffle_ps(m03,m14,_MM_SHUFFLE(1,0,2,1)); //降低y和z的
__m256 x = _mm256_shuffle_ps(m03,xy,_MM_SHUFFLE(2,0,3,0));
__m256 y = _mm256_shuffle_ps(yz,xy,_MM_SHUFFLE(3,1,2,0));
__m256 z = _mm256_shuffle_ps(yz,m25,_MM_SHUFFLE(3,0,3,1));

这是11条指令。 (6次加载,随机播放5次)






在一般情况下,可以执行 S x W O(S * log(W))指令中转置。其中:




  • S 是步伐

  • W 是SIMD宽度



假设存在2个向量排列和半向量插入加载,则公式变为:

 (S x W load-permute)< = S *(lg(W)+ 1)指令

忽略reg-reg移动。对于退化的情况,例如 3 x 4 ,可能会做得更好。



这里是 3 x 16 与AVX512进行负载转置:(6个负载,3个混洗,6个混合)

  FORCE_INLINE void transpose_f32_16x3_forward_AVX512(
const float T [48],
__m512& r0,__m512& r1,__m512& r2
){
__m512 a0,a1,a2;

// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47

a0 = _mm512_castps256_ps512(_mm256_loadu_ps(T + 0));
a1 = _mm512_castps256_ps512(_mm256_loadu_ps(T + 8));
a2 = _mm512_castps256_ps512(_mm256_loadu_ps(T + 16));
a0 = _mm512_insertf32x8(a0,((const __m256 *)T)[3],1);
a1 = _mm512_insertf32x8(a1,((const __m256 *)T)[4],1);
a2 = _mm512_insertf32x8(a2,((const __m256 *)T)[5],1);

// 0 1 2 3 4 5 6 7 24 25 26 27 28 29 30 31
// 8 9 10 11 12 13 14 15 32 33 34 35 36 37 38 39
// 16 17 18 19 20 21 22 23 40 41 42 43 44 45 46 47

r0 = _mm512_mask_blend_ps(0xf0f0,a0,a1);
r1 = _mm512_permutex2var_ps(a0,_mm512_setr_epi32(4,5,6,7,16,17,18,19,12,13,14,15,24,25,26,27),a2);
r2 = _mm512_mask_blend_ps(0xf0f0,a1,a2);

// 0 1 2 3 12 13 14 15 24 25 26 27 36 37 38 39
// 4 5 6 7 16 17 18 19 28 29 30 31 40 41 42 43
// 8 9 10 11 20 21 22 23 32 33 34 35 44 45 46 47

a0 = _mm512_mask_blend_ps(0xcccc,r0,r1);
a1 = _mm512_shuffle_ps(r0,r2,78);
a2 = _mm512_mask_blend_ps(0xcccc,r1,r2);

// 0 1 6 7 12 13 18 19 24 25 30 31 36 37 42 43
// 2 3 8 9 14 15 20 21 26 27 32 33 38 39 44 45
// 4 5 10 11 16 17 22 23 28 29 34 35 40 41 46 47

r0 = _mm512_mask_blend_ps(0xaaaa,a0,a1);
r1 = _mm512_permutex2var_ps(a0,_mm512_setr_epi32(1,16,3,18,5,20,7,22,9,24,11,26,13,28,15,30),a2);
r2 = _mm512_mask_blend_ps(0xaaaa,a1,a2);

// 0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45
// 1 4 7 10 13 16 19 22 25 28 31 34 37 40 43 46
// 2 5 8 11 14 17 20 23 26 29 32 35 38 41 44 47
}

相反的 3 x 16 转置存储将留给读者练习。



该模式一点都不微不足道,因为 S = 3 有点退化。但是,如果您可以看到该模式,则可以将其推广到任何奇数整数 S 以及任何2的幂数 W


The question:

What is the most efficient sequence to generate a stride-3 gather of 32-bit elements from memory? If the memory is arranged as:

MEM = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 ...

We want to obtain three YMM registers where:

YMM0 = R0 R1 R2 R3 R4 R5 R6 R7
YMM1 = G0 G1 G2 G3 G4 G5 G6 G7
YMM2 = B0 B1 B2 B3 B4 B5 B6 B7

Motivation and discussion

The scalar C code is something like

template <typename T>
T Process(const T* Input) {
  T Result = 0;
  for (int i=0; i < 4096; ++i) {
    T R = Input[3*i];
    T G = Input[3*i+1];
    T B = Input[3*i+2];
    Result += some_parallelizable_algorithm<T>(R, G, B);  
  }
  return Result;
}

Let's say that some_parallelizable_algorithm was written in intrinsics and was tuned to the fastest possible implementation possible:

template <typename T>
__m256i some_parallelizable_algorithm(__m256i R, __m256i G, __m256i B);

So the vector implementation for T=int32_t can be something like:

    template <>
    int32_t Process<int32_t>(const int32_t* Input) {
     __m256i Step = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
     __m256i Result = _mm256_setzero_si256(); 
     for (int i=0; i < 4096; i+=8) {
       // R = R0 R1 R2 R3 R4 R5 R6 R7
       __m256i R = _mm256_i32gather_epi32 (Input+3*i, Step, 3);
       // G = G0 G1 G2 G3 G4 G5 G6 G7
       __m256i G = _mm256_i32gather_epi32 (Input+3*i+1, Step, 3);
       // B = B0 B1 B2 B3 B4 B5 B6 B7
       __m256i B = _mm256_i32gather_epi32 (Input+3*i+2, Step, 3);
       Result = _mm256_add_epi32 (Result, 
                                  some_parallelizable_algorithm<int32_t>(R, G, B));
     }
   // Here should be the less interesting part:
   // Perform a reduction on Result and return the result
}

First, this can be done because there are gather instructions for 32-bit elements, but there are none for 16-bit elements or 8-bit elements. Second, and more importantly, the gather instruction above should be entirely avoided for performance reasons. It is probably more efficient to use contiguous wide loads and shuffle the loaded values to obtain the R, G and B vectors.

    template <>
    int32_t Process<int32_t>(const int32_t* Input) {
     __m256i Result = _mm256_setzero_si256(); 
     for (int i=0; i < 4096; i+=3) {
       __m256i Ld0 = _mm256_lddqu_si256((__m256i*)Input+3*i));
       __m256i Ld1 = _mm256_lddqu_si256((__m256i*)Input+3*i+1));
       __m256i Ld2 = _mm256_lddqu_si256((__m256i*)Input+3*i+2));
       __m256i R = ???
       __m256i G = ???
       __m256i B = ???
       Result = _mm256_add_epi32 (Result, 
                                  some_parallelizable_algorithm<int32_t>(R, G, B));
     }
   // Here should be the less interesting part:
   // Perform a reduction on Result and return the result
}

It seems that for power-2 strides (2, 4, ...) there are known methods using UNKPCKL/UNKPCKH, but for stride-3 accesses i could not find any references.

I am interested in solving this for T=int32_t, T=int16_t and T=int8_t, but to remain focused let's only discuss the first case.

解决方案

This article from Intel describes how to do exactly the 3x8 case that you want.

That article covers the float case. If you want int32, you'll need to cast the outputs since there's no integer version of _mm256_shuffle_ps().

Copying their solution verbatim:

float *p;  // address of first vector
__m128 *m = (__m128*) p;
__m256 m03;
__m256 m14; 
__m256 m25; 
m03  = _mm256_castps128_ps256(m[0]); // load lower halves
m14  = _mm256_castps128_ps256(m[1]);
m25  = _mm256_castps128_ps256(m[2]);
m03  = _mm256_insertf128_ps(m03 ,m[3],1);  // load upper halves
m14  = _mm256_insertf128_ps(m14 ,m[4],1);
m25  = _mm256_insertf128_ps(m25 ,m[5],1);

__m256 xy = _mm256_shuffle_ps(m14, m25, _MM_SHUFFLE( 2,1,3,2)); // upper x's and y's 
__m256 yz = _mm256_shuffle_ps(m03, m14, _MM_SHUFFLE( 1,0,2,1)); // lower y's and z's
__m256 x  = _mm256_shuffle_ps(m03, xy , _MM_SHUFFLE( 2,0,3,0)); 
__m256 y  = _mm256_shuffle_ps(yz , xy , _MM_SHUFFLE( 3,1,2,0)); 
__m256 z  = _mm256_shuffle_ps(yz , m25, _MM_SHUFFLE( 3,0,3,1)); 

So this is 11 instructions. (6 loads, 5 shuffles)


In the general case, it's possible to do an S x W transpose in O(S*log(W)) instructions. Where:

  • S is the stride
  • W is the SIMD width

Assuming the existence of 2-vector permutes and half-vector insert-loads, then the formula becomes:

(S x W load-permute)  <=  S * (lg(W) + 1) instructions

Ignoring reg-reg moves. For degenerate cases like the 3 x 4, it may be possible to do better.

Here's the 3 x 16 load-transpose with AVX512: (6 loads, 3 shuffles, 6 blends)

FORCE_INLINE void transpose_f32_16x3_forward_AVX512(
    const float T[48],
    __m512& r0, __m512& r1, __m512& r2
){
    __m512 a0, a1, a2;

    //   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
    //  16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    //  32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47

    a0 = _mm512_castps256_ps512(_mm256_loadu_ps(T +  0));
    a1 = _mm512_castps256_ps512(_mm256_loadu_ps(T +  8));
    a2 = _mm512_castps256_ps512(_mm256_loadu_ps(T + 16));
    a0 = _mm512_insertf32x8(a0, ((const __m256*)T)[3], 1);
    a1 = _mm512_insertf32x8(a1, ((const __m256*)T)[4], 1);
    a2 = _mm512_insertf32x8(a2, ((const __m256*)T)[5], 1);

    //   0  1  2  3  4  5  6  7 24 25 26 27 28 29 30 31
    //   8  9 10 11 12 13 14 15 32 33 34 35 36 37 38 39
    //  16 17 18 19 20 21 22 23 40 41 42 43 44 45 46 47

    r0 = _mm512_mask_blend_ps(0xf0f0, a0, a1);
    r1 = _mm512_permutex2var_ps(a0, _mm512_setr_epi32(  4,  5,  6,  7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27), a2);
    r2 = _mm512_mask_blend_ps(0xf0f0, a1, a2);

    //   0  1  2  3 12 13 14 15 24 25 26 27 36 37 38 39
    //   4  5  6  7 16 17 18 19 28 29 30 31 40 41 42 43
    //   8  9 10 11 20 21 22 23 32 33 34 35 44 45 46 47

    a0 = _mm512_mask_blend_ps(0xcccc, r0, r1);
    a1 = _mm512_shuffle_ps(r0, r2, 78);
    a2 = _mm512_mask_blend_ps(0xcccc, r1, r2);

    //   0  1  6  7 12 13 18 19 24 25 30 31 36 37 42 43
    //   2  3  8  9 14 15 20 21 26 27 32 33 38 39 44 45
    //   4  5 10 11 16 17 22 23 28 29 34 35 40 41 46 47

    r0 = _mm512_mask_blend_ps(0xaaaa, a0, a1);
    r1 = _mm512_permutex2var_ps(a0, _mm512_setr_epi32(  1,  16,  3, 18,  5, 20,  7, 22,  9, 24, 11, 26, 13, 28, 15, 30), a2);
    r2 = _mm512_mask_blend_ps(0xaaaa, a1, a2);

    //   0  3  6  9 12 15 18 21 24 27 30 33 36 39 42 45
    //   1  4  7 10 13 16 19 22 25 28 31 34 37 40 43 46
    //   2  5  8 11 14 17 20 23 26 29 32 35 38 41 44 47
}

The inverse 3 x 16 transpose-store will be left as an exercise to the reader.

The pattern is not at all trivial to see since the S = 3 is somewhat degenerate. But if you can see the pattern, you'll be able to generalize this to any odd integer S as well as any power-of-two W.

这篇关于最快的跨度3收集指令顺序是什么?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆