使用AVX/AVX2转置8x8浮动 [英] Transpose an 8x8 float using AVX/AVX2

查看:112
本文介绍了使用AVX/AVX2转置8x8浮动的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

可以通过制作四个4x4矩阵并对每个矩阵进行转置来实现8x8矩阵的转置. 这不是我想要的.

Transposing a 8x8 matrix can be achieved by making four 4x4 matrices, and transposing each of them. This is not want I'm going for.

在另一个问题中,一个答案提供了解决方案,对于8x8矩阵仅需要24条指令即可.但是,这不适用于浮点数.

In another question, one answer gave a solution that would only require 24 instructions for an 8x8 matrix. However, this does not apply to floats.

由于AVX2包含256位的寄存器,因此每个寄存器都可以容纳8个32位的整数(浮点数).但是问题是:

Since the AVX2 contains registers of 256 bits, each register would fit eight 32 bits integers (floats). But the question is:

如何使用AVX/AVX2以尽可能小的指令转置8x8浮点矩阵?

推荐答案

我已经回答了这个问题

I already answered this question Fast memory transpose with SSE, AVX, and OpenMP.

让我重复一下用AVX转置8x8浮点矩阵的解决方案.让我知道这是否比使用4x4块和_MM_TRANSPOSE4_PS更快.我将它用于更大的矩阵转置中的内核,该转置受内存限制,因此可能不是一个公平的测试.

Let me repeat the solution for transposing an 8x8 float matrix with AVX. Let me know if this is any faster than using 4x4 blocks and _MM_TRANSPOSE4_PS. I used it for a kernel in a larger matrix transpose which was memory bound so that was probably not a fair test.

inline void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) {
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
__t0 = _mm256_unpacklo_ps(row0, row1);
__t1 = _mm256_unpackhi_ps(row0, row1);
__t2 = _mm256_unpacklo_ps(row2, row3);
__t3 = _mm256_unpackhi_ps(row2, row3);
__t4 = _mm256_unpacklo_ps(row4, row5);
__t5 = _mm256_unpackhi_ps(row4, row5);
__t6 = _mm256_unpacklo_ps(row6, row7);
__t7 = _mm256_unpackhi_ps(row6, row7);
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}


基于此评论我了解到有更有效的方法可以进行8x8换位.请参见英特尔优化手册下的"11.11处理端口5压力"部分.例11-19使用了相同数量的指令,但通过使用也要进入端口0的混合,降低了对端口5的压力.我可能会在某个时候使用内在函数来实现这一点,但目前我不需要这样做.


Based on this comment I learned that there are more efficient methods which to do the 8x8 transpose. See Example 11-19 and and 11-20 in the Intel optimization manual under section "11.11 Handling Port 5 Pressure". Example 11-19 uses the same number of instructions but reduces the pressure on port5 by using blends which go to port0 as well. I may implement this with intrinsics at some point but I don't have a need for this at this point.

在上面提到的英特尔手册中,我更加仔细地研究了示例11-19和11-20.事实证明,示例11-19使用了4次不必要的混洗操作.它具有8个解压缩包,12个混洗和8个128位排列.我的方法减少了4次混洗.它们将8个混洗替换为混纺.因此有4个混洗和8个混和.我怀疑这是否比仅用8个随机播放的方法更好.

I looked more carefully into Example 11-19 and 11-20 in the Intel Manuals I mentioned above. It turns out that example 11-19 uses 4 more shuffle operations than necessary. It has 8 unpack, 12 shuffles, and 8 128-bit permutes. My method uses 4 fewer shuffles. They replace 8 of the shuffles with blends. So 4 shuffles and 8 blends. I doubt that's better than my method with only eight shuffles.

但是,如果需要从内存中加载矩阵,则示例11-20是一个改进.它使用8个拆包,8个插入,8个随机播放,8个128位加载和8个存储. 128位负载降低了端口压力.我继续并使用内在函数实现了这一点.

Example 11-20 is, however, an improvement if you need to load the matrix from memory. This uses 8 unpacks, 8 inserts, 8 shuffles, 8 128-bit loads, and 8 stores. The 128-bit loads reduce the port pressure. I went ahead and implemented this using intrinsics.

//Example 11-20. 8x8 Matrix Transpose Using VINSERTF128 loads
void tran(float* mat, float* matT) {
  __m256  r0, r1, r2, r3, r4, r5, r6, r7;
  __m256  t0, t1, t2, t3, t4, t5, t6, t7;

  r0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[0*8+0])), _mm_load_ps(&mat[4*8+0]), 1);
  r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[1*8+0])), _mm_load_ps(&mat[5*8+0]), 1);
  r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[2*8+0])), _mm_load_ps(&mat[6*8+0]), 1);
  r3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[3*8+0])), _mm_load_ps(&mat[7*8+0]), 1);
  r4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[0*8+4])), _mm_load_ps(&mat[4*8+4]), 1);
  r5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[1*8+4])), _mm_load_ps(&mat[5*8+4]), 1);
  r6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[2*8+4])), _mm_load_ps(&mat[6*8+4]), 1);
  r7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[3*8+4])), _mm_load_ps(&mat[7*8+4]), 1);

  t0 = _mm256_unpacklo_ps(r0,r1);
  t1 = _mm256_unpackhi_ps(r0,r1);
  t2 = _mm256_unpacklo_ps(r2,r3);
  t3 = _mm256_unpackhi_ps(r2,r3);
  t4 = _mm256_unpacklo_ps(r4,r5);
  t5 = _mm256_unpackhi_ps(r4,r5);
  t6 = _mm256_unpacklo_ps(r6,r7);
  t7 = _mm256_unpackhi_ps(r6,r7);

  r0 = _mm256_shuffle_ps(t0,t2, 0x44);
  r1 = _mm256_shuffle_ps(t0,t2, 0xEE);
  r2 = _mm256_shuffle_ps(t1,t3, 0x44);
  r3 = _mm256_shuffle_ps(t1,t3, 0xEE);
  r4 = _mm256_shuffle_ps(t4,t6, 0x44);
  r5 = _mm256_shuffle_ps(t4,t6, 0xEE);
  r6 = _mm256_shuffle_ps(t5,t7, 0x44);
  r7 = _mm256_shuffle_ps(t5,t7, 0xEE);

  _mm256_store_ps(&matT[0*8], r0);
  _mm256_store_ps(&matT[1*8], r1);
  _mm256_store_ps(&matT[2*8], r2);
  _mm256_store_ps(&matT[3*8], r3);
  _mm256_store_ps(&matT[4*8], r4);
  _mm256_store_ps(&matT[5*8], r5);
  _mm256_store_ps(&matT[6*8], r6);
  _mm256_store_ps(&matT[7*8], r7);
}


因此,我再次查看示例11-19.据我所知,基本思想是可以用1个shuffle和2个blends代替2个shuffle指令(shufps).例如


So I looked into example 11-19 again. The basic idea as far as I can tell is that two shuffle instructions (shufps) can be replaced by one shuffle and two blends. For example

r0 = _mm256_shuffle_ps(t0,t2, 0x44);
r1 = _mm256_shuffle_ps(t0,t2, 0xEE);

可以替换为

v = _mm256_shuffle_ps(t0,t2, 0x4E);
r0 = _mm256_blend_ps(t0, v, 0xCC);
r1 = _mm256_blend_ps(t2, v, 0x33);

这说明了为什么我的原始代码使用8个混洗,示例11-19使用4个混洗和8个混和.

This explains why my original code used 8 shuffles and Example 11-19 uses 4 shuffles and eight blends.

掺混物对吞吐量很有利,因为混排仅进入一个端口(在混排端口上形成瓶颈),但是混和可以在多个端口上运行,因此无法竞争.但是更好的是:8个混洗还是4个混洗和8个混和?

The blends are good for throughput because shuffles only go to one port (creating a bottleneck on the shuffle port), but blends can run on multiple ports and thus don't compete. But what is better: 8 shuffles or 4 shuffles and 8 blends?

这必须经过测试,并且可能取决于周围的代码.如果您在循环中使用 lot 不需要端口5的其他uops来限制总uop吞吐量,那么您可以使用纯随机播放版本.理想情况下,在转置数据已经存储在寄存器中之前,应该对其进行一些计算.请参见 https://agner.org/optimize/x86标签Wiki .

This has to be tested, and can depend on surrounding code. If you mostly bottleneck on total uop throughput with a lot of other uops in the loop that don't need port 5, you might go for the pure shuffle version. Ideally you should do some computation on the transposed data before storing it, while it's already in registers. See https://agner.org/optimize/ and other performance links in the x86 tag wiki.

但是,我没有找到一种用混合物替换解压说明的方法.

I don't, however, see a way to replace the unpack instructions with blends.

这里是完整的代码,其中结合了示例11-19将2个shuffle转换为1个shuffle和2个混合,以及示例11-20使用vinsertf128负载(在Intel Haswell/Skylake CPU上为2 uops:任何端口一个ALU) ,一个内存.不幸的是,它们没有微熔丝.所有寄存器操作数的vinsertf128对于Intel上的shuffle端口都是1 uop,所以这很好,因为编译器将负载折叠为vinsertf128的内存操作数.)这样做的优点是只需要将源数据对齐16字节即可获得最佳性能,从而避免了任何高速缓存行拆分.

Here is full code which combines Example 11-19 converting 2 shuffles to 1 shuffle and two blends and Example 11-20 which uses vinsertf128 loads (which on Intel Haswell/Skylake CPUs are 2 uops: one ALU for any port, one memory. They unfortunately don't micro-fuse. vinsertf128 with all register operands is 1 uop for the shuffle port on Intel, so this is good because the compiler folds the load into a memory operand for vinsertf128.) This has the advantage of only needing the source data 16-byte aligned for maximum performance, avoiding any cache-line splits.

#include <stdio.h>
#include <x86intrin.h>
#include <omp.h>   

/*
void tran(float* mat, float* matT) {
  __m256  r0, r1, r2, r3, r4, r5, r6, r7;
  __m256  t0, t1, t2, t3, t4, t5, t6, t7;

  r0 = _mm256_load_ps(&mat[0*8]);
  r1 = _mm256_load_ps(&mat[1*8]);
  r2 = _mm256_load_ps(&mat[2*8]);
  r3 = _mm256_load_ps(&mat[3*8]);
  r4 = _mm256_load_ps(&mat[4*8]);
  r5 = _mm256_load_ps(&mat[5*8]);
  r6 = _mm256_load_ps(&mat[6*8]);
  r7 = _mm256_load_ps(&mat[7*8]);

  t0 = _mm256_unpacklo_ps(r0, r1);
  t1 = _mm256_unpackhi_ps(r0, r1);
  t2 = _mm256_unpacklo_ps(r2, r3);
  t3 = _mm256_unpackhi_ps(r2, r3);
  t4 = _mm256_unpacklo_ps(r4, r5);
  t5 = _mm256_unpackhi_ps(r4, r5);
  t6 = _mm256_unpacklo_ps(r6, r7);
  t7 = _mm256_unpackhi_ps(r6, r7);

  r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));  
  r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));
  r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));
  r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));
  r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));
  r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));
  r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));
  r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));

  t0 = _mm256_permute2f128_ps(r0, r4, 0x20);
  t1 = _mm256_permute2f128_ps(r1, r5, 0x20);
  t2 = _mm256_permute2f128_ps(r2, r6, 0x20);
  t3 = _mm256_permute2f128_ps(r3, r7, 0x20);
  t4 = _mm256_permute2f128_ps(r0, r4, 0x31);
  t5 = _mm256_permute2f128_ps(r1, r5, 0x31);
  t6 = _mm256_permute2f128_ps(r2, r6, 0x31);
  t7 = _mm256_permute2f128_ps(r3, r7, 0x31);

  _mm256_store_ps(&matT[0*8], t0);
  _mm256_store_ps(&matT[1*8], t1);
  _mm256_store_ps(&matT[2*8], t2);
  _mm256_store_ps(&matT[3*8], t3);
  _mm256_store_ps(&matT[4*8], t4);
  _mm256_store_ps(&matT[5*8], t5);
  _mm256_store_ps(&matT[6*8], t6);
  _mm256_store_ps(&matT[7*8], t7);
}
*/

void tran(float* mat, float* matT) {
  __m256  r0, r1, r2, r3, r4, r5, r6, r7;
  __m256  t0, t1, t2, t3, t4, t5, t6, t7;

  r0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[0*8+0])), _mm_load_ps(&mat[4*8+0]), 1);
  r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[1*8+0])), _mm_load_ps(&mat[5*8+0]), 1);
  r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[2*8+0])), _mm_load_ps(&mat[6*8+0]), 1);
  r3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[3*8+0])), _mm_load_ps(&mat[7*8+0]), 1);
  r4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[0*8+4])), _mm_load_ps(&mat[4*8+4]), 1);
  r5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[1*8+4])), _mm_load_ps(&mat[5*8+4]), 1);
  r6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[2*8+4])), _mm_load_ps(&mat[6*8+4]), 1);
  r7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[3*8+4])), _mm_load_ps(&mat[7*8+4]), 1);

  t0 = _mm256_unpacklo_ps(r0,r1);
  t1 = _mm256_unpackhi_ps(r0,r1);
  t2 = _mm256_unpacklo_ps(r2,r3);
  t3 = _mm256_unpackhi_ps(r2,r3);
  t4 = _mm256_unpacklo_ps(r4,r5);
  t5 = _mm256_unpackhi_ps(r4,r5);
  t6 = _mm256_unpacklo_ps(r6,r7);
  t7 = _mm256_unpackhi_ps(r6,r7);

  __m256 v;

  //r0 = _mm256_shuffle_ps(t0,t2, 0x44);
  //r1 = _mm256_shuffle_ps(t0,t2, 0xEE);  
  v = _mm256_shuffle_ps(t0,t2, 0x4E);
  r0 = _mm256_blend_ps(t0, v, 0xCC);
  r1 = _mm256_blend_ps(t2, v, 0x33);

  //r2 = _mm256_shuffle_ps(t1,t3, 0x44);
  //r3 = _mm256_shuffle_ps(t1,t3, 0xEE);
  v = _mm256_shuffle_ps(t1,t3, 0x4E);
  r2 = _mm256_blend_ps(t1, v, 0xCC);
  r3 = _mm256_blend_ps(t3, v, 0x33);

  //r4 = _mm256_shuffle_ps(t4,t6, 0x44);
  //r5 = _mm256_shuffle_ps(t4,t6, 0xEE);
  v = _mm256_shuffle_ps(t4,t6, 0x4E);
  r4 = _mm256_blend_ps(t4, v, 0xCC);
  r5 = _mm256_blend_ps(t6, v, 0x33);

  //r6 = _mm256_shuffle_ps(t5,t7, 0x44);
  //r7 = _mm256_shuffle_ps(t5,t7, 0xEE);
  v = _mm256_shuffle_ps(t5,t7, 0x4E);
  r6 = _mm256_blend_ps(t5, v, 0xCC);
  r7 = _mm256_blend_ps(t7, v, 0x33);

  _mm256_store_ps(&matT[0*8], r0);
  _mm256_store_ps(&matT[1*8], r1);
  _mm256_store_ps(&matT[2*8], r2);
  _mm256_store_ps(&matT[3*8], r3);
  _mm256_store_ps(&matT[4*8], r4);
  _mm256_store_ps(&matT[5*8], r5);
  _mm256_store_ps(&matT[6*8], r6);
  _mm256_store_ps(&matT[7*8], r7);
}


int verify(float *mat) {
    int i,j;
    int error = 0;
    for(i=0; i<8; i++) {
      for(j=0; j<8; j++) {
        if(mat[j*8+i] != 1.0f*i*8+j) error++;
      }
    }
    return error;
}

void print_mat(float *mat) {
    int i,j;
    for(i=0; i<8; i++) {
      for(j=0; j<8; j++) printf("%2.0f ", mat[i*8+j]);
      puts("");
    }
    puts("");
}

int main(void) {
    int i,j, rep;
    float mat[64] __attribute__((aligned(64)));
    float matT[64] __attribute__((aligned(64)));
    double dtime;

    rep = 10000000;
    for(i=0; i<64; i++) mat[i] = i;
    print_mat(mat);

    tran(mat,matT);
    //dtime = -omp_get_wtime();
    //tran(mat, matT, rep);
    //dtime += omp_get_wtime();
    printf("errors %d\n", verify(matT));
    //printf("dtime %f\n", dtime);
    print_mat(matT);
}

这篇关于使用AVX/AVX2转置8x8浮动的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆