用SSE更好的8x8字节矩阵转置? [英] A better 8x8 bytes matrix transpose with SSE?

查看:200
本文介绍了用SSE更好的8x8字节矩阵转置?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我发现了这篇文章,它解释了如何通过24次操作转置8x8字节矩阵,以及几卷之后的内容实现转置的代码.但是,这种方法并没有利用我们可以将8x8转置为四个4x4转置的事实,并且每个只能在一条shuffle指令中完成(

I found this post that explains how to transpose an 8x8 bytes matrix with 24 operations, and a few scrolls later there's the code that implements the transpose. However, this method does not exploit the fact that we can block the 8x8 transpose into four 4x4 transposes, and each one can be done in one shuffle instruction only (this post is the reference). So I came out with this solution:

__m128i transpose4x4mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13,  9, 5, 1, 12,  8, 4, 0);
__m128i shuffle8x8Mask = _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 4,  5, 6, 7, 12,  13, 14, 15);

void TransposeBlock8x8(uint8_t *src, uint8_t *dst, int srcStride, int dstStride) {
    __m128i load0 = _mm_set_epi64x(*(uint64_t*)(src + 1 * srcStride), *(uint64_t*)(src + 0 * srcStride));
    __m128i load1 = _mm_set_epi64x(*(uint64_t*)(src + 3 * srcStride), *(uint64_t*)(src + 2 * srcStride));
    __m128i load2 = _mm_set_epi64x(*(uint64_t*)(src + 5 * srcStride), *(uint64_t*)(src + 4 * srcStride));
    __m128i load3 = _mm_set_epi64x(*(uint64_t*)(src + 7 * srcStride), *(uint64_t*)(src + 6 * srcStride));

    __m128i shuffle0 = _mm_shuffle_epi8(load0, shuffle8x8Mask);
    __m128i shuffle1 = _mm_shuffle_epi8(load1, shuffle8x8Mask);
    __m128i shuffle2 = _mm_shuffle_epi8(load2, shuffle8x8Mask);
    __m128i shuffle3 = _mm_shuffle_epi8(load3, shuffle8x8Mask);

    __m128i block0 = _mm_unpacklo_epi64(shuffle0, shuffle1);
    __m128i block1 = _mm_unpackhi_epi64(shuffle0, shuffle1);
    __m128i block2 = _mm_unpacklo_epi64(shuffle2, shuffle3);
    __m128i block3 = _mm_unpackhi_epi64(shuffle2, shuffle3);

    __m128i transposed0 = _mm_shuffle_epi8(block0, transpose4x4mask);   
    __m128i transposed1 = _mm_shuffle_epi8(block1, transpose4x4mask);   
    __m128i transposed2 = _mm_shuffle_epi8(block2, transpose4x4mask);   
    __m128i transposed3 = _mm_shuffle_epi8(block3, transpose4x4mask);   

    __m128i store0 = _mm_unpacklo_epi32(transposed0, transposed2);
    __m128i store1 = _mm_unpackhi_epi32(transposed0, transposed2);
    __m128i store2 = _mm_unpacklo_epi32(transposed1, transposed3);
    __m128i store3 = _mm_unpackhi_epi32(transposed1, transposed3);

    *((uint64_t*)(dst + 0 * dstStride)) = _mm_extract_epi64(store0, 0);
    *((uint64_t*)(dst + 1 * dstStride)) = _mm_extract_epi64(store0, 1);
    *((uint64_t*)(dst + 2 * dstStride)) = _mm_extract_epi64(store1, 0);
    *((uint64_t*)(dst + 3 * dstStride)) = _mm_extract_epi64(store1, 1);
    *((uint64_t*)(dst + 4 * dstStride)) = _mm_extract_epi64(store2, 0);
    *((uint64_t*)(dst + 5 * dstStride)) = _mm_extract_epi64(store2, 1);
    *((uint64_t*)(dst + 6 * dstStride)) = _mm_extract_epi64(store3, 0);
    *((uint64_t*)(dst + 7 * dstStride)) = _mm_extract_epi64(store3, 1);
}

不包括加载/存储操作,此过程仅包含16条指令,而不是24条指令.

Excluding load/store operations this procedure consists of only 16 instructions instead of 24.

我想念什么?

推荐答案

除了要读取和写入内存的加载,存储和pinsrq -s之外,步幅可能不等于8个字节, 您只需12条指令即可进行转置(此代码可轻松与Z玻色子的测试代码结合使用):

Apart from the loads, stores and pinsrq-s to read from and write to memory, with possibly a stride not equal to 8 bytes, you can do the transpose with only 12 instructions (this code can easily be used in combination with Z boson's test code):

void tran8x8b_SSE_v2(char *A, char *B) {
  __m128i pshufbcnst = _mm_set_epi8(15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0);

  __m128i B0, B1, B2, B3, T0, T1, T2, T3;
  B0 = _mm_loadu_si128((__m128i*)&A[ 0]);
  B1 = _mm_loadu_si128((__m128i*)&A[16]);
  B2 = _mm_loadu_si128((__m128i*)&A[32]);
  B3 = _mm_loadu_si128((__m128i*)&A[48]);


  T0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B0),_mm_castsi128_ps(B1),0b10001000));
  T1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B2),_mm_castsi128_ps(B3),0b10001000));
  T2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B0),_mm_castsi128_ps(B1),0b11011101));
  T3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B2),_mm_castsi128_ps(B3),0b11011101));

  B0 = _mm_shuffle_epi8(T0,pshufbcnst);
  B1 = _mm_shuffle_epi8(T1,pshufbcnst);
  B2 = _mm_shuffle_epi8(T2,pshufbcnst);
  B3 = _mm_shuffle_epi8(T3,pshufbcnst);

  T0 = _mm_unpacklo_epi32(B0,B1);
  T1 = _mm_unpackhi_epi32(B0,B1);
  T2 = _mm_unpacklo_epi32(B2,B3);
  T3 = _mm_unpackhi_epi32(B2,B3);

  _mm_storeu_si128((__m128i*)&B[ 0], T0);
  _mm_storeu_si128((__m128i*)&B[16], T1);
  _mm_storeu_si128((__m128i*)&B[32], T2);
  _mm_storeu_si128((__m128i*)&B[48], T3);
}


在这里,我们使用32位浮点混洗,它比epi32混洗更灵活. 强制转换不会生成额外的指令(使用gcc 5.4生成的代码):

Here we use the 32 bit floating point shuffle which is more flexible than the epi32 shuffle. The casts do not generate extra instructions (code generated with gcc 5.4):

tran8x8b_SSE_v2:
.LFB4885:
    .cfi_startproc
    vmovdqu 48(%rdi), %xmm5
    vmovdqu 32(%rdi), %xmm2
    vmovdqu 16(%rdi), %xmm0
    vmovdqu (%rdi), %xmm1
    vshufps $136, %xmm5, %xmm2, %xmm4
    vshufps $221, %xmm5, %xmm2, %xmm2
    vmovdqa .LC6(%rip), %xmm5
    vshufps $136, %xmm0, %xmm1, %xmm3
    vshufps $221, %xmm0, %xmm1, %xmm1
    vpshufb %xmm5, %xmm3, %xmm3
    vpshufb %xmm5, %xmm1, %xmm0
    vpshufb %xmm5, %xmm4, %xmm4
    vpshufb %xmm5, %xmm2, %xmm1
    vpunpckldq  %xmm4, %xmm3, %xmm5
    vpunpckldq  %xmm1, %xmm0, %xmm2
    vpunpckhdq  %xmm4, %xmm3, %xmm3
    vpunpckhdq  %xmm1, %xmm0, %xmm0
    vmovups %xmm5, (%rsi)
    vmovups %xmm3, 16(%rsi)
    vmovups %xmm2, 32(%rsi)
    vmovups %xmm0, 48(%rsi)
    ret
    .cfi_endproc



在某些(但不是全部)较旧的cpus上,可能存在较小的旁路延迟(0到2个周期之间),以便在 整数和浮点单位.这会增加功能的延迟时间,但不一定会影响 代码的吞吐量.



On some, but not all, older cpus there might be a small bypass delay (between 0 and 2 cycles) for moving data between the integer and the floating point units. This increases the latency of the function, but it does not necessarily affect the throughput of the code.

具有1e9转录的简单潜伏期测试:

A simple latency test with 1e9 tranpositions:

  for (int i=0;i<500000000;i++){
     tran8x8b_SSE(A,C);
     tran8x8b_SSE(C,A);
  }
  print8x8b(A);

使用tran8x8b_SSE大约需要5.5秒(19.7e9周期),而使用tran8x8b_SSE_v2(Intel核心i5-6500)大约需要4.5秒(16.0e9周期).注意 尽管在for循环中内联了函数,但是编译器并没有消除装入和存储.

This takes about 5.5 seconds (19.7e9 cycles) with tran8x8b_SSE and 4.5 seconds (16.0e9 cycles) with tran8x8b_SSE_v2 (Intel core i5-6500). Note that the load and stores were not eliminated by the compiler, although the functions were inlined in the for loop.


更新:带有混合功能的AVX2-128/SSE 4.1解决方案.

随机播放"(拆包,随机播放)由端口5处理,现代cpus上每个CPU周期有1条指令. 有时,用两种混合物代替一种洗牌"是值得的.在Skylake上,32位混合指令可以在端口0、1或5上运行.

The 'shuffles' (unpack, shuffle) are handled by port 5, with 1 instruction per cpu cycle on modern cpus. Sometimes it pays off to replace one 'shuffle' with two blends. On Skylake the 32 bit blend instructions can run on either port 0, 1 or 5.

很遗憾,_mm_blend_epi32仅是AVX2-128.结合使用_mm_blend_ps是有效的SSE 4.1替代方案 有一些演员表(通常是免费的). 12个随机播放"被替换为 8个混洗和8个混和组合.

Unfortunately, _mm_blend_epi32 is only AVX2-128. An efficient SSE 4.1 alternative is _mm_blend_ps in combination with a few casts (which are usually free). The 12 'shuffles' are replaced by 8 shuffles in combination with 8 blends.

现在,简单的等待时间测试大约需要3.6秒(13e9 cpu周期),比tran8x8b_SSE_v2的结果快18%.

The simple latency test now runs in about 3.6 seconds (13e9 cpu cycles), which is 18 % faster than the results with tran8x8b_SSE_v2.

代码:

/* AVX2-128 version, sse 4.1 version see ---------------->       SSE 4.1 version of tran8x8b_AVX2_128()                                                              */
void tran8x8b_AVX2_128(char *A, char *B) {                   /*  void tran8x8b_SSE4_1(char *A, char *B) {                                                            */                                    
  __m128i pshufbcnst_0 = _mm_set_epi8(15, 7,11, 3,  
               13, 5, 9, 1,  14, 6,10, 2,  12, 4, 8, 0);     /*    __m128i pshufbcnst_0 = _mm_set_epi8(15, 7,11, 3,  13, 5, 9, 1,  14, 6,10, 2,  12, 4, 8, 0);       */                                    
  __m128i pshufbcnst_1 = _mm_set_epi8(13, 5, 9, 1,  
               15, 7,11, 3,  12, 4, 8, 0,  14, 6,10, 2);     /*    __m128i pshufbcnst_1 = _mm_set_epi8(13, 5, 9, 1,  15, 7,11, 3,  12, 4, 8, 0,  14, 6,10, 2);       */                                    
  __m128i pshufbcnst_2 = _mm_set_epi8(11, 3,15, 7,  
                9, 1,13, 5,  10, 2,14, 6,   8, 0,12, 4);     /*    __m128i pshufbcnst_2 = _mm_set_epi8(11, 3,15, 7,   9, 1,13, 5,  10, 2,14, 6,   8, 0,12, 4);       */                                    
  __m128i pshufbcnst_3 = _mm_set_epi8( 9, 1,13, 5,  
               11, 3,15, 7,   8, 0,12, 4,  10, 2,14, 6);     /*    __m128i pshufbcnst_3 = _mm_set_epi8( 9, 1,13, 5,  11, 3,15, 7,   8, 0,12, 4,  10, 2,14, 6);       */                                    
  __m128i B0, B1, B2, B3, T0, T1, T2, T3;                    /*    __m128 B0, B1, B2, B3, T0, T1, T2, T3;                                                            */                                    
                                                             /*                                                                                                      */                                    
  B0 = _mm_loadu_si128((__m128i*)&A[ 0]);                    /*    B0 = _mm_loadu_ps((float*)&A[ 0]);                                                                */                                    
  B1 = _mm_loadu_si128((__m128i*)&A[16]);                    /*    B1 = _mm_loadu_ps((float*)&A[16]);                                                                */                                    
  B2 = _mm_loadu_si128((__m128i*)&A[32]);                    /*    B2 = _mm_loadu_ps((float*)&A[32]);                                                                */                                    
  B3 = _mm_loadu_si128((__m128i*)&A[48]);                    /*    B3 = _mm_loadu_ps((float*)&A[48]);                                                                */                                    
                                                             /*                                                                                                      */                                    
  B1 = _mm_shuffle_epi32(B1,0b10110001);                     /*    B1 = _mm_shuffle_ps(B1,B1,0b10110001);                                                            */                                    
  B3 = _mm_shuffle_epi32(B3,0b10110001);                     /*    B3 = _mm_shuffle_ps(B3,B3,0b10110001);                                                            */                                    
  T0 = _mm_blend_epi32(B0,B1,0b1010);                        /*    T0 = _mm_blend_ps(B0,B1,0b1010);                                                                  */                                    
  T1 = _mm_blend_epi32(B2,B3,0b1010);                        /*    T1 = _mm_blend_ps(B2,B3,0b1010);                                                                  */                                    
  T2 = _mm_blend_epi32(B0,B1,0b0101);                        /*    T2 = _mm_blend_ps(B0,B1,0b0101);                                                                  */                                    
  T3 = _mm_blend_epi32(B2,B3,0b0101);                        /*    T3 = _mm_blend_ps(B2,B3,0b0101);                                                                  */                                    
                                                             /*                                                                                                      */                                    
  B0 = _mm_shuffle_epi8(T0,pshufbcnst_0);                    /*    B0 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T0),pshufbcnst_0));                       */                                    
  B1 = _mm_shuffle_epi8(T1,pshufbcnst_1);                    /*    B1 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T1),pshufbcnst_1));                       */                                    
  B2 = _mm_shuffle_epi8(T2,pshufbcnst_2);                    /*    B2 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T2),pshufbcnst_2));                       */                                    
  B3 = _mm_shuffle_epi8(T3,pshufbcnst_3);                    /*    B3 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T3),pshufbcnst_3));                       */                                    
                                                             /*                                                                                                      */                                    
  T0 = _mm_blend_epi32(B0,B1,0b1010);                        /*    T0 = _mm_blend_ps(B0,B1,0b1010);                                                                  */                                    
  T1 = _mm_blend_epi32(B0,B1,0b0101);                        /*    T1 = _mm_blend_ps(B0,B1,0b0101);                                                                  */                                    
  T2 = _mm_blend_epi32(B2,B3,0b1010);                        /*    T2 = _mm_blend_ps(B2,B3,0b1010);                                                                  */                                    
  T3 = _mm_blend_epi32(B2,B3,0b0101);                        /*    T3 = _mm_blend_ps(B2,B3,0b0101);                                                                  */                                    
  T1 = _mm_shuffle_epi32(T1,0b10110001);                     /*    T1 = _mm_shuffle_ps(T1,T1,0b10110001);                                                            */                                    
  T3 = _mm_shuffle_epi32(T3,0b10110001);                     /*    T3 = _mm_shuffle_ps(T3,T3,0b10110001);                                                            */                                    
                                                             /*                                                                                                      */                                    
  _mm_storeu_si128((__m128i*)&B[ 0], T0);                    /*    _mm_storeu_ps((float*)&B[ 0], T0);                                                                */                                    
  _mm_storeu_si128((__m128i*)&B[16], T1);                    /*    _mm_storeu_ps((float*)&B[16], T1);                                                                */                                    
  _mm_storeu_si128((__m128i*)&B[32], T2);                    /*    _mm_storeu_ps((float*)&B[32], T2);                                                                */                                    
  _mm_storeu_si128((__m128i*)&B[48], T3);                    /*    _mm_storeu_ps((float*)&B[48], T3);                                                                */                                    
}                                                            /*  }                                                                                                   */                                    

这篇关于用SSE更好的8x8字节矩阵转置?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆