如何提高64位C / Intel汇编程序的内存性能/数据局部性 [英] How to improve memory performance/data locality of 64-bit C/intel assembly program

查看:372
本文介绍了如何提高64位C / Intel汇编程序的内存性能/数据局部性的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我使用的是爱好程序来教自己的高性能计算技术。

我的电脑有一个英特尔的Ivy Bridge酷睿i7 3770处理器,32 GB内存和Microsoft Visual Studio 2010的C编译器的免费版本。

64位程序需要大约20 GB的内存,因为它有五个4 GB的查找表(bytevecM ......下面bytevecX)。该搜索程序的内部循环被写为一个单独的C文件(因为我可能想用汇编版本后更换),如下图所示:

 的#define H_PRIME 1000003
INT内(
   为const char * bytevecM,
   为const char * bytevecD,
   为const char * bytevecC,
   为const char * bytevecL,
   为const char * bytevecX,
   INT startval,INT endval,
   INT平方米,诠释D2,C2 INT,INT 12,INT×2,
   为int * PS

{
   为int * PSIN = PS;
   INT QQM;
   INT M3,M4,M5,M6,M7;
   INT D3,D4,D5,D6,D7;
   INT C3,C4,C5,C6,C7;
   INT L3,L4,L5,L6,L7;
   INT X3,X4,X5,X6,X7;
   INT Q3,Q4,Q5,Q6,Q7,Q8;
   对(Q3 = startval; Q3< endval ++ Q3)
   {
      如果(Q3 == || 10 == Q3 13)继续;
      M3 =(2 ^ Q3)* H_PRIME;
      D3 =(D2 ^ Q3)* H_PRIME;
      C3 =(C2 ^ Q3)* H_PRIME;
      L3 =(12 ^ Q3)* H_PRIME;
      X3 =(X2 ^ Q3)* H_PRIME;
      对于(Q4 = 1; Q4< 128; ++ Q4)
      {
         如果(Q4 == || 10 Q4 == 13)继续;
         M4 =(M3 ^ Q4)* H_PRIME;
         D4 =(D3 ^ Q4)* H_PRIME;
         C4 =(C3 ^ Q4)* H_PRIME;
         14 =(13 ^ Q4)* H_PRIME;
         X4 =(X3 ^ Q4)* H_PRIME;
         对(Q5 = 1; Q5&所述; 128 ++ Q5)
         {
            如果(Q5 == || 10 Q5 == 13)继续;
            M5 =(M4 ^ Q5)* H_PRIME;
            D5 =(D4 ^ Q5)* H_PRIME;
            C5 =(C4 ^ Q5)* H_PRIME;
            15 =(14 ^ Q5)* H_PRIME;
            X5 =(X4 ^ Q5)* H_PRIME;
            为(Q6 = 1; Q6&所述; 128 ++ Q6)
            {
               如果(Q6 == || 10 Q6 == 13)继续;
               M6 =(M5 ^ Q6)* H_PRIME;
               D6 =(D5 ^ Q6)* H_PRIME;
               C6 =(C5 ^ Q6)* H_PRIME;
               L6 =(15 ^ Q6)* H_PRIME;
               5233 =(X5 ^ Q6)* H_PRIME;
               为(Q7 = 1; Q7&所述; 128 ++ Q7)
               {
                  如果(Q7 == || 10 Q7 == 13)继续;
                  M7 =(M6 ^ Q7)* H_PRIME;
                  D7 =(D6 ^ Q7)* H_PRIME;
                  C7 =(C6 ^ Q7)* H_PRIME;
                  17 =(16 ^ Q7)* H_PRIME;
                  X7 =(5233 ^ Q7)* H_PRIME;
                  为(Q8 = 1; Q8&所述; 128 ++ Q8)
                  {
                     如果(Q8 == || 10 Q8 == 13)继续;
                     QQM = bytevecM [(无符号整数)(M7 ^ Q8)];
                     如果(QQM!= 0
                     &功放;&安培; QQM == bytevecD [(无符号整数)(D7 ^ Q8)
                     &功放;&安培; QQM == bytevecC [(无符号整数)(C7 ^ Q8)
                     &功放;&安培; QQM == bytevecL [(无符号整数)(17 ^ Q8)
                     &功放;&安培; QQM == bytevecX [(无符号整数)(X7 ^ Q8)])
                     {
                        * PS ++ = Q3; * PS ++ = Q4; * PS ++ = Q5;
                        * PS ++ = Q6; * PS ++ = Q7; * PS ++ = Q8;
                        * PS ++ = QQM;
                     }
                  }
               }
            }
         }
      }
   }
   回报(INT)(PS - PSIN);
}

请注意,顺便说一句,那上面的算法是通过与不同的开始和结束范围每个线程运行它的一个副本,轻松并行。

使用直觉,英特尔内部函数,并分别标杆每一个变化,我能够从约五小时缩短运行时间为三个,如下图所示:

 的#include< emmintrin.h>
#包括LT&;&smmintrin.h GT;
#定义H_PRIME 1000003
#定义UNROLL(Q8)QQM = bytevecM [(无符号整数)(M7 ^ Q8)]; \\
  如果(QQM!= 0 \\
  &功放;&安培; QQM == bytevecD [(无符号整数)(s7.m128i_i32 [0] ^ Q8)\\
  &功放;&安培; QQM == bytevecC [(无符号整数)(s7.m128i_i32 [1] ^ Q8)\\
  &功放;&安培; QQM == bytevecL [(无符号整数)(s7.m128i_i32 [2] ^ Q8)\\
  &功放;&安培; QQM == bytevecX [(无符号整数)(s7.m128i_i32 [3] ^ Q8)){\\
    PS [J ++] = _mm_set_epi16(0,QQM,Q8,Q7,Q6,Q5,Q4,Q3); }
INT内(
   为const char * bytevecM,
   为const char * bytevecD,
   为const char * bytevecC,
   为const char * bytevecL,
   为const char * bytevecX,
   INT startval,INT endval,
   INT平方米,诠释D2,C2 INT,INT 12,INT×2,
   __m128i * PS

{
   __m128i S2 = _mm_set_epi32(×2,L2,C2,D2);
   __m128i马力= _mm_set1_epi32(H_PRIME);
   __m128i XT [128];
   __m128i S3,S4,S5,S6,S7;
   INT QQM;
   INT M3,M4,M5,M6,M7;
   INT Q3,Q4,Q5,Q6,Q7;
   INT J = 0;
   INT Z者除外;对于(Z = 1; z,其中128 ++ Z){XT [Z] = _mm_set1_epi32(Z); }
   对(Q3 = startval; Q3< endval ++ Q3)
   {
      如果(Q3 == || 10 == Q3 13)继续;
      M3 =(2 ^ Q3)* H_PRIME;
      S3 = _mm_mullo_epi32(_mm_xor_si128(S2,XT [Q3]),马力);
      对于(Q4 = 1; Q4< 128; ++ Q4)
      {
         如果(Q4 == || 10 Q4 == 13)继续;
         M4 =(M3 ^ Q4)* H_PRIME;
         S4 = _mm_mullo_epi32(_mm_xor_si128(S3,XT [Q4]),马力);
         对(Q5 = 1; Q5&所述; 128 ++ Q5)
         {
            如果(Q5 == || 10 Q5 == 13)继续;
            M5 =(M4 ^ Q5)* H_PRIME;
            S5 = _mm_mullo_epi32(_mm_xor_si128(S4,XT [Q5]),马力);
            为(Q6 = 1; Q6&所述; 128 ++ Q6)
            {
               如果(Q6 == || 10 Q6 == 13)继续;
               M6 =(M5 ^ Q6)* H_PRIME;
               S6 = _mm_mullo_epi32(_mm_xor_si128(S5,XT [Q6]),马力);
               为(Q7 = 1; Q7&所述; 128 ++ Q7)
               {
                  如果(Q7 == || 10 Q7 == 13)继续;
                  M7 =(M6 ^ Q7)* H_PRIME;
                  S7 = _mm_mullo_epi32(_mm_xor_si128(S6,XT [Q7]),马力);
                  UNROLL(1)
                  UNROLL(96)
                  UNROLL(2)
                  UNROLL(3)
                  UNROLL(4)
                  UNROLL(5)
                  UNROLL(6)
                  UNROLL(7)
                  UNROLL(8)
                  UNROLL(9)
                  UNROLL(11)
                  UNROLL(12)
                  UNROLL(14)
                  // ...剪断UNROLL(15).. UNROLL(125)
                  UNROLL(126)
                  UNROLL(127)
               }
            }
         }
      }
   }
   复位J;
}

其中大部分加快的是从内环路的手动解开。

由于我很新的英特尔SSE / AVX指令,请让我知道,如果你只是看到上面的东西使你做鬼脸。

英特尔VTune报告最大的热点出现在该行:

  UNROLL(1)

在相应的组件code时,热点如下所示:

  MOV EAX,ECX 0.917s
MOV EDX,ECX 0.062s
XOR RAX,为0x1
MOVDQA xmmword PTR [RSP +为0x20],XMM0
MOV EBX,DWORD PTR [RSP + 0x2c] 0.155s
MOV r11d,DWORD PTR [RSP + 0x28] 0.949s
MOVSX ECX,字节PTR [RAX + RDI * 1] 0.156s
MOV r9d,DWORD PTR [RSP + 0X24] 91.132s
MOV r8d,DWORD PTR [RSP + 0x20的] 0.233s
测试ECX,ECX
JZ 0x14000225b
---
MOV EAX,r8d 0.342s
XOR RAX,为0x1 0.047s
MOVSX EAX,字节PTR [RAX + R13 * 1] 0.124s
CMP ECX,EAX 12.631s
JNZ 0x14000225b
---
MOV EAX,r9d
XOR RAX,为0x1
MOVSX EAX,字节PTR [RAX + R12 * 1]
CMP ECX,EAX 0.016s
JNZ 0x14000225b

这似乎是一个数据局部性的问题给我。通过内循环每一次,M7的价值疯狂和未$ P $变化pdictably,在4 GB范围内,所以你可能会得到第一个UNROLL(1)高速缓存未命中仰视时QQM = bytevecM [M7 ^ 1]。

由于后续UNROLL(2).. UNROLL(127)与2..127异或M7,你通常会得到该解开的其余部分缓存命中。奇怪的是,改变解开的顺序,通过移动UNROLL(96)到右后UNROLL(1),给了一个显著加快。

据我所知,从存储器读取结果字节填补包含字节(64字节)的高速缓存行。

由于我很新的这方面,我欢迎任何意见或就如何加快内存查找,尤其是对于大型的表打交道时很好的参考(在我的情况下,4 GB表)。

我看不到一个明显的方式来提高上述算法数据局部性;如何可能实现的建议也欢迎。

更新2013年3月29日

由于该节点是书面的,我已经能够从三个小时进一步减少的运行时间降低到20分钟,如以下所示。

添加为每4 GB一个4 MB位图bytevec它减少到大约40分钟,通过添加一些_mm_ prefetch通话进一步减半。

请注意,该基本算法保持不变:数据局部性通过添加位图改善;延迟通过加入_mm_ prefetch呼叫降低

有关进一步的性能改进建议表示欢迎。改进后的程序如下:

 的#include< emmintrin.h>
#包括LT&;&smmintrin.h GT;
#定义H_PRIME 1000003#定义UNROLL(QX)QQM = bytevecM [^ M7 QX] \\
  如果(QQM!= 0 \\
  &功放;&安培; QQM == bytevecD [^ D7 QX]){\\
    _mm_ prefetch(安培; bytevecC [^ C7 QX] _MM_HINT_T0); \\
    QD [NQD ++] = QQM; QD [NQD ++] = QX; }INT内(
   const的无符号字符* bitvecM,
   const的无符号字符* bitvecD,
   const的无符号字符* bitvecC,
   const的无符号字符* bitvecL,
   const的无符号字符* bitvecX,
   const的无符号字符* bitvecV,
   const的无符号字符* bitvecI,
   const的无符号字符* bytevecM,
   const的无符号字符* bytevecD,
   const的无符号字符* bytevecC,
   const的无符号字符* bytevecL,
   const的无符号字符* bytevecX,
   INT startval,INT endval,
   INT平方米,诠释D2,C2 INT,INT 12,INT×2,INT V2,INT I2,
   __m128i * PS

{
   __declspec(对齐(16))__m128i S2 = _mm_set_epi32(I2,第2版,X2,12);
   __declspec(对齐(16))__m128i马力= _mm_set1_epi32(H_PRIME);
   __declspec(对齐(16))__m128i XT [128];
   __declspec(对齐(16))__m128i S3,S4,S5,S6;
   INT M3,M4,M5,M6;
   INT D3,D4,D5,D6;
   INT C3,C4,C5,C6;
   unsigned int类型M7,D7,C7,L7,X7,V7,i7处理器;
   INT QQM;
   INT Q3,Q4,Q5,Q6,Q7,Q8;
   INT IRET = 0;
   unsigned int类型QF [128 * 4];
   INT NQF;
   INT QZ;
   INT QD [128 * 2];
   INT NQD;
   INT CNT;
   INT QH [128 * 3]。
   INT nqh;
   INT企[128 * 5];
   INT NQI;
   unsigned int类型m7arr [128];
   unsigned int类型d7arr [128];
   常量为size_t * PBVM =(为size_t *)bitvecM;
   常量为size_t * pbvD =(为size_t *)bitvecD;
   常量为size_t * pbvC =(为size_t *)bitvecC;
   常量为size_t * pbvL =(为size_t *)bitvecL;
   常量为size_t * pbvX =(为size_t *)bitvecX;
   常量为size_t * pbvV =(为size_t *)bitvecV;
   常量为size_t * pbvI =(为size_t *)bitvecI;
   INT Z者除外;对于(Z = 1; z,其中128 ++ Z){XT [Z] = _mm_set1_epi32(Z); }   对(Q3 = startval; Q3< endval ++ Q3)
   {
      如果(Q3 == || 10 == Q3 13)继续;
      M3 =(2 ^ Q3)* H_PRIME;
      D3 =(D2 ^ Q3)* H_PRIME;
      C3 =(C2 ^ Q3)* H_PRIME;
      S3 = _mm_mullo_epi32(_mm_xor_si128(S2,XT [Q3]),马力);
      对于(Q4 = 1; Q4< 128; ++ Q4)
      {
         如果(Q4 == || 10 Q4 == 13)继续;
         M4 =(M3 ^ Q4)* H_PRIME;
         D4 =(D3 ^ Q4)* H_PRIME;
         C4 =(C3 ^ Q4)* H_PRIME;
         S4 = _mm_mullo_epi32(_mm_xor_si128(S3,XT [Q4]),马力);
         对(Q5 = 1; Q5&所述; 128 ++ Q5)
         {
            如果(Q5 == || 10 Q5 == 13)继续;
            M5 =(M4 ^ Q5)* H_PRIME;
            D5 =(D4 ^ Q5)* H_PRIME;
            C5 =(C4 ^ Q5)* H_PRIME;
            S5 = _mm_mullo_epi32(_mm_xor_si128(S4,XT [Q5]),马力);
            为(Q6 = 1; Q6&所述; 128 ++ Q6)
            {
               如果(Q6 == || 10 Q6 == 13)继续;
               M6 =(M5 ^ Q6)* H_PRIME;
               D6 =(D5 ^ Q6)* H_PRIME;
               C6 =(C5 ^ Q6)* H_PRIME;
               S6 = _mm_mullo_epi32(_mm_xor_si128(S5,XT [Q6]),马力);
               为(Q7 = 1; Q7&所述; 128 ++ Q7)
               {
                  如果(Q7 == || 10 Q7 == 13)继续;
                  m7arr [Q7] =(无符号整数)((M6 ^ Q7)* H_PRIME);
                  _mm_ prefetch((为const char *)(&安培; PBVM [m7arr [Q7]>> 13]),_MM_HINT_T0);
                  d7arr [Q7] =(无符号整数)((D6 ^ Q7)* H_PRIME);
                  _mm_ prefetch((为const char *)(&安培; pbvD [d​​7arr [Q7]>> 13]),_MM_HINT_T0);
               }
               nqh = 0;
               为(Q7 = 1; Q7&所述; 128 ++ Q7)
               {
                  如果(Q7 == || 10 Q7 == 13)继续;
                  如果((PBVM [m7arr [Q7]≥> 13]及((为size_t)1所述;≤((m7arr [Q7]≥大于7)及63)))== 0)继续;
                  如果((pbvD [d​​7arr [Q7]≥> 13]及((为size_t)1所述;≤((d7arr [Q7]≥大于7)及63)))== 0)继续;
                  C7 =(无符号整数)((C6 ^ Q7)* H_PRIME);
                  _mm_ prefetch((为const char *)(&安培; pbvC [C7>> 13]),_MM_HINT_T0);
                  L7 =(无符号整数)((s6.m128i_i32 [0] ^ Q7)* H_PRIME);
                  _mm_ prefetch((为const char *)(&安培; pbvL [17>> 13]),_MM_HINT_T0);
                  QH [nqh ++] = Q7;
                  QH [nqh ++] = C7;
                  QH [nqh ++] = 17;
               }
               NQI = 0;
               CNT = 0;
               而(CNT< nqh)
               {
                  Q7 = QH [CNT ++];
                  C7 = QH [CNT ++];
                  17 = QH [CNT ++];
                  如果((pbvC [C7>> 13]及((为size_t)1所述;≤((C 7>大于7)及63)))== 0)继续;
                  如果((pbvL [17>> 13]及((为size_t)1所述;≤((17>大于7)及63)))== 0)继续;
                  X7 =(无符号整数)((s6.m128i_i32 [1] ^ Q7)* H_PRIME);
                  _mm_ prefetch((为const char *)(&安培; pbvX [X7>> 13]),_MM_HINT_T0);
                  V7 =(无符号整数)((s6.m128i_i32 [2] ^ Q7)* H_PRIME);
                  _mm_ prefetch((为const char *)(&安培; pbvV [V7>> 13]),_MM_HINT_T0);
                  企[NQI ++] = Q7;
                  企[NQI ++] = C7;
                  企[NQI ++] = 17;
                  企[NQI ++] = X7;
                  企[NQI ++] = V7;
               }
               NQF = 0;
               CNT = 0;
               而(CNT< NQI)
               {
                  Q7 =企[CNT ++];
                  C7 =企[CNT ++];
                  17 =企[CNT ++];
                  X7 =企[CNT ++];
                  V7 =企[CNT ++];
                  如果((pbvX [X7>> 13]及((为size_t)1所述;≤((X7>大于7)及63)))== 0)继续;
                  如果((pbvV [V7>> 13]及((为size_t)1所述;≤((V7>大于7)及63)))== 0)继续;
                  I7 =(无符号整数)((s6.m128i_i32 [3] ^ Q7)* H_PRIME);
                  如果((pbvI [I7>> 13]及((为size_t)1所述;≤((I7>大于7)及63)))== 0)继续;
                  _mm_ prefetch(安培; bytevecD [d7arr [Q7]&放大器; 0xffffff80] _MM_HINT_T0);
                  _mm_ prefetch(安培; bytevecD [64+(d7arr [Q7]&安培; 0xffffff80),_MM_HINT_T0);
                  QF [NQF ++] = Q7;
                  QF [NQF ++] = C7;
                  QF [NQF ++] = 17;
                  QF [NQF ++] = X7;
               }
               CNT = 0;
               而(CNT< NQF)
               {
                  Q7 = QF [CNT]
                  CNT + = 4;
                  _mm_ prefetch(安培; bytevecM [m7arr [Q7]&放大器; 0xffffff80] _MM_HINT_T0);
                  _mm_ prefetch(安培; bytevecM [64+(m7arr [Q7]&安培; 0xffffff80),_MM_HINT_T0);
               }
               QZ = 0;
               而(QZ< NQF)
               {
                  Q7 = QF [QZ ++];
                  C7 = QF [QZ ++];
                  17 = QF [QZ ++];
                  X7 = QF [QZ ++];
                  M7 = m7arr [Q7];
                  D7 = d7arr [Q7];
                  NQD = 0;
                  UNROLL(1)
                  UNROLL(96)
                  UNROLL(2)
                  UNROLL(3)
                  UNROLL(4)
                  UNROLL(5)
                  UNROLL(6)
                  UNROLL(7)
                  UNROLL(8)
                  UNROLL(9)
                  UNROLL(11)
                  UNROLL(12)
                  UNROLL(14)
                  // ...剪断UNROLL(15).. UNROLL(125)
                  UNROLL(126)
                  UNROLL(127)
                  如果(NQD == 0)继续;
                  CNT = 0;
                  而(CNT< NQD)
                  {
                     QQM = QD [CNT ++];
                     Q8 = QD [CNT ++];
                     如果(QQM == bytevecC [^ C7 Q8]
                     &功放;&安培; QQM == bytevecL [17 ^ Q8]
                     &功放;&安培; QQM == bytevecX [^ X7 Q8])
                     {
                        PS [IRET ++] = _mm_set_epi16(0,QQM,Q8,Q7,Q6,Q5,Q4,Q3);
                     }
                  }
               }
            }
         }
      }
   }
   返回IRET;
}


解决方案

如果你真的需要访问以随机方式内存则只有真正的门面,你可以做,以节省时间主内存摊位。硬件根本无法随意跳跃到一个存储器地址,并具有单一的周期内的数据。当然,总有一些招数,往往依赖于精确的芯片组。无是雨后春笋般在脑海中对这个问题虽然。<​​/ P>

有关性能非常高的系统中,你经常需要在算法或技术诀窍仔细查看,以免做的工作。带着这个疑问在您的其他问题的情况下<一个href=\"http://stackoverflow.com/questions/15172102/using-c-intel-assembly-what-is-the-fastest-way-to-test-if-a-128-byte-memory-blo#comment21372091_15172102\">Using C / Intel汇编,什么是测试如果一个128字节的内存块包含全零的最快方法?,并使用的事实,大多数的内存是零的时候,你可以简单地设置1位每128字节甚至是512字节和使用,作为测试整个块短路。这可以看作是一个 http://en.wikipedia.org/wiki/Bloom_filter 尽管我preFER把它作为每个条目1位的数组。

有关4Gb的查找表,则需要31.5Mb每128字节,或7.8MB 1位,每4倍128 1位字节等,等这里的目的是要尝试并获得较小的presence指标更可能是在高速缓存中。当然,查找表的作家是要支付额外的存储器写。

这是另一种技术,其依赖于数据如何写入查找数组可能不适合于所有,将是值的地址存储在排序后的数组,而不是在阵列本身。这将允许您使用320MB的地址值(如我的数学是正确的),并允许您使用二进制搜索,但会导致初始探头良好的缓存效果。

或者,而不是立即做最后的内部循环,保存所需的输入变量,并继续到下一个封闭的循环迭代。当完成排序内存地址内环命中的这个名单,然后做他们都在自己的循环。有很多这一技术陷阱的,它只有真正帮助,如果你很可能会重复使用相同的内存块,但这种方法是使用真实世界

I am using a hobby program to teach myself high performance computing techniques.

My PC has an Intel Ivy Bridge Core i7 3770 processor with 32 GB of memory and the free version of Microsoft vs2010 C compiler.

The 64-bit program needs about 20 GB of memory because it has five 4 GB lookup tables (bytevecM ... bytevecX below). The inner loop of this search program was written as a separate C file (since I may want to replace it later with an assembler version), shown below:

#define H_PRIME 1000003
int inner(
   const char* bytevecM,
   const char* bytevecD,
   const char* bytevecC,
   const char* bytevecL,
   const char* bytevecX,
   int startval, int endval,
   int m2, int d2, int c2, int l2, int x2,
   int* ps
)
{
   int* psin = ps;
   int qqm;
   int m3, m4, m5, m6, m7;
   int d3, d4, d5, d6, d7;
   int c3, c4, c5, c6, c7;
   int l3, l4, l5, l6, l7;
   int x3, x4, x5, x6, x7;
   int q3, q4, q5, q6, q7, q8;
   for (q3 = startval; q3 < endval; ++q3)
   {
      if (q3 == 10 || q3 == 13) continue;
      m3 = (m2 ^ q3) * H_PRIME;
      d3 = (d2 ^ q3) * H_PRIME;
      c3 = (c2 ^ q3) * H_PRIME;
      l3 = (l2 ^ q3) * H_PRIME;
      x3 = (x2 ^ q3) * H_PRIME;
      for (q4 = 1; q4 < 128; ++q4)
      {
         if (q4 == 10 || q4 == 13) continue;
         m4 = (m3 ^ q4) * H_PRIME;
         d4 = (d3 ^ q4) * H_PRIME;
         c4 = (c3 ^ q4) * H_PRIME;
         l4 = (l3 ^ q4) * H_PRIME;
         x4 = (x3 ^ q4) * H_PRIME;
         for (q5 = 1; q5 < 128; ++q5)
         {
            if (q5 == 10 || q5 == 13) continue;
            m5 = (m4 ^ q5) * H_PRIME;
            d5 = (d4 ^ q5) * H_PRIME;
            c5 = (c4 ^ q5) * H_PRIME;
            l5 = (l4 ^ q5) * H_PRIME;
            x5 = (x4 ^ q5) * H_PRIME;
            for (q6 = 1; q6 < 128; ++q6)
            {
               if (q6 == 10 || q6 == 13) continue;
               m6 = (m5 ^ q6) * H_PRIME;
               d6 = (d5 ^ q6) * H_PRIME;
               c6 = (c5 ^ q6) * H_PRIME;
               l6 = (l5 ^ q6) * H_PRIME;
               x6 = (x5 ^ q6) * H_PRIME;
               for (q7 = 1; q7 < 128; ++q7)
               {
                  if (q7 == 10 || q7 == 13) continue;
                  m7 = (m6 ^ q7) * H_PRIME;
                  d7 = (d6 ^ q7) * H_PRIME;
                  c7 = (c6 ^ q7) * H_PRIME;
                  l7 = (l6 ^ q7) * H_PRIME;
                  x7 = (x6 ^ q7) * H_PRIME;
                  for (q8 = 1; q8 < 128; ++q8)
                  {
                     if (q8 == 10 || q8 == 13) continue;
                     qqm = bytevecM[(unsigned int)(m7 ^ q8)];
                     if (qqm != 0
                     &&  qqm == bytevecD[(unsigned int)(d7 ^ q8)]
                     &&  qqm == bytevecC[(unsigned int)(c7 ^ q8)]
                     &&  qqm == bytevecL[(unsigned int)(l7 ^ q8)]
                     &&  qqm == bytevecX[(unsigned int)(x7 ^ q8)])
                     {
                        *ps++ = q3; *ps++ = q4; *ps++ = q5;
                        *ps++ = q6; *ps++ = q7; *ps++ = q8;
                        *ps++ = qqm;
                     }
                  }
               }
            }
         }
      }
   }
   return (int)(ps - psin);
}

Note, by the way, that the above algorithm is easily parallelizable by running one copy of it in each thread with different start and end ranges.

Using intuition, Intel intrinsics, and benchmarking each change individually, I was able to reduce the running time from around five hours down to three, as shown below:

#include <emmintrin.h>
#include <smmintrin.h>
#define H_PRIME 1000003
#define UNROLL(q8) qqm = bytevecM[(unsigned int)(m7 ^ q8)];    \
  if (qqm != 0                                                 \
  &&  qqm == bytevecD[(unsigned int)(s7.m128i_i32[0] ^ q8)]    \
  &&  qqm == bytevecC[(unsigned int)(s7.m128i_i32[1] ^ q8)]    \
  &&  qqm == bytevecL[(unsigned int)(s7.m128i_i32[2] ^ q8)]    \
  &&  qqm == bytevecX[(unsigned int)(s7.m128i_i32[3] ^ q8)]) { \
    ps[j++] = _mm_set_epi16(0, qqm, q8, q7, q6, q5, q4, q3); }
int inner(
   const char* bytevecM,
   const char* bytevecD,
   const char* bytevecC,
   const char* bytevecL,
   const char* bytevecX,
   int startval, int endval,
   int m2, int d2, int c2, int l2, int x2,
   __m128i* ps
)
{
   __m128i s2 = _mm_set_epi32(x2, l2, c2, d2);
   __m128i hp = _mm_set1_epi32(H_PRIME);
   __m128i xt[128];
   __m128i s3, s4, s5, s6, s7;
   int qqm;
   int m3, m4, m5, m6, m7;
   int q3, q4, q5, q6, q7;
   int j = 0;
   int z; for (z = 1; z < 128; ++z) { xt[z] = _mm_set1_epi32(z); }
   for (q3 = startval; q3 < endval; ++q3)
   {
      if (q3 == 10 || q3 == 13) continue;
      m3 = (m2 ^ q3) * H_PRIME;
      s3 = _mm_mullo_epi32(_mm_xor_si128(s2, xt[q3]), hp);
      for (q4 = 1; q4 < 128; ++q4)
      {
         if (q4 == 10 || q4 == 13) continue;
         m4 = (m3 ^ q4) * H_PRIME;
         s4 = _mm_mullo_epi32(_mm_xor_si128(s3, xt[q4]), hp);
         for (q5 = 1; q5 < 128; ++q5)
         {
            if (q5 == 10 || q5 == 13) continue;
            m5 = (m4 ^ q5) * H_PRIME;
            s5 = _mm_mullo_epi32(_mm_xor_si128(s4, xt[q5]), hp);
            for (q6 = 1; q6 < 128; ++q6)
            {
               if (q6 == 10 || q6 == 13) continue;
               m6 = (m5 ^ q6) * H_PRIME;
               s6 = _mm_mullo_epi32(_mm_xor_si128(s5, xt[q6]), hp);
               for (q7 = 1; q7 < 128; ++q7)
               {
                  if (q7 == 10 || q7 == 13) continue;
                  m7 = (m6 ^ q7) * H_PRIME;
                  s7 = _mm_mullo_epi32(_mm_xor_si128(s6, xt[q7]), hp);
                  UNROLL(1)
                  UNROLL(96)
                  UNROLL(2)
                  UNROLL(3)
                  UNROLL(4)
                  UNROLL(5)
                  UNROLL(6)
                  UNROLL(7)
                  UNROLL(8)
                  UNROLL(9)
                  UNROLL(11)
                  UNROLL(12)
                  UNROLL(14)
                  // ... snipped UNROLL(15) .. UNROLL(125)
                  UNROLL(126)
                  UNROLL(127)
               }
            }
         }
      }
   }
   return j;
}

Most of this speed up was from the manual unroll of the inner loop.

Since I am very new to Intel SSE/AVX instructions, please let me know if you just saw something above that made you pull a face.

Intel VTune reports the biggest hot spot occurs at the line:

UNROLL(1)

In the corresponding assembly code, the hot spots are shown below:

mov     eax, ecx                         0.917s
mov     edx, ecx                         0.062s
xor     rax, 0x1
movdqa  xmmword ptr [rsp+0x20], xmm0
mov     ebx, dword ptr [rsp+0x2c]        0.155s
mov     r11d, dword ptr [rsp+0x28]       0.949s
movsx   ecx, byte ptr [rax+rdi*1]        0.156s
mov     r9d, dword ptr [rsp+0x24]       91.132s
mov     r8d, dword ptr [rsp+0x20]        0.233s
test    ecx, ecx
jz      0x14000225b
---
mov     eax, r8d                         0.342s
xor     rax, 0x1                         0.047s
movsx   eax, byte ptr [rax+r13*1]        0.124s
cmp     ecx, eax                        12.631s
jnz     0x14000225b
---
mov     eax, r9d
xor     rax, 0x1
movsx   eax, byte ptr [rax+r12*1]
cmp     ecx, eax                         0.016s
jnz     0x14000225b

This seems like a "data locality" problem to me. Each time through the inner loop, the value of m7 varies wildly and unpredictably, in a 4 GB range, so you will likely get a cache miss for the first UNROLL(1) when looking up qqm=bytevecM[m7^1].

Since the subsequent UNROLL(2)..UNROLL(127) xors m7 with 2..127, you will usually get a cache hit for the rest of the UNROLLs. Curiously, changing the order of the UNROLLs, by moving UNROLL(96) to right after UNROLL(1), gave a significant speed up.

I understand that reading a byte from memory results in filling the (64-byte) cache line that contains the byte.

Since I am very new to this area, I welcome any advice or good references on how to speed up memory lookups, especially when dealing with large tables (in my case, 4 GB tables).

I can't see an obvious way to improve data locality with the above algorithm; suggestions on how that might be achieved are also welcome.

Update 29 March 2013

Since this node was written, I've been able to further reduce the running time from three hours down to 20 minutes, as shown below.

Adding a 4 MB bitmap for each 4 GB bytevec reduced it to around 40 minutes, further halved by adding some _mm_prefetch calls.

Note that the basic algorithm remains unchanged: data locality was improved by adding the bitmaps; latency was reduced by adding the _mm_prefetch calls.

Suggestions for further performance improvements welcome. The improved program follows:

#include <emmintrin.h>
#include <smmintrin.h>
#define H_PRIME 1000003

#define UNROLL(qx) qqm = bytevecM[m7 ^ qx];         \
  if (qqm != 0                                      \
  &&  qqm == bytevecD[d7 ^ qx]) {                   \
    _mm_prefetch(&bytevecC[c7 ^ qx], _MM_HINT_T0);  \
    qd[nqd++] = qqm; qd[nqd++] = qx; }

int inner(
   const unsigned char* bitvecM,
   const unsigned char* bitvecD,
   const unsigned char* bitvecC,
   const unsigned char* bitvecL,
   const unsigned char* bitvecX,
   const unsigned char* bitvecV,
   const unsigned char* bitvecI,
   const unsigned char* bytevecM,
   const unsigned char* bytevecD,
   const unsigned char* bytevecC,
   const unsigned char* bytevecL,
   const unsigned char* bytevecX,
   int startval, int endval,
   int m2, int d2, int c2, int l2, int x2, int v2, int i2,
   __m128i* ps
)
{
   __declspec(align(16)) __m128i s2 = _mm_set_epi32(i2, v2, x2, l2);
   __declspec(align(16)) __m128i hp = _mm_set1_epi32(H_PRIME);
   __declspec(align(16)) __m128i xt[128];
   __declspec(align(16)) __m128i s3, s4, s5, s6;
   int m3, m4, m5, m6;
   int d3, d4, d5, d6;
   int c3, c4, c5, c6;
   unsigned int m7, d7, c7, l7, x7, v7, i7;
   int qqm;
   int q3, q4, q5, q6, q7, q8;
   int iret = 0;
   unsigned int qf[128*4];
   int nqf;
   int qz;
   int qd[128*2];
   int nqd;
   int cnt;
   int qh[128*3];
   int nqh;
   int qi[128*5];
   int nqi;
   unsigned int m7arr[128];
   unsigned int d7arr[128];
   const size_t* pbvM = (size_t*)bitvecM;
   const size_t* pbvD = (size_t*)bitvecD;
   const size_t* pbvC = (size_t*)bitvecC;
   const size_t* pbvL = (size_t*)bitvecL;
   const size_t* pbvX = (size_t*)bitvecX;
   const size_t* pbvV = (size_t*)bitvecV;
   const size_t* pbvI = (size_t*)bitvecI;
   int z; for (z = 1; z < 128; ++z) { xt[z] = _mm_set1_epi32(z); }

   for (q3 = startval; q3 < endval; ++q3)
   {
      if (q3 == 10 || q3 == 13) continue;
      m3 = (m2 ^ q3) * H_PRIME;
      d3 = (d2 ^ q3) * H_PRIME;
      c3 = (c2 ^ q3) * H_PRIME;
      s3 = _mm_mullo_epi32(_mm_xor_si128(s2, xt[q3]), hp);
      for (q4 = 1; q4 < 128; ++q4)
      {
         if (q4 == 10 || q4 == 13) continue;
         m4 = (m3 ^ q4) * H_PRIME;
         d4 = (d3 ^ q4) * H_PRIME;
         c4 = (c3 ^ q4) * H_PRIME;
         s4 = _mm_mullo_epi32(_mm_xor_si128(s3, xt[q4]), hp);
         for (q5 = 1; q5 < 128; ++q5)
         {
            if (q5 == 10 || q5 == 13) continue;
            m5 = (m4 ^ q5) * H_PRIME;
            d5 = (d4 ^ q5) * H_PRIME;
            c5 = (c4 ^ q5) * H_PRIME;
            s5 = _mm_mullo_epi32(_mm_xor_si128(s4, xt[q5]), hp);
            for (q6 = 1; q6 < 128; ++q6)
            {
               if (q6 == 10 || q6 == 13) continue;
               m6 = (m5 ^ q6) * H_PRIME;
               d6 = (d5 ^ q6) * H_PRIME;
               c6 = (c5 ^ q6) * H_PRIME;
               s6 = _mm_mullo_epi32(_mm_xor_si128(s5, xt[q6]), hp);
               for (q7 = 1; q7 < 128; ++q7)
               {
                  if (q7 == 10 || q7 == 13) continue;
                  m7arr[q7] = (unsigned int)( (m6 ^ q7) * H_PRIME );
                  _mm_prefetch((const char*)(&pbvM[m7arr[q7] >> 13]), _MM_HINT_T0);
                  d7arr[q7] = (unsigned int)( (d6 ^ q7) * H_PRIME );
                  _mm_prefetch((const char*)(&pbvD[d7arr[q7] >> 13]), _MM_HINT_T0);
               }
               nqh = 0;
               for (q7 = 1; q7 < 128; ++q7)
               {
                  if (q7 == 10 || q7 == 13) continue;
                  if ( (pbvM[m7arr[q7] >> 13] & ((size_t)1 << ((m7arr[q7] >> 7) & 63))) == 0 ) continue;
                  if ( (pbvD[d7arr[q7] >> 13] & ((size_t)1 << ((d7arr[q7] >> 7) & 63))) == 0 ) continue;
                  c7 = (unsigned int)( (c6 ^ q7) * H_PRIME );
                  _mm_prefetch((const char*)(&pbvC[c7 >> 13]), _MM_HINT_T0);
                  l7 = (unsigned int)( (s6.m128i_i32[0] ^ q7) * H_PRIME );
                  _mm_prefetch((const char*)(&pbvL[l7 >> 13]), _MM_HINT_T0);
                  qh[nqh++] = q7;
                  qh[nqh++] = c7;
                  qh[nqh++] = l7;
               }
               nqi = 0;
               cnt = 0;
               while (cnt < nqh)
               {
                  q7 = qh[cnt++];
                  c7 = qh[cnt++];
                  l7 = qh[cnt++];
                  if ( (pbvC[c7 >> 13] & ((size_t)1 << ((c7 >> 7) & 63))) == 0 ) continue;
                  if ( (pbvL[l7 >> 13] & ((size_t)1 << ((l7 >> 7) & 63))) == 0 ) continue;
                  x7 = (unsigned int)( (s6.m128i_i32[1] ^ q7) * H_PRIME );
                  _mm_prefetch((const char*)(&pbvX[x7 >> 13]), _MM_HINT_T0);
                  v7 = (unsigned int)( (s6.m128i_i32[2] ^ q7) * H_PRIME );
                  _mm_prefetch((const char*)(&pbvV[v7 >> 13]), _MM_HINT_T0);
                  qi[nqi++] = q7;
                  qi[nqi++] = c7;
                  qi[nqi++] = l7;
                  qi[nqi++] = x7;
                  qi[nqi++] = v7;
               }
               nqf = 0;
               cnt = 0;
               while (cnt < nqi)
               {
                  q7 = qi[cnt++];
                  c7 = qi[cnt++];
                  l7 = qi[cnt++];
                  x7 = qi[cnt++];
                  v7 = qi[cnt++];
                  if ( (pbvX[x7 >> 13] & ((size_t)1 << ((x7 >> 7) & 63))) == 0 ) continue;
                  if ( (pbvV[v7 >> 13] & ((size_t)1 << ((v7 >> 7) & 63))) == 0 ) continue;
                  i7 = (unsigned int)( (s6.m128i_i32[3] ^ q7) * H_PRIME );
                  if ( (pbvI[i7 >> 13] & ((size_t)1 << ((i7 >> 7) & 63))) == 0 ) continue;
                  _mm_prefetch(&bytevecD[d7arr[q7] & 0xffffff80], _MM_HINT_T0);
                  _mm_prefetch(&bytevecD[64+(d7arr[q7] & 0xffffff80)], _MM_HINT_T0);
                  qf[nqf++] = q7;
                  qf[nqf++] = c7;
                  qf[nqf++] = l7;
                  qf[nqf++] = x7;
               }
               cnt = 0;
               while (cnt < nqf)
               {
                  q7 = qf[cnt];
                  cnt += 4;
                  _mm_prefetch(&bytevecM[m7arr[q7] & 0xffffff80], _MM_HINT_T0);
                  _mm_prefetch(&bytevecM[64+(m7arr[q7] & 0xffffff80)], _MM_HINT_T0);
               }
               qz = 0;
               while (qz < nqf)
               {
                  q7 = qf[qz++];
                  c7 = qf[qz++];
                  l7 = qf[qz++];
                  x7 = qf[qz++];
                  m7 = m7arr[q7];
                  d7 = d7arr[q7];
                  nqd = 0;
                  UNROLL(1)
                  UNROLL(96)
                  UNROLL(2)
                  UNROLL(3)
                  UNROLL(4)
                  UNROLL(5)
                  UNROLL(6)
                  UNROLL(7)
                  UNROLL(8)
                  UNROLL(9)
                  UNROLL(11)
                  UNROLL(12)
                  UNROLL(14)
                  // ... snipped UNROLL(15) .. UNROLL(125)
                  UNROLL(126)
                  UNROLL(127)
                  if (nqd == 0) continue;
                  cnt = 0;
                  while (cnt < nqd)
                  {
                     qqm = qd[cnt++];
                     q8  = qd[cnt++];
                     if (qqm == bytevecC[c7 ^ q8]
                     &&  qqm == bytevecL[l7 ^ q8]
                     &&  qqm == bytevecX[x7 ^ q8])
                     {
                        ps[iret++] = _mm_set_epi16(0, qqm, q8, q7, q6, q5, q4, q3);
                     }
                  }
               }
            }
         }
      }
   }
   return iret;
}

解决方案

If you truly need to access the memory in random fashion then there is only really window dressing that you can do to save time on main memory stalls. The hardware simply cannot randomly jump to a memory address and have data available within a single cycle. Of course there are always 'tricks', often dependent on exact chipset. None is springing to mind for this question though.

For very high performance systems, you often need to look carefully at the algorithm or trick techniques to avoid doing work. Taking this question in the context of your other question Using C/Intel assembly, what is the fastest way to test if a 128-byte memory block contains all zeros? AND using the fact that most of the time the memory is zero, you could simply set 1 bit per 128 bytes or even 512bytes and use that as a short circuit for testing the complete block. This can be seen as a http://en.wikipedia.org/wiki/Bloom_filter although I prefer to think of it as an array of 1 bit per entry.

For a 4Gb lookup table, you will need 31.5Mb at 1 bit per 128 bytes, or 7.8Mb at 1 bit per 4x 128 bytes etc, etc. the aim here is to try and get the smaller presence indicator to be more likely to be in cache. Of course the lookup table writer is going to incur an additional memory write.

An alternative technique, which might not be suitable at all depending on how data is written to the lookup arrays, would be to store the addresses of values in a sorted array rather than the array itself. This would allow you to use 320Mb for the address values (if my maths is correct) and allow you to use binary search, but can result in good cache effectiveness for initial probes.

Or, rather than doing the final inner loop immediately, save the input variables required, and proceed to next enclosing loop iteration. When finished sort this list of inner loop hits by memory address and then do them all in their own loop. There is a lot of gotchas with this technique and it only really helps if you are likely to reuse same memory block, but this method is used real-world

这篇关于如何提高64位C / Intel汇编程序的内存性能/数据局部性的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆