什么是我的编译器在做什么？（优化的memcpy） [英] What is my compiler doing? (optimizing memcpy)

查看：378 发布时间：2016/7/18 20:42:38 c++ assembly sse compiler-optimization

本文介绍了什么是我的编译器在做什么？（优化的memcpy）的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我在VC中使用以下设置编译位code ++的2010 / O2 / OB2 /爱/ OT

I'm compiling a bit of code using the following settings in VC++2010: /O2 /Ob2 /Oi /Ot

不过我有一些无法理解产生的汇编的一些地区，我已经把在code一些问题为注释。

However I'm having some trouble understanding some parts of the assembly generated, I have put some questions in the code as comments.

此外，什么prefetching距离，一般建议在现代的CPU？我可以在我自己的CPU测试OFC，但我希望的一些价值，将在更广泛的CPU很好地工作。也许人们可以使用动态prefetching距离？

Also, what prefetching distance is generally recommended on modern cpus? I can ofc test on my own cpu, but I was hoping for some value that will work well on a wider range of cpus. Maybe one could use dynamic prefetching distances?

＆LT; - 编辑：

<--

我很惊讶，另一件事是，编译器不会以某种形式交织MOVDQA和movntdq说明？由于这些指令都是从我的理解某种意义上异步的。

Another thing I'm surprised about is that the compiler does not interleave in some form the movdqa and movntdq instructions? Since these instructions are in some sense asynchronous from my understanding.

这code还假定32字节的高速缓存线时prefetching，但它似乎是高端CPU有64字节的超高速缓存行，所以prefetches大概可以去掉2。

This code also assumes 32 byte cache lines when prefetching, however it seems that high-end cpus have 64 byte cachelines, so 2 of the prefetches can probably be removed.

- >

void memcpy_aligned_x86(void* dest, const void* source, size_t size)
{ 
0052AC20  push        ebp  
0052AC21  mov         ebp,esp  
 const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);

 for(size_t n = 0; n < size/16; n += 8) 
0052AC23  mov         edx,dword ptr [size]  
0052AC26  mov         ecx,dword ptr [dest]  
0052AC29  mov         eax,dword ptr [source]  
0052AC2C  shr         edx,4  
0052AC2F  test        edx,edx  
0052AC31  je          copy+9Eh (52ACBEh)  
 __m128i xmm0 = _mm_setzero_si128();
 __m128i xmm1 = _mm_setzero_si128();
 __m128i xmm2 = _mm_setzero_si128();
 __m128i xmm3 = _mm_setzero_si128();
 __m128i xmm4 = _mm_setzero_si128();
 __m128i xmm5 = _mm_setzero_si128();
 __m128i xmm6 = _mm_setzero_si128();
 __m128i xmm7 = _mm_setzero_si128();

 __m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
0052AC37  push        esi  
0052AC38  push        edi  
0052AC39  lea         edi,[edx-1]  
0052AC3C  shr         edi,3  
0052AC3F  inc         edi  
 {
  _mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);

  xmm0 = _mm_load_si128(source_128++);
  xmm1 = _mm_load_si128(source_128++);
  xmm2 = _mm_load_si128(source_128++);
  xmm3 = _mm_load_si128(source_128++);
  xmm4 = _mm_load_si128(source_128++);
  xmm5 = _mm_load_si128(source_128++);
  xmm6 = _mm_load_si128(source_128++);
  xmm7 = _mm_load_si128(source_128++);
0052AC40  movdqa      xmm6,xmmword ptr [eax+70h]  // 1. Why is this moved before the pretecthes?
0052AC45  prefetchnta [eax+80h]  
0052AC4C  prefetchnta [eax+0A0h]  
0052AC53  prefetchnta [eax+0C0h]  
0052AC5A  prefetchnta [eax+0E0h]  
0052AC61  movdqa      xmm0,xmmword ptr [eax+10h]  
0052AC66  movdqa      xmm1,xmmword ptr [eax+20h]  
0052AC6B  movdqa      xmm2,xmmword ptr [eax+30h]  
0052AC70  movdqa      xmm3,xmmword ptr [eax+40h]  
0052AC75  movdqa      xmm4,xmmword ptr [eax+50h]  
0052AC7A  movdqa      xmm5,xmmword ptr [eax+60h]  
0052AC7F  lea         esi,[eax+70h]  // 2. What is happening in these 2 lines?
0052AC82  mov         edx,eax        //
0052AC84  movdqa      xmm7,xmmword ptr [edx]  // 3. Why edx? and not simply eax?

  _mm_stream_si128(dest_128++, xmm0);
0052AC88  mov         esi,ecx  // 4. Is esi never used?
0052AC8A  movntdq     xmmword ptr [esi],xmm7  
  _mm_stream_si128(dest_128++, xmm1);
0052AC8E  movntdq     xmmword ptr [ecx+10h],xmm0  
  _mm_stream_si128(dest_128++, xmm2);
0052AC93  movntdq     xmmword ptr [ecx+20h],xmm1  
  _mm_stream_si128(dest_128++, xmm3);
0052AC98  movntdq     xmmword ptr [ecx+30h],xmm2  
  _mm_stream_si128(dest_128++, xmm4);
0052AC9D  movntdq     xmmword ptr [ecx+40h],xmm3  
  _mm_stream_si128(dest_128++, xmm5);
0052ACA2  movntdq     xmmword ptr [ecx+50h],xmm4  
  _mm_stream_si128(dest_128++, xmm6);
0052ACA7  movntdq     xmmword ptr [ecx+60h],xmm5  
  _mm_stream_si128(dest_128++, xmm7);
0052ACAC  lea         edx,[ecx+70h]  
0052ACAF  sub         eax,0FFFFFF80h  
0052ACB2  sub         ecx,0FFFFFF80h  
0052ACB5  dec         edi  
0052ACB6  movntdq     xmmword ptr [edx],xmm6  // 5. Why not simply ecx?
0052ACBA  jne         copy+20h (52AC40h)  
0052ACBC  pop         edi  
0052ACBD  pop         esi  
 }
}

原来的code：

original code:

void memcpy_aligned_x86(void* dest, const void* source, size_t size)
{ 
 assert(dest != nullptr);
 assert(source != nullptr);
 assert(source != dest);
 assert(size % 128 == 0);

 __m128i xmm0 = _mm_setzero_si128();
 __m128i xmm1 = _mm_setzero_si128();
 __m128i xmm2 = _mm_setzero_si128();
 __m128i xmm3 = _mm_setzero_si128();
 __m128i xmm4 = _mm_setzero_si128();
 __m128i xmm5 = _mm_setzero_si128();
 __m128i xmm6 = _mm_setzero_si128();
 __m128i xmm7 = _mm_setzero_si128();

 __m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
 const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);

 for(size_t n = 0; n < size/16; n += 8) 
 {
  _mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);

  xmm0 = _mm_load_si128(source_128++);
  xmm1 = _mm_load_si128(source_128++);
  xmm2 = _mm_load_si128(source_128++);
  xmm3 = _mm_load_si128(source_128++);
  xmm4 = _mm_load_si128(source_128++);
  xmm5 = _mm_load_si128(source_128++);
  xmm6 = _mm_load_si128(source_128++);
  xmm7 = _mm_load_si128(source_128++);

  _mm_stream_si128(dest_128++, xmm0);
  _mm_stream_si128(dest_128++, xmm1);
  _mm_stream_si128(dest_128++, xmm2);
  _mm_stream_si128(dest_128++, xmm3);
  _mm_stream_si128(dest_128++, xmm4);
  _mm_stream_si128(dest_128++, xmm5);
  _mm_stream_si128(dest_128++, xmm6);
  _mm_stream_si128(dest_128++, xmm7);
 }
}

什么是我的编译器在做什么？（优化的memcpy） [英] What is my compiler doing? (optimizing memcpy)

问题描述

推荐答案

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

什么是我的编译器在做什么？ （优化的memcpy） [英] What is my compiler doing? (optimizing memcpy)

问题描述

推荐答案

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

什么是我的编译器在做什么？（优化的memcpy） [英] What is my compiler doing? (optimizing memcpy)

登录关闭