什么是我的编译器在做什么? (优化的memcpy) [英] What is my compiler doing? (optimizing memcpy)

查看:378
本文介绍了什么是我的编译器在做什么? (优化的memcpy)的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我在VC中使用以下设置编译位code ++的2010 / O2 / OB2 /爱/ OT

I'm compiling a bit of code using the following settings in VC++2010: /O2 /Ob2 /Oi /Ot

不过我有一些无法理解产生的汇编的一些地区,我已经把在code一些问题为注释。

However I'm having some trouble understanding some parts of the assembly generated, I have put some questions in the code as comments.

此外,什么prefetching距离,一般建议在现代的CPU?我可以在我自己的CPU测试OFC,但我希望的一些价值,将在更广泛的CPU很好地工作。也许人们可以使用动态prefetching距离?

Also, what prefetching distance is generally recommended on modern cpus? I can ofc test on my own cpu, but I was hoping for some value that will work well on a wider range of cpus. Maybe one could use dynamic prefetching distances?

< - 编辑:

<--

我很惊讶,另一件事是,编译器不会以某种形式交织MOVDQA和movntdq说明?由于这些指令都是从我的理解某种意义上异步的。

Another thing I'm surprised about is that the compiler does not interleave in some form the movdqa and movntdq instructions? Since these instructions are in some sense asynchronous from my understanding.

这code还假定32字节的高速缓存线时prefetching,但它似乎是高端CPU有64字节的超高速缓存行,所以prefetches大概可以去掉2。

This code also assumes 32 byte cache lines when prefetching, however it seems that high-end cpus have 64 byte cachelines, so 2 of the prefetches can probably be removed.

- >

void memcpy_aligned_x86(void* dest, const void* source, size_t size)
{ 
0052AC20  push        ebp  
0052AC21  mov         ebp,esp  
 const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);

 for(size_t n = 0; n < size/16; n += 8) 
0052AC23  mov         edx,dword ptr [size]  
0052AC26  mov         ecx,dword ptr [dest]  
0052AC29  mov         eax,dword ptr [source]  
0052AC2C  shr         edx,4  
0052AC2F  test        edx,edx  
0052AC31  je          copy+9Eh (52ACBEh)  
 __m128i xmm0 = _mm_setzero_si128();
 __m128i xmm1 = _mm_setzero_si128();
 __m128i xmm2 = _mm_setzero_si128();
 __m128i xmm3 = _mm_setzero_si128();
 __m128i xmm4 = _mm_setzero_si128();
 __m128i xmm5 = _mm_setzero_si128();
 __m128i xmm6 = _mm_setzero_si128();
 __m128i xmm7 = _mm_setzero_si128();

 __m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
0052AC37  push        esi  
0052AC38  push        edi  
0052AC39  lea         edi,[edx-1]  
0052AC3C  shr         edi,3  
0052AC3F  inc         edi  
 {
  _mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);

  xmm0 = _mm_load_si128(source_128++);
  xmm1 = _mm_load_si128(source_128++);
  xmm2 = _mm_load_si128(source_128++);
  xmm3 = _mm_load_si128(source_128++);
  xmm4 = _mm_load_si128(source_128++);
  xmm5 = _mm_load_si128(source_128++);
  xmm6 = _mm_load_si128(source_128++);
  xmm7 = _mm_load_si128(source_128++);
0052AC40  movdqa      xmm6,xmmword ptr [eax+70h]  // 1. Why is this moved before the pretecthes?
0052AC45  prefetchnta [eax+80h]  
0052AC4C  prefetchnta [eax+0A0h]  
0052AC53  prefetchnta [eax+0C0h]  
0052AC5A  prefetchnta [eax+0E0h]  
0052AC61  movdqa      xmm0,xmmword ptr [eax+10h]  
0052AC66  movdqa      xmm1,xmmword ptr [eax+20h]  
0052AC6B  movdqa      xmm2,xmmword ptr [eax+30h]  
0052AC70  movdqa      xmm3,xmmword ptr [eax+40h]  
0052AC75  movdqa      xmm4,xmmword ptr [eax+50h]  
0052AC7A  movdqa      xmm5,xmmword ptr [eax+60h]  
0052AC7F  lea         esi,[eax+70h]  // 2. What is happening in these 2 lines?
0052AC82  mov         edx,eax        //
0052AC84  movdqa      xmm7,xmmword ptr [edx]  // 3. Why edx? and not simply eax?

  _mm_stream_si128(dest_128++, xmm0);
0052AC88  mov         esi,ecx  // 4. Is esi never used?
0052AC8A  movntdq     xmmword ptr [esi],xmm7  
  _mm_stream_si128(dest_128++, xmm1);
0052AC8E  movntdq     xmmword ptr [ecx+10h],xmm0  
  _mm_stream_si128(dest_128++, xmm2);
0052AC93  movntdq     xmmword ptr [ecx+20h],xmm1  
  _mm_stream_si128(dest_128++, xmm3);
0052AC98  movntdq     xmmword ptr [ecx+30h],xmm2  
  _mm_stream_si128(dest_128++, xmm4);
0052AC9D  movntdq     xmmword ptr [ecx+40h],xmm3  
  _mm_stream_si128(dest_128++, xmm5);
0052ACA2  movntdq     xmmword ptr [ecx+50h],xmm4  
  _mm_stream_si128(dest_128++, xmm6);
0052ACA7  movntdq     xmmword ptr [ecx+60h],xmm5  
  _mm_stream_si128(dest_128++, xmm7);
0052ACAC  lea         edx,[ecx+70h]  
0052ACAF  sub         eax,0FFFFFF80h  
0052ACB2  sub         ecx,0FFFFFF80h  
0052ACB5  dec         edi  
0052ACB6  movntdq     xmmword ptr [edx],xmm6  // 5. Why not simply ecx?
0052ACBA  jne         copy+20h (52AC40h)  
0052ACBC  pop         edi  
0052ACBD  pop         esi  
 }
}

原来的code:

original code:

void memcpy_aligned_x86(void* dest, const void* source, size_t size)
{ 
 assert(dest != nullptr);
 assert(source != nullptr);
 assert(source != dest);
 assert(size % 128 == 0);

 __m128i xmm0 = _mm_setzero_si128();
 __m128i xmm1 = _mm_setzero_si128();
 __m128i xmm2 = _mm_setzero_si128();
 __m128i xmm3 = _mm_setzero_si128();
 __m128i xmm4 = _mm_setzero_si128();
 __m128i xmm5 = _mm_setzero_si128();
 __m128i xmm6 = _mm_setzero_si128();
 __m128i xmm7 = _mm_setzero_si128();

 __m128i* dest_128 = reinterpret_cast<__m128i*>(dest);
 const __m128i* source_128 = reinterpret_cast<const __m128i*>(source);

 for(size_t n = 0; n < size/16; n += 8) 
 {
  _mm_prefetch(reinterpret_cast<const char*>(source_128+8), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+10), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+12), _MM_HINT_NTA);
  _mm_prefetch(reinterpret_cast<const char*>(source_128+14), _MM_HINT_NTA);

  xmm0 = _mm_load_si128(source_128++);
  xmm1 = _mm_load_si128(source_128++);
  xmm2 = _mm_load_si128(source_128++);
  xmm3 = _mm_load_si128(source_128++);
  xmm4 = _mm_load_si128(source_128++);
  xmm5 = _mm_load_si128(source_128++);
  xmm6 = _mm_load_si128(source_128++);
  xmm7 = _mm_load_si128(source_128++);

  _mm_stream_si128(dest_128++, xmm0);
  _mm_stream_si128(dest_128++, xmm1);
  _mm_stream_si128(dest_128++, xmm2);
  _mm_stream_si128(dest_128++, xmm3);
  _mm_stream_si128(dest_128++, xmm4);
  _mm_stream_si128(dest_128++, xmm5);
  _mm_stream_si128(dest_128++, xmm6);
  _mm_stream_si128(dest_128++, xmm7);
 }
}


推荐答案

EAX + 70H读向上移动,因为EAX + 70H是在从eax中不同的高速缓存行,并且编译器可能想硬件prefetcher得到忙于该行尽快

eax+70h read is moved up because eax+70h is in a different cache line from eax, and the compiler probably wants the hardware prefetcher to get busy getting that line as soon as possible.

它不交织或者是因为它要最大限度地避免负载至存储依赖(即使AMD的优化导向明确说交错)的性能,或只是因为它是不能确定存储不会覆盖载荷。它是否改变的行为,如果你添加__restrict关键字源和目标?

It does not interleave either because it wants to maximize performance by avoiding load-to-store dependencies (even though the AMD optimization guide explicitly says to interleave), or simply because it is not sure that stores won't overwrite loads. Does it change the behavior if you add __restrict keywords to source and dest?

它的其余部分的目的,逃避我。可能是一些不起眼的指令解码或硬件prefetcher的考虑,无论是AMD还是英特尔,但我找不到任何理由。我不知道code变得更快或更慢,当你删除这些指令?

The purpose of the rest of it eludes me too. Could be some obscure instruction decoding or hardware prefetcher considerations, either for AMD or Intel, but I can't find any justification for that. I wonder if the code gets faster or slower when you remove those instructions?

推荐prefetching距离取决于环的大小。需要是足够远,该数据具有时间由它的所需的时间从存储器到达。我认为,你通常需要给它至少100个时钟周期。

The recommended prefetching distance depends on the loop size. Needs to be far enough that the data has time to arrive from the memory by the time it's needed. I think that you usually need to give it at least 100 clock ticks.

这篇关于什么是我的编译器在做什么? (优化的memcpy)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆