Visual Studio 2013 Update 2和Update 3生成的SSE 4指令 [英] SSE 4 instructions generated by Visual Studio 2013 Update 2 and Update 3

查看:106
本文介绍了Visual Studio 2013 Update 2和Update 3生成的SSE 4指令的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

如果我在VS 2013 Update 2或Update 3中编译此代码,(以下来自Update 3)

  #include  stdafx.h 
#include< iostream>
#include< random>

结构缓冲
{
long *数据;
int计数;
};

#ifndef max
#定义max(a,b)((((a)>(b))?(a):(b))
#endif

long代码(long *数据,整数计数)
{
long nMaxY = data [0];

for(int nNode = 0; nNode< count; nNode ++)
{
nMaxY = max(data [nNode],nMaxY);
}

return(nMaxY);
}

int _tmain(int argc,_TCHAR * argv [])
{
#ifdef __AVX__
static_assert(false,应禁用AVX );
#endif
#ifdef __AVX2__
static_assert(false,应禁用AVX2);
#endif
static_assert(_M_IX86_FP == 2,应该启用SSE2指令));
缓冲增益;
std :: mt19937引擎;
engine.seed(std :: random_device {}());
std :: uniform_int_distribution< int>分配(0,100);

buff.count = 1;
buff.data =新的long [1];
buff.data [0] = distribution(engine);

长结果=代码(buff.data,buff.count);
std :: cout<<结果; //确保使用结果
返回结果;
}

已启用SSE2指令,但未启用AVX / AVX2,发行版中的编译器会生成:

  {
nMaxY = max(data [nNode],nMaxY);
010612E1 movdqu xmm0,xmmword ptr [eax]
010612E5 add esi,8
010612E8 lea eax,[eax + 20h]
010612EB pmaxsd xmm1,xmm0
010612E0 movdqu xmm0,xmmword ptr [eax-10h]
010612F5 pmaxsd xmm2,xmm0
010612FA cmp esi,ebx
010612FC jl Code + 41h(010612E1h)
010612FE pmaxsd xmm1,xmm2
01061303 movdqa xmm0,xmm1
01061307 psrldq xmm0,8
0106130C pmaxsd xmm1,xmm0
01061311 movdqa xmm0,xmm1
01061315 psrldq xmms,4
0106131A ,xmm0
0106131F moved eax,xmm1
01061323 pop ebx
long nMaxY = data [0];

其中包含 pmaxsd



pmaxsd 指令是 SSE4_1指令或AVX指令,而不是SSE2指令。



Intel core2s支持sse3,但不支持sse4,并且不是 pmaxsd



在VS2013更新1或更新0中不会发生。



是否有一种方法可以使Visual Studio生成SSE2指令,但不能生成诸如 pmaxsd 之类的SSE4指令?这是Visual Studio更新2/3中的已知错误吗?有解决方法吗? Visual Studio不再支持Core2处理器吗?






以下是上面编译的代码的更复杂版本(默认情况下)释放设置)以使崩溃的Core2 CPU代码:

  #include stdafx.h 
#include< iostream>
#include< random>
#include< array>

枚举used_name {
_nNumPolygons = 10,
};


#ifndef max
#define max(a,b)(((a)>(b))?(a):(b))
#endif

结构缓冲区
{
std :: array< long *,_nNumPolygons>数据;
std :: array< int,_nNumPolygons>计数;
};

长代码(缓冲区* buff)
{
长nMaxY = buff-> data [0] [0];


for(int nPoly = 0; nPoly< _nNumPolygons; nPoly ++)
{
for(int nNode = 0; nNode< buff-> count [nPoly]; nNode ++)
{
nMaxY = max(buff-> data [nPoly] [nNode],nMaxY);
}
}

return(nMaxY);
}

extern C __int32 __isa_available;

int _tmain(int argc,_TCHAR * argv [])
{
#ifdef __AVX__
static_assert(false, AVX应该被禁用);
#endif
#ifdef __AVX2__
static_assert(false,应禁用AVX2);
#endif
#if!(define(_M_AMD64)|| defined(_M_X64))
static_assert(_M_IX86_FP == 2,应启用SSE2指令));
#endif
// __isa_available = 1; //强制代码执行操作,就好像SSE4_2不可用
缓冲区buff;
std :: mt19937引擎;
engine.seed(std :: random_device {}());
std :: uniform_int_distribution< int>分配(0,100);

for(int i = 0; i <_nNumPolygons; ++ i){
buff.count [i] = 10;
buff.data [i] =新的long [10];
for(int k = 0; k <10; ++ k)
{
buff.data [i] [k] = distribution(engine);
}
}

长结果= Code(&buff);
std :: cout<<结果; //确保使用结果
返回结果;
}

此处是此问题的错误链接,在我发布此问题的同时有人打开了。



这是生成的.asm:

 ?Code2 @@ YAJPAUBuffer @@@ Z PROC;代码2,COMDAT 
; _buff $ = ecx
;文件c:\users\adam.nevraumont.corelcorp.000\文档\visual studio 2013\项目\consoleapplication1\consoleapplication1\consoleapplication1.cpp
;第22行
push ebp
mov ebp,esp
sub esp,12; 0000000cH
push ebx
push esi
push edi
mov edi,ecx
;第26行
xor ebx,ebx
mov DWORD PTR _buff $ 1 $ [ebp],edi
mov DWORD PTR _nPoly $ 1 $ [ebp],ebx
mov eax,DWORD PTR [ edi]
mov edx,DWORD PTR [eax]
;第28行
movd xmm0,edx
pshufd xmm1,xmm0、0
movdqa xmm2,xmm1
npad 12
$ LL6 @ Code2:
lea ecx, DWORD PTR [ebx * 4]
xor eax,eax
mov esi,DWORD PTR [ecx + edi + 40]
mov DWORD PTR tv443 [ebp],ecx
test esi ,esi
jle短裤$ LN5 @ Code2
cmp esi,8
jb短裤$ LN25 @ Code2
cmp DWORD PTR ___isa_available,2
jl短裤$ LN25 @ Code2
;第26行
mov ebx,DWORD PTR [ecx + edi]
mov ecx,esi
和ecx,-2147483641; 80000007H
jns短$ LN33 @ C​​ode2
dec ecx
或ecx,-8; fffffff8H
inc ecx
$ LN33 @ C​​ode2:
mov edi,esi
sub edi,ecx
npad 8
$ LL3 @ C​​ode2:
;第30行
movdqu xmm0,XMMWORD PTR [ebx + eax * 4]
pmaxsd xmm1,xmm0
movdqu xmm0,XMMWORD PTR [ebx + eax * 4 + 16]
添加eax ,8
pmaxsd xmm2,xmm0
cmp eax,edi
jl SHORT $ LL3 @ C​​ode2
mov ebx,DWORD PTR _nPoly $ 1 $ [ebp]
mov ecx, DWORD PTR tv443 [ebp]
mov edi,DWORD PTR _buff $ 1 $ [ebp]
$ LN25 @ Code2:
;第28行
cmp eax,esi
jge SHORT $ LN5 @ Code2
;第26行
mov edi,DWORD PTR [ecx + edi]
npad 4
$ LL23 @ C​​ode2:
;第30行
cmp DWORD PTR [edi + eax * 4],edx
cmovg edx,DWORD PTR [edi + eax * 4]
inc eax
cmp eax,esi
jl短裤$ LL23 @ C​​ode2
$ LN5 @ Code2:
;第26行
mov edi,DWORD PTR _buff $ 1 $ [ebp]
inc ebx
mov DWORD PTR _nPoly $ 1 $ [ebp],ebx
cmp ebx,10; 0000000aH
jl $ LL6 @ Code2
;第28行
移动xmm0,edx
pshufd xmm0,xmm0、0
pmaxsd xmm1,xmm0
pmaxsd xmm1,xmm2
movdqa xmm0,xmm1
psrldq xmm0 ,8
pmaxsd xmm1,xmm0
movdqa xmm0,xmm1
pop edi
psrldq xmm0,4
pmaxsd xmm1,xmm0
pop esi
movd eax,xmm1
pop ebx
;第35行
mov esp,ebp
pop ebp
ret 0

这里:

  cmp esi,8 
jb短$ LN25 @ Code2
cmp DWORD PTR ___isa_available, 2
jl短裤$ LN25 @ Code2

如果(A)循环少于8个长,或(B)我们不支持SSE3 / SSE4,则为 step版本。



单步版本为:

  $ LN5 @ Code2:
;第26行
mov edi,DWORD PTR _buff $ 1 $ [ebp]
inc ebx
mov DWORD PTR _nPoly $ 1 $ [ebp],ebx
cmp ebx,10; 0000000aH
jl $ LL6 @ Code2

没有SSE指令。但是,重要的是失败。如果 eax (迭代参数)通过 10 ,它将落入:

 ;第28行
moved xmm0,edx
pshufd xmm0,xmm0,0
pmaxsd xmm1,xmm0

是查找单步版本结果和SSE4结果的最大值的代码。第三条指令是 pmaxsd ,这是一条SSE4_1指令,它不受 __ isa_available 的保护。



是否有编译器设置或变通办法可以保留自动矢量化,同时不在启用Core2 SSE2的计算机上调用SSE4_1指令?



请注意,我删除循环的双重嵌套性质的尝试似乎使问题消失了。



解决方案

这是记录的行为


自动矢量化程序还使用了更新的SSE4。 2条指令集(如果您的计算机支持)。


如果仔细查看编译器生成的代码,您会发现使用SSE4.2指令取决于运行时测试:

  cmp DWORD PTR ___isa_available,2 
jl SHORT $ LN11 @Code

此处的值2 显然是指SSE4.2



但是我能够确认该错误在第二个示例中。事实证明,我使用的Core 2 PC支持SSE4.1和 PMAXSD 指令,因此我必须在具有Pentium 4 CPU的PC上对其进行测试。非法指令异常。您应该将错误报告提交给 Microsoft Connect 。请务必提及您的示例代码在其上无法运行的特定Core 2 CPU模型。



至于解决方法,我只能建议更改受影响功能的优化级别。从优化速度切换到优化大小似乎会生成与仅使用SSE2指令相同的代码。您可以使用 #pragmaoptimize 这样切换优化级别:

  #pragmaoptimize( s,on)

long代码(缓冲区* buff)
{
...
}

#pragmaoptimize(,on)

记录在此错误报告中的 / d2Qvec-sse2only 是未记录的标志,可在更新3(可能还有更新2)上使用,以防止编译器输出SSE4指令。这自然可以防止某些循环被矢量化。 / d2Qvec-sse2only 可能会在任何时候停止工作(随时更改,恕不另行通知),可能会在将来的VC版本上使用。



Microsoft声称此问题已在Update 4和Update 4 CTP 2中得到解决(不适用于生产用途)。


If I compile this code in VS 2013 Update 2 or Update 3: (below comes from Update 3)

#include "stdafx.h"
#include <iostream>
#include <random>

struct Buffer
{
  long* data;
  int   count;
};

#ifndef max
#define max(a,b)            (((a) > (b)) ? (a) : (b))
#endif

long Code(long* data, int count)
{
  long nMaxY = data[0];

  for (int nNode = 0; nNode < count; nNode++)
  {
    nMaxY = max(data[nNode], nMaxY);
  }

  return(nMaxY);
}

int _tmain(int argc, _TCHAR* argv[])
{
#ifdef __AVX__
  static_assert(false, "AVX should be disabled");
#endif
#ifdef __AVX2__
  static_assert(false, "AVX2 should be disabled");
#endif
  static_assert(_M_IX86_FP == 2, "SSE2 instructions should be enabled");
  Buffer buff;
  std::mt19937 engine;
  engine.seed(std::random_device{}());
  std::uniform_int_distribution<int> distribution(0, 100);

  buff.count = 1;
  buff.data = new long[1];
  buff.data[0] = distribution(engine);

  long result = Code(buff.data, buff.count);
  std::cout << result; // ensure result is used
  return result;
}

with SSE2 instructions enabled, but not AVX/AVX2, the compiler in release generates:

  {
    nMaxY = max(data[nNode], nMaxY);
010612E1  movdqu      xmm0,xmmword ptr [eax]  
010612E5  add         esi,8  
010612E8  lea         eax,[eax+20h]  
010612EB  pmaxsd      xmm1,xmm0  
010612F0  movdqu      xmm0,xmmword ptr [eax-10h]  
010612F5  pmaxsd      xmm2,xmm0  
010612FA  cmp         esi,ebx  
010612FC  jl          Code+41h (010612E1h)  
010612FE  pmaxsd      xmm1,xmm2  
01061303  movdqa      xmm0,xmm1  
01061307  psrldq      xmm0,8  
0106130C  pmaxsd      xmm1,xmm0  
01061311  movdqa      xmm0,xmm1  
01061315  psrldq      xmm0,4  
0106131A  pmaxsd      xmm1,xmm0  
0106131F  movd        eax,xmm1  
01061323  pop         ebx  
  long nMaxY = data[0];

which contains, among other things, pmaxsd instructions.

pmaxsd instructions are SSE4_1 instructions or AVX instructions as far as I can tell, not SSE2 instructions.

Intel core2s support sse3, but not sse4, and not pmaxsd.

This does not occur in VS2013 update 1 or update 0.

Is there a way to get Visual Studio to generate SSE2 instructions but not SSE4 instructions like pmaxsd? Is this a known bug in Visual Studio update 2/3? Is there a workaround? Does Visual Studio no longer support Core2 processors?


Here is a more complex version of the above code that compiles (under default release settings) to code that crashes a Core2 CPU:

#include "stdafx.h"
#include <iostream>
#include <random>
#include <array>

enum unused_name {
  _nNumPolygons = 10,
};


#ifndef max
#define max(a,b)            (((a) > (b)) ? (a) : (b))
#endif

struct Buffer
{
  std::array<long*, _nNumPolygons> data;
  std::array<int, _nNumPolygons>   count;
};

long Code(Buffer* buff)
{
  long  nMaxY = buff->data[0][0];


  for (int nPoly = 0; nPoly < _nNumPolygons; nPoly++)
  {
    for (int nNode = 0; nNode < buff->count[nPoly]; nNode++)
    {
      nMaxY = max(buff->data[nPoly][nNode], nMaxY);
    }
  }

  return(nMaxY);
}

extern "C" __int32 __isa_available;

int _tmain(int argc, _TCHAR* argv[])
{
#ifdef __AVX__
  static_assert(false, "AVX should be disabled");
#endif
#ifdef __AVX2__
  static_assert(false, "AVX2 should be disabled");
#endif
#if !( defined( _M_AMD64 ) || defined( _M_X64 ) )
  static_assert(_M_IX86_FP == 2, "SSE2 instructions should be enabled");
#endif
  // __isa_available = 1; // to force code to act as if SSE4_2 is not available
  Buffer buff;
  std::mt19937 engine;
  engine.seed(std::random_device{}());
  std::uniform_int_distribution<int> distribution(0, 100);

  for (int i = 0; i < _nNumPolygons; ++i) {
    buff.count[i] = 10;
    buff.data[i] = new long[10];
    for (int k = 0; k < 10; ++k)
    {
      buff.data[i][k] = distribution(engine);
    }
  }

  long result = Code(&buff);
  std::cout << result; // ensure result is used
  return result;
}

Here is a link to a bug for this issue that someone else opened around the same time I posted this question.

Here is the generated .asm:

?Code2@@YAJPAUBuffer@@@Z PROC        ; Code2, COMDAT
; _buff$ = ecx
; File c:\users\adam.nevraumont.corelcorp.000\documents\visual studio 2013\projects\consoleapplication1\consoleapplication1\consoleapplication1.cpp
; Line 22
  push  ebp
  mov  ebp, esp
  sub  esp, 12          ; 0000000cH
  push  ebx
  push  esi
  push  edi
  mov  edi, ecx
; Line 26
  xor  ebx, ebx
  mov  DWORD PTR _buff$1$[ebp], edi
  mov  DWORD PTR _nPoly$1$[ebp], ebx
  mov  eax, DWORD PTR [edi]
  mov  edx, DWORD PTR [eax]
; Line 28
  movd  xmm0, edx
  pshufd  xmm1, xmm0, 0
  movdqa  xmm2, xmm1
  npad  12
$LL6@Code2:
  lea  ecx, DWORD PTR [ebx*4]
  xor  eax, eax
  mov  esi, DWORD PTR [ecx+edi+40]
  mov  DWORD PTR tv443[ebp], ecx
  test  esi, esi
  jle  SHORT $LN5@Code2
  cmp  esi, 8
  jb  SHORT $LN25@Code2
  cmp  DWORD PTR ___isa_available, 2
  jl  SHORT $LN25@Code2
; Line 26
  mov  ebx, DWORD PTR [ecx+edi]
  mov  ecx, esi
  and  ecx, -2147483641      ; 80000007H
  jns  SHORT $LN33@Code2
  dec  ecx
  or  ecx, -8          ; fffffff8H
  inc  ecx
$LN33@Code2:
  mov  edi, esi
  sub  edi, ecx
  npad  8
$LL3@Code2:
; Line 30
  movdqu  xmm0, XMMWORD PTR [ebx+eax*4]
  pmaxsd  xmm1, xmm0
  movdqu  xmm0, XMMWORD PTR [ebx+eax*4+16]
  add  eax, 8
  pmaxsd  xmm2, xmm0
  cmp  eax, edi
  jl  SHORT $LL3@Code2
  mov  ebx, DWORD PTR _nPoly$1$[ebp]
  mov  ecx, DWORD PTR tv443[ebp]
  mov  edi, DWORD PTR _buff$1$[ebp]
$LN25@Code2:
; Line 28
  cmp  eax, esi
  jge  SHORT $LN5@Code2
; Line 26
  mov  edi, DWORD PTR [ecx+edi]
  npad  4
$LL23@Code2:
; Line 30
  cmp  DWORD PTR [edi+eax*4], edx
  cmovg  edx, DWORD PTR [edi+eax*4]
  inc  eax
  cmp  eax, esi
  jl  SHORT $LL23@Code2
$LN5@Code2:
; Line 26
  mov  edi, DWORD PTR _buff$1$[ebp]
  inc  ebx
  mov  DWORD PTR _nPoly$1$[ebp], ebx
  cmp  ebx, 10          ; 0000000aH
  jl  $LL6@Code2
; Line 28
  movd  xmm0, edx
  pshufd  xmm0, xmm0, 0
  pmaxsd  xmm1, xmm0
  pmaxsd  xmm1, xmm2
  movdqa  xmm0, xmm1
  psrldq  xmm0, 8
  pmaxsd  xmm1, xmm0
  movdqa  xmm0, xmm1
  pop  edi
  psrldq  xmm0, 4
  pmaxsd  xmm1, xmm0
  pop  esi
  movd  eax, xmm1
  pop  ebx
; Line 35
  mov  esp, ebp
  pop  ebp
  ret  0

Here:

  cmp  esi, 8
  jb  SHORT $LN25@Code2
  cmp  DWORD PTR ___isa_available, 2
  jl  SHORT $LN25@Code2

we have the test that branches to the "single step" version if either (A) the loop is less than 8 long, or (B) we don't have SSE3/SSE4 support.

The single step version is:

$LN5@Code2:
; Line 26
  mov  edi, DWORD PTR _buff$1$[ebp]
  inc  ebx
  mov  DWORD PTR _nPoly$1$[ebp], ebx
  cmp  ebx, 10          ; 0000000aH
  jl  $LL6@Code2

which has no SSE instructions. However, the important part is the fall through. If eax (the iteration parameter) passes 10, it falls through into:

; Line 28
  movd  xmm0, edx
  pshufd  xmm0, xmm0, 0
  pmaxsd  xmm1, xmm0

which is code that finds the max of both the single step version results and the SSE4 results. The 3rd instruction is pmaxsd, which is an SSE4_1 instruction, and it is not guarded by __isa_available.

Is there a compiler setting or workaround that can leave the auto-vectorization intact, while not invoking SSE4_1 instructions on Core2 SSE2 enabled computers? Is there a bug in my code that is causing this to happen?

Note that my attempts to remove the double-nested nature of the loop seem to make the problem go away.

解决方案

This is documented behaviour:

The Auto-Vectorizer also uses the newer, SSE4.2 instruction set if your computer supports it.

If you look closer at the code the compiler generates you'll see that the use of the SSE4.2 instructions is dependent on a runtime test:

cmp DWORD PTR ___isa_available, 2
jl  SHORT $LN11@Code

The value 2 here apparently means SSE4.2.

I was however able to confirm the bug in your second example. It turns out the Core 2 PC I was using supports SSE4.1 and the PMAXSD instruction, so I had to test it in on a PC with a Pentium 4 CPU to get the illegal instruction exception. You should submit a bug report to Microsoft Connect. Be sure to mention the specific Core 2 CPU model your example code fails on.

As for a workaround I can only suggest changing the optimization level for the affected function. Switching from optimizing for speed to optimizing for size seems to generate much the same code as would be used with only SSE2 instructions. You can use #pragma optimize to switch the optimization level like this:

#pragma optimize("s", on)

long Code(Buffer* buff)
{
     ...
}

#pragma optimize("", on)

As documented on this bug report, /d2Qvec-sse2only is an undocumented flag that works on update 3 (and possibly update 2) to prevent the compiler from outputing SSE4 instructions. This can prevent some loops from being vectorized, naturally. /d2Qvec-sse2only may cease to work at any point (it is "subject to future change without notice"), possibly on future versions of VC.

Microsoft claims that this problem is fixed in Update 4, and in the Update 4 CTP 2 (not for production use).

这篇关于Visual Studio 2013 Update 2和Update 3生成的SSE 4指令的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆