相比于裸__m128 SSE矢量包装类型的表现 [英] SSE vector wrapper type performance compared to bare __m128
问题描述
我发现一个有趣的约SIMD陷阱Gamasutra的文章,其中规定这是不可能达到的纯性能 __ M128
键入与封装类型。嗯,我是持怀疑态度,所以我下载的项目文件,编造一个可比的测试用例。
原来(我惊奇)的包装版本是显著慢。因为我不想谈论刚才的空气稀薄,测试案例如下:
在第一种情况 Vec4
是与 __ M128
键入一个简单的别名一些运营商:
的#include< xmmintrin.h>
#包括LT&;&emmintrin.h GT;使用Vec4 = __m128;内嵌__m128 VLOAD(浮动六)
{
返回_mm_set_ps(F,F,F,F);
};内嵌Vec4&安培;运算符+ =(Vec4&安培; VA,VB Vec4)
{
回报(VA = _mm_add_ps(VA,VB));
};内嵌Vec4&安培;符* =(Vec4&安培; VA,VB Vec4)
{
回报(VA = _mm_mul_ps(VA,VB));
};内嵌Vec4运营商+(Vec4 VA,VB Vec4)
{
返回_mm_add_ps(VA,VB);
};内嵌Vec4操作符(Vec4 VA,VB Vec4)
{
返回_mm_sub_ps(VA,VB);
};内嵌Vec4符*(Vec4 VA,VB Vec4)
{
返回_mm_mul_ps(VA,VB);
};
在第二种情况 Vec4
约为一个轻量级的包装__ M128
。
这不是一个完整的包装,只是一个小品覆盖问题。经营者裹一模一样内部函数,唯一的区别是(因为不能在参数应用于16字节对齐),他们采取 Vec4
为常量
引用:
的#include< xmmintrin.h>
#包括LT&;&emmintrin.h GT;结构Vec4
{
__m128 SIMD; 内嵌Vec4()=默认值;
内嵌Vec4(常量Vec4&安培)=默认值;
内嵌Vec4&安培;运算符=(常量Vec4&安培)=默认值; 内嵌Vec4(__ M128 S)
:SIMD(S)
{} 内嵌运营商__m128()const的
{
返回SIMD;
} 内嵌运营商__m128及()
{
返回SIMD;
}
};内嵌__m128 VLOAD(浮动六)
{
返回_mm_set_ps(F,F,F,F);
};内嵌Vec4 VADD(常量Vec4&放大器; VA,常量Vec4和放大器; VB)
{
返回_mm_add_ps(VA,VB);
//返回_mm_add_ps(va.simd,vb.simd); //不作区别
};内嵌Vec4 VSUB(常量Vec4&放大器; VA,常量Vec4和放大器; VB)
{
返回_mm_sub_ps(VA,VB);
//返回_mm_sub_ps(va.simd,vb.simd); //不作区别
};内嵌Vec4 VMul(常量Vec4&放大器; VA,常量Vec4和放大器; VB)
{
返回_mm_mul_ps(VA,VB);
//返回_mm_mul_ps(va.simd,vb.simd); //不作区别
};
这里是测试内核产生不同的版本不同的性能 Vec4
:
的#include< xmmintrin.h>
#包括LT&;&emmintrin.h GT;结构EQSTATE
{
//过滤器#1(低频段) Vec4 LF; // 频率
Vec4 f1p0; //波兰人...
Vec4 f1p1;
Vec4 f1p2;
Vec4 f1p3; //滤波器#2(高频段) Vec4 HF; // 频率
Vec4 f2p0; //波兰人...
Vec4 f2p1;
Vec4 f2p2;
Vec4 f2p3; //样品历史缓冲区 Vec4 sdm1; //采样数据减去1
Vec4 SDM2; // 2
Vec4 sdm3; // 3 //增益控制 Vec4 LG; //低增益
Vec4毫克; //中旬增益
Vec4汞柱; //高增益};静浮vsaf =(1.0F / 4294967295.0f); //极少量(非规格化修复)
静态Vec4 VSA = VLOAD(vsaf);Vec4 TestEQ(EQSTATE * ES,Vec4&安培;样品)
{
//当地人 Vec4 L,M,H; //低/中/高 - 采样值 //过滤器#1(低通) ES-GT&; f1p0 + =(ES> LF *(样本 - 上课 - > f1p0))+ VSA;
//第ES> f1p0 = VADD(ES> f1p0,VADD(VMul(ES> LF,VSUB(样品,第ES> f1p0)),VSA)); ES-GT&; f1p1 + =(ES> LF *(ES> f1p0 - 上课 - > f1p1));
// ES-> f1p1 = VADD(ES-> f1p1,VMul(ES-> LF,VSUB(ES-> f1p0,ES-> f1p1))); ES-GT&; f1p2 + =(ES> LF *(ES> f1p1 - 上课 - > f1p2));
// ES-> f1p2 = VADD(ES-> f1p2,VMul(ES-> LF,VSUB(ES-> f1p1,ES-> f1p2))); ES-GT&; f1p3 + =(ES> LF *(ES> f1p2 - 上课 - > f1p3));
// ES-> f1p3 = VADD(ES-> f1p3,VMul(ES-> LF,VSUB(ES-> f1p2,ES-> f1p3))); L = ES-GT&; f1p3; //滤波器#2(高通) ES-GT&; f2p0 + =(ES> HF *(样本 - 上课 - > f2p0))+ VSA;
//第ES> f2p0 = VADD(ES> f2p0,VADD(VMul(ES> HF,VSUB(样品,第ES> f2p0)),VSA)); ES-GT&; f2p1 + =(ES> HF *(ES> f2p0 - 上课 - > f2p1));
// ES-> f2p1 = VADD(ES-> f2p1,VMul(ES-> HF,VSUB(ES-> f2p0,ES-> f2p1))); ES-GT&; f2p2 + =(ES> HF *(ES> f2p1 - 上课 - > f2p2));
// ES-> f2p2 = VADD(ES-> f2p2,VMul(ES-> HF,VSUB(ES-> f2p1,ES-> f2p2))); ES-GT&; f2p3 + =(ES> HF *(ES> f2p2 - 上课 - > f2p3));
// ES-> f2p3 = VADD(ES-> f2p3,VMul(ES-> HF,VSUB(ES-> f2p2,ES-> f2p3))); H = ES-GT&; sdm3 - 上课 - > f2p3;
// H = VSUB(ES> sdm3,ES-GT&; f2p3); //计算中端(信号 - (低+高)) M = ES-> sdm3 - (H + L);
// M = VSUB(ES-> sdm3,VADD(H,L)); //规模,合并和存储 L * = ES-GT&; LG;
M * = ES-GT&;毫克;
H * = ES-GT&;汞柱; // L = VMul(L,ES-> LG);
// M = VMul(男,第ES>毫克);
// H = VMul(H,第ES>汞柱); //随机播放历史缓冲区 ES-GT&; sdm3 = ES-GT&; SDM2;
ES-GT&; SDM2 = ES-GT&; sdm1;
ES-GT&; sdm1 =样本; //返回结果 返回(L + M + H);
//回报(VADD(L,VADD(M,H)));
}//使这些为全局强制执行的函数调用;
静态Vec4样本[1024]结果[1024];
静态EQSTATE ES;#包括LT&;&计时GT;
#包括LT&;&iostream的GT;诠释的main()
{
汽车T0 =的std ::时辰:: high_resolution_clock ::现在(); 为(中间体二= 0; II蛋白酶1024;ⅱ++)
{
结果[Ⅱ] = TestEQ(安培; ES,样品[II]);
} 汽车T1 =的std ::时辰:: high_resolution_clock ::现在();
自动T =的std ::时辰:: duration_cast<的std ::时辰::纳秒>(T1 - T0).Count之间的();
性病::法院LT&;< 时间:<< T<<的'\\ n'; 的std :: cin.get(); 返回0;
}
链接工作code
2015年MSVC生成的汇编为第1版
; COMDAT?TestEQ @@ YA?AT__m128 @@ PAUEQSTATE @@ AAT1 @@ž
_TEXT段
?TestEQ @@ YA AT__m128 @@ PAUEQSTATE @@ AAT1 @@žPROC; TestEQ,COMDAT
; _es $死$ = ECX
; _Sample $ = EDX
vmovaps XMM0,XMMWORD PTR [EDX]
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16
vmovaps XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@一
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?VSA @@ 3T__m128 @@一
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64
vmulps XMM0,XMM0,XMM2
vaddps XMM4,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64
vmovaps XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 80
vmovaps将xmm1,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 192
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64,XMM4
vmovaps XMM0,XMMWORD PTR [EDX]
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?VSA @@ 3T__m128 @@一
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144
vsubps XMM2,将xmm1,XMM0
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144,XMM0
vmovaps XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 176
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 192,XMM0
vmovaps XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 160
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 176,XMM0
vmovaps XMM0,XMMWORD PTR [EDX]
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 160,XMM0
vaddps XMM0,XMM4,XMM2
vsubps XMM0,xmm1中,XMM0
vmulps将xmm1,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 224
vmulps XMM0,XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 240
vaddps将xmm1,xmm1中,XMM0
vmulps XMM0,XMM4,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 208
vaddps XMM0,xmm1中,XMM0
RET 0
?TestEQ @@ YA AT__m128 @@ PAUEQSTATE @@ AAT1 @@žENDP?; TestEQ
2015年MSVC产生组装在第二版
<?pre> TestEQ @@ YA AUVec4 @ VMATH @@ PAUEQSTATE @@ AAU12 @@žPROC; TestEQ,COMDAT
; ___ $ ReturnUdt $ = ECX
; _es $死$ = EDX
推EBX
MOV EBX,ESP
子ESP,8
和ESP,-8; fffffff8H
ADD ESP,4
推EBP
MOV EBP,DWORD PTR [EBX + 4]
MOV EAX,DWORD PTR _Sample $ [EBX]
vmovaps XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@一
vmovaps将xmm1,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 192
MOV DWORD PTR [ESP + 4],EBP
vmovaps XMM0,XMMWORD PTR [EAX]
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?VSA @@ 3UVec4 @ VMATH @@一
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 16,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 32,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 48,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64
vmulps XMM0,XMM0,XMM2
vaddps XMM4,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64
vmovaps XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 80
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 64,XMM4
vmovaps XMM0,XMMWORD PTR [EAX]
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?VSA @@ 3UVec4 @ VMATH @@一
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 96,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 112,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 128,XMM0
vsubps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144
vmulps XMM0,XMM0,XMM2
vaddps XMM0,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144
vsubps XMM2,将xmm1,XMM0
vmovaps XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 144,XMM0
vaddps XMM0,XMM2,XMM4
vsubps XMM0,xmm1中,XMM0
vmulps将xmm1,XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 224
vmovdqu XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 176
vmovdqu XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 192,XMM0
vmovdqu XMM0,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 160
vmovdqu XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 176,XMM0
vmovdqu XMM0,XMMWORD PTR [EAX]
vmovdqu XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 160,XMM0
vmulps XMM0,XMM4,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 208
vaddps将xmm1,XMM0,xmm1中
vmulps XMM0,XMM2,XMMWORD PTR?ES @@ 3UEQSTATE @@ A + 240
vaddps XMM0,xmm1中,XMM0
vmovaps XMMWORD PTR [ECX],XMM0
MOV EAX,ECX
流行EBP
MOV ESP,EBX
流行EBX
RET 0
?TestEQ @@ YA AUVec4 @ VMATH @@ PAUEQSTATE @@ AAU12 @@žENDP?; TestEQ
的第2版的生产装配是显著更长和更慢。它不严格相关的的Visual Studio 的,因为的锵3.8 的产生类似的性能结果。
锵3.8在第一个版本生成的程序集
?TestEQ @@ YAT__m128 @@ PAUEQSTATE @@ AAT1 @@ Z:#@?\\ 01 TestEQ @@ YAT__m128 @@ PAUEQSTATE @@ AAT1 @@ Z
Lfunc_begin0:
Ltmp0:
#BB#0:#%进入
MOVL 8(%ESP),EAX%
MOVL 4(%尤),%ecx中
vmovaps _vsa,%XMM0
vmovaps(ECX%),%将xmm1
vmovaps 16(%ECX),%XMM2
vmovaps(%EAX),%XMM3
vsubps%XMM2,XMM3%,%XMM3
vmulps%XMM3,%将xmm1,%XMM3
vaddps%XMM3,%XMM0,%XMM3
vaddps%XMM3,XMM2%,%XMM2
vmovaps%XMM2,16(%ECX)
vmovaps 32(%ECX),%XMM3
vsubps%XMM3,XMM2%,%XMM2
vmulps%XMM2,%将xmm1,%XMM2
vaddps%XMM2,XMM3%,%XMM2
vmovaps%XMM2,32(%ECX)
vmovaps 48(%ECX),%XMM3
vsubps%XMM3,XMM2%,%XMM2
vmulps%XMM2,%将xmm1,%XMM2
vaddps%XMM2,XMM3%,%XMM2
vmovaps%XMM2,48(%ECX)
vmovaps 64(%ECX),%XMM3
vsubps%XMM3,XMM2%,%XMM2
vmulps%XMM2,%将xmm1,xmm1中的%
vaddps%将xmm1,%XMM3,xmm1中的%
vmovaps%将xmm1,64(%ECX)
vmovaps 80(%ECX),%XMM2
vmovaps 96(%ECX),%XMM3
vmovaps(%EAX),%XMM4
vsubps%XMM3,%XMM4,%XMM4
vmulps%XMM4,%XMM2,XMM4%
vaddps%XMM4,%XMM0,%XMM0
vaddps%XMM0,%XMM3,%XMM0
vmovaps%XMM0,96(%ECX)
vmovaps 112(%ECX),%XMM3
vsubps%XMM3,%XMM0,%XMM0
vmulps%XMM0,%XMM2,%XMM0
vaddps%XMM0,%XMM3,%XMM0
vmovaps%XMM0,112(ECX%)
vmovaps 128(%ECX),%XMM3
vsubps%XMM3,%XMM0,%XMM0
vmulps%XMM0,%XMM2,%XMM0
vaddps%XMM0,%XMM3,%XMM0
vmovaps%XMM0,128(%ECX)
vmovaps 144(%ECX),%XMM3
vsubps%XMM3,%XMM0,%XMM0
vmulps%XMM0,%XMM2,%XMM0
vaddps%XMM0,%XMM3,%XMM0
vmovaps%XMM0,144(ECX%)
vmovaps 192(ECX%),%XMM2
vsubps%XMM0,%XMM2,%XMM0
vaddps%XMM0,xmm1中的%,%XMM3
vsubps%XMM3,XMM2%,%XMM2
vmulps 208(ECX%),%将xmm1,xmm1中的%
vmulps 224(ECX%),%XMM2,%XMM2
vmulps 240(%ECX),%XMM0,%XMM0
vmovaps 176(ECX%),%XMM3
vmovaps%XMM3,192(ECX%)
vmovaps 160(%ECX),%XMM3
vmovaps%XMM3,176(ECX%)
vmovaps(%EAX),%XMM3
vmovaps%XMM3,160(ECX%)
vaddps%XMM2,%XMM0,%XMM0
vaddps%XMM0,xmm1中的%,%XMM0
RETL
Lfunc_end0:
锵3.8在第二版生成的程序集
?TestEQ @@ YA AUVec4 @@ PAUEQSTATE @@ AAU1 @@ Z:?#@\\ 01 TestEQ @@ YA AUVec4 @@ PAUEQSTATE @@ AAU1 @ @Z
Lfunc_begin0:
Ltmp0:
#BB#0:#%进入
MOVL 12(%尤),%ecx中
MOVL 8(%ESP),EDX%
vmovaps(%EDX),%XMM0
vmovaps 16(%EDX),xmm1中的%
vmovaps(ECX%),%XMM2
vsubps%将xmm1,%XMM2,%XMM2
vmulps%XMM0,%XMM2,%XMM2
vaddps _vsa,%XMM2,%XMM2
vaddps%XMM2,%将xmm1,xmm1中的%
vmovaps%将xmm1,16(%EDX)
vmovaps 32(%EDX),%XMM2
vsubps%XMM2,%将xmm1,xmm1中的%
vmulps%XMM0,xmm1中的%,%将xmm1
vaddps%将xmm1,%XMM2,xmm1中的%
vmovaps%将xmm1,32(%EDX)
vmovaps 48(%EDX),%XMM2
vsubps%XMM2,%将xmm1,xmm1中的%
vmulps%XMM0,xmm1中的%,%将xmm1
vaddps%将xmm1,%XMM2,xmm1中的%
vmovaps%将xmm1,48(%EDX)
vmovaps 64(%EDX),%XMM2
vsubps%XMM2,%将xmm1,xmm1中的%
vmulps%XMM0,xmm1中的%,%XMM0
vaddps%XMM0,%XMM2,%XMM0
vmovaps%XMM0,64(%EDX)
vmovaps 80(%EDX),xmm1中的%
vmovaps 96(%EDX),%XMM2
vmovaps(%ECX),%XMM3
vsubps%XMM2,XMM3%,%XMM3
vmulps%将xmm1,%XMM3,%XMM3
vaddps _vsa,%XMM3,%XMM3
vaddps%XMM3,XMM2%,%XMM2
vmovaps%XMM2,96(%EDX)
vmovaps 112(%EDX),%XMM3
vsubps%XMM3,XMM2%,%XMM2
vmulps%将xmm1,%XMM2,%XMM2
vaddps%XMM2,XMM3%,%XMM2
vmovaps%XMM2,112(%EDX)
vmovaps 128(%EDX),%XMM3
vsubps%XMM3,XMM2%,%XMM2
vmulps%将xmm1,%XMM2,%XMM2
vaddps%XMM2,XMM3%,%XMM2
vmovaps%XMM2,128(%EDX)
vmovaps 144(%EDX),%XMM3
vsubps%XMM3,XMM2%,%XMM2
vmulps%将xmm1,%XMM2,xmm1中的%
vaddps%将xmm1,%XMM3,xmm1中的%
vmovaps%将xmm1,144(%EDX)
vmovaps 192(%EDX),%XMM2
vsubps%将xmm1,%XMM2,xmm1中的%
vaddps%将xmm1,%XMM0,%XMM3
vsubps%XMM3,XMM2%,%XMM2
vmulps 208(%EDX),%XMM0,%XMM0
vmulps 224(%EDX),%XMM2,%XMM2
MOVL 4(%ESP),EAX%
vmulps 240(%EDX),%将xmm1,xmm1中的%
vmovaps 176(%EDX),%XMM3
vmovaps%XMM3,192(%EDX)
vmovaps 160(%EDX),%XMM3
vmovaps%XMM3,176(%EDX)
vmovaps(%ECX),%XMM3
vmovaps%XMM3,160(%EDX)
vaddps%XMM2,%XMM0,%XMM0
vaddps%XMM0,xmm1中的%,%XMM0
vmovaps%XMM0,(%EAX)
RETL
Lfunc_end0:
虽然指令的数目是相同的,第1次版本仍然快约50%的
我试图找出问题的原因,没有成功。有喜欢那些难看的 vmovdqu
指令在第二MSVC装配可疑的东西。建设,拷贝赋值操作符和传递按引用也可能会不必要地将数据从SSE寄存器回内存,但是我所有的努力来解决或准确识别的问题是不成功的。
我真的不认为这样一个简单的包装不能达到相同的性能裸 __ M128
,无论使其可以消除的开销。
那么,什么是对那里发生的?
由于原来的问题是不符合规定的用户结构Vec4
。
它深深地涉及到x86的调用约定。
默认的x86调用约定在Visual C ++是 __ CDECL
,其中
推栈上的参数,以相反的顺序(从右到左)
块引用>现在,这是一个问题,因为
Vec4
应保持在XMM寄存器传递。但是让我们看看什么是真正发生的事情。第一种情况
在第一种情况下
Vec4
是的简单类型别名__ M128
。使用Vec4 = __m128;
/ * ... * /
Vec4 TestEQ(EQSTATE * ES,Vec4&安培;样品){...}<?pre>
TestEQ
的生成函数头在组装TestEQ @@ YA AT__m128 @@ PAUEQSTATE @@ AAT1 @@žPROC; TestEQ,COMDAT
; _es $ = ECX
; _Sample $ = EDX
...尼斯。
第二种情况
在第二种情况下
Vec4
未的__ M128别名
,它是一个用户定义的类型了。在这里我探讨x86和x64平台编译。
的x86(32位编译)
由于
。__ CDECL
(这是默认调用的x86约定)不允许对准值传递给函数(即会发出错误C2719 :样本:以16要求的对齐形式参数不会被对齐
),我们通过它通过常量
引用结构Vec4 {__m128 SIMD; / * ... * /};
/ * ... * /
Vec4 TestEQ(EQSTATE * ES,常量Vec4&安培;样品){...}它产生
TestEQ
函数标题为<?pre>
TestEQ @@ YA AUVec4 @@ PAUEQSTATE @@ ABU1 @@žPROC; TestEQ,COMDAT
; ___ $ ReturnUdt $ = ECX
; _es $ = EDX
推EBX
MOV EBX,ESP
子ESP,8
和ESP,-8; fffffff8H
ADD ESP,4
推EBP
MOV EBP,DWORD PTR [EBX + 4]
MOV EAX,DWORD PTR _Sample $ [EBX]
...这是不是像一个在第一种情况那么简单。这些参数被移动到堆栈。还有这里没有列出了一些额外的
MOV前几个SSE指令过于之间
的说明。在整体这些指令都足以打到有所表现。64(64位编译)
在的Windows 64使用不同的调用约定作为一部分的的x64应用程序二进制接口(ABI)的
该公约试图保持数据寄存器如果可能的话,在某种程度上浮点数据保存在XMM寄存器。
x64的应用程序二进制接口(ABI)是一个4寄存器快速调用
调用约定,具有堆栈后盾,为这些寄存器。这里有一个
在一个函数的参数之间严格的一对一的对应关系,并
该寄存器这些参数。不适合在8任何参数
字节,或者不1,2,4,或8个字节时,必须通过引用传递。
(......)
所有浮点运算所使用的16 XMM寄存器完成。
该参数在寄存器RCX,RDX,R8和R9通过。如果argumentsare
浮点/双,他们在XMM0L,XMM1L,XMM2L和XMM3L传递。 16
字节参数按引用传递。
块引用>
微软64位调用约定之后在Windows
和pre-启动UEFI(用于x86-64的长模式)。它采用寄存器RCX,
RDX,R8,R9的前四个整数或指针参数(在
顺序),XMM0,XMM1,XMM2,XMM3用于浮点
参数。额外的参数压入堆栈(从右到
剩下)。整数返回值(类似于X86)返回在RAX如果
64位或更少。浮点返回值在XMM0返回。
块引用>因此,在64位模式下第二种情况下生成
<?pre>TestEQ
函数标题为TestEQ @@ YQ AUVec4 @@ PAUEQSTATE @@ ABU1 @@žPROC; TestEQ,COMDAT
; _es $ = ECX
; _Sample $ = EDX
...这是完全一样的第一种情况!
解决方案
对于x86模式psented行为$ P $应明确固定的。
最简单的办法就是
在线
的功能。
虽然这仅仅是一个暗示,编译器可以完全忽略,你可以告诉编译器总是内联函数。但有时这是不是因为功能的大小或任何其他原因的期望。幸运的是微软推出的Visual Studio 2013及以上(在x86和x64模式下可用)的
__ vectorcall
约定。这非常类似于默认的Windows x64调用约定,但更多的可利用的寄存器。让我们重写与第二种情况
__ vectorcall
:Vec4 __vectorcall TestEQ(EQSTATE * ES,常量Vec4&安培;样品){...}
现在所生成汇编函数头
<?pre>TestEQ
是TestEQ @@ YQ AUVec4 @@ PAUEQSTATE @@ ABU1 @@žPROC; TestEQ,COMDAT
; _es $ = ECX
; _Sample $ = EDX
...这是最后一样的第一种情况的和64的的第二种情况的
正如彼得科德斯指出,要采取充分利用
__ vectorcall
时,Vec4
参数应该传递值而不是常量引用。要做到这一点,通过类型应满足一些要求,比如它必须是平凡的拷贝构造(没有用户定义的复制构造函数),不应包含任何联盟。在下面的评论更多信息和这里。最后的话
它看起来像MSVC引擎盖下自动应用
__ vectorcall
约定的优化,当它检测到__ M128
参数。否则,它使用默认调用约定__ CDECL
(你可以改变由编译器选项这种行为)。人们告诉我的意见,他们并没有看到GCC太大的区别,并锵产生的2案件组装。这是因为这些编译器与优化标志
-O2
只是内联TestEQ
功能到测试循环体(的看)。这也有可能是他们会更聪明比MSVC,他们将执行的函数调用的更好的优化。I found an interesting Gamasutra article about SIMD pitfalls, which states that it is not possible to reach the performance of the "pure"
__m128
type with wrapper types. Well I was skeptical, so I downloaded the project files and fabricated a comparable test case.It turned out (for my surprise) that the wrapper version is significantly slower. Since I don't want to talk about just the thin air, the test cases are the following:
In the 1st case
Vec4
is a simple alias of the__m128
type with some operators:#include <xmmintrin.h> #include <emmintrin.h> using Vec4 = __m128; inline __m128 VLoad(float f) { return _mm_set_ps(f, f, f, f); }; inline Vec4& operator+=(Vec4 &va, Vec4 vb) { return (va = _mm_add_ps(va, vb)); }; inline Vec4& operator*=(Vec4 &va, Vec4 vb) { return (va = _mm_mul_ps(va, vb)); }; inline Vec4 operator+(Vec4 va, Vec4 vb) { return _mm_add_ps(va, vb); }; inline Vec4 operator-(Vec4 va, Vec4 vb) { return _mm_sub_ps(va, vb); }; inline Vec4 operator*(Vec4 va, Vec4 vb) { return _mm_mul_ps(va, vb); };
In the 2nd case
Vec4
is a lightweight wrapper around__m128
. It is not a complete wrapper, just a short sketch which covers the issue. The operators wrap exactly the same intrinsics, the only difference is (since 16-byte alignment cannot be applied on arguments) that they takeVec4
asconst
reference:#include <xmmintrin.h> #include <emmintrin.h> struct Vec4 { __m128 simd; inline Vec4() = default; inline Vec4(const Vec4&) = default; inline Vec4& operator=(const Vec4&) = default; inline Vec4(__m128 s) : simd(s) {} inline operator __m128() const { return simd; } inline operator __m128&() { return simd; } }; inline __m128 VLoad(float f) { return _mm_set_ps(f, f, f, f); }; inline Vec4 VAdd(const Vec4 &va, const Vec4 &vb) { return _mm_add_ps(va, vb); // return _mm_add_ps(va.simd, vb.simd); // doesn't make difference }; inline Vec4 VSub(const Vec4 &va, const Vec4 &vb) { return _mm_sub_ps(va, vb); // return _mm_sub_ps(va.simd, vb.simd); // doesn't make difference }; inline Vec4 VMul(const Vec4 &va, const Vec4 &vb) { return _mm_mul_ps(va, vb); // return _mm_mul_ps(va.simd, vb.simd); // doesn't make difference };
And here is the test kernel which produces different performance with different versions of
Vec4
:#include <xmmintrin.h> #include <emmintrin.h> struct EQSTATE { // Filter #1 (Low band) Vec4 lf; // Frequency Vec4 f1p0; // Poles ... Vec4 f1p1; Vec4 f1p2; Vec4 f1p3; // Filter #2 (High band) Vec4 hf; // Frequency Vec4 f2p0; // Poles ... Vec4 f2p1; Vec4 f2p2; Vec4 f2p3; // Sample history buffer Vec4 sdm1; // Sample data minus 1 Vec4 sdm2; // 2 Vec4 sdm3; // 3 // Gain Controls Vec4 lg; // low gain Vec4 mg; // mid gain Vec4 hg; // high gain }; static float vsaf = (1.0f / 4294967295.0f); // Very small amount (Denormal Fix) static Vec4 vsa = VLoad(vsaf); Vec4 TestEQ(EQSTATE* es, Vec4& sample) { // Locals Vec4 l,m,h; // Low / Mid / High - Sample Values // Filter #1 (lowpass) es->f1p0 += (es->lf * (sample - es->f1p0)) + vsa; //es->f1p0 = VAdd(es->f1p0, VAdd(VMul(es->lf, VSub(sample, es->f1p0)), vsa)); es->f1p1 += (es->lf * (es->f1p0 - es->f1p1)); //es->f1p1 = VAdd(es->f1p1, VMul(es->lf, VSub(es->f1p0, es->f1p1))); es->f1p2 += (es->lf * (es->f1p1 - es->f1p2)); //es->f1p2 = VAdd(es->f1p2, VMul(es->lf, VSub(es->f1p1, es->f1p2))); es->f1p3 += (es->lf * (es->f1p2 - es->f1p3)); //es->f1p3 = VAdd(es->f1p3, VMul(es->lf, VSub(es->f1p2, es->f1p3))); l = es->f1p3; // Filter #2 (highpass) es->f2p0 += (es->hf * (sample - es->f2p0)) + vsa; //es->f2p0 = VAdd(es->f2p0, VAdd(VMul(es->hf, VSub(sample, es->f2p0)), vsa)); es->f2p1 += (es->hf * (es->f2p0 - es->f2p1)); //es->f2p1 = VAdd(es->f2p1, VMul(es->hf, VSub(es->f2p0, es->f2p1))); es->f2p2 += (es->hf * (es->f2p1 - es->f2p2)); //es->f2p2 = VAdd(es->f2p2, VMul(es->hf, VSub(es->f2p1, es->f2p2))); es->f2p3 += (es->hf * (es->f2p2 - es->f2p3)); //es->f2p3 = VAdd(es->f2p3, VMul(es->hf, VSub(es->f2p2, es->f2p3))); h = es->sdm3 - es->f2p3; //h = VSub(es->sdm3, es->f2p3); // Calculate midrange (signal - (low + high)) m = es->sdm3 - (h + l); //m = VSub(es->sdm3, VAdd(h, l)); // Scale, Combine and store l *= es->lg; m *= es->mg; h *= es->hg; //l = VMul(l, es->lg); //m = VMul(m, es->mg); //h = VMul(h, es->hg); // Shuffle history buffer es->sdm3 = es->sdm2; es->sdm2 = es->sdm1; es->sdm1 = sample; // Return result return(l + m + h); //return(VAdd(l, VAdd(m, h))); } //make these as globals to enforce the function call; static Vec4 sample[1024], result[1024]; static EQSTATE es; #include <chrono> #include <iostream> int main() { auto t0 = std::chrono::high_resolution_clock::now(); for (int ii=0; ii<1024; ii++) { result[ii] = TestEQ(&es, sample[ii]); } auto t1 = std::chrono::high_resolution_clock::now(); auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count(); std::cout << "timing: " << t << '\n'; std::cin.get(); return 0; }
Link to working code
MSVC 2015 generated assembly for the 1st version:
; COMDAT ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z _TEXT SEGMENT ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC ; TestEQ, COMDAT ; _es$dead$ = ecx ; _sample$ = edx vmovaps xmm0, XMMWORD PTR [edx] vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16 vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64 vmulps xmm0, xmm0, xmm2 vaddps xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64 vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80 vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4 vmovaps xmm0, XMMWORD PTR [edx] vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144 vsubps xmm2, xmm1, xmm0 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0 vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0 vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0 vmovaps xmm0, XMMWORD PTR [edx] vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0 vaddps xmm0, xmm4, xmm2 vsubps xmm0, xmm1, xmm0 vmulps xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224 vmulps xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240 vaddps xmm1, xmm1, xmm0 vmulps xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208 vaddps xmm0, xmm1, xmm0 ret 0 ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z ENDP ; TestEQ
MSVC 2015 generated assembly for the 2nd version:
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z PROC ; TestEQ, COMDAT ; ___$ReturnUdt$ = ecx ; _es$dead$ = edx push ebx mov ebx, esp sub esp, 8 and esp, -8 ; fffffff8H add esp, 4 push ebp mov ebp, DWORD PTR [ebx+4] mov eax, DWORD PTR _sample$[ebx] vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192 mov DWORD PTR [esp+4], ebp vmovaps xmm0, XMMWORD PTR [eax] vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64 vmulps xmm0, xmm0, xmm2 vaddps xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64 vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4 vmovaps xmm0, XMMWORD PTR [eax] vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0 vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144 vsubps xmm2, xmm1, xmm0 vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0 vaddps xmm0, xmm2, xmm4 vsubps xmm0, xmm1, xmm0 vmulps xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224 vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176 vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0 vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160 vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0 vmovdqu xmm0, XMMWORD PTR [eax] vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0 vmulps xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208 vaddps xmm1, xmm0, xmm1 vmulps xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240 vaddps xmm0, xmm1, xmm0 vmovaps XMMWORD PTR [ecx], xmm0 mov eax, ecx pop ebp mov esp, ebx pop ebx ret 0 ?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z ENDP ; TestEQ
The produced assembly of the 2nd version is significantly longer and slower. It is not strictly related to Visual Studio, since Clang 3.8 produces similar performance results.
Clang 3.8 generated assembly for the 1st version:
"?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z": # @"\01?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z" Lfunc_begin0: Ltmp0: # BB#0: # %entry movl 8(%esp), %eax movl 4(%esp), %ecx vmovaps _vsa, %xmm0 vmovaps (%ecx), %xmm1 vmovaps 16(%ecx), %xmm2 vmovaps (%eax), %xmm3 vsubps %xmm2, %xmm3, %xmm3 vmulps %xmm3, %xmm1, %xmm3 vaddps %xmm3, %xmm0, %xmm3 vaddps %xmm3, %xmm2, %xmm2 vmovaps %xmm2, 16(%ecx) vmovaps 32(%ecx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm2, %xmm1, %xmm2 vaddps %xmm2, %xmm3, %xmm2 vmovaps %xmm2, 32(%ecx) vmovaps 48(%ecx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm2, %xmm1, %xmm2 vaddps %xmm2, %xmm3, %xmm2 vmovaps %xmm2, 48(%ecx) vmovaps 64(%ecx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm2, %xmm1, %xmm1 vaddps %xmm1, %xmm3, %xmm1 vmovaps %xmm1, 64(%ecx) vmovaps 80(%ecx), %xmm2 vmovaps 96(%ecx), %xmm3 vmovaps (%eax), %xmm4 vsubps %xmm3, %xmm4, %xmm4 vmulps %xmm4, %xmm2, %xmm4 vaddps %xmm4, %xmm0, %xmm0 vaddps %xmm0, %xmm3, %xmm0 vmovaps %xmm0, 96(%ecx) vmovaps 112(%ecx), %xmm3 vsubps %xmm3, %xmm0, %xmm0 vmulps %xmm0, %xmm2, %xmm0 vaddps %xmm0, %xmm3, %xmm0 vmovaps %xmm0, 112(%ecx) vmovaps 128(%ecx), %xmm3 vsubps %xmm3, %xmm0, %xmm0 vmulps %xmm0, %xmm2, %xmm0 vaddps %xmm0, %xmm3, %xmm0 vmovaps %xmm0, 128(%ecx) vmovaps 144(%ecx), %xmm3 vsubps %xmm3, %xmm0, %xmm0 vmulps %xmm0, %xmm2, %xmm0 vaddps %xmm0, %xmm3, %xmm0 vmovaps %xmm0, 144(%ecx) vmovaps 192(%ecx), %xmm2 vsubps %xmm0, %xmm2, %xmm0 vaddps %xmm0, %xmm1, %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps 208(%ecx), %xmm1, %xmm1 vmulps 224(%ecx), %xmm2, %xmm2 vmulps 240(%ecx), %xmm0, %xmm0 vmovaps 176(%ecx), %xmm3 vmovaps %xmm3, 192(%ecx) vmovaps 160(%ecx), %xmm3 vmovaps %xmm3, 176(%ecx) vmovaps (%eax), %xmm3 vmovaps %xmm3, 160(%ecx) vaddps %xmm2, %xmm0, %xmm0 vaddps %xmm0, %xmm1, %xmm0 retl Lfunc_end0:
Clang 3.8 generated assembly for the 2nd version:
"?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z": # @"\01?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z" Lfunc_begin0: Ltmp0: # BB#0: # %entry movl 12(%esp), %ecx movl 8(%esp), %edx vmovaps (%edx), %xmm0 vmovaps 16(%edx), %xmm1 vmovaps (%ecx), %xmm2 vsubps %xmm1, %xmm2, %xmm2 vmulps %xmm0, %xmm2, %xmm2 vaddps _vsa, %xmm2, %xmm2 vaddps %xmm2, %xmm1, %xmm1 vmovaps %xmm1, 16(%edx) vmovaps 32(%edx), %xmm2 vsubps %xmm2, %xmm1, %xmm1 vmulps %xmm0, %xmm1, %xmm1 vaddps %xmm1, %xmm2, %xmm1 vmovaps %xmm1, 32(%edx) vmovaps 48(%edx), %xmm2 vsubps %xmm2, %xmm1, %xmm1 vmulps %xmm0, %xmm1, %xmm1 vaddps %xmm1, %xmm2, %xmm1 vmovaps %xmm1, 48(%edx) vmovaps 64(%edx), %xmm2 vsubps %xmm2, %xmm1, %xmm1 vmulps %xmm0, %xmm1, %xmm0 vaddps %xmm0, %xmm2, %xmm0 vmovaps %xmm0, 64(%edx) vmovaps 80(%edx), %xmm1 vmovaps 96(%edx), %xmm2 vmovaps (%ecx), %xmm3 vsubps %xmm2, %xmm3, %xmm3 vmulps %xmm1, %xmm3, %xmm3 vaddps _vsa, %xmm3, %xmm3 vaddps %xmm3, %xmm2, %xmm2 vmovaps %xmm2, 96(%edx) vmovaps 112(%edx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm1, %xmm2, %xmm2 vaddps %xmm2, %xmm3, %xmm2 vmovaps %xmm2, 112(%edx) vmovaps 128(%edx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm1, %xmm2, %xmm2 vaddps %xmm2, %xmm3, %xmm2 vmovaps %xmm2, 128(%edx) vmovaps 144(%edx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm1, %xmm2, %xmm1 vaddps %xmm1, %xmm3, %xmm1 vmovaps %xmm1, 144(%edx) vmovaps 192(%edx), %xmm2 vsubps %xmm1, %xmm2, %xmm1 vaddps %xmm1, %xmm0, %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps 208(%edx), %xmm0, %xmm0 vmulps 224(%edx), %xmm2, %xmm2 movl 4(%esp), %eax vmulps 240(%edx), %xmm1, %xmm1 vmovaps 176(%edx), %xmm3 vmovaps %xmm3, 192(%edx) vmovaps 160(%edx), %xmm3 vmovaps %xmm3, 176(%edx) vmovaps (%ecx), %xmm3 vmovaps %xmm3, 160(%edx) vaddps %xmm2, %xmm0, %xmm0 vaddps %xmm0, %xmm1, %xmm0 vmovaps %xmm0, (%eax) retl Lfunc_end0:
Although the number of the instructions is the same, the 1st version is still about 50% faster.
I tried to identify the cause of the issue, without success. There are suspicious things like those ugly
vmovdqu
instructions in the 2nd MSVC assembly. Construction, copy assignment operator and the pass-by-reference also can unnecessarily move the data from SSE registers back to memory, however all my attempts to solve or exactly identify the issue was unsuccessful.I really don't think that such a simple wrapper cannot reach the same performance as the bare
__m128
, whatever causes the overhead it could be eliminated.So what is going on there?
解决方案As it turned out the problem is not with the user defined
struct Vec4
. It is deeply related to the x86 calling conventions.The default x86 calling convention in Visual C++ is
__cdecl
, whichPushes parameters on the stack, in reverse order (right to left)
Now this is a problem, since
Vec4
should be kept and passed in an XMM register. But let's see what is actually happening.
1st case
In the first case
Vec4
is a simple type alias of__m128
.using Vec4 = __m128; /* ... */ Vec4 TestEQ(EQSTATE* es, Vec4 &sample) { ... }
The generated function header of
TestEQ
in assembly is?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC ; TestEQ, COMDAT ; _es$ = ecx ; _sample$ = edx ...
Nice.
2nd case
In the second case
Vec4
is not an alias of__m128
, it is an user defined type now.Here I investigate compilation for both x86 and x64 platform.
x86 (32-bit compilation)
Since
__cdecl
(which is the default calling convention in x86) doesn't allow to pass aligned values to functions (that would emitError C2719: 'sample': formal parameter with requested alignment of 16 won't be aligned
) we pass it byconst
reference.struct Vec4{ __m128 simd; /* ... */ }; /* ... */ Vec4 TestEQ(EQSTATE* es, const Vec4 &sample) { ... }
which generates the function header for
TestEQ
as?TestEQ@@YA?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC ; TestEQ, COMDAT ; ___$ReturnUdt$ = ecx ; _es$ = edx push ebx mov ebx, esp sub esp, 8 and esp, -8 ; fffffff8H add esp, 4 push ebp mov ebp, DWORD PTR [ebx+4] mov eax, DWORD PTR _sample$[ebx] ...
This is not so simple like the one in the 1st case. The arguments are moved to the stack. There are some additional
mov
instructions between the first few SSE instructions too, which are not listed here. These instructions in overall are enough to somewhat hit the performance.x64 (64-bit compilation)
Windows in x64 use a different calling convention as part of the x64 Application Binary Interface (ABI).
This convention tries to keep the data in registers if possible, in a way that floating-point data kept in XMM registers.
From MSDN Overview of x64 Calling Conventions:
The x64 Application Binary Interface (ABI) is a 4 register fast-call calling convention, with stack-backing for those registers. There is a strict one-to-one correspondence between arguments in a function, and the registers for those arguments. Any argument that doesn’t fit in 8 bytes, or is not 1, 2, 4, or 8 bytes, must be passed by reference. (...) All floating point operations are done using the 16 XMM registers. The arguments are passed in registers RCX, RDX, R8, and R9. If the argumentsare float/double, they are passed in XMM0L, XMM1L, XMM2L, and XMM3L. 16 byte arguments are passed by reference.
From Wikipedia page for x86-64 calling conventions
The Microsoft x64 calling convention is followed on Windows and pre-boot UEFI (for long mode on x86-64). It uses registers RCX, RDX, R8, R9 for the first four integer or pointer arguments (in that order), and XMM0, XMM1, XMM2, XMM3 are used for floating point arguments. Additional arguments are pushed onto the stack (right to left). Integer return values (similar to x86) are returned in RAX if 64 bits or less. Floating point return values are returned in XMM0.
So the second case in x64 mode generates the function header for
TestEQ
as?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC ; TestEQ, COMDAT ; _es$ = ecx ; _sample$ = edx ...
This is exactly the same as the 1st case!
Solution
For x86 mode the presented behavior should be clearly fixed.
The most simple solution is to
inline
the function. Although this is just a hint and the compiler can completely ignore, you can tell the compiler to always inline the function. However sometimes this is not desired because of the function size or any other reason.Fortunately Microsoft introduced the
__vectorcall
convention in Visual Studio 2013 and above (available in both x86 and x64 mode). This is very similar to the default Windows x64 calling convention, but with more utilizable registers.Let's rewrite the 2nd case with
__vectorcall
:Vec4 __vectorcall TestEQ(EQSTATE* es, const Vec4 &sample) { ... }
Now the generated assembly function header for
TestEQ
is?TestEQ@@YQ?AUVec4@@PAUEQSTATE@@ABU1@@Z PROC ; TestEQ, COMDAT ; _es$ = ecx ; _sample$ = edx ...
which is finally the same as the 1st case and the 2nd case in x64.
As Peter Cordes pointed out, to take the full advantage of
__vectorcall
, theVec4
argument should be passed by value, instead of constant reference. To do this the passed type should meet some requirements, like it must be trivially copy constructible (no user defined copy constructors) and shouldn't contain any union. More info in the comments below and here.Final words
It looks like MSVC under the hood automatically applies the
__vectorcall
convention as an optimization when it detects an__m128
argument. Otherwise it uses the default calling convention__cdecl
(you can change this behavior by compiler options).People told me in the comments that they didn't see much difference between the GCC and Clang generated assembly of the two case. This is because these compilers with optimization flag
-O2
simply inline theTestEQ
function into the test loop body (see). It is also possible that they would be more clever than MSVC and they would perform better optimization of the function call.这篇关于相比于裸__m128 SSE矢量包装类型的表现的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!