_m128 SSE崩溃(OpenGL / C ++) [英] _m128 SSE crashing (OpenGL/C++)

查看:93
本文介绍了_m128 SSE崩溃(OpenGL / C ++)的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

您好!这是我在这里发表的第一篇文章,如果我遗漏或收集太多信息,请原谅我。


我正在尝试使用SSE卸载3D模型上的顶点变换(浮点)加快绘制模型所花费的时间。主要功能如下:

 __ m128 m4x4v_colSSE(const __m128 cols [4],const __m128 v)
{
__m128 u1 = _mm_shuffle_ps( v,v,_MM_SHUFFLE(0,0,0,0));
__m128 u2 = _mm_shuffle_ps(v,v,_MM_SHUFFLE(1,1,1,1));
__m128 u3 = _mm_shuffle_ps(v,v,_MM_SHUFFLE(2,2,2,2));
__m128 u4 = _mm_shuffle_ps(v,v,_MM_SHUFFLE(3,3,3,3));

__m128 prod1 = _mm_mul_ps(u1,cols [0]);
__ m128 prod2 = _mm_mul_ps(u2,cols [1]); //< ---此行产生错误!异常抛出0x0012F167
__m128 prod3 = _mm_mul_ps(u3,cols [2]);
__m128 prod4 = _mm_mul_ps(u4,cols [3]);

返回_mm_add_ps(_mm_add_ps(prod1,prod2),_ mm_add_ps(prod3,prod4));
}

现在,当执行mat4转换时,它会从下一个函数调用:

 

void md5_transform_vertices_sse(MD5mesh * msh,epi :: mat4_c * posemats,basevert * dst)
{
epi :: mat4_c * mats = posemats + 1;
MD5vertex * vs = msh-> verts;
MD5weight * ws = msh->权重;

int i,j;

for(i = 0; i< msh-> vertcnt; i ++)
{
MD5vertex * v = vs + i;
basevert * cv = dst + i;

__m128 pos = _mm_set1_ps(0);
__m128 norm = _mm_set1_ps(0);

MD5weight * w = ws + v-> firstweight;


for(j = 0; j< v-> weightcnt; j ++)
{
__m128 wpos = _mm_setr_ps(w [j] .pos [0],w [j] .pos [1],w [j] .pos [2],1);
__m128 wnorm = _mm_setr_ps(w [j] .normal [0],w [j] .normal [1],w [j] .normal [2],0);
wpos = m4x4v_colSSE((__ m128 *)& mats [w [j] .jointidx],wpos);
wnorm = m4x4v_colSSE((__ m128 *)& mats [w [j] .jointidx],wnorm);

__m128 weight = _mm_set1_ps(w [j] .weight);
pos = _mm_mul_ps(_mm_add_ps(wpos,pos),weight);
norm = _mm_mul_ps(_mm_add_ps(wnorm,norm),weight);
}

_mm_store_ps((float *)& cv-> pos,pos);
_mm_store_ps((float *)& cv-> norm,norm);
}

}

当游戏引擎加载并解码MD5模型后,我立即崩溃:


EDGE.exe中0x0045F167处抛出异常:


0xC0000005:访问冲突读取位置0xFFFFFFFF。


(_m128 m4x4v_colSSE()失败)


所以,我想要收集的是这里存在某种对齐问题(也许我错了),但我似乎无法解决导致错误的原因。为了记录,引擎被编译(在VS2017中)与/ O2和/ Ob2优化,
并且它被设置为生成SSE2。 


我写了md5_transform_vertices_sse ()函数代替普通的非SSE2版本(供参考):

 void md5_transform_vertices(MD5mesh * msh,epi :: mat4_c * posemats,basevert * dst)
{
epi :: mat4_c * mats = posemats + 1;
MD5vertex * vs = msh-> verts;
MD5weight * ws = msh->权重;
int i,j;

for(i = 0; i< msh-> vertcnt; i ++)
{
MD5vertex * v = vs + i;
basevert * cv = dst + i;

cv-> pos = epi :: vec3_c(0,0,0);
cv-> norm = epi :: vec3_c(0,0,0);
cv-> tan = epi :: vec3_c(0,0,0);
cv-> uv = epi :: vec2_c(v-> uv [0],v-> uv [1]);

MD5weight * w = ws + v-> firstweight;
for(j = 0; j< v-> weightcnt; j ++)
{
#if PREMULTIPLY
cv-> pos + =(mats [w [j ] .jointidx] * epi :: vec4_c(w [j] .pos,w [j] .weight))。Get3D();
cv-> norm + =(mats [w [j] .jointidx] * epi :: vec4_c(w [j] .normal,0))。Get3D();
cv-> tan + =(mats [w [j] .jointidx] * epi :: vec4_c(w [j] .tan,0))。Get3D();
#else
cv-> pos + = mats [w [j] .jointidx] * epi :: vec3_c(w [j] .pos)* w [j] .weight;
cv-> norm + =(mats [w [j] .jointidx] * epi :: vec4_c(w [j] .normal,0))。Get3D()* w [j] .weight;
cv-> tan + =(mats [w [j] .jointidx] * epi :: vec4_c(w [j] .tan,0))。Get3D()* w [j] .weight;
#endif
}

}
}

尽可能看,这些计算对于具有多个/ tris / etc的统一模型来说有点贵,因此尝试SSE的主要原因是加速这些计算。引擎是用OpenGL编写的,基于原始的DOOM引擎。


任何帮助,解释或建议我都会非常感激。



再次感谢您,


-Coraline




解决方案

您好,


感谢您在此处发帖。


>>在EDGE.exe中的0x0045F167处抛出异常:


0xC0000005:访问冲突读取位置0xFFFFFFFF。


(_m128 m4x4v_colSSE()失败)


你的vs版本是什么?您是否尝试将您的vs更新到最新版本?


也许您可以尝试按照以下情况关闭优化。


https://developercommunity.visualstudio.com/content/问题/ 174967 / read-access-violation-when-moving-16-byte-aligned.html


据我所知,变量类型_m128在16字节边界上自动对齐。根据这个
文档,您最好不要直接访问__m128字段。


此外,此论坛是关于桌面应用程序的c ++代码问题。对于与游戏开发更相关的案例,我建议您在下面的论坛上发帖以获得更好的支持。


https://www.gamedev.net/forums/forum/10-engines-and-middleware/


您的理解与合作将不胜感激。


最诚挚的问候,


Baron Bi



Hello! This is my first post here so please forgive me if I leave out or put in too much information.

I'm trying to use SSE to offload vertex transformation on 3D models (floating point) to speed up the amount of time it has to spend drawing the models. The primary function is below:

__m128 m4x4v_colSSE(const __m128 cols[4], const __m128 v) 
{
	__m128 u1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
	__m128 u2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
	__m128 u3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
	__m128 u4 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));

	__m128 prod1 = _mm_mul_ps(u1, cols[0]);
	__m128 prod2 = _mm_mul_ps(u2, cols[1]); //<--- this line produces the error! Exception thrown at 0x0012F167 
	__m128 prod3 = _mm_mul_ps(u3, cols[2]);
	__m128 prod4 = _mm_mul_ps(u4, cols[3]);

	return _mm_add_ps(_mm_add_ps(prod1, prod2), _mm_add_ps(prod3, prod4));
}

Now, it gets called from the very next function, when performing mat4 transformation:

void md5_transform_vertices_sse(MD5mesh *msh, epi::mat4_c *posemats, basevert *dst) { epi::mat4_c *mats = posemats + 1; MD5vertex *vs = msh->verts; MD5weight *ws = msh->weights; int i,j; for(i = 0; i < msh->vertcnt; i++) { MD5vertex *v = vs + i; basevert *cv = dst + i; __m128 pos = _mm_set1_ps(0); __m128 norm = _mm_set1_ps(0); MD5weight *w = ws + v->firstweight;

for(j = 0; j < v->weightcnt; j++) { __m128 wpos = _mm_setr_ps(w[j].pos[0], w[j].pos[1], w[j].pos[2], 1); __m128 wnorm = _mm_setr_ps(w[j].normal[0],w[j].normal[1], w[j].normal[2], 0); wpos = m4x4v_colSSE((__m128*)&mats[w[j].jointidx], wpos); wnorm = m4x4v_colSSE((__m128*)&mats[w[j].jointidx], wnorm); __m128 weight = _mm_set1_ps(w[j].weight); pos = _mm_mul_ps(_mm_add_ps(wpos,pos),weight); norm = _mm_mul_ps(_mm_add_ps(wnorm,norm),weight); } _mm_store_ps((float*)&cv->pos, pos); _mm_store_ps((float*)&cv->norm, norm); } }

When the game engine loads and after decoding the MD5 model, I get an immediate crash:

Exception thrown at 0x0045F167 in EDGE.exe:

0xC0000005: Access violation reading location 0xFFFFFFFF.

(fails in _m128 m4x4v_colSSE())

So, what I'm trying to gather is that there is some sort of alignment issues here (and maybe I'm wrong), but I can't seem to wrap my head around what is causing the error. For the record, the engine is compiled (in VS2017) with /O2 and /Ob2 optimizations, and it is set to generate SSE2. 

I have written the md5_transform_vertices_sse() function in place of the normal, non-SSE2 version (for reference):

void md5_transform_vertices(MD5mesh *msh, epi::mat4_c *posemats, basevert *dst) 
{
	epi::mat4_c *mats = posemats + 1;
	MD5vertex *vs = msh->verts;
	MD5weight *ws = msh->weights;
	int i,j;
	
	for(i = 0; i < msh->vertcnt; i++) 
	{
		MD5vertex *v = vs + i;
		basevert *cv = dst + i;
		
		cv->pos = epi::vec3_c(0,0,0);
		cv->norm = epi::vec3_c(0,0,0);
		cv->tan = epi::vec3_c(0,0,0);
		cv->uv = epi::vec2_c(v->uv[0],v->uv[1]);
		
		MD5weight *w = ws + v->firstweight;
		for(j = 0; j < v->weightcnt; j++) 
		{
#if PREMULTIPLY
			cv->pos += (mats[w[j].jointidx] * epi::vec4_c(w[j].pos,w[j].weight)).Get3D();
			cv->norm += (mats[w[j].jointidx] * epi::vec4_c(w[j].normal,0)).Get3D();
			cv->tan += (mats[w[j].jointidx] * epi::vec4_c(w[j].tan,0)).Get3D();
#else
			cv->pos += mats[w[j].jointidx] * epi::vec3_c(w[j].pos) * w[j].weight;
			cv->norm += (mats[w[j].jointidx] * epi::vec4_c(w[j].normal,0)).Get3D() * w[j].weight;
			cv->tan += (mats[w[j].jointidx] * epi::vec4_c(w[j].tan,0)).Get3D() * w[j].weight;
#endif
		}
		
	}
}

As you can see, these calculations are a tad expensive for unified models with several pieces/tris/etc, so the primary reason to attempt SSE was to speed up these calculations. The engine is written in OpenGL and is based on the original DOOM engine.

Any help, explanations, or suggestions I would be most thankful.

Thank you again,

-Coraline


解决方案

Hi,

thanks for posting here.

>>Exception thrown at 0x0045F167 in EDGE.exe:

0xC0000005: Access violation reading location 0xFFFFFFFF.

(fails in _m128 m4x4v_colSSE())

What's your vs version? Have you tried to update your vs to the latest version?

Maybe you could try to follow this case below to turn off optimizations.

https://developercommunity.visualstudio.com/content/problem/174967/read-access-violation-when-moving-16-byte-aligned.html

As far as I know, variables of type _m128 are automatically aligned on 16-byte boundaries. According to this document, you'd better not access the __m128 fields directly.

Besides, this forum is about c++ code issues with desktop application. For your case which is more related to game development, I suggest you post on this forum below for better support.

https://www.gamedev.net/forums/forum/10-engines-and-middleware/

Your understanding and cooperation will be grateful.

Best Regards,

Baron Bi


这篇关于_m128 SSE崩溃(OpenGL / C ++)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆