SSE内存访问 [英] SSE memory access
本文介绍了SSE内存访问的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
我需要高斯消元使用SSE,我不知道如何从128位寄存器(每个存储4个元素)访问每一个元素(32位)来执行。这是原来的code(不使用SSE):
unsigned int类型I,J,K;对于(i = 0; I< num_elements;我++)/ * A矩阵的内容复制进U矩阵。 * /
为(J = 0; J< num_elements; J ++)
U [num_elements * I + J] = A [num_elements * I + J]。
对于(K = 0; K< num_elements; k ++){/ *到位对U矩阵进行高斯消元法。 * /
为(J =(K + 1); J< num_elements; J ++){/ *降低当前行。 * / 如果(U [num_elements * K + K] == 0){
的printf(检测数值不稳定的主要对角元素为零\\ n);
返回0;
} / *司一步。 * /
U [num_elements * K + J] =(浮点)(U [num_elements * K + J] / U [num_elements * K + K]);
} U [num_elements * K + K] = 1; / *设置U中的主对角线项为1 * / 为(ⅰ=(K + 1); I&下; num_elements;我++){
为(J =(K + 1); J&下; num_elements; J ++)
/ * Elimnation一步。 * /
U [num_elements * I + J] = U [num_elements * I + J] - \\
(U [num_elements * I + K] * U [num_elements * K + J]); U [num_elements * I + K] = 0;
}
}
好吧我得到分割故障[核心转储]与此code。我是新来SSE。有人能帮忙吗?谢谢你。
INT I,J,K;
__m128 A_I,b_i,C_I,d_i;对于(i = 0; I< NUM_ROWS;我++)
{
为(J = 0; J< NUM_ROWS; J + = 4)
{
INT指数= NUM_ROWS * I + J;
__m128 V = _mm_loadu_ps(安培; A [指数]); //装载4×花车
_mm_storeu_ps(安培; U [指数],V); //店4×花车}
}
对于(K = 0; K< NUM_ROWS; k ++){ A_I = _mm_load_ss(安培; U [NUM_ROWS * K + K]);
为(J =(4 * K + 1); J&下; NUM_ROWS; J + = 4){
b_i = _mm_loadu_ps(安培; U [NUM_ROWS * K + J]); //减少currentrow。 如果(U [NUM_ROWS * K + K] == 0){
的printf(数值不稳定检测。); } / *司一步。 * /
b_i = _mm_div_ps(b_i,A_I);
} A_I = _mm_set_ss(1); 对于(I =(K + 1); I< NUM_ROWS;我++){
d_i = _mm_load_ss(安培; U [NUM_ROWS * I + K]);
为(J =(4 * K + 1); J&下; NUM_ROWS; J + = 4){
C_I = _mm_loadu_ps(安培; U [NUM_ROWS * I + J]); / * Elimnation一步。 * /
b_i = _mm_loadu_ps(安培; U [NUM_ROWS * K + J]);
C_I = _mm_sub_ps(C_I,_mm_mul_ss(b_i,d_i));
}
d_i = _mm_set_ss(0);
}
}
解决方案
为了让你开始,你的第一个循环应该更多这样的:
为(i = 0; I< num_elements;我++)
{
为(J = 0; J< num_elements; J + = 4)
{
INT指数= num_elements * I + J;
__m128i V = _mm_loadu_ps((__ m128i *)及A [指数]); //装载4×花车
_mm_storeu_ps((__ m128i *)及U [指数],V); //店4×花车
}
}
这假定 num_elements
是4的倍数,这既不 A
也不 U
正确对齐。
I need to perform Gaussian Elimination using SSE and I am not sure how to access each element(32 bits) from the 128 bit registers(each storing 4 elements). This is the original code(without using SSE):
unsigned int i, j, k;
for (i = 0; i < num_elements; i ++) /* Copy the contents of the A matrix into the U matrix. */
for(j = 0; j < num_elements; j++)
U[num_elements * i + j] = A[num_elements*i + j];
for (k = 0; k < num_elements; k++){ /* Perform Gaussian elimination in place on the U matrix. */
for (j = (k + 1); j < num_elements; j++){ /* Reduce the current row. */
if (U[num_elements*k + k] == 0){
printf("Numerical instability detected. The principal diagonal element is zero. \n");
return 0;
}
/* Division step. */
U[num_elements * k + j] = (float)(U[num_elements * k + j] / U[num_elements * k + k]);
}
U[num_elements * k + k] = 1; /* Set the principal diagonal entry in U to be 1. */
for (i = (k+1); i < num_elements; i++){
for (j = (k+1); j < num_elements; j++)
/* Elimnation step. */
U[num_elements * i + j] = U[num_elements * i + j] -\
(U[num_elements * i + k] * U[num_elements * k + j]);
U[num_elements * i + k] = 0;
}
}
Okay I'm getting segmentation fault[core dumped] with this code. I'm new to SSE. Can someone help? Thanks.
int i,j,k;
__m128 a_i,b_i,c_i,d_i;
for (i = 0; i < num_rows; i++)
{
for (j = 0; j < num_rows; j += 4)
{
int index = num_rows * i + j;
__m128 v = _mm_loadu_ps(&A[index]); // load 4 x floats
_mm_storeu_ps(&U[index], v); // store 4 x floats
}
}
for (k = 0; k < num_rows; k++){
a_i= _mm_load_ss(&U[num_rows*k+k]);
for (j = (4*k + 1); j < num_rows; j+=4){
b_i= _mm_loadu_ps(&U[num_rows*k+j]);// Reduce the currentrow.
if (U[num_rows*k+k] == 0){
printf("Numerical instability detected.);
}
/* Division step. */
b_i = _mm_div_ps(b_i, a_i);
}
a_i = _mm_set_ss(1);
for (i = (k+1); i < num_rows; i++){
d_i= _mm_load_ss(&U[num_rows*i+k]);
for (j = (4*k+1); j < num_rows; j+=4){
c_i= _mm_loadu_ps(&U[num_rows*i+j]); /* Elimnation step. */
b_i= _mm_loadu_ps(&U[num_rows*k+j]);
c_i = _mm_sub_ps(c_i, _mm_mul_ss(b_i,d_i));
}
d_i= _mm_set_ss(0);
}
}
解决方案
In order to get you started, your first loop should be more like this:
for (i = 0; i < num_elements; i++)
{
for (j = 0; j < num_elements; j += 4)
{
int index = num_elements * i + j;
__m128i v = _mm_loadu_ps((__m128i *)&A[index]); // load 4 x floats
_mm_storeu_ps((__m128i *)&U[index], v); // store 4 x floats
}
}
This assumes that num_elements
is a multiple of 4, and that neither A
nor U
is correctly aligned.
这篇关于SSE内存访问的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
查看全文