SSE内存访问 [英] SSE memory access

查看:240
本文介绍了SSE内存访问的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我需要高斯消元使用SSE,我不知道如何从128位寄存器(每个存储4个元素)访问每一个元素(32位)来执行。这是原来的code(不使用SSE):

  unsigned int类型I,J,K;对于(i = 0; I< num_elements;我++)/ * A矩阵的内容复制进U矩阵。 * /
    为(J = 0; J< num_elements; J ++)
        U [num_elements * I + J] = A [num_elements * I + J]。
对于(K = 0; K< num_elements; k ++){/ *到位对U矩阵进行高斯消元法。 * /
    为(J =(K + 1); J< num_elements; J ++){/ *降低当前行。 * /        如果(U [num_elements * K + K] == 0){
            的printf(检测数值不稳定的主要对角元素为零\\ n);
            返回0;
        }        / *司一步。 * /
        U [num_elements * K + J] =(浮点)(U [num_elements * K + J] / U [num_elements * K + K]);
    }    U [num_elements * K + K] = 1; / *设置U中的主对角线项为1 * /    为(ⅰ=(K + 1); I&下; num_elements;我++){
        为(J =(K + 1); J&下; num_elements; J ++)
            / * Elimnation一步。 * /
            U [num_elements * I + J] = U [num_elements * I + J] - \\
                                      (U [num_elements * I + K] * U [num_elements * K + J]);        U [num_elements * I + K] = 0;
    }
}

好吧我得到分割故障[核心转储]与此code。我是新来SSE。有人能帮忙吗?谢谢你。

  INT I,J,K;
 __m128 A_I,b_i,C_I,d_i;对于(i = 0; I< NUM_ROWS;我++)
{
为(J = 0; J< NUM_ROWS; J + = 4)
{
    INT指数= NUM​​_ROWS * I + J;
   __m128 V = _mm_loadu_ps(安培; A [指数]); //装载4×花车
   _mm_storeu_ps(安培; U [指数],V); //店4×花车}
}
对于(K = 0; K< NUM_ROWS; k ++){ A_I = _mm_load_ss(安培; U [NUM_ROWS * K + K]);
    为(J =(4 * K + 1); J&下; NUM_ROWS; J + = 4){
               b_i = _mm_loadu_ps(安培; U [NUM_ROWS * K + J]); //减少currentrow。    如果(U [NUM_ROWS * K + K] == 0){
    的printf(数值不稳定检测。);        }        / *司一步。 * /
        b_i = _mm_div_ps(b_i,A_I);
  }    A_I = _mm_set_ss(1);    对于(I =(K + 1); I< NUM_ROWS;我++){
  d_i = _mm_load_ss(安培; U [NUM_ROWS * I + K]);
        为(J =(4 * K + 1); J&下; NUM_ROWS; J + = 4){
           C_I = _mm_loadu_ps(安培; U [NUM_ROWS * I + J]); / * Elimnation一步。 * /
        b_i = _mm_loadu_ps(安培; U [NUM_ROWS * K + J]);
            C_I = _mm_sub_ps(C_I,_mm_mul_ss(b_i,d_i));
        }
       d_i = _mm_set_ss(0);
    }
  }


解决方案

为了让你开始,你的第一个循环应该更多这样的:

 为(i = 0; I< num_elements;我++)
{
    为(J = 0; J< num_elements; J + = 4)
    {
        INT指数= num_elements * I + J;
        __m128i V = _mm_loadu_ps((__ m128i *)及A [指数]); //装载4×花车
        _mm_storeu_ps((__ m128i *)及U [指数],V); //店4×花车
    }
}

这假定 num_elements 是4的倍数,这既不 A 也不 U 正确对齐。

I need to perform Gaussian Elimination using SSE and I am not sure how to access each element(32 bits) from the 128 bit registers(each storing 4 elements). This is the original code(without using SSE):

unsigned int i, j, k;

for (i = 0; i < num_elements; i ++)             /* Copy the contents of the A matrix into the U matrix. */
    for(j = 0; j < num_elements; j++)
        U[num_elements * i + j] = A[num_elements*i + j];


for (k = 0; k < num_elements; k++){             /* Perform Gaussian elimination in place on the U matrix. */
    for (j = (k + 1); j < num_elements; j++){   /* Reduce the current row. */

        if (U[num_elements*k + k] == 0){
            printf("Numerical instability detected. The principal diagonal element is zero. \n");
            return 0;
        }

        /* Division step. */
        U[num_elements * k + j] = (float)(U[num_elements * k + j] / U[num_elements * k + k]);
    }

    U[num_elements * k + k] = 1;             /* Set the principal diagonal entry in U to be 1. */

    for (i = (k+1); i < num_elements; i++){
        for (j = (k+1); j < num_elements; j++)
            /* Elimnation step. */
            U[num_elements * i + j] = U[num_elements * i + j] -\
                                      (U[num_elements * i + k] * U[num_elements * k + j]);

        U[num_elements * i + k] = 0; 
    } 
}

Okay I'm getting segmentation fault[core dumped] with this code. I'm new to SSE. Can someone help? Thanks.

 int i,j,k;
 __m128 a_i,b_i,c_i,d_i;

for (i = 0; i < num_rows; i++)
{
for (j = 0; j < num_rows; j += 4)
{
    int index = num_rows * i + j;
   __m128 v = _mm_loadu_ps(&A[index]); // load 4 x floats
   _mm_storeu_ps(&U[index], v);         // store 4 x floats

}
}
for (k = 0; k < num_rows; k++){  

 a_i= _mm_load_ss(&U[num_rows*k+k]);         


    for (j = (4*k + 1); j < num_rows; j+=4){
               b_i= _mm_loadu_ps(&U[num_rows*k+j]);// Reduce the currentrow. 

    if (U[num_rows*k+k] == 0){
    printf("Numerical instability detected.);

        }

        /* Division step. */
        b_i =    _mm_div_ps(b_i, a_i);
  }

    a_i = _mm_set_ss(1);           

    for (i = (k+1); i < num_rows; i++){
  d_i= _mm_load_ss(&U[num_rows*i+k]);
        for (j = (4*k+1); j < num_rows; j+=4){
           c_i= _mm_loadu_ps(&U[num_rows*i+j]); /* Elimnation step. */
        b_i= _mm_loadu_ps(&U[num_rows*k+j]);    
            c_i = _mm_sub_ps(c_i, _mm_mul_ss(b_i,d_i));
        }
       d_i= _mm_set_ss(0); 
    } 
  }

解决方案

In order to get you started, your first loop should be more like this:

for (i = 0; i < num_elements; i++)
{
    for (j = 0; j < num_elements; j += 4)
    {
        int index = num_elements * i + j;
        __m128i v = _mm_loadu_ps((__m128i *)&A[index]); // load 4 x floats
        _mm_storeu_ps((__m128i *)&U[index], v);         // store 4 x floats
    }
}

This assumes that num_elements is a multiple of 4, and that neither A nor U is correctly aligned.

这篇关于SSE内存访问的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆