SSE内存访问 [英] SSE memory access

查看：240 发布时间：2016/8/24 12:48:06 c sse simd gaussian

本文介绍了SSE内存访问的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我需要高斯消元使用SSE，我不知道如何从128位寄存器（每个存储4个元素）访问每一个元素（32位）来执行。这是原来的code（不使用SSE）：

  unsigned int类型I，J，K;对于（i = 0; I＆LT; num_elements;我++）/ * A矩阵的内容复制进U矩阵。 * /
    为（J = 0; J＆LT; num_elements; J ++）
        U [num_elements * I + J] = A [num_elements * I + J]。
对于（K = 0; K＆LT; num_elements; k ++）{/ *到位对U矩阵进行高斯消元法。 * /
    为（J =（K + 1）; J＆LT; num_elements; J ++）{/ *降低当前行。 * /        如果（U [num_elements * K + K] == 0）{
            的printf（检测数值不稳定的主要对角元素为零\\ n）;
            返回0;
        }        / *司一步。 * /
        U [num_elements * K + J] =（浮点）（U [num_elements * K + J] / U [num_elements * K + K]）;
    }    U [num_elements * K + K] = 1; / *设置U中的主对角线项为1 * /    为（ⅰ=（K + 1）; I＆下; num_elements;我++）{
        为（J =（K + 1）; J＆下; num_elements; J ++）
            / * Elimnation一步。 * /
            U [num_elements * I + J] = U [num_elements * I + J]  -  \\
                                      （U [num_elements * I + K] * U [num_elements * K + J]）;        U [num_elements * I + K] = 0;
    }
}

好吧我得到分割故障[核心转储]与此code。我是新来SSE。有人能帮忙吗？谢谢你。

  INT I，J，K;
 __m128 A_I，b_i，C_I，d_i;对于（i = 0; I＆LT; NUM_ROWS;我++）
{
为（J = 0; J＆LT; NUM_ROWS; J + = 4）
{
    INT指数= NUM_ROWS * I + J;
   __m128 V = _mm_loadu_ps（安培; A [指数]）; //装载4×花车
   _mm_storeu_ps（安培; U [指数]，V）; //店4×花车}
}
对于（K = 0; K＆LT; NUM_ROWS; k ++）{ A_I = _mm_load_ss（安培; U [NUM_ROWS * K + K]）;
    为（J =（4 * K + 1）; J＆下; NUM_ROWS; J + = 4）{
               b_i = _mm_loadu_ps（安培; U [NUM_ROWS * K + J]）; //减少currentrow。    如果（U [NUM_ROWS * K + K] == 0）{
    的printf（数值不稳定检测。）;        }        / *司一步。 * /
        b_i = _mm_div_ps（b_i，A_I）;
  }    A_I = _mm_set_ss（1）;    对于（I =（K + 1）; I＆LT; NUM_ROWS;我++）{
  d_i = _mm_load_ss（安培; U [NUM_ROWS * I + K]）;
        为（J =（4 * K + 1）; J＆下; NUM_ROWS; J + = 4）{
           C_I = _mm_loadu_ps（安培; U [NUM_ROWS * I + J]）; / * Elimnation一步。 * /
        b_i = _mm_loadu_ps（安培; U [NUM_ROWS * K + J]）;
            C_I = _mm_sub_ps（C_I，_mm_mul_ss（b_i，d_i））;
        }
       d_i = _mm_set_ss（0）;
    }
  }

解决方案

为了让你开始，你的第一个循环应该更多这样的：

 为（i = 0; I＆LT; num_elements;我++）
{
    为（J = 0; J＆LT; num_elements; J + = 4）
    {
        INT指数= num_elements * I + J;
        __m128i V = _mm_loadu_ps（（__ m128i *）及A [指数]）; //装载4×花车
        _mm_storeu_ps（（__ m128i *）及U [指数]，V）; //店4×花车
    }
}

这假定 num_elements 是4的倍数，这既不 A 也不 U 正确对齐。

I need to perform Gaussian Elimination using SSE and I am not sure how to access each element(32 bits) from the 128 bit registers(each storing 4 elements). This is the original code(without using SSE):

unsigned int i, j, k;

for (i = 0; i < num_elements; i ++)             /* Copy the contents of the A matrix into the U matrix. */
    for(j = 0; j < num_elements; j++)
        U[num_elements * i + j] = A[num_elements*i + j];


for (k = 0; k < num_elements; k++){             /* Perform Gaussian elimination in place on the U matrix. */
    for (j = (k + 1); j < num_elements; j++){   /* Reduce the current row. */

        if (U[num_elements*k + k] == 0){
            printf("Numerical instability detected. The principal diagonal element is zero. \n");
            return 0;
        }

        /* Division step. */
        U[num_elements * k + j] = (float)(U[num_elements * k + j] / U[num_elements * k + k]);
    }

    U[num_elements * k + k] = 1;             /* Set the principal diagonal entry in U to be 1. */

    for (i = (k+1); i < num_elements; i++){
        for (j = (k+1); j < num_elements; j++)
            /* Elimnation step. */
            U[num_elements * i + j] = U[num_elements * i + j] -\
                                      (U[num_elements * i + k] * U[num_elements * k + j]);

        U[num_elements * i + k] = 0; 
    } 
}

Okay I'm getting segmentation fault[core dumped] with this code. I'm new to SSE. Can someone help? Thanks.

 int i,j,k;
 __m128 a_i,b_i,c_i,d_i;

for (i = 0; i < num_rows; i++)
{
for (j = 0; j < num_rows; j += 4)
{
    int index = num_rows * i + j;
   __m128 v = _mm_loadu_ps(&A[index]); // load 4 x floats
   _mm_storeu_ps(&U[index], v);         // store 4 x floats

}
}
for (k = 0; k < num_rows; k++){  

 a_i= _mm_load_ss(&U[num_rows*k+k]);         


    for (j = (4*k + 1); j < num_rows; j+=4){
               b_i= _mm_loadu_ps(&U[num_rows*k+j]);// Reduce the currentrow. 

    if (U[num_rows*k+k] == 0){
    printf("Numerical instability detected.);

        }

        /* Division step. */
        b_i =    _mm_div_ps(b_i, a_i);
  }

    a_i = _mm_set_ss(1);           

    for (i = (k+1); i < num_rows; i++){
  d_i= _mm_load_ss(&U[num_rows*i+k]);
        for (j = (4*k+1); j < num_rows; j+=4){
           c_i= _mm_loadu_ps(&U[num_rows*i+j]); /* Elimnation step. */
        b_i= _mm_loadu_ps(&U[num_rows*k+j]);    
            c_i = _mm_sub_ps(c_i, _mm_mul_ss(b_i,d_i));
        }
       d_i= _mm_set_ss(0); 
    } 
  }

解决方案

In order to get you started, your first loop should be more like this:

for (i = 0; i < num_elements; i++)
{
    for (j = 0; j < num_elements; j += 4)
    {
        int index = num_elements * i + j;
        __m128i v = _mm_loadu_ps((__m128i *)&A[index]); // load 4 x floats
        _mm_storeu_ps((__m128i *)&U[index], v);         // store 4 x floats
    }
}

This assumes that num_elements is a multiple of 4, and that neither A nor U is correctly aligned.

这篇关于SSE内存访问的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

SSE内存访问 [英] SSE memory access

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录关闭

SSE内存访问 [英] SSE memory access

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录 关闭

登录关闭