使用AVX内在函数而不是SSE不能提高速度 - 为什么？ [英] Using AVX intrinsics instead of SSE does not improve speed -- why?

查看：458 发布时间：2016/10/22 17:12:46 c++ performance gcc sse avx

本文介绍了使用AVX内在函数而不是SSE不能提高速度 - 为什么？的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我已经使用英特尔的SSE内在函数了相当长的时间，良好的性能提升。因此，我期望AVX内在函数进一步加速我的程序。不幸的是，这不是现在的情况。可能我犯了一个愚蠢的错误，所以如果有人可以帮助我，我会非常感激。

我使用Ubuntu 11.10和g ++ 4.6.1。我用

编译我的程序（见下文）

  g ++ simpleExample.cpp -O3 -march = native -o simpleExample

测试系统有一个Intel i7-2600 CPU。

这里是代表我的问题的代码。在我的系统上，我得到输出

  98.715 ms，b [42] = 0.900038 // Naive 
 24.457 ms ，b [42] = 0.900038 // SSE 
 24.646 ms，b [42] = 0.900038 // AVX

注意，计算sqrt（sqrt（sqrt（x）））只是为了确保内存带宽不限制执行速度;它只是一个例子。

simpleExample.cpp：

  #include< immintrin.h> ; 
 #include< iostream> 
 #include< math.h> 
 #include< sys / time.h> 
 
 using namespace std; 
 
 // ---------------------------------------- ------------------------------------- 
 //此函数返回当前时间，表示为自纪元以来的秒
 // -------------------------------------- --------------------------------------- 
 double getCurrentTime（）{
 struct timeval curr; 
 struct timezone tz; 
 gettimeofday（& curr，& tz）; 
 double tmp = static_cast< double>（curr.tv_sec）* static_cast< double>（1000000）
 + static_cast< double>（curr.tv_usec）; 
 return tmp * 1e-6; 
} 
 
 // ----------------------------------- ------------------------------------------ 
 //主例程
 // -------------------------------------------- --------------------------------- 
 int main（）{
 
 srand48（0）; // seed PRNG 
 double e，s; // timestamp variables 
 float * a，* b; // data pointers 
 float * pA，* pB; //工作指针
 __m128 rA，rB; // SSE的变量
 __m256 rA_AVX，rB_AVX; // AVX的变量
 
 //定义向量大小
 const int vector_size = 10000000; 
 
 //分配内存
 a =（float *）_mm_malloc（vector_size * sizeof（float），32）; 
 b =（float *）_mm_malloc（vector_size * sizeof（float），32）; 
 
 //初始化向量// 
 for（int i = 0; i  a [i] = fabs（drand48（））; 
 b [i] = 0.0f; 
} 
 
 // +++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++++++++++++++++ 
 //简单实施
 // ++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++ 
s = getCurrentTime（）; 
 for（int i = 0; i  b [i] = sqrtf（sqrtf（sqrtf（a [i]）））; 
} 
 e = getCurrentTime（）; 
 cout<< （e-s）* 1000 < ms< ，b [42] =< b [42]< endl; 
 
 // ---------------------------------------- ------------------------------------- 
 for（int i = 0; i < vector_size; i ++）{
b [i] = 0.0f; 
} 
 // --------------------------------------- -------------------------------------- 
 
 // ++ +++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++ 
 // SSE2实现
 // +++++++++++ +++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++ 
 pA = a; pB = b; 
 
 s = getCurrentTime（）; 
 for（int i = 0; i  rA = _mm_load_ps（pA）; 
 rB = _mm_sqrt_ps（_mm_sqrt_ps（_mm_sqrt_ps（rA）））; 
 _mm_store_ps（pB，rB）; 
 pA + = 4; 
 pB + = 4; 
} 
 e = getCurrentTime（）; 
 cout<< （e-s）* 1000 < ms< ，b [42] =< b [42]< endl; 
 
 // ---------------------------------------- ------------------------------------- 
 for（int i = 0; i < vector_size; i ++）{
b [i] = 0.0f; 
} 
 // --------------------------------------- -------------------------------------- 
 
 // ++ +++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++ 
 // AVX实现
 // +++++++++++ +++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++ 
 pA = a; pB = b; 
 
 s = getCurrentTime（）; 
 for（int i = 0; i  rA_AVX = _mm256_load_ps（pA）; 
 rB_AVX = _mm256_sqrt_ps（_mm256_sqrt_ps（_mm256_sqrt_ps（rA_AVX）））; 
 _mm256_store_ps（pB，rB_AVX）; 
 pA + = 8; 
 pB + = 8; 
} 
 e = getCurrentTime（）; 
 cout<< （e-s）* 1000 < ms< ，b [42] =< b [42]< endl; 
 
 _mm_free（a）; 
 _mm_free（b）; 
 
 return 0; 
}

感谢任何帮助！

VSQRTPS （AVX指令）的执行周期是 SQRTPS （SSE指令）。请参阅Agner Fog的优化指南：说明表，第88页。

说明如平方根和除法不能从AVX中受益。另一方面，加法，乘法等操作。

I've been using Intel's SSE intrinsics for quite some time with good performance gains. Hence, I expected the AVX intrinsics to further speed-up my programs. This, unfortunately, was not the case until now. Probably I am doing a stupid mistake, so I would be very grateful if somebody could help me out.

I use Ubuntu 11.10 with g++ 4.6.1. I compiled my program (see below) with

g++ simpleExample.cpp -O3 -march=native -o simpleExample

The test system has a Intel i7-2600 CPU.

Here is the code which exemplifies my problem. On my system, I get the output

98.715 ms, b[42] = 0.900038 // Naive
24.457 ms, b[42] = 0.900038 // SSE
24.646 ms, b[42] = 0.900038 // AVX

Note that the computation sqrt(sqrt(sqrt(x))) was only chosen to ensure that memory bandwith does not limit execution speed; it is just an example.

simpleExample.cpp:

#include <immintrin.h>
#include <iostream>
#include <math.h> 
#include <sys/time.h>

using namespace std;

// -----------------------------------------------------------------------------
// This function returns the current time, expressed as seconds since the Epoch
// -----------------------------------------------------------------------------
double getCurrentTime(){
  struct timeval curr;
  struct timezone tz;
  gettimeofday(&curr, &tz);
  double tmp = static_cast<double>(curr.tv_sec) * static_cast<double>(1000000)
             + static_cast<double>(curr.tv_usec);
  return tmp*1e-6;
}

// -----------------------------------------------------------------------------
// Main routine
// -----------------------------------------------------------------------------
int main() {

  srand48(0);            // seed PRNG
  double e,s;            // timestamp variables
  float *a, *b;          // data pointers
  float *pA,*pB;         // work pointer
  __m128 rA,rB;          // variables for SSE
  __m256 rA_AVX, rB_AVX; // variables for AVX

  // define vector size 
  const int vector_size = 10000000;

  // allocate memory 
  a = (float*) _mm_malloc (vector_size*sizeof(float),32);
  b = (float*) _mm_malloc (vector_size*sizeof(float),32);

  // initialize vectors //
  for(int i=0;i<vector_size;i++) {
    a[i]=fabs(drand48());
    b[i]=0.0f;
  }

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// Naive implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  s = getCurrentTime();
  for (int i=0; i<vector_size; i++){
    b[i] = sqrtf(sqrtf(sqrtf(a[i])));
  }
  e = getCurrentTime();
  cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;

// -----------------------------------------------------------------------------
  for(int i=0;i<vector_size;i++) {
    b[i]=0.0f;
  }
// -----------------------------------------------------------------------------

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// SSE2 implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  pA = a; pB = b;

  s = getCurrentTime();
  for (int i=0; i<vector_size; i+=4){
    rA   = _mm_load_ps(pA);
    rB   = _mm_sqrt_ps(_mm_sqrt_ps(_mm_sqrt_ps(rA)));
    _mm_store_ps(pB,rB);
    pA += 4;
    pB += 4;
  }
  e = getCurrentTime();
  cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;

// -----------------------------------------------------------------------------
  for(int i=0;i<vector_size;i++) {
    b[i]=0.0f;
  }
// -----------------------------------------------------------------------------

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// AVX implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  pA = a; pB = b;

  s = getCurrentTime();
  for (int i=0; i<vector_size; i+=8){
    rA_AVX   = _mm256_load_ps(pA);
    rB_AVX   = _mm256_sqrt_ps(_mm256_sqrt_ps(_mm256_sqrt_ps(rA_AVX)));
    _mm256_store_ps(pB,rB_AVX);
    pA += 8;
    pB += 8;
  }
  e = getCurrentTime();
  cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;

  _mm_free(a);
  _mm_free(b);

  return 0;
}

Any help is appreciated!

解决方案

This is because VSQRTPS (AVX instruction) takes exactly twice as many cycles as SQRTPS (SSE instruction) on a Sandy Bridge processor. See Agner Fog's optimize guide: instruction tables, page 88.

Instructions like square root and division don't benefit from AVX. On the other hand, additions, multiplications, etc., do.

这篇关于使用AVX内在函数而不是SSE不能提高速度 - 为什么？的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

使用AVX内在函数而不是SSE不能提高速度 - 为什么？ [英] Using AVX intrinsics instead of SSE does not improve speed -- why?

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

使用AVX内在函数而不是SSE不能提高速度 - 为什么？ [英] Using AVX intrinsics instead of SSE does not improve speed -- why?

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭