使用AVX内在函数而不是SSE不能提高速度 - 为什么? [英] Using AVX intrinsics instead of SSE does not improve speed -- why?
问题描述
我已经使用英特尔的SSE内在函数了相当长的时间,良好的性能提升。因此,我期望AVX内在函数进一步加速我的程序。不幸的是,这不是现在的情况。可能我犯了一个愚蠢的错误,所以如果有人可以帮助我,我会非常感激。
我使用Ubuntu 11.10和g ++ 4.6.1。我用
编译我的程序(见下文) g ++ simpleExample.cpp -O3 -march = native -o simpleExample
测试系统有一个Intel i7-2600 CPU。
这里是代表我的问题的代码。在我的系统上,我得到输出
98.715 ms,b [42] = 0.900038 // Naive
24.457 ms ,b [42] = 0.900038 // SSE
24.646 ms,b [42] = 0.900038 // AVX
注意,计算sqrt(sqrt(sqrt(x)))只是为了确保内存带宽不限制执行速度;它只是一个例子。
simpleExample.cpp:
#include< immintrin.h> ;
#include< iostream>
#include< math.h>
#include< sys / time.h>
using namespace std;
// ---------------------------------------- -------------------------------------
//此函数返回当前时间,表示为自纪元以来的秒
// -------------------------------------- ---------------------------------------
double getCurrentTime(){
struct timeval curr;
struct timezone tz;
gettimeofday(& curr,& tz);
double tmp = static_cast< double>(curr.tv_sec)* static_cast< double>(1000000)
+ static_cast< double>(curr.tv_usec);
return tmp * 1e-6;
}
// ----------------------------------- ------------------------------------------
//主例程
// -------------------------------------------- ---------------------------------
int main(){
srand48(0); // seed PRNG
double e,s; // timestamp variables
float * a,* b; // data pointers
float * pA,* pB; //工作指针
__m128 rA,rB; // SSE的变量
__m256 rA_AVX,rB_AVX; // AVX的变量
//定义向量大小
const int vector_size = 10000000;
//分配内存
a =(float *)_mm_malloc(vector_size * sizeof(float),32);
b =(float *)_mm_malloc(vector_size * sizeof(float),32);
//初始化向量//
for(int i = 0; i a [i] = fabs(drand48());
b [i] = 0.0f;
}
// +++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++++++++++++++++
//简单实施
// ++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++
s = getCurrentTime();
for(int i = 0; i b [i] = sqrtf(sqrtf(sqrtf(a [i])));
}
e = getCurrentTime();
cout<< (e-s)* 1000 < ms< ,b [42] =< b [42]< endl;
// ---------------------------------------- -------------------------------------
for(int i = 0; i < vector_size; i ++){
b [i] = 0.0f;
}
// --------------------------------------- --------------------------------------
// ++ +++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++
// SSE2实现
// +++++++++++ +++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++
pA = a; pB = b;
s = getCurrentTime();
for(int i = 0; i rA = _mm_load_ps(pA);
rB = _mm_sqrt_ps(_mm_sqrt_ps(_mm_sqrt_ps(rA)));
_mm_store_ps(pB,rB);
pA + = 4;
pB + = 4;
}
e = getCurrentTime();
cout<< (e-s)* 1000 < ms< ,b [42] =< b [42]< endl;
// ---------------------------------------- -------------------------------------
for(int i = 0; i < vector_size; i ++){
b [i] = 0.0f;
}
// --------------------------------------- --------------------------------------
// ++ +++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++
// AVX实现
// +++++++++++ +++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++
pA = a; pB = b;
s = getCurrentTime();
for(int i = 0; i rA_AVX = _mm256_load_ps(pA);
rB_AVX = _mm256_sqrt_ps(_mm256_sqrt_ps(_mm256_sqrt_ps(rA_AVX)));
_mm256_store_ps(pB,rB_AVX);
pA + = 8;
pB + = 8;
}
e = getCurrentTime();
cout<< (e-s)* 1000 < ms< ,b [42] =< b [42]< endl;
_mm_free(a);
_mm_free(b);
return 0;
}
感谢任何帮助!
SQRTPS
(SSE指令)。请参阅Agner Fog的优化指南:说明表,第88页。 说明如平方根和除法不能从AVX中受益。另一方面,加法,乘法等操作。
I've been using Intel's SSE intrinsics for quite some time with good performance gains. Hence, I expected the AVX intrinsics to further speed-up my programs. This, unfortunately, was not the case until now. Probably I am doing a stupid mistake, so I would be very grateful if somebody could help me out.
I use Ubuntu 11.10 with g++ 4.6.1. I compiled my program (see below) with
g++ simpleExample.cpp -O3 -march=native -o simpleExample
The test system has a Intel i7-2600 CPU.
Here is the code which exemplifies my problem. On my system, I get the output
98.715 ms, b[42] = 0.900038 // Naive
24.457 ms, b[42] = 0.900038 // SSE
24.646 ms, b[42] = 0.900038 // AVX
Note that the computation sqrt(sqrt(sqrt(x))) was only chosen to ensure that memory bandwith does not limit execution speed; it is just an example.
simpleExample.cpp:
#include <immintrin.h>
#include <iostream>
#include <math.h>
#include <sys/time.h>
using namespace std;
// -----------------------------------------------------------------------------
// This function returns the current time, expressed as seconds since the Epoch
// -----------------------------------------------------------------------------
double getCurrentTime(){
struct timeval curr;
struct timezone tz;
gettimeofday(&curr, &tz);
double tmp = static_cast<double>(curr.tv_sec) * static_cast<double>(1000000)
+ static_cast<double>(curr.tv_usec);
return tmp*1e-6;
}
// -----------------------------------------------------------------------------
// Main routine
// -----------------------------------------------------------------------------
int main() {
srand48(0); // seed PRNG
double e,s; // timestamp variables
float *a, *b; // data pointers
float *pA,*pB; // work pointer
__m128 rA,rB; // variables for SSE
__m256 rA_AVX, rB_AVX; // variables for AVX
// define vector size
const int vector_size = 10000000;
// allocate memory
a = (float*) _mm_malloc (vector_size*sizeof(float),32);
b = (float*) _mm_malloc (vector_size*sizeof(float),32);
// initialize vectors //
for(int i=0;i<vector_size;i++) {
a[i]=fabs(drand48());
b[i]=0.0f;
}
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// Naive implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
s = getCurrentTime();
for (int i=0; i<vector_size; i++){
b[i] = sqrtf(sqrtf(sqrtf(a[i])));
}
e = getCurrentTime();
cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;
// -----------------------------------------------------------------------------
for(int i=0;i<vector_size;i++) {
b[i]=0.0f;
}
// -----------------------------------------------------------------------------
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// SSE2 implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
pA = a; pB = b;
s = getCurrentTime();
for (int i=0; i<vector_size; i+=4){
rA = _mm_load_ps(pA);
rB = _mm_sqrt_ps(_mm_sqrt_ps(_mm_sqrt_ps(rA)));
_mm_store_ps(pB,rB);
pA += 4;
pB += 4;
}
e = getCurrentTime();
cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;
// -----------------------------------------------------------------------------
for(int i=0;i<vector_size;i++) {
b[i]=0.0f;
}
// -----------------------------------------------------------------------------
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// AVX implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
pA = a; pB = b;
s = getCurrentTime();
for (int i=0; i<vector_size; i+=8){
rA_AVX = _mm256_load_ps(pA);
rB_AVX = _mm256_sqrt_ps(_mm256_sqrt_ps(_mm256_sqrt_ps(rA_AVX)));
_mm256_store_ps(pB,rB_AVX);
pA += 8;
pB += 8;
}
e = getCurrentTime();
cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;
_mm_free(a);
_mm_free(b);
return 0;
}
Any help is appreciated!
This is because VSQRTPS
(AVX instruction) takes exactly twice as many cycles as SQRTPS
(SSE instruction) on a Sandy Bridge processor. See Agner Fog's optimize guide: instruction tables, page 88.
Instructions like square root and division don't benefit from AVX. On the other hand, additions, multiplications, etc., do.
这篇关于使用AVX内在函数而不是SSE不能提高速度 - 为什么?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!