std堆栈性能问题 [英] std stack performance issues

查看：555 发布时间：2016/11/3 1:26:04 c++ windows performance stack

本文介绍了std堆栈性能问题的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

最近我试图做一些性能基准，比较 std :: stack< int，std :: vector< int>> （使用预分配的内存）。现在我经历了一些奇怪的行为。

我想要的第一件事是是堆栈基准代码中的这一行：

  // std :: vector< int> magicVector（10）;

当我取消注释此行时，性能提升约17％（基准时间从6.5降至5.4秒）。但是该行应该对程序的其余部分没有影响，因为它不会修改任何其他成员。此外，它是无关紧要的向量的int或向量的双重...

 
 
  我想问的第二件事我的堆栈实现和 std :: stack 之间的性能差异。我被告知， std :: stack 应该像我的堆栈一样快，但结果表明，我的FastStack是两倍快。 
 
 
  结果（带有未注释的效果增加行）：
 
 stack 5.38979 
 
 stack 5.34406 < br> 
 stack 5.32404 
 
 stack 5.30519 
 
 FastStack 2.59635 
 
 FastStack 2.59204 
 
 FastStack 2.59713 
 
 FastStack 2.64814 
 
 
 这些结果来自VS2010用/ O2，/ Ot，/ Ob2和其他默认优化的版本。我的CPU是Intel i5 3570k，默认时钟（3.6 GHz为一个线程）。 
 
 
 我将所有代码放在一个文件中，这样任何人都可以轻松地对其进行测试。 
  #define _SECURE_SCL 0 
 
 #include< iostream> 
 #include< vector> 
 #include< stack> 
 #include< Windows.h> 
 
 using namespace std; 
 
 // ---------------------------------------- ----------------------------------------- 
 // --- -------------------------------------------------- ---------------------------- 
 //目的：高分辨率定时器
 // ----- -------------------------------------------------- -------------------------- 
 
 class HRTimer 
 {
 public：
 HRTimer（）; 
 double GetFrequency（void）; 
 void Start（void）; 
 double Stop（void）; 
 double GetTime（）; 
 
 private：
 LARGE_INTEGER start; 
 LARGE_INTEGER stop; 
 double frequency; 
}; 
 
 HRTimer :: HRTimer（）
 {
 frequency = this-> GetFrequency（）; 
} 
 
 double HRTimer :: GetFrequency（void）
 {
 LARGE_INTEGER proc_freq; 
 if（！:: QueryPerformanceFrequency（& proc_freq））
 return -1; 
 return proc_freq.QuadPart; 
} 
 
 void HRTimer :: Start（void）
 {
 DWORD_PTR oldmask = :: SetThreadAffinityMask（:: GetCurrentThread（），0）; 
 :: QueryPerformanceCounter（& start）; 
 :: SetThreadAffinityMask（:: GetCurrentThread（），oldmask）; 
} 
 
 double HRTimer :: Stop（void）
 {
 DWORD_PTR oldmask = :: SetThreadAffinityMask（:: GetCurrentThread（），0）; 
 :: QueryPerformanceCounter（& stop）; 
 :: SetThreadAffinityMask（:: GetCurrentThread（），oldmask）; 
 return（（stop.QuadPart  -  start.QuadPart）/ frequency）; 
} 
 
 HRTimer :: GetTime（）
 {
 LARGE_INTEGER time; 
 :: QueryPerformanceCounter（& time）; 
 return time.QuadPart / frequency; 
} 
 
 // ----------------------------------- ---------------------------------------------- 
 // ------------------------------------------------ --------------------------------- 
 //目的：应该比std :: stack $更快b $ b // --------------------------------------------- ------------------------------------ 
 
 template< class T> 
 
 class FastStack 
 {
 public：
 T * st; 
 int allocationSize; 
 int lastIndex; 
 
 public：
 FastStack（int stackSize）; 
〜FastStack（）; 
 
 inline void resize（int newSize）; 
 inline void push（T x）; 
 inline void pop（）; 
 inline T getAndRemove（）; 
 inline T getLast（）; 
 inline void clear（）; 
}; 
 
 template< class T> 
 FastStack< T> :: FastStack（int stackSize）
 {
 st = NULL; 
 this-> allocationSize = stackSize; 
 st = new T [stackSize]; 
 lastIndex = -1; 
} 
 
 template< class T> 
 FastStack< T> ::〜FastStack（）
 {
 delete [] st; 
} 
 
 template< class T> 
 void FastStack< T> :: clear（）
 {
 lastIndex = -1; 
} 
 
 template< class T> 
 T FastStack< T> :: getLast（）
 {
 return st [lastIndex]; 
} 
 
 template< class T> 
 T FastStack< T> :: getAndRemove（）
 {
 return st [lastIndex--]; 
} 
 
 template< class T> 
 void FastStack< T> :: pop（）
 {
 --lastIndex; 
} 
 
 template< class T> 
 void FastStack< T> :: push（T x）
 {
 st [++ lastIndex] = x; 
} 
 
 template< class T> 
 void FastStack< T> :: resize（int newSize）
 {
 if（st！= NULL）
 delete [] st; 
 st = new T [newSize]; 
} 
 // --------------------------------------- ------------------................................................ 
 //  - -------------------------------------------------- ----------------------------- 
 // --------------- -------------------------------------------------- ---------------- 
 //目的：std :: stack和FastStack的基准
 // ------------ -------------------------------------------------- ------------------- 
 
 
 int main（int argc，char * argv []）
 {
 #if 1 
 for（int it = 0; it< 4; it ++）
 {
 std :: stack< int，std :: vector< int> b堆叠; 
 int x; 
 
 for（int i = 0; i <100; i ++）//在这两个循环之后，bStack的容量将是141，所以不会再重新分配
 bStack.push ）; 
 for（int i = 0; i <100; i ++）
 bStack.pop（）; 
 // std :: vector< int> magicVector（10）; //当你取消注释这行时，性能将神奇地上升约18％
 
 HRTimer定时器; 
 timer.Start（）; 
 
 for（int i = 0; i <2000000000; i ++）
 {
 bStack.push（i）; 
 x = bStack.top（）; 
 if（i％100 == 0& i！= 0）
 for（int j = 0; j< 100; j ++）
 bStack.pop 
} 
 
 double totalTime = timer.Stop（）; 
 cout<< stack<< totalTime< endl; 
} 
 #endif 
 
 // ----------------------------- -------------------------------------------------- ----- 
 
 #if 1 
 for（int it = 0; it< 4; it ++）
 {
 FastStack< int> fstack（200）; 
 int x; 
 
 HRTimer定时器; 
 timer.Start（）; 
 
 for（int i = 0; i <2000000000; i ++）
 {
 fstack.push（i）; 
 x = fstack.getLast（）; 
 if（i％100 == 0& i！= 0）
 for（int j = 0; j< 100; j ++）
 fstack.pop 
} 
 
 double totalTime = timer.Stop（）; 
 cout<< FastStack< totalTime< endl; 
} 
 #endif 
 
 cout<< 完成; 
 cin.get（）; 
 return 0; 
} 
  
。
 
  EDIT：因为每个人都谈论我真正糟糕的实现我的堆栈我想设置的事情正确。我在几分钟内创建了这个堆栈，我只实现了我目前需要的几个功能。它从来不是要替换std :: stack :)或保存以在所有情况下使用。唯一的目标是实现最高速度和正确的结果。对此误解我很抱歉...我只想知道几个答案... 
解决方案
许多评论（甚至答案）专注于实施中的风险。但问题是。 
 
 
 如下面直接演示的，纠正感知的代码缺点会不更改关于性能的任何重要信息。 
 
 
 这里是OP的代码修改为（A）安全，和（B）支持与 std :: stack ，以及（C）为 std :: stack 保留缓冲区空间，以便为那些错误地认为这些东西对性能：
  #define _SECURE_SCL 0 
 #define _SCL_SECURE_NO_WARNINGS 
 
 #include< algorithm> // std :: swap 
 #include< iostream> 
 #include< vector> 
 #include< stack> 
 #include< stddef.h> // ptrdiff_t 
 #include< type_traits> // std :: is_pod 
 using namespace std; 
 
 #undef UNICODE 
 #define UNICODE 
 #include< Windows.h> 
 
 typedef ptrdiff_t大小; 
 typedef大小索引; 
 
 template<类类型，类Container> 
 void reserve（size const newBufSize，std :: stack< Type，Container>& st）
 {
 struct Access：std :: stack<类型，容器> 
 {
 static Container& container（std :: stack< Type，Container>& st）
 {
 return st。*& Access :: c; 
} 
}; 
 
 Access :: container（st）.reserve（newBufSize）; 
} 
 
 class HighResolutionTimer 
 {
 public：
 HighResolutionTimer（）; 
 double GetFrequency（）const; 
 void Start（）; 
 double Stop（）; 
 double GetTime（）const; 
 
 private：
 LARGE_INTEGER start; 
 LARGE_INTEGER stop; 
 double frequency; 
}; 
 
 HighResolutionTimer :: HighResolutionTimer（）
 {
 frequency = GetFrequency（）; 
} 
 
 double HighResolutionTimer :: GetFrequency（）const 
 {
 LARGE_INTEGER proc_freq; 
 if（！:: QueryPerformanceFrequency（& proc_freq））
 return -1; 
 return static_cast< double>（proc_freq.QuadPart）; 
} 
 
 void HighResolutionTimer :: Start（）
 {
 DWORD_PTR oldmask = :: SetThreadAffinityMask（:: GetCurrentThread（），0）; 
 :: QueryPerformanceCounter（& start）; 
 :: SetThreadAffinityMask（:: GetCurrentThread（），oldmask）; 
} 
 
 double HighResolutionTimer :: Stop（）
 {
 DWORD_PTR oldmask = :: SetThreadAffinityMask（:: GetCurrentThread（），0）; 
 :: QueryPerformanceCounter（& stop）; 
 :: SetThreadAffinityMask（:: GetCurrentThread（），oldmask）; 
 return（（stop.QuadPart  -  start.QuadPart）/ frequency）; 
} 
 
 double HighResolutionTimer :: GetTime（）const 
 {
 LARGE_INTEGER time; 
 :: QueryPerformanceCounter（& time）; 
 return time.QuadPart / frequency; 
} 
 
 template< class Type，bool elemTypeIsPOD = !! std :: is_pod<类型> :: value> 
 class FastStack; 
 
 template<类类型> 
 class FastStack< Type，true> 
 {
 private：
 Type * st_; 
索引lastIndex_; 
 Size capacity_; 
 
 public：
 size const size（）const {return lastIndex_ + 1; } 
 size const capacity（）const {return capacity_; } 
 
 void reserve（Size const newCapacity）
 {
 if（newCapacity> capacity_）
 {
 FastStack<类型>（* this，newCapacity）.swapWith（* this）; 
} 
} 
 
 void push（type const& x）
 {
 if（size（）== capacity（））
 {
 reserve（2 * capacity（））; 
} 
 st _ [++ lastIndex_] = x; 
} 
 
 void pop（）
 {
 --lastIndex_; 
} 
 
类型top（）const 
 {
 return st_ [lastIndex_]; 
} 
 
 void swapWith（FastStack& other）throw（）
 {
 using std :: swap; 
 swap（st_，other.st_）; 
 swap（lastIndex_，other.lastIndex_）; 
 swap（capacity_，other.capacity_）; 
} 
 
 void operator =（FastStack other）
 {
 other.swapWith（* this）; 
} 
 
〜FastStack（）
 {
 delete [] st_; 
} 
 
 FastStack（Size const aCapacity = 0）
：st_（new Type [aCapacity]）
，capacity_（aCapacity）
 {
 lastIndex_ = -1; 
} 
 
 FastStack（FastStack const& other，int const newBufSize = -1）
 {
 capacity_ =（newBufSize< other.size size（）：newBufSize）; 
 st_ = new Type [capacity_]; 
 lastIndex_ = other.lastIndex_; 
 copy（other.st_，other.st_ + other.size（），st_）; //不能抛出POD。 
} 
}; 
 
 template<类类型> 
 void reserve（Size const newCapacity，FastStack< Type>& st）
 {
 st.reserve（newCapacity）; 
} 
 
 template< class StackType> 
 void test（char const * const description）
 {
 for（int it = 0; it< 4; ++ it）
 {
 StackType st; 
储备（200，st）; 
 
 //在这两个循环之后，st的容量将是141，因此不会再有重新分配
（int i = 0; i <100; ++ i）{st。 push（i）; } 
 for（int i = 0; i <100; ++ i）{st.pop（）; } 
 
 //当你取消注释这一行时，std :: stack的性能将神奇地上升约18％
 // std :: vector< int> magicVector（10）; 
 
高分辨率定时器; 
 timer.Start（）; 
 
 for（Index i = 0; i <1000000000; ++ i）
 {
 st.push（i）; 
（void）st.top（）; 
 if（i％100 == 0& i！= 0）
 {
 for（int j = 0; j< 100; ++ j）{st.pop （）; } 
} 
} 
 
 double const totalTime = timer.Stop（）; 
 wcout<<描述<< ：<< totalTime< endl; 
} 
} 
 
 int main（）
 {
 typedef stack<索引，索引> > SStack; 
 typedef FastStack<索引> FStack; 
 
 test< SStack>（std :: stack）; 
 test< FStack>（FastStack）; 
 
 cout<< 完成; 
} 
  
这个慢慢消化的三星RC530笔记本电脑的结果：
 
 [D：\dev\test\so\12704314] 
> a 
 std :: stack：3.21319 
 std :: stack：3.16456 
 std :: stack：3.23298 
 std :: stack：3.20854 
 FastStack：1.97636 
 FastStack：1.97958 
 FastStack： 2.12977 
 FastStack：2.13507 
 Done 
 [D：\dev\test\so\12704314] 
> _ 
 
 
 
 
 现在让我们来看一个典型的实现 std :: vector :: push_back ，它由 std :: stack< T，std :: vector< T> :: push  ，我知道只有3个程序员谁曾经使用过这种缩进风格，即PJP，Petzold和我自己;我现在，从1998年或那里，认为这是可怕的）：
  void push_back（const value_type& _Val）
 {//在末尾插入元素
 if（_Inside（_STD addressof（_Val）））
 { / push back一个元素
 size_type _Idx = _STD addressof（_Val） -  this-> _Myfirst; 
 if（this-> _Mylast == this-> _Myend）
 _Reserve（1）; 
 _Orphan_range（this-> _Mylast，this-> _Mylast）; 
 this-> _Getal（）。construct（this-> _Mylast，
 this-> _Myfirst [_Idx]）; 
 ++ this-> _Mylast; 
} 
 else 
 {//推回非元素
 if（this-> _Mylast == this-> _Myend）
 _Reserve（1） ; 
 _Orphan_range（this-> _Mylast，this-> _Mylast）; 
 this-> _Getal（）。construct（this-> _Mylast，
 _Val）; 
 ++ this-> _Mylast; 
} 
} 
  
我怀疑测量的无效率至少部分地在所有的东西发生在那里，并且或许它也是一个自动生成的安全检查的问题。
 
 
 对于调试生成 std :: stack 性能是非常不好，我放弃了等待任何结果。
 
 
 
 
 
   EDIT ：根据以下Xeo的评论，我更新了 push 以检查自推缓冲区重新分配的情况，通过将其作为单独的函数分解：
  void push（type const& x）
 {
 if（size（）== capacity（））
 {
 reserveAndPush（x）; 
} 
 st _ [++ lastIndex_] = x; 
} 
  
神秘地，虽然 reserveAndPush 在这次测试中永远不会调用，它会影响性能–由于代码大小不适合缓存？
 
 [D：\dev\test\so\12704314] 
 > a 
 std :: stack：3.21623 
 std :: stack：3.30501 
 std :: stack：3.24337 
 std :: stack：3.27711 
 FastStack：2.52791 
 FastStack：2.44621 
 FastStack：2.44759 
 FastStack：2.47287 
 Done 
 [D：\dev\test\so\12704314] 
> _ 
 
 
 
  
 
  EDIT 2 ：DeadMG显示代码必须有错误。我相信问题是缺少 return ，加上表达式计算新的大小（两次零仍然为零）。他还指出，我忘了显示 reserveAndPush 。应为：
  void reserveAndPush（type const& x）
 {
 Type const xVal = x; 
 reserve（capacity_ == 0？1：2 * capacity_）; 
 push（xVal）; 
} 
 
 void push（type const& x）
 {
 if（size（）== capacity（））
 {
 return reserveAndPush（x）; //<  - 关键的回报。 
} 
 st _ [++ lastIndex_] = x; 
} 
  
 
Recently I was trying to do some performance benchmarks, comparing std::stack<int, std::vector<int>> and my own simple implementation of stack (that use pre-allocated memory). Now I’m experiencing some strange behavior.  

First thing I want to ask is this line in stack benchmark code:
//  std::vector<int> magicVector(10);
When I uncomment this line, performance rise about 17% (benchmark time drops from 6.5 to 5.4 seconds). But the line should have no impact on the rest of the program because it's not modifying any others members. Besides, it doesn't matter if it is vector of int or vector of double...  

Second thing I want to ask is big performance difference between my stack implementation and std::stack. I was told that std::stack should be as fast as my stack but results shows that my "FastStack" is twice as fast.  

Results (with uncommented performance increasing line):

stack 5.38979

stack 5.34406

stack 5.32404

stack 5.30519

FastStack 2.59635

FastStack 2.59204

FastStack 2.59713

FastStack 2.64814  

These results come from release build from VS2010 with /O2, /Ot, /Ob2 and others default optimizations. My CPU is Intel i5 3570k with default clock (3.6 GHz for one thread).  

I put all code in one file so anyone can easily test it.  
#define _SECURE_SCL 0

#include <iostream>
#include <vector>
#include <stack>
#include <Windows.h>

using namespace std;

//---------------------------------------------------------------------------------
//---------------------------------------------------------------------------------
//  Purpose:    High Resolution Timer
//---------------------------------------------------------------------------------

class HRTimer
{
public:
    HRTimer();
    double GetFrequency(void);
    void Start(void) ;
    double Stop(void);
    double GetTime();

private:
    LARGE_INTEGER start;
    LARGE_INTEGER stop;
    double frequency;
};

HRTimer::HRTimer()
{
    frequency = this->GetFrequency();
}

double HRTimer::GetFrequency(void)
{
    LARGE_INTEGER proc_freq;
    if (!::QueryPerformanceFrequency(&proc_freq))
        return -1;
    return proc_freq.QuadPart;
}

void HRTimer::Start(void)
{
    DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0);
    ::QueryPerformanceCounter(&start);
    ::SetThreadAffinityMask(::GetCurrentThread(), oldmask);
}

double HRTimer::Stop(void)
{
    DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0);
    ::QueryPerformanceCounter(&stop);
    ::SetThreadAffinityMask(::GetCurrentThread(), oldmask);
    return ((stop.QuadPart - start.QuadPart) / frequency);
} 

double HRTimer::GetTime()
{
    LARGE_INTEGER time;
    ::QueryPerformanceCounter(&time);
    return time.QuadPart / frequency;
}

//---------------------------------------------------------------------------------
//---------------------------------------------------------------------------------
//  Purpose:    Should be faster than std::stack
//---------------------------------------------------------------------------------

template <class T>

class FastStack
{
public:
    T* st;
    int allocationSize;
    int lastIndex;

public:
    FastStack(int stackSize);
    ~FastStack();

    inline void resize(int newSize);
    inline void push(T x);
    inline void pop();
    inline T getAndRemove();
    inline T getLast();
    inline void clear();
};

template <class T>
FastStack<T>::FastStack( int stackSize )
{
    st = NULL;
    this->allocationSize = stackSize;
    st = new T[stackSize];
    lastIndex = -1;
}

template <class T>
FastStack<T>::~FastStack()
{
    delete [] st;
}

template <class T>
void FastStack<T>::clear()
{
    lastIndex = -1;
}

template <class T>
T FastStack<T>::getLast()
{
    return st[lastIndex];
}

template <class T>
T FastStack<T>::getAndRemove()
{
    return st[lastIndex--];
}

template <class T>
void FastStack<T>::pop()
{
    --lastIndex;
}

template <class T>
void FastStack<T>::push( T x )
{
    st[++lastIndex] = x;
}

template <class T>
void FastStack<T>::resize( int newSize )
{
    if (st != NULL)
        delete [] st;
    st = new T[newSize];
}
//---------------------------------------------------------------------------------
//---------------------------------------------------------------------------------
//---------------------------------------------------------------------------------
//  Purpose:    Benchmark of std::stack and FastStack
//---------------------------------------------------------------------------------


int main(int argc, char *argv[])
{
#if 1
    for (int it = 0; it < 4; it++)
    {
        std::stack<int, std::vector<int>> bStack;
        int x;

        for (int i = 0; i < 100; i++)   // after this two loops, bStack's capacity will be 141 so there will be no more reallocating
            bStack.push(i);
        for (int i = 0; i < 100; i++)
            bStack.pop();
    //  std::vector<int> magicVector(10);           // when you uncomment this line, performance will magically rise about 18%

        HRTimer timer;
        timer.Start();

        for (int i = 0; i < 2000000000; i++)
        {
            bStack.push(i);
            x = bStack.top();
            if (i % 100 == 0 && i != 0)
                for (int j = 0; j < 100; j++)
                    bStack.pop();
        }

        double totalTime = timer.Stop();
        cout << "stack " << totalTime << endl;
    }
#endif

    //------------------------------------------------------------------------------------

#if 1
    for (int it = 0; it < 4; it++)
    {
        FastStack<int> fstack(200);
        int x;

        HRTimer timer;
        timer.Start();

        for (int i = 0; i < 2000000000; i++)
        {
            fstack.push(i);
            x = fstack.getLast();
            if (i % 100 == 0 && i != 0)
                for (int j = 0; j < 100; j++)
                    fstack.pop();
        }

        double totalTime = timer.Stop();
        cout << "FastStack " << totalTime << endl;
    }
#endif

    cout << "Done";
    cin.get();
    return 0;
}
.

EDIT: Since everybody talks about my really bad implementation of my stack I want to set things right. I created that stack in few minutes and I implemented just few features that I currently needed. It was never meant to be a replacement of std::stack :) or save to use in all cases. The only goal was to achieve maximum speed and correct results. I'm sorry about this misunderstanding… I just want to know few answers…
 解决方案 
The many comments (and even answers) focus on the risks in your implementation. Yet the question stands. 

As directly demonstrated below rectifying the perceived code shortcomings would not change anything significant about the performance. 

Here is the OP's code modified to be (A) safe, and (B) supporting the same operations as std::stack, and (C) reserving buffer space also for the std::stack, in order to clarify things for those who mistakenly believe that this stuff matters for the performance:
#define _SECURE_SCL 0
#define _SCL_SECURE_NO_WARNINGS

#include <algorithm>        // std::swap
#include <iostream>
#include <vector>
#include <stack>
#include <stddef.h>         // ptrdiff_t
#include <type_traits>      // std::is_pod
using namespace std;

#undef UNICODE
#define UNICODE
#include <Windows.h>

typedef ptrdiff_t   Size;
typedef Size        Index;

template< class Type, class Container >
void reserve( Size const newBufSize, std::stack< Type, Container >& st )
{
    struct Access: std::stack< Type, Container >
    {
        static Container& container( std::stack< Type, Container >& st )
        {
            return st.*&Access::c;
        }
    };

    Access::container( st ).reserve( newBufSize );
}

class HighResolutionTimer
{
public:
    HighResolutionTimer();
    double GetFrequency() const;
    void Start() ;
    double Stop();
    double GetTime() const;

private:
    LARGE_INTEGER start;
    LARGE_INTEGER stop;
    double frequency;
};

HighResolutionTimer::HighResolutionTimer()
{
    frequency = GetFrequency();
}

double HighResolutionTimer::GetFrequency() const
{
    LARGE_INTEGER proc_freq;
    if (!::QueryPerformanceFrequency(&proc_freq))
        return -1;
    return static_cast< double >( proc_freq.QuadPart );
}

void HighResolutionTimer::Start()
{
    DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0);
    ::QueryPerformanceCounter(&start);
    ::SetThreadAffinityMask(::GetCurrentThread(), oldmask);
}

double HighResolutionTimer::Stop()
{
    DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0);
    ::QueryPerformanceCounter(&stop);
    ::SetThreadAffinityMask(::GetCurrentThread(), oldmask);
    return ((stop.QuadPart - start.QuadPart) / frequency);
} 

double HighResolutionTimer::GetTime() const
{
    LARGE_INTEGER time;
    ::QueryPerformanceCounter(&time);
    return time.QuadPart / frequency;
}

template< class Type, bool elemTypeIsPOD = !!std::is_pod< Type >::value >
class FastStack;

template< class Type >
class FastStack< Type, true >
{
private:
    Type*   st_;
    Index   lastIndex_;
    Size    capacity_;

public:
    Size const size() const { return lastIndex_ + 1; }
    Size const capacity() const { return capacity_; }

    void reserve( Size const newCapacity )
    {
        if( newCapacity > capacity_ )
        {
            FastStack< Type >( *this, newCapacity ).swapWith( *this );
        }
    }

    void push( Type const& x )
    {
        if( size() == capacity() )
        {
            reserve( 2*capacity() );
        }
        st_[++lastIndex_] = x;
    }

    void pop()
    {
        --lastIndex_;
    }

    Type top() const
    {
        return st_[lastIndex_];
    }

    void swapWith( FastStack& other ) throw()
    {
        using std::swap;
        swap( st_, other.st_ );
        swap( lastIndex_, other.lastIndex_ );
        swap( capacity_, other.capacity_ );
    }

    void operator=( FastStack other )
    {
        other.swapWith( *this );
    }

    ~FastStack()
    {
        delete[] st_;
    }

    FastStack( Size const aCapacity = 0 )
        : st_( new Type[aCapacity] )
        , capacity_( aCapacity )
    {
        lastIndex_ = -1;
    }

    FastStack( FastStack const& other, int const newBufSize = -1 )
    {
        capacity_ = (newBufSize < other.size()? other.size(): newBufSize);
        st_ = new Type[capacity_];
        lastIndex_ = other.lastIndex_;
        copy( other.st_, other.st_ + other.size(), st_ );   // Can't throw for POD.
    }
};

template< class Type >
void reserve( Size const newCapacity, FastStack< Type >& st )
{
    st.reserve( newCapacity );
}

template< class StackType >
void test( char const* const description )
{
    for( int it = 0; it < 4; ++it )
    {
        StackType st;
        reserve( 200, st );

        // after this two loops, st's capacity will be 141 so there will be no more reallocating
        for( int i = 0; i < 100; ++i ) { st.push( i ); }
        for( int i = 0; i < 100; ++i ) { st.pop(); }

        // when you uncomment this line, std::stack performance will magically rise about 18%
        // std::vector<int> magicVector(10);

        HighResolutionTimer timer;
        timer.Start();

        for( Index i = 0; i < 1000000000; ++i )
        {
            st.push( i );
            (void) st.top();
            if( i % 100 == 0 && i != 0 )
            {
                for( int j = 0; j < 100; ++j ) { st.pop(); }
            }
        }

        double const totalTime = timer.Stop();
        wcout << description << ": "  << totalTime << endl;
    }
}

int main()
{
    typedef stack< Index, vector< Index > > SStack;
    typedef FastStack< Index >              FStack;

    test< SStack >( "std::stack" );
    test< FStack >( "FastStack" );

    cout << "Done";
}
Results on this slow-as-molasses Samsung RC530 laptop:
[D:\dev\test\so\12704314]
> a
std::stack: 3.21319
std::stack: 3.16456
std::stack: 3.23298
std::stack: 3.20854
FastStack: 1.97636
FastStack: 1.97958
FastStack: 2.12977
FastStack: 2.13507
Done
[D:\dev\test\so\12704314]
> _
And similarly for Visual C++.

Now let's look at a typical implementation of std::vector::push_back, which is called by std::stack<T, std::vector<T>>::push (in passing, I know of only 3 programmers who have ever used this indentation style, namely PJP, Petzold and myself; I now, since 1998 or thereabouts, think it's horrible!):
void push_back(const value_type& _Val)
    {   // insert element at end
    if (_Inside(_STD addressof(_Val)))
        {   // push back an element
        size_type _Idx = _STD addressof(_Val) - this->_Myfirst;
        if (this->_Mylast == this->_Myend)
            _Reserve(1);
        _Orphan_range(this->_Mylast, this->_Mylast);
        this->_Getal().construct(this->_Mylast,
            this->_Myfirst[_Idx]);
        ++this->_Mylast;
        }
    else
        {   // push back a non-element
        if (this->_Mylast == this->_Myend)
            _Reserve(1);
        _Orphan_range(this->_Mylast, this->_Mylast);
        this->_Getal().construct(this->_Mylast,
            _Val);
        ++this->_Mylast;
        }
    }
I suspect that the measured inefficiency lies at least partly in all the stuff going on there, and perhaps it's also a matter of automatically generated safety checks.

For a debug build the std::stack performance is so extremely ungood that I gave up waiting for any result.



EDIT: following Xeo’s comment below I updated push to check for "self-push" in the case of buffer reallocation, by factoring that out as a separate function:
void push( Type const& x )
{
    if( size() == capacity() )
    {
        reserveAndPush( x );
    }
    st_[++lastIndex_] = x;
}
Mysteriously, although reserveAndPush is never called in this testing, it affects the performance – due to code size not fitting cache?
[D:\dev\test\so\12704314]
> a
std::stack: 3.21623
std::stack: 3.30501
std::stack: 3.24337
std::stack: 3.27711
FastStack: 2.52791
FastStack: 2.44621
FastStack: 2.44759
FastStack: 2.47287
Done
[D:\dev\test\so\12704314]
> _

EDIT 2: DeadMG showed that the code must be buggy. I believe the problem was a missing return, plus the expression computing new size (twice zero is still zero). He also pointed out that I forgot to show reserveAndPush. Should be:
void reserveAndPush( Type const& x )
{
    Type const xVal = x;
    reserve( capacity_ == 0? 1 : 2*capacity_ );
    push( xVal );
}

void push( Type const& x )
{
    if( size() == capacity() )
    {
        return reserveAndPush( x );    // <-- The crucial "return".
    }
    st_[++lastIndex_] = x;
}


                        
这篇关于std堆栈性能问题的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！


                    
                        查看全文

std堆栈性能问题 [英] std stack performance issues

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

std堆栈性能问题 [英] std stack performance issues

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭