c ++ 11 regex比python慢 [英] c++11 regex slower than python

查看:303
本文介绍了c ++ 11 regex比python慢的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

hi我想知道为什么下面的代码使用regex拆分字符串

 #include< regex> 
#include< vector>
#include< string>

std :: vector< std :: string> split(const std :: string& s){
static const std :: regex rsplit(+);
auto rit = std :: sregex_token_iterator(s.begin(),s.end(),rsplit,-1);
auto rend = std :: sregex_token_iterator();
auto res = std :: vector< std :: string>(rit,rend);
return res;
}

int main(){
for(auto i = 0; i <10000; ++ i)
split(abc,) ;
return 0;
}

比下面的python代码更慢

  import re 
for i in range(10000):
re.split('+','ab c')

这里

  > python test.py 0.05s用户0.01s系统94%cpu 0.070总计
./test 0.26s用户0.00s系统99%cpu 0.296总计

在osx上使用clang ++。



使用-O3编译会将其降至 0.09s用户0.00s系统99%cpu 0.109总计

解决方案

注意



另请参见此答案: http://stackoverflow.com/a/21708215 这是 EDIT 2






我已将循环增加到1000000



这是我的Python时间:

  real 0m2.038s 
user 0m2.009s
sys 0m0.024s

这里是一个等价的你的代码,只是有点漂亮:

  #include< regex& 
#include< vector>
#include< string>

std :: vector< std :: string> split(const std :: string& s,const std :: regex& r)
{
return {
std :: sregex_token_iterator(s.begin(),s.end ),r,-1),
std :: sregex_token_iterator()
};
}

int main()
{
const std :: regex r(+)
for(auto i = 0; i <1000000; ++ i)
split(a b c,r);
return 0;
}

计时:

  real 0m5.786s 
用户0m5.779s
sys 0m0.005s






这是一个优化,以避免构造/分配向量和字符串对象:

  #include< regex> 
#include< vector>
#include< string>

void split(const std :: string& s,const std :: regex& r,std :: vector< std :: string>& v)
{
auto rit = std :: sregex_token_iterator(s.begin(),s.end(),r,-1);
auto rend = std :: sregex_token_iterator();
v.clear();
while(rit!= rend)
{
v.push_back(* rit);
++ rit;
}
}

int main()
{
const std :: regex r(+);
std :: vector< std :: string> v;
for(auto i = 0; i <1000000; ++ i)
split(a b c,r,v)
return 0;
}

计时:

  real 0m3.034s 
用户0m3.029s
sys 0m0.004s

这是接近100%的性能提升。



向量是在循环之前创建的,并且可以在第一次迭代。之后, clear()没有内存释放,向量维护内存并构建字符串就地






另一个性能提高是完全避免构建/销毁 std :: string

 <$> 



c $ c> #include< regex>
#include< vector>
#include< string>

void split(const char * s,const std :: regex& r,std :: vector< std :: string>& v)
{
auto rit = std :: cregex_token_iterator(s,s + std :: strlen(s),r,-1);
auto rend = std :: cregex_token_iterator();
v.clear();
while(rit!= rend)
{
v.push_back(* rit);
++ rit;
}
}

计时:


b $ b

  real 0m2.509s 
user 0m2.503s
sys 0m0.004s

一个最终的改进是有一个 std :: vector const char * / code>作为返回,其中每个char指针指向原始 s c string 本身中的子字符串。问题是,你不能这样做,因为它们不会被空终止(对于这一点,请参见后面的示例中的C ++ 1y string_ref 的用法) 。






最后的改进也可以通过这个实现:

  #include< regex> 
#include< vector>
#include< string>

void split(const std :: string& s,const std :: regex& r,std :: vector< std :: string>& v)
{
auto rit = std :: cregex_token_iterator(s.data(),s.data()+ s.length(),r,-1);
auto rend = std :: cregex_token_iterator();
v.clear();
while(rit!= rend)
{
v.push_back(* rit);
++ rit;
}
}

int main()
{
const std :: regex r(+);
std :: vector< std :: string> v;
for(auto i = 0; i <1000000; ++ i)
split(a b c,r,v) //常量字符串(a b c)应该由编译器优化
//。我有相同的性能,
//如果它是一个循环外的对象
return 0;
}






ang 3.3(从树干)与-O3的样品。也许其他正则表达式库能够更好地执行,但在任何情况下,分配/释放通常是性能损失。






Boost.Regex



这是 boost :: regex 参数示例:

  real 0m1.284s 
用户0m1.278s
sys 0m0.005s

相同的代码, boost :: regex std :: regex 这个示例中的接口是完全相同的,只需要更改命名空间并包含





EDIT



为了完成起见,我试过这个(上面提到的极限改进建议),它没有提高性能的等效 std :: vector< std :: string> & v 版本:

  #include< regex> 
#include< vector>
#include< string>

template< typename Iterator> class intrusive_substring
{
private:
迭代器begin_,end_;

public:
intrusive_substring(Iterator begin,Iterator end):begin_(begin),end_(end){}

迭代器begin(){return begin_; }
Iterator end(){return end_;}
};

使用intrusive_char_substring = intrusive_substring< const char *>

void split(const std :: string& s,const std :: regex& r,std :: vector< intrusive_char_substring>& v)
{
auto rit = std :: cregex_token_iterator(s.data(),s.data()+ s.length(),r,-1);
auto rend = std :: cregex_token_iterator();
v.clear(); //这可以被编译器优化,因为
// intrusive_char_substring析构函数什么也不做,所以
//重置内部大小是唯一要做的事情。
//维护先前分配的内存。
while(rit!= rend)
{
v.emplace_back(rit-> first,rit-> second);
++ rit;
}
}

int main()
{
const std :: regex r(+);
std :: vector< intrusive_char_substring> v;
for(auto i = 0; i <1000000; ++ i)
split(a b c,r,v)

return 0;
}

这与array_ref和string_ref提案。下面是一个使用它的示例代码:

  #include< regex> 
#include< vector>
#include< string>
#include< string_ref>

void split(const std :: string& s,const std :: regex& r,std :: vector< std :: string_ref>& v)
{
auto rit = std :: cregex_token_iterator(s.data(),s.data()+ s.length(),r,-1);
auto rend = std :: cregex_token_iterator();
v.clear();
while(rit!= rend)
{
v.emplace_back(rit-> first,rit-> length());
++ rit;
}
}

int main()
{
const std :: regex r(+);
std :: vector< std :: string_ref> v;
for(auto i = 0; i <1000000; ++ i)
split(a b c,r,v)

return 0;
}

返回使用向量return返回 split 的情况下, c >

EDIT 2



这个新解决方案能够通过返回获取输出。我使用Marshall Clow的 string_view string_ref 已重命名)libc ++实现在https://github.com/mclow/string_view

 # include< string> 
#include< string_view>
#include< boost / regex.hpp>
#include< boost / range / iterator_range.hpp>
#include< boost / iterator / transform_iterator.hpp>

using namespace std;
using namespace std :: experimental;
using namespace boost;

string_view stringfier(const cregex_token_iterator :: value_type& match){
return {match.first,static_cast< size_t>(match.length())};
}

使用string_view_iterator =
transform_iterator< decltype(& stringfier),cregex_token_iterator> ;;

iterator_range< string_view_iterator> split(string_view s,const regex& r){
return {
string_view_iterator(
cregex_token_iterator(s.begin(),s.end(),r,-1),
stringfier
),
string_view_iterator()
};
}

int main(){
const regex r(+);
for(size_t i = 0; i <1000000; ++ i){
split(a b c,r);
}
}

计时:


b $ b

  real 0m0.385s 
用户0m0.385s
sys 0m0.000s

注意与之前的结果相比有多快。当然,它不是在循环中填充一个向量(也不能预先匹配任何东西),但你得到一个范围,你可以范围为基于范围 for ,或者甚至用它来填充向量



iterator_range 在原始字符串上创建 string_view $ c>(或一个空的终止字符串),这将变得非常轻量级,从不产生不必要的字符串分配。



code> split 实现,但实际上填充了向量我们可以这样做:

  int main(){
const regex r(+);
vector< string_view> v;
v.reserve(10);
for(size_t i = 0; i <1000000; ++ i){
copy(split(a b c,r),back_inserter(v));
v.clear();
}
}

这使用boost范围复制算法填充向量每个迭代,时间是:

  real 0m1.002s 
用户0m0.997s
sys 0m0。 004s

可以看出,与优化的 string_view 输出参数版本。



注意,还有 std :: split 的提案将会像这样工作。


hi i would like to understand why the following code which does a split string split using regex

#include<regex>
#include<vector>
#include<string>

std::vector<std::string> split(const std::string &s){
    static const std::regex rsplit(" +");
    auto rit = std::sregex_token_iterator(s.begin(), s.end(), rsplit, -1);
    auto rend = std::sregex_token_iterator();
    auto res = std::vector<std::string>(rit, rend);
    return res;
}

int main(){
    for(auto i=0; i< 10000; ++i)
       split("a b c", " ");
    return 0;
}

is slower then the following python code

import re
for i in range(10000):
    re.split(' +', 'a b c')

here's

> python test.py  0.05s user 0.01s system 94% cpu 0.070 total
./test  0.26s user 0.00s system 99% cpu 0.296 total

Im using clang++ on osx.

compiling with -O3 brings it to down to 0.09s user 0.00s system 99% cpu 0.109 total

解决方案

Notice

See also this answer: http://stackoverflow.com/a/21708215 which was the base for the EDIT 2 at the bottom here.


I've augmented the loop to 1000000 to get a better timing measure.

This is my Python timing:

real    0m2.038s
user    0m2.009s
sys     0m0.024s

Here's an equivalent of your code, just a bit prettier:

#include <regex>
#include <vector>
#include <string>

std::vector<std::string> split(const std::string &s, const std::regex &r)
{
    return {
        std::sregex_token_iterator(s.begin(), s.end(), r, -1),
        std::sregex_token_iterator()
    };
}

int main()
{
    const std::regex r(" +");
    for(auto i=0; i < 1000000; ++i)
       split("a b c", r);
    return 0;
}

Timing:

real    0m5.786s
user    0m5.779s
sys     0m0.005s


This is an optimization to avoid construction/allocation of vector and string objects:

#include <regex>
#include <vector>
#include <string>

void split(const std::string &s, const std::regex &r, std::vector<std::string> &v)
{
    auto rit = std::sregex_token_iterator(s.begin(), s.end(), r, -1);
    auto rend = std::sregex_token_iterator();
    v.clear();
    while(rit != rend)
    {
        v.push_back(*rit);
        ++rit;
    }
}

int main()
{
    const std::regex r(" +");
    std::vector<std::string> v;
    for(auto i=0; i < 1000000; ++i)
       split("a b c", r, v);
    return 0;
}

Timing:

real    0m3.034s
user    0m3.029s
sys     0m0.004s

This is near a 100% performance improvement.

The vector is created before the loop, and can grow its memory in the first iteration. Afterwards there's no memory deallocation by clear(), the vector maintains the memory and construct strings in-place.


Another performance increase would be to avoid construction/destruction std::string completely, and hence, allocation/deallocation of its objects.

This is a tentative in this direction:

#include <regex>
#include <vector>
#include <string>

void split(const char *s, const std::regex &r, std::vector<std::string> &v)
{
    auto rit = std::cregex_token_iterator(s, s + std::strlen(s), r, -1);
    auto rend = std::cregex_token_iterator();
    v.clear();
    while(rit != rend)
    {
        v.push_back(*rit);
        ++rit;
    }
}

Timing:

real    0m2.509s
user    0m2.503s
sys     0m0.004s

An ultimate improvement would be to have a std::vector of const char * as return, where each char pointer would point to a substring inside the original s c string itself. The problem is that, you can't do that because each of them would not be null terminated (for this, see usage of C++1y string_ref in a later sample).


This last improvement could also be achieved with this:

#include <regex>
#include <vector>
#include <string>

void split(const std::string &s, const std::regex &r, std::vector<std::string> &v)
{
    auto rit = std::cregex_token_iterator(s.data(), s.data() + s.length(), r, -1);
    auto rend = std::cregex_token_iterator();
    v.clear();
    while(rit != rend)
    {
        v.push_back(*rit);
        ++rit;
    }
}

int main()
{
    const std::regex r(" +");
    std::vector<std::string> v;
    for(auto i=0; i < 1000000; ++i)
       split("a b c", r, v); // the constant string("a b c") should be optimized
                             // by the compiler. I got the same performance as
                             // if it was an object outside the loop
    return 0;
}


I've built the samples with clang 3.3 (from trunk) with -O3. Maybe other regex libraries are able to perform better, but in any case, allocations/deallocations are frequently a performance hit.


Boost.Regex

This is the boost::regex timing for the c string arguments sample:

real    0m1.284s
user    0m1.278s
sys     0m0.005s

Same code, boost::regex and std::regex interface in this sample are identical, just needed to change the namespace and include.

Best wishes for it to get better over time, C++ stdlib regex implementations are in their infancy.

EDIT

For sake of completion, I've tried this (the above mentioned "ultimate improvement" suggestion) and it didn't improved performance of the equivalent std::vector<std::string> &v version in anything:

#include <regex>
#include <vector>
#include <string>

template<typename Iterator> class intrusive_substring
{
private:
    Iterator begin_, end_;

public:
    intrusive_substring(Iterator begin, Iterator end) : begin_(begin), end_(end) {}

    Iterator begin() {return begin_;}
    Iterator end() {return end_;}
};

using intrusive_char_substring = intrusive_substring<const char *>;

void split(const std::string &s, const std::regex &r, std::vector<intrusive_char_substring> &v)
{
    auto rit = std::cregex_token_iterator(s.data(), s.data() + s.length(), r, -1);
    auto rend = std::cregex_token_iterator();
    v.clear(); // This can potentially be optimized away by the compiler because
               // the intrusive_char_substring destructor does nothing, so
               // resetting the internal size is the only thing to be done.
               // Formerly allocated memory is maintained.
    while(rit != rend)
    {
        v.emplace_back(rit->first, rit->second);
        ++rit;
    }
}

int main()
{
    const std::regex r(" +");
    std::vector<intrusive_char_substring> v;
    for(auto i=0; i < 1000000; ++i)
       split("a b c", r, v);

    return 0;
}

This has to do with the array_ref and string_ref proposal. Here's a sample code using it:

#include <regex>
#include <vector>
#include <string>
#include <string_ref>

void split(const std::string &s, const std::regex &r, std::vector<std::string_ref> &v)
{
    auto rit = std::cregex_token_iterator(s.data(), s.data() + s.length(), r, -1);
    auto rend = std::cregex_token_iterator();
    v.clear();
    while(rit != rend)
    {
        v.emplace_back(rit->first, rit->length());
        ++rit;
    }
}

int main()
{
    const std::regex r(" +");
    std::vector<std::string_ref> v;
    for(auto i=0; i < 1000000; ++i)
       split("a b c", r, v);

    return 0;
}

It will also be cheaper to return a vector of string_ref rather than string copies for the case of split with vector return.

EDIT 2

This new solution is able to get output by return. I have used Marshall Clow's string_view (string_ref got renamed) libc++ implementation found at https://github.com/mclow/string_view.

#include <string>
#include <string_view>
#include <boost/regex.hpp>
#include <boost/range/iterator_range.hpp>
#include <boost/iterator/transform_iterator.hpp>

using namespace std;
using namespace std::experimental;
using namespace boost;

string_view stringfier(const cregex_token_iterator::value_type &match) {
    return {match.first, static_cast<size_t>(match.length())};
}

using string_view_iterator =
    transform_iterator<decltype(&stringfier), cregex_token_iterator>;

iterator_range<string_view_iterator> split(string_view s, const regex &r) {
    return {
        string_view_iterator(
            cregex_token_iterator(s.begin(), s.end(), r, -1),
            stringfier
        ),
        string_view_iterator()
    };
}

int main() {
    const regex r(" +");
    for (size_t i = 0; i < 1000000; ++i) {
        split("a b c", r);
    }
}

Timing:

real    0m0.385s
user    0m0.385s
sys     0m0.000s

Note how faster this is compared to previous results. Of course, it's not filling a vector inside the loop (nor matching anything in advance probably too), but you get a range anyway, which you can range over with range-based for, or even use it to fill a vector.

As ranging over the iterator_range creates string_views over an original string(or a null terminated string), this gets very lightweight, never generating unnecessary string allocations.

Just to compare using this split implementation but actually filling a vector we could do this:

int main() {
    const regex r(" +");
    vector<string_view> v;
    v.reserve(10);
    for (size_t i = 0; i < 1000000; ++i) {
        copy(split("a b c", r), back_inserter(v));
        v.clear();
    }
}

This uses boost range copy algorithm to fill the vector in each iteration, the timing is:

real    0m1.002s
user    0m0.997s
sys     0m0.004s

As can be seen, no much difference in comparison with the optimized string_view output param version.

Note also there's a proposal for a std::split that would work like this.

这篇关于c ++ 11 regex比python慢的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆