使用Boost :: spirit编写的解析器的性能问题 [英] Performance issue with parser written with Boost::spirit

查看：822 发布时间：2016/11/5 23:15:51 c++ parsing c++11 boost boost-spirit-qi

本文介绍了使用Boost :: spirit编写的解析器的性能问题的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我要分析看起来像这样（FASTA般的文本格式）文件：

<预类=郎无prettyprint-覆盖>

> InfoHeader 
 
>，其中的每80个字符后是一个换行符某些文本序列; InfoHeader 
，其中的每80个字符后是一个换行符某些文本序列
 ...

例如：

<预class =lang-none prettyprint-override>

> gi | 31563518 | ref | NP_852610.1 |微管相关蛋白1A / 1B轻链3A亚型B〔智人] 
 MKMRFFSSPCGKAAVDPADRCKEVQQIRDQHPSKIPVIIERYKGEKQLPVLDKTKFLVPDHVNMSELVKI 
 IRRRLQLNPTQAFFLLVNQHSMVSVSTPIADIYEQEKDEDGFLYMVYASQETFGFIRENE

我用boost :: spirit写了一个解析器。解析器将标题行和以下文本序列正确地存储在 std :: vector < std :: pair< string，string>> ，但是对于更大的文件它需要很长时间（对于100MB文件来说是17秒）。作为比较，我写了一个程序没有boost ::精神（只是STL函数），简单地复制该100MB文件的每一行在 std :: vector 。整个过程需要不到一秒钟。用于比较的程序不是为了目的，但我不认为解析器应该花更长时间...

我知道有很多。其他FASTA解析器左右，但我相当好奇，为什么我的代码是慢

的

.HPP文件：

  #include< boost / filesystem / path.hpp> 
 
 namespace fs = boost :: filesystem; 
 
 
 class FastaReader {
 
 public：
 typedef std :: vector< std :: pair< std :: string，std :: string> > fastaVector; 
 
 private：
 fastaVector fV; 
 fs :: path file; 
 
 public：
 FastaReader（const fs :: path& f）; 
〜FastaReader（）; 
 
 const fs :: route& getFile（）const; 
 const fastaVector :: const_iterator getBeginIterator（）const; 
 const fastaVector :: const_iterator getEndIterator（）const; 
 
 private：
 void parse（）; 
 
};

和.cpp文件：

  #include< iomanip> 
 #include< boost / date_time / posix_time / posix_time.hpp> 
 #include< boost / filesystem / fstream.hpp> 
 #include< boost / filesystem / operations.hpp> 
 #include< boost / filesystem / path.hpp> 
 #include< boost / spirit / include / classic_position_iterator.hpp> 
 #include< boost / spirit / include / phoenix_bind.hpp> 
 #include< boost / spirit / include / phoenix_core.hpp> 
 #include< boost / spirit / include / phoenix_fusion.hpp> 
 #include< boost / spirit / include / phoenix_operator.hpp> 
 #include< boost / spirit / include / qi.hpp> 
 #includefastaReader.hpp
 
 
 using namespace std; 
 
 namespace fs = boost :: filesystem; 
 namespace qi = boost :: spirit :: qi; 
命名空间pt = boost :: posix_time; 
 
 template< typename Iterator，typename Skipper> 
结构FastaGrammar：补气::语法<迭代器，FastaReader :: fastaVector（），齐::当地人<串>中船长> {
 qi :: rule< Iterator> infoLineStart; 
 qi :: rule< Iterator> inputEnd; 
 qi :: rule< Iterator> lineEnd; 
 qi :: rule< Iterator，string（），Skipper> infoLine; 
 qi :: rule< Iterator，string（），Skipper> seqLine; 
 qi :: rule< Iterator，FastaReader :: fastaVector（），qi :: locals< string>，Skipper> fasta; 
 
 
 FastaGrammar（）：FastaGrammar :: base_type（FASTAFASTA）{
使用boost ::精神::标准:: char_; 
 using boost :: phoenix :: bind; 
使用qi :: eoi; 
使用qi :: eol; 
使用qi :: lexeme; 
使用qi :: _ 1; 
使用qi :: _ val; 
 using namespace qi :: labels; 
 
 infoLineStart = char _（'>'）; 
 inputEnd = eoi; 
 
 / * grammar * / 
 infoLine = lexeme [*（char_  -  eol）]; 
 seqLine = *（char_  -  infoLineStart）; 
 
 fasta = *（infoLineStart> infoLine [_a = _1] 
> seqLine [bind（& FastaGrammar :: addValue，_val，_a，_1）] 
） 
> inputEnd 
; 
 
 infoLineStart.name（>）; 
 infoLine.name（sequence identifier）; 
 seqLine.name（sequence）; 
 
} 
 
 static void addValue（FastaReader :: fastaVector& fa，const string& info，const string& seq）{
 fa.push_back make_pair（info，seq））; 
} 
}; 
 
 
 FastaReader :: FastaReader（const fs :: path& f）{
 this-> file = f; 
 this-> parse（）; 
} 
 
 
 FastaReader ::〜FastaReader（）{} 
 
 
 const fs :: path& FastaReader :: getFile（）const {
 return this-> file; 
} 
 
 
 const FastaReader :: fastaVector :: const_iterator FastaReader :: getBeginIterator（）const {
 return this-> fV.cbegin（）; 
} 
 
 
 const FastaReader :: fastaVector :: const_iterator FastaReader :: getEndIterator（）const {
 return this-> fV.cend（）; 
} 
 
 
 void FastaReader :: parse（）{
 if（> file.empty（））throw string（FastaReader： 。）; 
 if（！fs :: is_regular_file（this-> file））throw（string（FastaReader：File not found：）+ this-> file.string（））; 
 
 typedef boost :: spirit :: istream_iterator iterator_type; 
 typedef boost :: spirit :: classic :: position_iterator2< iterator_type> pos_iterator_type; 
 typedef FastaGrammar< pos_iterator_type，boost :: spirit :: ascii :: space_type> fastaGr; 
 
 fs :: ifstream fin（this-> file）; 
 if（！fin.is_open（））{
 throw（string（FastaReader：Access denied：）+ this-> file.string（））; 
} 
 
 fin.unsetf（ios :: skipws）; 
 
 iterator_type begin（fin）; 
 iterator_type end; 
 
 pos_iterator_type pos_begin（begin，end，this-> file.string（））; 
 pos_iterator_type pos_end; 
 
 fastaGr fG; 
 try {
 std :: cerr<< 测量：分析。 << std :: endl; 
 const pt :: ptime startMeasurement = pt :: microsec_clock :: universal_time（）; 
 
 qi :: phrase_parse（pos_begin，pos_end，fG，boost :: spirit :: ascii :: space，this-> fV）; 
 
 const pt :: ptime endMeasurement = pt :: microsec_clock :: universal_time（）; 
 pt :: time_duration duration（endMeasurement  -  startMeasurement）; 
 std :: cerr<<持续时间< std :: endl; 
} catch（std :: string str）{
 cerr<< error message：< str<< endl; 
} 
}

所以语法如下：
它会查找>符号，然后存储所有后续字符，直到检测到EOL。在EOL之后，当检测到>符号时，文本序列开始和结束。然后通过调用 FastaReader :: addValue（）将两个字符串（标题行和文本序列）存储在std :: vector中。

我编译我的程序使用g ++版本4.8.2与-O2和-std = c ++ 11标志。

所以，性能问题在我的代码？
解决方案
_{上一页：步骤3：带有零复制的MOAR快速}
_{返回步骤1.清理+分析}

步骤4：删除位置迭代器

因为你不使用它，我们可以删除状态迭代器，这可能会抑制很多优化（并且可以在探查器输出）

< a href =http://coliru.stacked-crooked.com/a/1782a8f773d27fdb =nofollow> Live on Coliru
#define BOOST_SPIRIT_USE_PHOENIX_V3 #include< boost / filesystem / path.hpp> #include< boost / utility / string_ref.hpp> #include< boost / iostreams / device / mapped_file.hpp> namespace io = boost :: iostreams; namespace fs = boost :: filesystem; class FastaReader { public： typedef std :: pair& boost :: string_ref，boost :: string_ref>条目; typedef std :: vector< Entry>数据; private：数据fV; fs :: path file; public： FastaReader（const fs :: path& f）; 〜FastaReader（）; const fs :: route& getFile（）const; const Data :: const_iterator begin（）const; const Data :: const_iterator end（）const; private： io :: mapped_file_source mmap; void parse（）; }; #include< iomanip> #include< boost / date_time / posix_time / posix_time.hpp> #include< boost / filesystem / fstream.hpp> #include< boost / filesystem / operations.hpp> #include< boost / filesystem / path.hpp> #include< boost / spirit / include / qi.hpp> #include< boost / spirit / include / phoenix.hpp> #include< boost / fusion / adapted / std_pair.hpp> //＃includefastaReader.hpp #include< boost / iostreams / device / mapped_file.hpp> using namespace std; namespace fs = boost :: filesystem; namespace qi = boost :: spirit :: qi; 命名空间pt = boost :: posix_time; namespace io = boost :: iostreams; namespace boost {namespace spirit {namespace traits { template< typename It> struct assign_to_attribute_from_iterators< boost :: string_ref，It，void> { static void call（It f，It l，boost :: string_ref& attr）{attr = boost :: string_ref {f，size_t（std :: distance（f，l））}; } }; }}} template< typename Iterator> struct FastaGrammar：qi :: grammar< Iterator，FastaReader :: Data（）> { FastaGrammar（）：FastaGrammar :: base_type（fasta）{ using namespace qi; using boost :: phoenix :: construct; using boost :: phoenix :: begin; using boost :: phoenix :: size; entry =（'>'>> raw [*〜char _（'\\\ '）]>'\\\ '> raw [*〜char_ （'>'）]）; fasta = * entry>> * eol>> eoi; BOOST_SPIRIT_DEBUG_NODES（（fasta）（entry））; } private： qi :: rule< Iterator，FastaReader :: Data（）> fasta; qi :: rule< Iterator，FastaReader :: Entry（）>条目; }; FastaReader :: FastaReader（const fs :: path& f）：file（f），mmap（file.c_str（））{ parse（）; } FastaReader ::〜FastaReader（）{} const fs :: path& FastaReader :: getFile（）const { return this-> file; } const FastaReader :: Data :: const_iterator FastaReader :: begin（）const { return this-> fV.cbegin（）; } const FastaReader :: Data :: const_iterator FastaReader :: end（）const { return this-> fV.cend（）; } void FastaReader :: parse（）{ if（this-> file.empty（））throw std :: runtime_error（FastaReader：）; if（！fs :: is_regular_file（this-> file））throw std :: runtime_error（string（FastaReader：File not found：）+ this-> file.string（））; typedef char const * iterator_type; typedef FastaGrammar< iterator_type> fastaGr; static const fastaGr fG {}; try { std :: cerr<< 测量：分析。 << std :: endl; const pt :: ptime startMeasurement = pt :: microsec_clock :: universal_time（）; iterator_type first（mmap.data（）），last（mmap.end（））; qi :: phrase_parse（first，last，fG，boost :: spirit :: ascii :: space，this-> fV）; const pt :: ptime endMeasurement = pt :: microsec_clock :: universal_time（）; pt :: time_duration duration（endMeasurement - startMeasurement）; std :: cerr<<持续时间< std :: endl; } catch（std :: exception const& e）{ cerr<< error message：< e.what（）< endl; } } int main（）{ FastaReader reader（input.txt）; for（auto& e：reader）std :: cout<< '>'<< e.first<< '\\\ '< e.second<< \\\ \\\ ; }
现在 74.8x 更快。
$ time ./test | head -n4 测量：分析。 00：00：00.194432

I want to parse a file that looks like this (FASTA-like text format):
>InfoHeader "Some text sequence that has a line break after every 80 characters" >InfoHeader "Some text sequence that has a line break after every 80 characters" ...
e.g.:
>gi|31563518|ref|NP_852610.1| microtubule-associated proteins 1A/1B light chain 3A isoform b [Homo sapiens] MKMRFFSSPCGKAAVDPADRCKEVQQIRDQHPSKIPVIIERYKGEKQLPVLDKTKFLVPDHVNMSELVKI IRRRLQLNPTQAFFLLVNQHSMVSVSTPIADIYEQEKDEDGFLYMVYASQETFGFIRENE
I wrote a parser for this with boost::spirit. The parser correctly stores the header line and the following text sequence in a std::vector< std::pair< string, string >> but it takes kind of long for bigger files (17sec for a 100MB file). As comparison I wrote a program without boost::spirit (just STL functions) that simply copies each line of that 100MB file in a std::vector. The whole process takes less than a second. The "program" used for the comparison is not serving the purpose but I don't think the parser should take that much longer...

I know there are plenty of other FASTA parsers around but I'm rather curious why my code is slow.

The .hpp file:
#include <boost/filesystem/path.hpp> namespace fs = boost::filesystem; class FastaReader { public: typedef std::vector< std::pair<std::string, std::string> > fastaVector; private: fastaVector fV; fs::path file; public: FastaReader(const fs::path & f); ~FastaReader(); const fs::path & getFile() const; const fastaVector::const_iterator getBeginIterator() const; const fastaVector::const_iterator getEndIterator() const; private: void parse(); };
And the .cpp file:
#include <iomanip> #include <boost/date_time/posix_time/posix_time.hpp> #include <boost/filesystem/fstream.hpp> #include <boost/filesystem/operations.hpp> #include <boost/filesystem/path.hpp> #include <boost/spirit/include/classic_position_iterator.hpp> #include <boost/spirit/include/phoenix_bind.hpp> #include <boost/spirit/include/phoenix_core.hpp> #include <boost/spirit/include/phoenix_fusion.hpp> #include <boost/spirit/include/phoenix_operator.hpp> #include <boost/spirit/include/qi.hpp> #include "fastaReader.hpp" using namespace std; namespace fs = boost::filesystem; namespace qi = boost::spirit::qi; namespace pt = boost::posix_time; template <typename Iterator, typename Skipper> struct FastaGrammar : qi::grammar<Iterator, FastaReader::fastaVector(), qi::locals<string>, Skipper> { qi::rule<Iterator> infoLineStart; qi::rule<Iterator> inputEnd; qi::rule<Iterator> lineEnd; qi::rule<Iterator, string(), Skipper> infoLine; qi::rule<Iterator, string(), Skipper> seqLine; qi::rule<Iterator, FastaReader::fastaVector(), qi::locals<string>, Skipper> fasta; FastaGrammar() : FastaGrammar::base_type(fasta, "fasta") { using boost::spirit::standard::char_; using boost::phoenix::bind; using qi::eoi; using qi::eol; using qi::lexeme; using qi::_1; using qi::_val; using namespace qi::labels; infoLineStart = char_('>'); inputEnd = eoi; /* grammar */ infoLine = lexeme[*(char_ - eol)]; seqLine = *(char_ - infoLineStart); fasta = *(infoLineStart > infoLine[_a = _1] > seqLine[bind(&FastaGrammar::addValue, _val, _a, _1)] ) > inputEnd ; infoLineStart.name(">"); infoLine.name("sequence identifier"); seqLine.name("sequence"); } static void addValue(FastaReader::fastaVector & fa, const string & info, const string & seq) { fa.push_back(make_pair(info, seq)); } }; FastaReader::FastaReader(const fs::path & f) { this->file = f; this->parse(); } FastaReader::~FastaReader() {} const fs::path & FastaReader::getFile() const { return this->file; } const FastaReader::fastaVector::const_iterator FastaReader::getBeginIterator() const { return this->fV.cbegin(); } const FastaReader::fastaVector::const_iterator FastaReader::getEndIterator() const { return this->fV.cend(); } void FastaReader::parse() { if ( this->file.empty() ) throw string("FastaReader: No file specified."); if ( ! fs::is_regular_file(this->file) ) throw (string("FastaReader: File not found: ") + this->file.string()); typedef boost::spirit::istream_iterator iterator_type; typedef boost::spirit::classic::position_iterator2<iterator_type> pos_iterator_type; typedef FastaGrammar<pos_iterator_type, boost::spirit::ascii::space_type> fastaGr; fs::ifstream fin(this->file); if ( ! fin.is_open() ) { throw (string("FastaReader: Access denied: ") + this->file.string()); } fin.unsetf(ios::skipws); iterator_type begin(fin); iterator_type end; pos_iterator_type pos_begin(begin, end, this->file.string()); pos_iterator_type pos_end; fastaGr fG; try { std::cerr << "Measuring: Parsing." << std::endl; const pt::ptime startMeasurement = pt::microsec_clock::universal_time(); qi::phrase_parse(pos_begin, pos_end, fG, boost::spirit::ascii::space, this->fV); const pt::ptime endMeasurement = pt::microsec_clock::universal_time(); pt::time_duration duration (endMeasurement - startMeasurement); std::cerr << duration << std::endl; } catch (std::string str) { cerr << "error message: " << str << endl; } }
So the grammar does the folloing: It looks for a ">" sign and then stores all following characters until an EOL is detected. After the EOL the text sequence starts and ends when a ">" sign is detected. Both strings (header line and text sequence) are then stored in a std::vector by calling FastaReader::addValue().

I compiled my program using g++ version 4.8.2 with -O2 and -std=c++11 flags.

So where is the the performance issue in my code?
解决方案
_{Previous: Step 3: MOAR FASTER WITH ZERO-COPY}
_{Return to Step 1. Cleaning up + Profiling}

Step 4: Dropping the position iterator

Since you're not using it, we can drop the stateful iterator, which is likely to inhibit quite a lot of optimizations (and was indirectly visible in the profiler output)

Live On Coliru
#define BOOST_SPIRIT_USE_PHOENIX_V3 #include <boost/filesystem/path.hpp> #include <boost/utility/string_ref.hpp> #include <boost/iostreams/device/mapped_file.hpp> namespace io = boost::iostreams; namespace fs = boost::filesystem; class FastaReader { public: typedef std::pair<boost::string_ref, boost::string_ref> Entry; typedef std::vector<Entry> Data; private: Data fV; fs::path file; public: FastaReader(const fs::path & f); ~FastaReader(); const fs::path & getFile() const; const Data::const_iterator begin() const; const Data::const_iterator end() const; private: io::mapped_file_source mmap; void parse(); }; #include <iomanip> #include <boost/date_time/posix_time/posix_time.hpp> #include <boost/filesystem/fstream.hpp> #include <boost/filesystem/operations.hpp> #include <boost/filesystem/path.hpp> #include <boost/spirit/include/qi.hpp> #include <boost/spirit/include/phoenix.hpp> #include <boost/fusion/adapted/std_pair.hpp> //#include "fastaReader.hpp" #include <boost/iostreams/device/mapped_file.hpp> using namespace std; namespace fs = boost::filesystem; namespace qi = boost::spirit::qi; namespace pt = boost::posix_time; namespace io = boost::iostreams; namespace boost { namespace spirit { namespace traits { template <typename It> struct assign_to_attribute_from_iterators<boost::string_ref, It, void> { static void call(It f, It l, boost::string_ref& attr) { attr = boost::string_ref { f, size_t(std::distance(f,l)) }; } }; } } } template <typename Iterator> struct FastaGrammar : qi::grammar<Iterator, FastaReader::Data()> { FastaGrammar() : FastaGrammar::base_type(fasta) { using namespace qi; using boost::phoenix::construct; using boost::phoenix::begin; using boost::phoenix::size; entry = ('>' >> raw[ *~char_('\n') ] >> '\n' >> raw[ *~char_('>') ]); fasta = *entry >> *eol >> eoi ; BOOST_SPIRIT_DEBUG_NODES((fasta)(entry)); } private: qi::rule<Iterator, FastaReader::Data()> fasta; qi::rule<Iterator, FastaReader::Entry()> entry; }; FastaReader::FastaReader(const fs::path & f) : file(f), mmap(file.c_str()) { parse(); } FastaReader::~FastaReader() {} const fs::path & FastaReader::getFile() const { return this->file; } const FastaReader::Data::const_iterator FastaReader::begin() const { return this->fV.cbegin(); } const FastaReader::Data::const_iterator FastaReader::end() const { return this->fV.cend(); } void FastaReader::parse() { if (this->file.empty()) throw std::runtime_error("FastaReader: No file specified."); if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string()); typedef char const* iterator_type; typedef FastaGrammar<iterator_type> fastaGr; static const fastaGr fG{}; try { std::cerr << "Measuring: Parsing." << std::endl; const pt::ptime startMeasurement = pt::microsec_clock::universal_time(); iterator_type first(mmap.data()), last(mmap.end()); qi::phrase_parse(first, last, fG, boost::spirit::ascii::space, this->fV); const pt::ptime endMeasurement = pt::microsec_clock::universal_time(); pt::time_duration duration (endMeasurement - startMeasurement); std::cerr << duration << std::endl; } catch (std::exception const& e) { cerr << "error message: " << e.what() << endl; } } int main() { FastaReader reader("input.txt"); for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n"; }
Now it's 74.8x faster.
$ time ./test | head -n4 Measuring: Parsing. 00:00:00.194432

这篇关于使用Boost :: spirit编写的解析器的性能问题的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

使用Boost :: spirit编写的解析器的性能问题 [英] Performance issue with parser written with Boost::spirit

问题描述

步骤4：删除位置迭代器

Step 4: Dropping the position iterator

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

使用Boost :: spirit编写的解析器的性能问题 [英] Performance issue with parser written with Boost::spirit

问题描述

步骤4：删除位置迭代器

Step 4: Dropping the position iterator

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭