提升精神解析CSV与可变顺序的列 [英] boost spirit parsing CSV with columns in variable order

查看:250
本文介绍了提升精神解析CSV与可变顺序的列的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图使用boost精神解析CSV文件(带标题行)。
csv不是常量格式。有时有一些额外的列或列的顺序是混合的。



例如,我的CSV可能如下所示:

 
姓名,姓氏,年龄
John,Doe,32

或:

 
年龄,姓名
32,John

 

我只想解析姓名年龄(NB >是整数类型)。目前我出来一个非常丑的解决方案,其中精神解析第一行,并创建一个矢量,其中包含一个枚举在我感兴趣的位置。然后我必须手工解析终端符号...

 枚举LineItems {
NAME, AGE,UNUSED
};

struct CsvLine {
string name;
int age
};

使用Column = std :: string;
using CsvFile = std :: vector< CsvLine>

template< typename it>
struct CsvGrammar:qi :: grammar< It,CsvFile(),qi :: locals< std :: vector< LineItems>>,qi :: blank_type> {
CsvGrammar():
CsvGrammar :: base_type(start){
using namespace qi;

static const char colsep =',';

start = qi :: omit [header [qi :: _ a = qi :: _ 1]]> eol>>线(_a)%eol;
header =(lit(Name)[phx :: push_back(phx :: ref(qi :: _ val),LineItems :: NAME]]
| lit(Age)[phx: :push_back(phx :: ref(qi :: _ val),LineItems :: AGE)]
| column [phx :: push_back(phx :: ref(qi :: _ val),LineItems :: UNUSED)]) %colsep;
line =(column%colsep)[phx :: bind(& CsvGrammar< it> :: convertFunc,this,qi :: _ 1,qi :: _ r1,
qi :: _ val)
column = quoted | *〜char _(,\\\
);
quoted =''>> *(\\|〜char _(\\\\
))> ''';
}

void convertFunc(std :: vector< string>& columns,std :: vector< LineItems>& positions,CsvLine& csvLine){
//终端符号在这里解析,并赋值给csvLine struct。
...
}
private:
qi :: rule< It,CsvFile() :locals< std :: vector< LineItems>>,qi :: blank_type> start;
qi :: rule< It,std :: vector< LineItems>(),qi :: blank_type> $ b qi :: rule< It,CsvLine(std :: vector< LineItems>),qi :: blank_type> line;
qi :: rule< It,Column(),qi :: blank_type> column;
qi :: rule< It,std :: string()" quoted;
qi :: rule< It,qi :: blank_type> empty;

};

这里是 full source



如果头解析器可以准备一个向量< rule< ...> *> / code>和行解析器只是使用这个向量来解析自身?一种高级的 nabialek招trick (我一直在尝试但我不能做到)。



或者有什么更好的方法来解析这种类型的CSV与精神?
(任何帮助,感谢您提前谢谢)

解决方案



我认为它很优雅(qi本地人甚至允许可重入使用)。



规则中的分歧( Boost Spirit:语义行为是邪恶的? )you 可将转换函数移到属性转换自定义点 中。



糟糕。正如所指出的那样太简单了。然而,你仍然可以减少cruftiness相当多。通过两个简单的调整,语法为:

  item.add(Name,NAME)(Age,AGE) ; 
start = omit [header [_a = _1]]>> eol>>线(_a)%eol;

header =(item | omit [column]>> attr(UNUSED))%colsep;
line =(column%colsep)[convert];

column = quoted | *〜char _(,\\\
);
quoted =''>> *(\\|〜char _(\\\\
))> ''';

调整:




  • 使用 qi :: symbols 将标题映射到 LineItem

  • 使用直接访问上下文的原始semantinc操作( [convert] )(参见精神语义动作参数 ):

      struct final {
    using Ctx = typename decltype(line):: context_type;

    void operator()(Columns const& columns,Ctx& ctx ,bool& pass)const {
    auto& csvLine = boost :: fusion :: at_c 0(ctx.attributes);
    auto& amp; amp; amp; amp = (ctx.attributes);
    int i = 0;

    for(LineItems position:positions){
    switch(position){
    case NAME:csvLine.name = columns [i]; break;
    case AGE:csvLine.age = atoi(columns [i] .c_str());打破;
    default:break;
    }
    i ++;
    }

    pass = true; // returns false failed the`line` rule
    }
    } convert;




可以说, c $ c> auto convert = phx :: bind(& CsvGrammar< it> :: convertFunc,this,qi :: _ 1,qi :: _ r1,qi :: _ val) c $ c> auto 与Proto / Phoenix / Spirit表达式是众所周知的容易出错(UB由于悬挂引用临时表达式模板),所以我一定更喜欢上面显示的方式。 p>

Live On Coliru

  //#define BOOST_SPIRIT_DEBUG 
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include< iostream>
#include< boost / fusion / include / at_c.hpp>
#include< boost / spirit / include / qi.hpp>
#include< boost / spirit / include / phoenix.hpp>
#include< string>
#include< vector>

命名空间qi = boost :: spirit :: qi;
namespace phx = boost :: phoenix;

使用std :: string;

枚举LineItems {NAME,AGE,UNUSED};

struct CsvLine {
string name;
int age
};

使用Column = std :: string;
using Columns = std :: vector< Column> ;;
using CsvFile = std :: vector< CsvLine>

template< typename it>
struct CsvGrammar:qi :: grammar< It,CsvFile(),qi :: locals< std :: vector< LineItems>>,qi :: blank_type> {
CsvGrammar():CsvGrammar :: base_type(start){
using namespace qi;
static const char colsep =',';

item.add(Name,NAME)(Age,AGE);
start = qi :: omit [header [_a = _1]]>> eol>>线(_a)%eol;

header =(item | omit [column]>> attr(UNUSED))%colsep;
line =(column%colsep)[convert];

column = quoted | *〜char _(,\\\
);
quoted =''>> *(\\|〜char _(\\\\
))> ''';

BOOST_SPIRIT_DEBUG_NODES((header)(column)(quoted));
}

private:
qi :: rule< It ,std :: vector< LineItems>(),qi :: blank_type> header;
qi :: rule< It,CsvFile(),qi :: locals< std :: vector< LineItems> :blank_type> start;
qi :: rule< It,CsvLine(std :: vector< LineItems> const&),qi :: blank_type> line;

qi :: rule< It ,Column(),qi :: blank_type> column;
qi :: rule< It,std :: string()" quoted;
qi :: rule< It,qi :: blank_type> empty ;

qi :: symbols< char,LineItems> item;

struct final {
using Ctx = typename decltype(line):: context_type;

void operator()(Columns const& columns,Ctx& ctx,bool& pass)const {
auto& csvLine = boost :: fusion :: at_c 0(ctx.attributes) ;
auto& amp; positions = boost :: fusion :: at_c< 1>(ctx.attributes);
int i = 0;

for(LineItems position:positions){
switch(position){
case NAME:csvLine.name = columns [i];打破;
case AGE:csvLine.age = atoi(columns [i] .c_str());打破;
default:break;
}
i ++;
}

pass = true; // returns false failed the`line` rule
}
} convert;
};

int main(){
const std :: string s =Surname,Name,Age,\ nJohn,Doe,32 \\\
Mark,Smith,43;

auto f(begin(s)),l(end(s));
CsvGrammar< std :: string :: const_iterator> p;

CsvFile解析;
bool ok = qi :: phrase_parse(f,l,p,qi :: blank,parsed);

if(ok){
for(CsvLine line:parsed){
std :: cout< '['<< line.name<< ']'<< '['<< line.age<< ']';
std :: cout<< std :: endl;
}
} else {
std :: cout<< 解析失败\\\
;
}

if(f!= l)
std :: cout< Remaining unparsed:'<< std :: string(f,l)<< '\\\
;
}

打印

  [Doe] [32] 
[Smith] [43]


I'm trying to parse a CSV file (with header line) using boost spirit. The csv is not in a constant format. Sometimes there is some extra column or the order of the column is mixed. I'm interested in few columns, whose header name is well known.

For instance my CSV may look like:

Name,Surname,Age
John,Doe,32

Or:

Age,Name
32,John

I want to parse only the content of Name and Age (N.B. Age is integer type). At the moment i come out with a very ugly solution where Spirit parses the first line and creates a vector that contains an enum in the positions i'm interested into. And then i have to do the parsing of the terminal symbols by hand...

enum LineItems {
    NAME, AGE, UNUSED
};

struct CsvLine {
    string name;
    int age;
};

using Column = std::string;
using CsvFile = std::vector<CsvLine>;

template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
    CsvGrammar() :
            CsvGrammar::base_type(start) {
        using namespace qi;

        static const char colsep = ',';

        start = qi::omit[header[qi::_a = qi::_1]] >> eol >> line(_a) % eol;
        header = (lit("Name")[phx::push_back(phx::ref(qi::_val), LineItems::NAME)]
                | lit("Age")[phx::push_back(phx::ref(qi::_val), LineItems::AGE)]
                | column[phx::push_back(phx::ref(qi::_val), LineItems::UNUSED)]) % colsep;
        line = (column % colsep)[phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1,
                qi::_val)];
        column = quoted | *~char_(",\n");
        quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
    }

    void convertFunc(std::vector<string>& columns, std::vector<LineItems>& positions, CsvLine &csvLine) {
       //terminal symbol parsing here, and assign to csvLine struct.
       ...
    }
private:
    qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
    qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
    qi::rule<It, CsvLine(std::vector<LineItems>), qi::blank_type> line;
    qi::rule<It, Column(), qi::blank_type> column;
    qi::rule<It, std::string()> quoted;
    qi::rule<It, qi::blank_type> empty;

};

Here is the full source.

What if the header parser could prepare a vector<rule<...>*> and the "line parser" just use this vector to parse itself? a sort of advanced nabialek trick (i've been trying but i couldn't make it).

Or is there any better way to parse this kind of CSV with Spirit? (any help is appreciated, thank you in advance)

解决方案

I'd go with the concept that you have,

I think it's plenty elegant (the qi locals even allow reentrant use of this).

To reduce the cruft in the rules (Boost Spirit: "Semantic actions are evil"?) you could move the "conversion function" off into attribute transformation customization points.

Oops. As commented that was too simple. However, you can still reduce the cruftiness quite a bit. With two simple tweaks, the grammar reads:

item.add("Name", NAME)("Age", AGE);
start  = omit[ header[_a=_1] ] >> eol >> line(_a) % eol;

header = (item | omit[column] >> attr(UNUSED)) % colsep;
line   = (column % colsep) [convert];

column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';

The tweaks:

  • using qi::symbols to map from header to LineItem
  • using a raw semantinc action ([convert]) which directly access the context (see boost spirit semantic action parameters):

    struct final {
        using Ctx = typename decltype(line)::context_type;
    
        void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
            auto& csvLine   = boost::fusion::at_c<0>(ctx.attributes);
            auto& positions = boost::fusion::at_c<1>(ctx.attributes);
            int i =0;
    
            for (LineItems position : positions) {
                switch (position) {
                    case NAME: csvLine.name = columns[i];              break;
                    case AGE:  csvLine.age = atoi(columns[i].c_str()); break;
                    default:   break;
                }
                i++;
            }
    
            pass = true; // returning false fails the `line` rule
        }
    } convert;
    

Arguably the upshot is akin to doing auto convert = phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1, qi::_val) but using auto with Proto/Phoenix/Spirit expressions is notoriously error prone (UB due to dangling refs to temporaries from the expression template), so I'd certainly prefer the way shown above.

Live On Coliru

//#define BOOST_SPIRIT_DEBUG
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <iostream>
#include <boost/fusion/include/at_c.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <string>
#include <vector>

namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;

using std::string;

enum LineItems { NAME, AGE, UNUSED };

struct CsvLine {
    string name;
    int age;
};

using Column  = std::string;
using Columns = std::vector<Column>;
using CsvFile = std::vector<CsvLine>;

template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
    CsvGrammar() : CsvGrammar::base_type(start) {
        using namespace qi;
        static const char colsep = ',';

        item.add("Name", NAME)("Age", AGE);
        start  = qi::omit[ header[_a=_1] ] >> eol >> line(_a) % eol;

        header = (item | omit[column] >> attr(UNUSED)) % colsep;
        line   = (column % colsep) [convert];

        column = quoted | *~char_(",\n");
        quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';

        BOOST_SPIRIT_DEBUG_NODES((header)(column)(quoted));
    }

private:
    qi::rule<It, std::vector<LineItems>(),                      qi::blank_type> header;
    qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
    qi::rule<It, CsvLine(std::vector<LineItems> const&),        qi::blank_type> line;

    qi::rule<It, Column(), qi::blank_type> column;
    qi::rule<It, std::string()> quoted;
    qi::rule<It, qi::blank_type> empty;

    qi::symbols<char, LineItems> item;

    struct final {
        using Ctx = typename decltype(line)::context_type;

        void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
            auto& csvLine   = boost::fusion::at_c<0>(ctx.attributes);
            auto& positions = boost::fusion::at_c<1>(ctx.attributes);
            int i =0;

            for (LineItems position : positions) {
                switch (position) {
                    case NAME: csvLine.name = columns[i];              break;
                    case AGE:  csvLine.age = atoi(columns[i].c_str()); break;
                    default:   break;
                }
                i++;
            }

            pass = true; // returning false fails the `line` rule
        }
    } convert;
};

int main() {
    const std::string s = "Surname,Name,Age,\nJohn,Doe,32\nMark,Smith,43";

    auto f(begin(s)), l(end(s));
    CsvGrammar<std::string::const_iterator> p;

    CsvFile parsed;
    bool ok = qi::phrase_parse(f, l, p, qi::blank, parsed);

    if (ok) {
        for (CsvLine line : parsed) {
            std::cout << '[' << line.name << ']' << '[' << line.age << ']';
            std::cout << std::endl;
        }
    } else {
        std::cout << "Parse failed\n";
    }

    if (f != l)
        std::cout << "Remaining unparsed: '" << std::string(f, l) << "'\n";
}

Prints

[Doe][32]
[Smith][43]

这篇关于提升精神解析CSV与可变顺序的列的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆