使用boost对gapped gzip文件进行迭代 [英] Iterating over mmaped gzip file with boost

查看：176 发布时间：2016/10/25 16:03:34 c++ boost gzip

本文介绍了使用boost对gapped gzip文件进行迭代的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我试图学习boost和一些模板编程在C + +，但我真的有这样一个艰难的时间来实现一个简单的类，使用 mapped_file_source 迭代Gzip文件。我本质上有一个TSV格式的边缘列表，使得gzip文件中的每一行都是以下格式：< src：int>< tab>< dst：int> 。我想要的是实现一个 gz_file 类暴露一个开始和结束迭代器，我可以得到一个边缘（

 std :: pair< int， 
 
 
 问题是复制构造函数被破坏，因为我不知道我在哪里定位gzip文件。
 
 
 这是到目前为止所有的代码：
  class gz_graph {
 public：
 gz_graph（const char * filename）
 {
 m_file.open（filename）; 
 if（！m_file.is_open（））{
 throw std :: runtime_error（Error opening file）; 
} 
 
 m_data = m_file.data（）; 
 m_data_size = m_file.size（）/ sizeof（m_data [0]）; 
 
 auto ret = posix_madvise（（void *）m_data，m_data_size，POSIX_MADV_SEQUENTIAL）; 
} 
 
类迭代器; 
 
迭代器begin（）const 
 {
 return iterator（this，false）; 
} 
 
 iterator end（）const 
 {
 return iterator（this，true）; 
} 
 
类迭代器：public std :: iterator< std :: forward_iterator_tag，Edge> {
 public：
 iterator（gz_graph const * ref，bool consumption）
：m_ref（ref），
 m_cur_edge（-1，-1），
 m_consumed消费）
 {
 if（！consum）{
 initialize（）; 
 advance（）; 
} 
} 
 
 iterator（const iterator& x）
：m_ref（x.m_ref），
 m_cur_edge（x.m_cur_edge）
 {
 if（！x.m_consumed）{
 initialize（）; 
 advance（）; 
} 
 
 std :: cout<< 复制构造函数< std :: endl; 
} 
 
 value_type const& operator *（）const 
 {
 return m_cur_edge; 
} 
 
 value_type const * operator->（）const 
 {
 return& m_cur_edge; 
} 
 
迭代器& operator ++（）
 {
 advance（）; 
 return * this; 
} 
 
 bool operator ==（iterator const& other）const 
 {
 assert（m_ref == other.m_ref）; 
 return m_cur_edge == other.m_cur_edge; 
} 
 
 bool operator！=（iterator const& other）const 
 {
 return！（* this == other）; 
} 
 
 private：
 void initialize（）
 {
 boost :: iostreams :: array_source source（m_ref-> m_data，m_ref-> ; m_data_size）; 
 m_in.push（boost :: iostreams :: gzip_decompressor（））; 
 m_in.push（source）; 
} 
 
 void advance（）
 {
 std :: string line_str; 
 if（！getline（m_in，line_str））{
 m_consumed = true; 
 m_cur_edge = Edge（-1，-1）; 
 return; 
} 
 
 std :: vector< std :: string> strs; 
 boost :: split（strs，line_str，boost :: is_any_of（\t））; 
 
 if（strs.size（）！= 2）
 throw std :: runtime_error（每行必填2个字段）; 
 
 int src = boost :: lexical_cast< int>（strs.at（0））; 
 int dst = boost :: lexical_cast< int>（strs.at（1））; 
 
 m_cur_edge = Edge（src，dst）; 
 
 // std :: cout<< 读取行<< line_str<< std :: endl; 
} 
 
 gz_graph const * m_ref; 
 Edge m_cur_edge; 
 boost :: iostreams :: filtering_istream m_in; 
 bool m_consumed; 
}; 
 
 private：
 boost :: iostreams :: mapped_file_source m_file; 
 char const * m_data; 
 size_t m_data_size; 
}; 
  
 
 
解决方案

std :: istream_iterator 这里。我不知道如何解释你的输入伪代码，所以让我幽默你，做复杂的解析：

  struct Edge：std :: pair< int，int> {}; 
 
 std :: istream& >>（std :: istream& is，Edge& edge）
 {
 using namespace boost :: spirit :: qi; 
 return是>> match（src：> int_>'\t'>dst：> int_>> eol，edge.first，edge.second）; 
}

我希望你会很高兴有它更简单，但更简单更容易，对吗？

现在主程序看起来像

  
 std :: istream_iterator< Edge> it（fs>> std :: noskipws），end; 
 it！= end; 
 ++ it）
 {
 std :: cout<<第一<< to<<第二<< \\\
; 
}

其中 fs 具有 gzip_decompressor 的 filtering_istream 。查看 Live On Coliru

完整代码

  #include< boost /iostreams/device/mapped_file.hpp> 
 #include< boost / iostreams / filtering_stream.hpp> 
 #include< boost / iostreams / filter / gzip.hpp> 
 
 #include< boost / spirit / include / qi.hpp> 
 #include< boost / spirit / include / qi_match.hpp> 
 #include  
 struct Edge：std :: pair< int，int> {
}; 
 
 std :: istream& >>（std :: istream& is，Edge& edge）
 {
 using namespace boost :: spirit :: qi; 
 return是>> match（src：> int_>'\t'>dst：> int_>> eol，edge.first，edge.second）; 
} 
 
命名空间io = boost :: iostreams; 
 
 int main（）
 {
 io :: mapped_file_source csv（csv.txt.gz）; 
 
 io :: stream< io :: mapped_file_source> textstream（csv）; 
 io :: filtering_istream fs; 
 fs.push（io :: gzip_decompressor {}）; 
 fs.push（textstream）; 
 
 for（
 std :: istream_iterator< Edge> it（fs>> std :: noskipws），last; 
 it！= last; 
 + + it）
 {
 std :: cout<第一<< to<<第二<< \\\
; 
} 
}

I am trying to learn boost and some template programming in C++ but I am really having such an hard time to implement a simple class for iterating over Gzip files using mapped_file_source. I essentially have an edge list in TSV format such that each line in the gzip file is of the format: <src:int><tab><dst:int>. What I want is to implement a gz_file class that exposes a begin and end iterator over which I can get an edge (std::pair<int,int>) each time I query the iterator.

The problem is the copy constructor which is broken since I cannot known where I am positioned in the gzip file.

Here is the code I have so far:

class gz_graph {
public:
    gz_graph(const char * filename)
    {
        m_file.open(filename);
        if (!m_file.is_open()) {
            throw std::runtime_error("Error opening file");
        }

        m_data = m_file.data();
        m_data_size = m_file.size() / sizeof(m_data[0]);

        auto ret = posix_madvise((void*)m_data, m_data_size, POSIX_MADV_SEQUENTIAL);
    }

    class iterator;

    iterator begin() const
    {
        return iterator(this, false);
    }

    iterator end() const
    {
        return iterator(this, true);
    }

    class iterator : public std::iterator<std::forward_iterator_tag, Edge> {
        public:
            iterator(gz_graph const * ref, bool consumed)
                : m_ref(ref),
                  m_cur_edge(-1, -1),
                  m_consumed(consumed)
            {
                if (!consumed) {
                    initialize();
                    advance();
                }
            }

            iterator(const iterator& x)
                : m_ref(x.m_ref),
                  m_cur_edge(x.m_cur_edge)
            {
                if (!x.m_consumed) {
                    initialize();
                    advance();
                }

                std::cout << "Copy constructor" << std::endl;
            }

            value_type const& operator*() const
            {
                return m_cur_edge;
            }

            value_type const* operator->() const
            {
                return &m_cur_edge;
            }

            iterator& operator++()
            {
                advance();
                return *this;
            }

            bool operator==(iterator const& other) const
            {
                assert(m_ref == other.m_ref);
                return m_cur_edge == other.m_cur_edge;
            }

            bool operator!=(iterator const& other) const
            {
                return !(*this == other);
            }

        private:
            void initialize()
            {
                boost::iostreams::array_source source(m_ref->m_data, m_ref->m_data_size);
                m_in.push(boost::iostreams::gzip_decompressor());
                m_in.push(source);
            }

            void advance()
            {
                std::string line_str;
                if (!getline(m_in, line_str)) {
                    m_consumed = true;
                    m_cur_edge = Edge(-1, -1);
                    return;
                }

                std::vector<std::string> strs;
                boost::split(strs, line_str, boost::is_any_of("\t"));

                if (strs.size() != 2)
                    throw std::runtime_error("Required 2 fields per line");

                int src = boost::lexical_cast<int>(strs.at(0));
                int dst = boost::lexical_cast<int>(strs.at(1));

                m_cur_edge = Edge(src, dst);

//                std::cout << "Read line " << line_str << std::endl;
            }

            gz_graph const * m_ref;
            Edge m_cur_edge;
            boost::iostreams::filtering_istream m_in;
            bool m_consumed;
        };

private:
    boost::iostreams::mapped_file_source m_file;
    char const* m_data;
    size_t m_data_size;
};

解决方案

I would just use a std::istream_iterator here. I'm not sure how exactly to interpret your "input pseudo-code", so let me humor you and do the "complicated" parsing:

struct Edge : std::pair<int, int> { };

std::istream& operator>>(std::istream& is, Edge& edge)
{
    using namespace boost::spirit::qi;
    return is >> match("src:" > int_ > '\t' > "dst:" > int_ >> eol, edge.first, edge.second);
}

I expect you would be happy to have it much simpler, but simpler is easier, right?

Now the main program looks like

for (
        std::istream_iterator<Edge> it(fs >> std::noskipws), end;
        it != end;
        ++it)
{
    std::cout << it->first << " to " << it->second << "\n";
}

Where fs is the filtering_istream that has the gzip_decompressor. See it Live On Coliru

Full Code

#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_match.hpp>
#include <iterator>

struct Edge : std::pair<int, int> {
};

std::istream& operator>>(std::istream& is, Edge& edge)
{
    using namespace boost::spirit::qi;
    return is >> match("src:" > int_ > '\t' > "dst:" > int_ >> eol, edge.first, edge.second);
}

namespace io = boost::iostreams;

int main()
{
    io::mapped_file_source csv("csv.txt.gz");

    io::stream<io::mapped_file_source> textstream(csv);
    io::filtering_istream fs;
    fs.push(io::gzip_decompressor{});
    fs.push(textstream);

    for (
            std::istream_iterator<Edge> it(fs >> std::noskipws), last;
            it != last;
            ++it)
    {
        std::cout << it->first << " to " << it->second << "\n";
    }
}

这篇关于使用boost对gapped gzip文件进行迭代的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

使用boost对gapped gzip文件进行迭代 [英] Iterating over mmaped gzip file with boost

问题描述

完整代码

Full Code

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

使用boost对gapped gzip文件进行迭代 [英] Iterating over mmaped gzip file with boost

问题描述

完整代码

Full Code

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭