迭代与升压mmaped gzip文件 [英] Iterating over mmaped gzip file with boost

查看：131 发布时间：2016/8/12 18:27:54 c++ boost gzip

本文介绍了迭代与升压mmaped gzip文件的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我想学习升压和C ++中的一些模板编程，但我真的有这样一个困难时期实现一个简单的类使用 mapped_file_source 在gzip文件的迭代。我基本上是在TSV格式的边列表使得在gzip文件每一行的格式为：＆LT;源：INT＆GT;＆LT;标签＆gt;＆LT; DST：INT＆GT; 。我想是实施 gz_file 类，它暴露了一个开始和结束迭代器，我可以得到一个边缘（的std ::对＆LT; INT， INT方式＆gt; ）我每次查询迭代器

问题是拷贝构造函数是坏，因为我不知道我在哪里放置在gzip文件。

下面是code我到目前为止有：

 类gz_graph {
上市：
    gz_graph（为const char *文件名）
    {
        m_file.open（文件名）;
        如果（！m_file.is_open（））{
            抛出std :: runtime_error（错误打开文件）;
        }        M_DATA = m_file.data（）;
        m_data_size = m_file.size（）/的sizeof（M_DATA [0]）;        汽车RET = posix_madvise（（无效*）M_DATA，m_data_size，POSIX_MADV_SEQUENTIAL）;
    }    一流的迭代器;    迭代器开始（）const的
    {
        返回的迭代器（这一点，假）;
    }    迭代结束（）const的
    {
        返回的迭代器（这一点，真正的）;
    }    一流的迭代器：公众的std ::迭代器＆LT;的std :: forward_iterator_tag，边缘＆GT; {
        上市：
            迭代器（gz_graph常量*裁判，布尔消耗）
                ：m_ref（REF）
                  m_cur_edge（-1，-1），
                  m_consumed（消耗）
            {
                如果（！消耗）{
                    初始化（）;
                    提前（）;
                }
            }            迭代器（常量迭代器和放大器; X）
                ：m_ref（x.m_ref）
                  m_cur_edge（x.m_cur_edge）
            {
                如果（！x.m_consumed）{
                    初始化（）;
                    提前（）;
                }                性病::法院LT＆;＆LT; 复制构造函数＆LT;＆LT;的std :: ENDL;
            }            VALUE_TYPE常量和放大器;运算符*（）const的
            {
                返回m_cur_edge;
            }            VALUE_TYPE常量*操作符＆GT;（）const的
            {
                返回＆安培; m_cur_edge;
            }            迭代器和放大器;符++（）
            {
                提前（）;
                返回*这一点;
            }            布尔运算符==（迭代器常量和放大器;其他）常量
            {
                断言（m_ref == other.m_ref）;
                返回m_cur_edge == other.m_cur_edge;
            }            布尔运算符=（const的迭代器放大器和;除外）！常量
            {
                返回（*此==其他）！;
            }        私人的：
            void初始化（）
            {
                提高::输入输出流:: array_source源（m_ref-＆GT; M_DATA，m_ref-＆GT; m_data_size）;
                m_in.push（升压::输入输出流:: gzip_decom pressor（））;
                m_in.push（源）;
            }            无效提前（）
            {
                标准::字符串line_str;
                如果（！函数getline（m_in，line_str））{
                    m_consumed = TRUE;
                    m_cur_edge =边缘（-1，-1）;
                    返回;
                }                的std ::矢量＆lt;标准::字符串＆GT;可疑交易报告;
                提高::分裂序列（STR，line_str，提振:: is_any_of（\\ t的））;                如果（strs.size（）！= 2）
                    抛出std :: runtime_error（必须每行2场）;                INT SRC =的boost :: lexical_cast的＆LT; INT＆GT;（strs.at（0））;
                INT DST =的boost :: lexical_cast的＆LT; INT＆GT;（strs.at（1））;                m_cur_edge =边缘（SRC，DST）;//性病::法院LT＆;＆LT; 读行＆LT;＆LT; line_str＆LT;＆LT;的std :: ENDL;
            }            gz_graph常量* m_ref;
            边缘m_cur_edge;
            提高::输入输出流:: filtering_istream m_in;
            布尔m_consumed;
        };私人的：
    提高::输入输出流:: mapped_file_source m_file;
    字符常量* M_DATA;
    为size_t m_data_size;
};

解决方案

我只想用一个的std :: istream_iterator 在这里。我不知道究竟如何跨preT你的伪输入code，所以让我的幽默，你和做了复杂解析：

 结构边缘：性病::对＆LT; INT，INT＆GT; {};的std :: istream的＆安培;运营商的GT;＆GT;（的std :: istream的＆放大器是，边及放大器;边）
{
    使用空间boost ::精神::补气;
    返回的是＆GT;＆GT;匹配（SRC：＆GT; int_＆GT;'\\ T'＆gt;中的DST：＆GT; int_＆GT;＆GT; EOL，edge.first，edge.second）;
}

我希望你会很乐意把它简单得多，但更简单更容易，对吧？

现在主程序看起来像

 为（
        的std :: istream_iterator＆LT;边及GT;它（FS＆GT;＆GT;的std :: noskipws），结束;
        ！它=结束;
        它++）
{
    性病::法院LT＆;＆LT;它 - ＆gt;首先＆LT;＆LT; 到＆所述;＆下; IT-＆gt;第二个＆LT;＆LT; \\ n;
}

其中， FS 是 filtering_istream 具有 gzip_decom pressor 。看到它的 住在Coliru

全code

 的#include＆LT;升压/输入输出流/设备/ mapped_file.hpp＆GT;
＃包括LT＆;升压/输入输出流/ filtering_stream.hpp＆GT;
＃包括LT＆;升压/输入输出流/过滤/ gzip.hpp＆GT;＃包括LT＆;升压/精神/有/ qi.hpp＆GT;
＃包括LT＆;升压/精神/有/ qi_match.hpp＆GT;
＃包括LT＆;＆迭代器GT;结构边缘：性病::对＆LT; INT，INT＆GT; {
};的std :: istream的＆安培;运营商的GT;＆GT;（的std :: istream的＆放大器是，边及放大器;边）
{
    使用空间boost ::精神::补气;
    返回的是＆GT;＆GT;匹配（SRC：＆GT; int_＆GT;'\\ T'＆gt;中的DST：＆GT; int_＆GT;＆GT; EOL，edge.first，edge.second）;
}命名空间IO =的boost ::输入输出流;诠释的main（）
{
    IO :: mapped_file_source CSV（csv.txt.gz）;    IO ::流＆LT; IO :: mapped_file_source＆GT;文本流（CSV）;
    IO :: filtering_istream FS;
    fs.push（IO :: gzip_decom pressor {}）;
    fs.push（文本流）;    对于（
            的std :: istream_iterator＆LT;边及GT;它（FS＆GT;＆GT;的std :: noskipws），最后的;
            它=上！;
            它++）
    {
        性病::法院LT＆;＆LT;它 - ＆gt;首先＆LT;＆LT; 到＆所述;＆下; IT-＆gt;第二个＆LT;＆LT; \\ n;
    }
}

I am trying to learn boost and some template programming in C++ but I am really having such an hard time to implement a simple class for iterating over Gzip files using mapped_file_source. I essentially have an edge list in TSV format such that each line in the gzip file is of the format: <src:int><tab><dst:int>. What I want is to implement a gz_file class that exposes a begin and end iterator over which I can get an edge (std::pair<int,int>) each time I query the iterator.

The problem is the copy constructor which is broken since I cannot known where I am positioned in the gzip file.

Here is the code I have so far:

class gz_graph {
public:
    gz_graph(const char * filename)
    {
        m_file.open(filename);
        if (!m_file.is_open()) {
            throw std::runtime_error("Error opening file");
        }

        m_data = m_file.data();
        m_data_size = m_file.size() / sizeof(m_data[0]);

        auto ret = posix_madvise((void*)m_data, m_data_size, POSIX_MADV_SEQUENTIAL);
    }

    class iterator;

    iterator begin() const
    {
        return iterator(this, false);
    }

    iterator end() const
    {
        return iterator(this, true);
    }

    class iterator : public std::iterator<std::forward_iterator_tag, Edge> {
        public:
            iterator(gz_graph const * ref, bool consumed)
                : m_ref(ref),
                  m_cur_edge(-1, -1),
                  m_consumed(consumed)
            {
                if (!consumed) {
                    initialize();
                    advance();
                }
            }

            iterator(const iterator& x)
                : m_ref(x.m_ref),
                  m_cur_edge(x.m_cur_edge)
            {
                if (!x.m_consumed) {
                    initialize();
                    advance();
                }

                std::cout << "Copy constructor" << std::endl;
            }

            value_type const& operator*() const
            {
                return m_cur_edge;
            }

            value_type const* operator->() const
            {
                return &m_cur_edge;
            }

            iterator& operator++()
            {
                advance();
                return *this;
            }

            bool operator==(iterator const& other) const
            {
                assert(m_ref == other.m_ref);
                return m_cur_edge == other.m_cur_edge;
            }

            bool operator!=(iterator const& other) const
            {
                return !(*this == other);
            }

        private:
            void initialize()
            {
                boost::iostreams::array_source source(m_ref->m_data, m_ref->m_data_size);
                m_in.push(boost::iostreams::gzip_decompressor());
                m_in.push(source);
            }

            void advance()
            {
                std::string line_str;
                if (!getline(m_in, line_str)) {
                    m_consumed = true;
                    m_cur_edge = Edge(-1, -1);
                    return;
                }

                std::vector<std::string> strs;
                boost::split(strs, line_str, boost::is_any_of("\t"));

                if (strs.size() != 2)
                    throw std::runtime_error("Required 2 fields per line");

                int src = boost::lexical_cast<int>(strs.at(0));
                int dst = boost::lexical_cast<int>(strs.at(1));

                m_cur_edge = Edge(src, dst);

//                std::cout << "Read line " << line_str << std::endl;
            }

            gz_graph const * m_ref;
            Edge m_cur_edge;
            boost::iostreams::filtering_istream m_in;
            bool m_consumed;
        };

private:
    boost::iostreams::mapped_file_source m_file;
    char const* m_data;
    size_t m_data_size;
};

解决方案

I would just use a std::istream_iterator here. I'm not sure how exactly to interpret your "input pseudo-code", so let me humor you and do the "complicated" parsing:

struct Edge : std::pair<int, int> { };

std::istream& operator>>(std::istream& is, Edge& edge)
{
    using namespace boost::spirit::qi;
    return is >> match("src:" > int_ > '\t' > "dst:" > int_ >> eol, edge.first, edge.second);
}

I expect you would be happy to have it much simpler, but simpler is easier, right?

Now the main program looks like

for (
        std::istream_iterator<Edge> it(fs >> std::noskipws), end;
        it != end;
        ++it)
{
    std::cout << it->first << " to " << it->second << "\n";
}

Where fs is the filtering_istream that has the gzip_decompressor. See it Live On Coliru

Full Code

#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_match.hpp>
#include <iterator>

struct Edge : std::pair<int, int> {
};

std::istream& operator>>(std::istream& is, Edge& edge)
{
    using namespace boost::spirit::qi;
    return is >> match("src:" > int_ > '\t' > "dst:" > int_ >> eol, edge.first, edge.second);
}

namespace io = boost::iostreams;

int main()
{
    io::mapped_file_source csv("csv.txt.gz");

    io::stream<io::mapped_file_source> textstream(csv);
    io::filtering_istream fs;
    fs.push(io::gzip_decompressor{});
    fs.push(textstream);

    for (
            std::istream_iterator<Edge> it(fs >> std::noskipws), last;
            it != last;
            ++it)
    {
        std::cout << it->first << " to " << it->second << "\n";
    }
}

这篇关于迭代与升压mmaped gzip文件的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

迭代与升压mmaped gzip文件 [英] Iterating over mmaped gzip file with boost

问题描述

全code

Full Code

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

迭代与升压mmaped gzip文件 [英] Iterating over mmaped gzip file with boost

问题描述

全code

Full Code

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭