迭代与升压mmaped gzip文件 [英] Iterating over mmaped gzip file with boost
问题描述
我想学习升压和C ++中的一些模板编程,但我真的有这样一个困难时期实现一个简单的类使用 mapped_file_source
在gzip文件的迭代。我基本上是在TSV格式的边列表使得在gzip文件每一行的格式为:<源:INT><标签>< DST:INT>
。我想是实施 gz_file
类,它暴露了一个开始和结束迭代器,我可以得到一个边缘(的std ::对< INT, INT方式>
)我每次查询迭代器
问题是拷贝构造函数是坏,因为我不知道我在哪里放置在gzip文件。
下面是code我到目前为止有:
类gz_graph {
上市:
gz_graph(为const char *文件名)
{
m_file.open(文件名);
如果(!m_file.is_open()){
抛出std :: runtime_error(错误打开文件);
} M_DATA = m_file.data();
m_data_size = m_file.size()/的sizeof(M_DATA [0]); 汽车RET = posix_madvise((无效*)M_DATA,m_data_size,POSIX_MADV_SEQUENTIAL);
} 一流的迭代器; 迭代器开始()const的
{
返回的迭代器(这一点,假);
} 迭代结束()const的
{
返回的迭代器(这一点,真正的);
} 一流的迭代器:公众的std ::迭代器<的std :: forward_iterator_tag,边缘> {
上市:
迭代器(gz_graph常量*裁判,布尔消耗)
:m_ref(REF)
m_cur_edge(-1,-1),
m_consumed(消耗)
{
如果(!消耗){
初始化();
提前();
}
} 迭代器(常量迭代器和放大器; X)
:m_ref(x.m_ref)
m_cur_edge(x.m_cur_edge)
{
如果(!x.m_consumed){
初始化();
提前();
} 性病::法院LT&;< 复制构造函数<<的std :: ENDL;
} VALUE_TYPE常量和放大器;运算符*()const的
{
返回m_cur_edge;
} VALUE_TYPE常量*操作符>()const的
{
返回&安培; m_cur_edge;
} 迭代器和放大器;符++()
{
提前();
返回*这一点;
} 布尔运算符==(迭代器常量和放大器;其他)常量
{
断言(m_ref == other.m_ref);
返回m_cur_edge == other.m_cur_edge;
} 布尔运算符=(const的迭代器放大器和;除外)!常量
{
返回(*此==其他)!;
} 私人的:
void初始化()
{
提高::输入输出流:: array_source源(m_ref-> M_DATA,m_ref-> m_data_size);
m_in.push(升压::输入输出流:: gzip_decom pressor());
m_in.push(源);
} 无效提前()
{
标准::字符串line_str;
如果(!函数getline(m_in,line_str)){
m_consumed = TRUE;
m_cur_edge =边缘(-1,-1);
返回;
} 的std ::矢量<标准::字符串>可疑交易报告;
提高::分裂序列(STR,line_str,提振:: is_any_of(\\ t的)); 如果(strs.size()!= 2)
抛出std :: runtime_error(必须每行2场); INT SRC =的boost :: lexical_cast的< INT>(strs.at(0));
INT DST =的boost :: lexical_cast的< INT>(strs.at(1)); m_cur_edge =边缘(SRC,DST);//性病::法院LT&;< 读行<< line_str<<的std :: ENDL;
} gz_graph常量* m_ref;
边缘m_cur_edge;
提高::输入输出流:: filtering_istream m_in;
布尔m_consumed;
};私人的:
提高::输入输出流:: mapped_file_source m_file;
字符常量* M_DATA;
为size_t m_data_size;
};
我只想用一个的std :: istream_iterator
在这里。我不知道究竟如何跨preT你的伪输入code,所以让我的幽默,你和做了复杂解析:
结构边缘:性病::对< INT,INT> {};的std :: istream的&安培;运营商的GT;>(的std :: istream的&放大器是,边及放大器;边)
{
使用空间boost ::精神::补气;
返回的是>>匹配(SRC:> int_>'\\ T'>中的DST:> int_>> EOL,edge.first,edge.second);
}
我希望你会很乐意把它简单得多,但更简单更容易,对吧?
现在主程序看起来像
为(
的std :: istream_iterator<边及GT;它(FS>>的std :: noskipws),结束;
!它=结束;
它++)
{
性病::法院LT&;<它 - >首先<< 到&所述;&下; IT->第二个<< \\ n;
}
其中, FS
是 filtering_istream
具有 gzip_decom pressor
。看到它的 住在Coliru
全code
的#include<升压/输入输出流/设备/ mapped_file.hpp>
#包括LT&;升压/输入输出流/ filtering_stream.hpp>
#包括LT&;升压/输入输出流/过滤/ gzip.hpp>#包括LT&;升压/精神/有/ qi.hpp>
#包括LT&;升压/精神/有/ qi_match.hpp>
#包括LT&;&迭代器GT;结构边缘:性病::对< INT,INT> {
};的std :: istream的&安培;运营商的GT;>(的std :: istream的&放大器是,边及放大器;边)
{
使用空间boost ::精神::补气;
返回的是>>匹配(SRC:> int_>'\\ T'>中的DST:> int_>> EOL,edge.first,edge.second);
}命名空间IO =的boost ::输入输出流;诠释的main()
{
IO :: mapped_file_source CSV(csv.txt.gz); IO ::流< IO :: mapped_file_source>文本流(CSV);
IO :: filtering_istream FS;
fs.push(IO :: gzip_decom pressor {});
fs.push(文本流); 对于(
的std :: istream_iterator<边及GT;它(FS>>的std :: noskipws),最后的;
它=上!;
它++)
{
性病::法院LT&;<它 - >首先<< 到&所述;&下; IT->第二个<< \\ n;
}
}
I am trying to learn boost and some template programming in C++ but I am really having such an hard time to implement a simple class for iterating over Gzip files using mapped_file_source
. I essentially have an edge list in TSV format such that each line in the gzip file is of the format: <src:int><tab><dst:int>
. What I want is to implement a gz_file
class that exposes a begin and end iterator over which I can get an edge (std::pair<int,int>
) each time I query the iterator.
The problem is the copy constructor which is broken since I cannot known where I am positioned in the gzip file.
Here is the code I have so far:
class gz_graph {
public:
gz_graph(const char * filename)
{
m_file.open(filename);
if (!m_file.is_open()) {
throw std::runtime_error("Error opening file");
}
m_data = m_file.data();
m_data_size = m_file.size() / sizeof(m_data[0]);
auto ret = posix_madvise((void*)m_data, m_data_size, POSIX_MADV_SEQUENTIAL);
}
class iterator;
iterator begin() const
{
return iterator(this, false);
}
iterator end() const
{
return iterator(this, true);
}
class iterator : public std::iterator<std::forward_iterator_tag, Edge> {
public:
iterator(gz_graph const * ref, bool consumed)
: m_ref(ref),
m_cur_edge(-1, -1),
m_consumed(consumed)
{
if (!consumed) {
initialize();
advance();
}
}
iterator(const iterator& x)
: m_ref(x.m_ref),
m_cur_edge(x.m_cur_edge)
{
if (!x.m_consumed) {
initialize();
advance();
}
std::cout << "Copy constructor" << std::endl;
}
value_type const& operator*() const
{
return m_cur_edge;
}
value_type const* operator->() const
{
return &m_cur_edge;
}
iterator& operator++()
{
advance();
return *this;
}
bool operator==(iterator const& other) const
{
assert(m_ref == other.m_ref);
return m_cur_edge == other.m_cur_edge;
}
bool operator!=(iterator const& other) const
{
return !(*this == other);
}
private:
void initialize()
{
boost::iostreams::array_source source(m_ref->m_data, m_ref->m_data_size);
m_in.push(boost::iostreams::gzip_decompressor());
m_in.push(source);
}
void advance()
{
std::string line_str;
if (!getline(m_in, line_str)) {
m_consumed = true;
m_cur_edge = Edge(-1, -1);
return;
}
std::vector<std::string> strs;
boost::split(strs, line_str, boost::is_any_of("\t"));
if (strs.size() != 2)
throw std::runtime_error("Required 2 fields per line");
int src = boost::lexical_cast<int>(strs.at(0));
int dst = boost::lexical_cast<int>(strs.at(1));
m_cur_edge = Edge(src, dst);
// std::cout << "Read line " << line_str << std::endl;
}
gz_graph const * m_ref;
Edge m_cur_edge;
boost::iostreams::filtering_istream m_in;
bool m_consumed;
};
private:
boost::iostreams::mapped_file_source m_file;
char const* m_data;
size_t m_data_size;
};
I would just use a std::istream_iterator
here. I'm not sure how exactly to interpret your "input pseudo-code", so let me humor you and do the "complicated" parsing:
struct Edge : std::pair<int, int> { };
std::istream& operator>>(std::istream& is, Edge& edge)
{
using namespace boost::spirit::qi;
return is >> match("src:" > int_ > '\t' > "dst:" > int_ >> eol, edge.first, edge.second);
}
I expect you would be happy to have it much simpler, but simpler is easier, right?
Now the main program looks like
for (
std::istream_iterator<Edge> it(fs >> std::noskipws), end;
it != end;
++it)
{
std::cout << it->first << " to " << it->second << "\n";
}
Where fs
is the filtering_istream
that has the gzip_decompressor
. See it Live On Coliru
Full Code
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_match.hpp>
#include <iterator>
struct Edge : std::pair<int, int> {
};
std::istream& operator>>(std::istream& is, Edge& edge)
{
using namespace boost::spirit::qi;
return is >> match("src:" > int_ > '\t' > "dst:" > int_ >> eol, edge.first, edge.second);
}
namespace io = boost::iostreams;
int main()
{
io::mapped_file_source csv("csv.txt.gz");
io::stream<io::mapped_file_source> textstream(csv);
io::filtering_istream fs;
fs.push(io::gzip_decompressor{});
fs.push(textstream);
for (
std::istream_iterator<Edge> it(fs >> std::noskipws), last;
it != last;
++it)
{
std::cout << it->first << " to " << it->second << "\n";
}
}
这篇关于迭代与升压mmaped gzip文件的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!