如何在C ++中快速,安全地从文本文件中读取极长的行? [英] how to read extreme long lines from text file fast and safe in C++?

查看:39
本文介绍了如何在C ++中快速,安全地从文本文件中读取极长的行?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

有一个6.53 GiB的大文本文件.它的每一行都可以是数据行或注释行.注释行通常很短,少于80个字符,而数据行包含超过200万个字符,并且长度可变.

考虑到每条数据线都需要作为一个单元来处理,是否有一种简单的方法可以在C ++中安全,快速地读取线?

安全(对于可变长度数据线安全):该解决方案与 std :: getline()一样易于使用.由于长度在变化,因此希望避免额外的内存管理.

快速:该解决方案可以达到 python 3.6.0 中的 readline()最快的速度,甚至可以达到 stdio.h 的fgets().

欢迎使用Pure C解决方案.在C和C ++中都提供了用于进一步处理的接口.


更新1 :感谢 Basile Starynkevitch 的简短但无价的评论,完美的解决方案出现了: POSIX getline().由于进一步处理仅涉及从字符到数字的转换,并且不使用字符串类的许多功能,因此在此应用程序中使用char数组就足够了.


更新2 :感谢 Zulan Galik ,他们都报告了 std :: getline() fgets() POSIX getline(),另一种可能的解决方案是使用更好的标准库实现,例如 libstdc ++ .此外,这是报告,声称Visual C ++和 std :: getline 的libc ++实现未得到很好的优化.

libc ++ 移到 libstdc ++ 会大大改变结果.在不同平台上的libstdc ++ 3.4.13/Linux 2.6.32中, POSIX getline() std :: getline() fgets()显示可比的性能.最初,代码是在Xcode 8.3.2(8E2002)中的clang默认设置下运行的,因此使用 libc ++ .


更多细节和一些努力(很长):

< string>

getline()可以处理任意长行,但速度较慢.在C ++中,Python中是否有 readline()的替代方法?

 //在具有libc ++和SSD的Mac OS X上的基准测试:python的readline()〜550 MiB/sstdio.h的fgets(),-O0/-O2〜1100 MiB/s字符串的getline(),-O0〜27 MiB/s字符串的getline(),-O2〜150 MiB/s字符串+堆栈缓冲区的getline(),-O2〜150 MiB/sifstream的getline(),-O0/-O2〜240 MiB/sreadstream of ifstream,-O2〜340 MiB/swc -l〜670 MiB/scat data.txt |./read-cin-unsync〜20 MiB/sstdio.h(POSIX.1-2008)的getline(),-O0〜1300 MiB/s 

  • 速度被粗略地舍入,仅用于显示幅度,并且所有代码块都运行几次以确保这些值具有代表性.

  • '-O0/-O2'表示两个优化级别的速度非常相似

  • 代码如下所示.


python的

readline()

 #readline.py导入时间导入操作系统t_start = time.perf_counter()fname ='data.txt'fin = open(fname,'rt')计数= 0而True:l = fin.readline()长度= len(l)如果length == 0:#EOF休息如果长度>80:#数据线计数+ = 1fin.close()t_end = time.perf_counter()时间= t_end-t_startfsize = os.path.getsize(fname)/1024/1024#MiB中的文件大小打印(速度:%d MiB/s"%(fsize/time))print(读取%d数据行"%count)#在python 3.6.0中以python readline.py的身份运行 


stdio.h

fgets()

  #include< stdio.h>#include< stdlib.h>#include< time.h>#include< string.h>int main(int argc,char * argv []){clock_t t_start = clock();if(argc!= 2){fprintf(stderr,需要一个输入参数\ n");返回EXIT_FAILURE;}FILE * fp = fopen(argv [1],"r");if(fp == NULL){perror(无法打开文件");返回EXIT_FAILURE;}//最大行长,之前由python确定const int SIZE = 1024 * 1024 * 3;字符行[SIZE];int count = 0;while(fgets(line,SIZE,fp)== line){if(strlen(line)> 80){计数+ = 1;}}clock_t t_end = clock();const double fsize = 6685;//MiB中的文件大小双倍时间=(t_end-t_start)/(双倍)CLOCKS_PER_SEC;fprintf(stdout,花费%.2f s \ n",时间);fprintf(stdout,"speed:%d MiB/s \ n",(int)(fsize/time));fprintf(stdout,读取%d条数据线\ n",计数);返回EXIT_SUCCESS;} 


< string>

getline()

 //readline-string-getline.cpp#include< string>#include< fstream>#include< iostream>#include< ctime>#include< cstdlib>使用命名空间std;int main(int argc,char * argv []){clock_t t_start = clock();if(argc!= 2){fprintf(stderr,需要一个输入参数\ n");返回EXIT_FAILURE;}//手动在堆栈上设置缓冲区const int BUFFERSIZE = 1024 * 1024 * 3;//我平台上的堆栈是8 MiB字符缓冲区[BUFFERSIZE];ifstream鳍;fin.rdbuf()-> pubsetbuf(buffer,BUFFERSIZE);fin.open(argv [1]);//默认缓冲区设置//ifstream fin(argv [1]);if(!fin){perror(无法打开文件");返回EXIT_FAILURE;}//最大行长,之前由python确定const int SIZE = 1024 * 1024 * 3;弦线;line.reserve(SIZE);int count = 0;while(getline(fin,line)){if(line.size()> 80){计数+ = 1;}}clock_t t_end = clock();const double fsize = 6685;//MiB中的文件大小双倍时间=(t_end-t_start)/(双倍)CLOCKS_PER_SEC;fprintf(stdout,花费%.2f s \ n",时间);fprintf(stdout,"speed:%d MiB/s \ n",(int)(fsize/time));fprintf(stdout,读取%d条数据线\ n",计数);返回EXIT_SUCCESS;} 


ifstream

getline()

 //readline-ifstream-getline.cpp#include< fstream>#include< iostream>#include< ctime>#include< cstdlib>使用命名空间std;int main(int argc,char * argv []){clock_t t_start = clock();if(argc!= 2){fprintf(stderr,需要一个输入参数\ n");返回EXIT_FAILURE;}ifstream fin(argv [1]);if(!fin){perror(无法打开文件");返回EXIT_FAILURE;}//最大行长,之前由python确定const int SIZE = 1024 * 1024 * 3;字符行[SIZE];int count = 0;while(fin.getline(line,SIZE)){if(strlen(line)> 80){计数+ = 1;}}clock_t t_end = clock();const double fsize = 6685;//MiB中的文件大小双倍时间=(t_end-t_start)/(双倍)CLOCKS_PER_SEC;fprintf(stdout,花费%.2f s \ n",时间);fprintf(stdout,"speed:%d MiB/s \ n",(int)(fsize/time));fprintf(stdout,读取%d条数据线\ n",计数);返回EXIT_SUCCESS;} 


ifstream

read()

 //seq-read-bin.cpp//顺序读取文件以查看速度上限//ifstream#include< iostream>#include< fstream>#include< ctime>使用命名空间std;int main(int argc,char * argv []){clock_t t_start = clock();if(argc!= 2){fprintf(stderr,需要一个输入参数\ n");返回EXIT_FAILURE;}ifstream fin(argv [1],ios :: binary);const int SIZE = 1024 * 1024 * 3;char str [SIZE];while(fin){fin.read(str,SIZE);}clock_t t_end = clock();双倍时间=(t_end-t_start)/(双倍)CLOCKS_PER_SEC;const double fsize = 6685;//MiB中的文件大小fprintf(stdout,花费%.2f s \ n",时间);fprintf(stdout,"speed:%d MiB/s \ n",(int)(fsize/time));返回EXIT_SUCCESS;} 


使用 cat ,然后使用 cin.sync_with_stdio(false)

cin 中读取

  #include< iostream>#include< ctime>#include< cstdlib>使用命名空间std;int main(void){clock_t t_start = clock();字符串input_line;cin.sync_with_stdio(false);while(cin){getline(cin,input_line);}双倍时间=(clock()-t_start)/(double)CLOCKS_PER_SEC;const double fsize = 6685;//MiB中的文件大小fprintf(stdout,花费%.2f s \ n",时间);fprintf(stdout,"speed:%d MiB/s \ n",(int)(fsize/time));返回EXIT_SUCCESS;} 


POSIX getline()

 //readline-c-getline.c#include< stdio.h>#include< stdlib.h>#include< time.h>int main(int argc,char * argv []){clock_t t_start = clock();char * line = NULL;size_t len = 0;ssize_t nread;如果(argc!= 2){fprintf(stderr,用法:%s< file> \ n",argv [1]);退出(EXIT_FAILURE);}FILE * stream = fopen(argv [1],"r");if(stream == NULL){perror("fopen");退出(EXIT_FAILURE);}int长度= -1;int count = 0;while((nread = getline(& line,& len,stream))!= -1){如果(nread> 80){计数+ = 1;}}自由行);fclose(stream);双倍时间=(clock()-t_start)/(double)CLOCKS_PER_SEC;const double fsize = 6685;//MiB中的文件大小fprintf(stdout,花费%.2f s \ n",时间);fprintf(stdout,"speed:%d MiB/s \ n",(int)(fsize/time));fprintf(stdout,读取%d条数据线.\ n",计数);//fprintf(stdout,"MSA的长度:%d \ n",长度为1);退出(EXIT_SUCCESS);} 

解决方案

正如我评论的那样,在Linux&POSIX系统,您可以考虑使用 getline(3);我猜下面的代码可以同时编译为C和C ++(假设您确实有一些有效的 fopen -ed FILE * fil; ...)

  char * linbuf = NULL;///或C ++中的nullptrsize_t linsiz = 0;ssize_t linlen = 0;while(((linlen = getline(& linbuf,& linsiz,fil))> = 0){//对linbuf做一些有用的事情;但没有C ++异常}免费(linbuf);linsiz = 0; 

我猜想这可能对C ++有效(或很容易适应).但是,然后提防C ++异常,它们不应经过while循环(或应确保适当的析构函数或 catch 正在执行 free(linbuf); ).

getline 也可能失败(例如,如果它调用失败的 malloc ),则您可能需要明智地处理该失败.

There is a large text file of 6.53 GiB. Each line of it can be a data line or comment line. Comment lines are usually short, less than 80 characters, while a data line contains more than 2 million characters and is variable-length.

Considering each data line needs to be dealt with as a unit, is there a simple way to read lines safe and fast in C++?

safe (safe for variable-length data lines): The solution is as easy to use as std::getline(). Since the length is changing, it is hoped to avoid extra memory management.

fast: The solution can achieve as fast as readline() in python 3.6.0, or even as fast as fgets() of stdio.h.

A Pure C solution is welcomed. The interface for further processing is provided both in C and C++.


UPDATE 1: Thanks to short but invaluable comment from Basile Starynkevitch, the perfect solution comes up: POSIX getline(). Since further processing only involves converting from character to number and does not use many features of string class, a char array would be sufficient in this application.


UPDATE 2: Thanks to comments from Zulan and Galik, who both report comparable performance among std::getline(), fgets() and POSIX getline(), another possible solution is to use a better standard library implementation such as libstdc++. Moreover, here is a report claiming that the Visual C++ and libc++ implementations of std::getline is not well optimised.

Moving from libc++ to libstdc++ changes the results a lot. With libstdc++ 3.4.13 / Linux 2.6.32 on a different platform, POSIX getline(), std::getline() and fgets() show comparable performance. At the beginning, codes were run under the default settings of clang in Xcode 8.3.2 (8E2002), thus libc++ is used.


More details and some efforts (very long):

getline() of <string> can handle arbitrary long lines but is a bit slow. Is there an alternative in C++ for readline() in python?

// benchmark on Mac OS X with libc++ and SSD:
readline() of python                         ~550 MiB/s

fgets() of stdio.h, -O0 / -O2               ~1100 MiB/s

getline() of string, -O0                      ~27 MiB/s
getline() of string, -O2                     ~150 MiB/s
getline() of string + stack buffer, -O2      ~150 MiB/s

getline() of ifstream, -O0 / -O2             ~240 MiB/s
read() of ifstream, -O2                      ~340 MiB/s

wc -l                                        ~670 MiB/s

cat data.txt | ./read-cin-unsync              ~20 MiB/s

getline() of stdio.h (POSIX.1-2008), -O0    ~1300 MiB/s

  • Speeds are rounded very roughly, only to show the magnitude, and all code blocks are run several times to assure that the values are representative.

  • '-O0 / -O2' means the speeds are very similar for both optimization levels

  • Codes are shown as follows.


readline() of python

# readline.py

import time
import os

t_start = time.perf_counter()

fname = 'data.txt'
fin = open(fname, 'rt')

count = 0

while True:
    l = fin.readline()
    length = len(l)
    if length == 0:     # EOF
        break
    if length > 80:     # data line
        count += 1

fin.close()

t_end = time.perf_counter()
time = t_end - t_start

fsize = os.path.getsize(fname)/1024/1024   # file size in MiB
print("speed: %d MiB/s" %(fsize/time))
print("reads %d data lines" %count)

# run as `python readline.py` with python 3.6.0


fgets() of stdio.h

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

int main(int argc, char* argv[]){
  clock_t t_start = clock();

  if(argc != 2) {
    fprintf(stderr, "needs one input argument\n");
    return EXIT_FAILURE;
  }

  FILE* fp = fopen(argv[1], "r");
  if(fp == NULL) {
    perror("Failed to open file");
    return EXIT_FAILURE;
  }

  // maximum length of lines, determined previously by python
  const int SIZE = 1024*1024*3;
  char line[SIZE];

  int count = 0;
  while(fgets(line, SIZE, fp) == line) {
    if(strlen(line) > 80) {
      count += 1;
    }
  }

  clock_t t_end = clock();

  const double fsize = 6685;  // file size in MiB

  double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
  fprintf(stdout, "reads %d data lines\n", count);

  return EXIT_SUCCESS;
}


getline() of <string>

// readline-string-getline.cpp
#include <string>
#include <fstream>
#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(int argc, char* argv[]) {
  clock_t t_start = clock();

  if(argc != 2) {
    fprintf(stderr, "needs one input argument\n");
    return EXIT_FAILURE;
  }

  // manually set the buffer on stack
  const int BUFFERSIZE = 1024*1024*3;   // stack on my platform is 8 MiB
  char buffer[BUFFERSIZE];
  ifstream fin;
  fin.rdbuf()->pubsetbuf(buffer, BUFFERSIZE);
  fin.open(argv[1]);

  // default buffer setting
  // ifstream fin(argv[1]);

  if(!fin) {
    perror("Failed to open file");
    return EXIT_FAILURE;
  }

  // maximum length of lines, determined previously by python
  const int SIZE = 1024*1024*3;
  string line;
  line.reserve(SIZE);

  int count = 0;
  while(getline(fin, line)) {
    if(line.size() > 80) {
      count += 1;
    }
  }

  clock_t t_end = clock();

  const double fsize = 6685;  // file size in MiB

  double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
  fprintf(stdout, "reads %d data lines\n", count);

  return EXIT_SUCCESS;
}


getline() of ifstream

// readline-ifstream-getline.cpp
#include <fstream>
#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(int argc, char* argv[]) {
  clock_t t_start = clock();

  if(argc != 2) {
    fprintf(stderr, "needs one input argument\n");
    return EXIT_FAILURE;
  }

  ifstream fin(argv[1]);
  if(!fin) {
    perror("Failed to open file");
    return EXIT_FAILURE;
  }

  // maximum length of lines, determined previously by python
  const int SIZE = 1024*1024*3;
  char line[SIZE];

  int count = 0;
  while(fin.getline(line, SIZE)) {
    if(strlen(line) > 80) {
      count += 1;
    }
  }

  clock_t t_end = clock();

  const double fsize = 6685;  // file size in MiB

  double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
  fprintf(stdout, "reads %d data lines\n", count);

  return EXIT_SUCCESS;
}


read() of ifstream

// seq-read-bin.cpp
// sequentially read the file to see the speed upper bound of
// ifstream

#include <iostream>
#include <fstream>
#include <ctime>

using namespace std;


int main(int argc, char* argv[]) {
  clock_t t_start = clock();

  if(argc != 2) {
    fprintf(stderr, "needs one input argument\n");
    return EXIT_FAILURE;
  }

  ifstream fin(argv[1], ios::binary);

  const int SIZE = 1024*1024*3;
  char str[SIZE];

  while(fin) {
    fin.read(str,SIZE);
  }

  clock_t t_end = clock();
  double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

  const double fsize = 6685;  // file size in MiB

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));

  return EXIT_SUCCESS;
}


use cat, then read from cin with cin.sync_with_stdio(false)

#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(void) {
  clock_t t_start = clock();

  string input_line;

  cin.sync_with_stdio(false);

  while(cin) {
    getline(cin, input_line);
  }

  double time = (clock() - t_start) / (double)CLOCKS_PER_SEC;

  const double fsize = 6685;  // file size in MiB

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));

  return EXIT_SUCCESS;
}


POSIX getline()

// readline-c-getline.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main(int argc, char *argv[]) {

  clock_t t_start = clock();

  char *line = NULL;
  size_t len = 0;
  ssize_t nread;

  if (argc != 2) {
    fprintf(stderr, "Usage: %s <file>\n", argv[1]);
    exit(EXIT_FAILURE);
  }

  FILE *stream = fopen(argv[1], "r");
  if (stream == NULL) {
    perror("fopen");
    exit(EXIT_FAILURE);
  }

  int length = -1;
  int count = 0;
  while ((nread = getline(&line, &len, stream)) != -1) {
    if (nread > 80) {
      count += 1;
    }
  }

  free(line);
  fclose(stream);

  double time = (clock() - t_start) / (double)CLOCKS_PER_SEC;
  const double fsize = 6685;  // file size in MiB
  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
  fprintf(stdout, "reads %d data lines.\n", count);
  // fprintf(stdout, "length of MSA: %d\n", length-1);

  exit(EXIT_SUCCESS);
}

解决方案

As I commented, on Linux & POSIX systems, you could consider using getline(3); I guess that the following could compile both as C and as C++ (assuming you do have some valid fopen-ed FILE*fil; ...)

char* linbuf = NULL; /// or nullptr in C++
size_t linsiz = 0;
ssize_t linlen = 0;

while((linlen=getline(&linbuf, &linsiz,fil))>=0) {
  // do something useful with linbuf; but no C++ exceptions
}
free(linbuf); linsiz=0;

I guess this might work (or be easily adapted) to C++. But then, beware of C++ exceptions, they should not go thru the while loop (or you should ensure that an appropriate destructor or catch is doing free(linbuf);).

Also getline could fail (e.g. if it calls a failing malloc) and you might need to handle that failure sensibly.

这篇关于如何在C ++中快速,安全地从文本文件中读取极长的行?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆