阅读大文本文件的n行 [英] Read n lines of a big text file

查看:123
本文介绍了阅读大文本文件的n行的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我所拥有的最小的文件有> 850k行,每行都是未知的长度。目标是在浏览器中从该文件中读取 n 行。



以下是HTML < input type =filename =fileid =文件> 和JS我有:

  var n = 10; 
var reader = new FileReader();
reader.onload = function(progressEvent){
//整个文件
console.log(this.result);

//由行
var lines = this.result.split('\\\
');
for(var line = 0; line< n; line ++){
console.log(lines [line]);
}
};

很明显,这里的问题是,它试图首先实现整个文件,然后用换行符。所以不管是 n ,它都会尝试读取整个文件,当文件很大的时候,最后什么也不读。

b
$ b

我应该怎么做?

注意:我愿意删除整个函数并从头开始,因为我会能够 console.log()我们阅读的每一行。







$ b $

 (0, (1,2))
(1,(4,5,6))
(2,(7))
(3,(8))




编辑:

要走的路将是像大文件 filereader api ,但我可以'看看我可以修改它来读取 n 文件的行...



通过使用 Uint8Array to Javascrip中的字符串那么可以从这里做:

  var view = new Uint8Array(fr.result); 
var string = new TextDecoder(utf-8)。decode(view);
console.log(Chunk+ string);

但是这可能不会读取最后一行,那么如何确定行后来?例如这里是它打印的内容:

 ((7202),(u'11330875493',u'2554375661'))
((1667),(u'9079074735',u'6883914476',


这个逻辑非常类似于我在大文件上的filereader api ,除非您需要跟踪到目前为止您已经处理的行数(也是到目前为止读取的最后一行,因为它可能尚未结束)。下一个示例适用于任何与UTF-8兼容的编码;如果需要其他编码,请查看 TextDecoder 的构造函数。



如果您确定输入是ASCII(或任何其他单字节编码),那么你也可以跳过使用 TextDecoder 和直接使用 FileReader作为文本读取输入 readAsText 方法

  //这只是below.document.getElementById('start')函数的一个例子。onclick = function(){var file = document.getElementById('infile ).files [0];如果(!file){console.log('No file selected。');返回; } var maxlines = parseInt(document.getElementById('maxlines')。value,10);} var lineno = 1;信息范范范范内范范范范范范范范范范范范();});};函数onComplete(){console.log('读取所有行');});}; / ** *阅读并包括| maxlines |来自| file |的行。 * * @param {Blob}文件 - 要读取的文件。 * @param {integer} maxlines  - 要读取的最大行数。 * @param {function(string)} forEachLine  - 为每一行调用。 * @param {function(error)} onComplete  - 在达到文件末尾时调用*或maxlines |行已被阅读。 * / function readSomeLines(file,maxlines,forEachLine,onComplete){var CHUNK_SIZE = 50000; // 50kb,任意选择。 var decoder = new TextDecoder(); var offset = 0; var linecount = 0; var linenumber = 0; var results =''; var fr = new FileReader(); fr.onload = function(){//使用stream:如果我们在多字节字符中间切割文件,则为true result + = decoder.decode(fr.result,{stream:true}); var lines = results.split('\\\
'); results = lines.pop(); //如果线路还没有结束。 linecount + = lines.length; if(linecount> maxlines){//读取太多行?截断结果。 lines.length - = linecount - maxlines; linecount = maxlines; } for(var i = 0; i< lines.length; ++ i){forEachLine(lines [i] +'\\\
'); } offset + = CHUNK_SIZE;寻求(); }; fr.onerror = function(){onComplete(fr.error); };寻求();函数seek(){if(linecount === maxlines){//我们找到了足够的行。的onComplete(); //完成返回; } if(offset!== 0&& offset> = file.size){//我们没有找到所有的行,但是没有更多的行。 forEachLine(结果); //这是从lines.pop(),之前。的onComplete(); //完成返回; } var slice = file.slice(offset,offset + CHUNK_SIZE); fr.readAsArrayBuffer(片); }

读取< input type =numberid = maxlines 的> < input type =fileid =infile>。< input type =buttonid =startvalue =打印行到控制台>


The smallest file I have has > 850k lines and every line is of unknown length. The goal is to read n lines from this file in the browser. Reading it fully is not going to happen.

Here is the HTML <input type="file" name="file" id="file"> and the JS I have:

var n = 10;
var reader = new FileReader();
reader.onload = function(progressEvent) {
  // Entire file
  console.log(this.result);

  // By lines
  var lines = this.result.split('\n');
  for (var line = 0; line < n; line++) {
    console.log(lines[line]);
  }
};

Obviously, the problem here is that it tries to first real the whole file and then split it by newline. So no matter of n, it will try to read the whole file, and eventually read nothing when the file is big.

How should I do it?

Note: I am willing to delete the whole function and start from scratch, given that I will be able to console.log() every line that we read.


*"every line is of unknown length" -> means that the file is something like this:

(0, (1, 2))
(1, (4, 5, 6))
(2, (7))
(3, (8))


Edit:

The way to go would be something like filereader api on big files, but I can't see how I can modify that to read n lines of the file...

By using Uint8Array to string in Javascript too, one can do from there:

var view = new Uint8Array(fr.result);
var string = new TextDecoder("utf-8").decode(view);
console.log("Chunk " + string);

but this may not read the last line as a whole, so how are you going to determine the lines later? For example here is what it printed:

((7202), (u'11330875493', u'2554375661'))
((1667), (u'9079074735', u'6883914476',

解决方案

The logic is very similar to what I wrote in my answer to filereader api on big files, except you need to keep track of the number of lines that you have processed so far (and also the last line read so far, because it may not have ended yet). The next example works for any encoding that is compatible with UTF-8; if you need another encoding look at the options for the TextDecoder constructor.

If you are certain that the input is ASCII (or any other single-byte encoding), then you can also skip the use of TextDecoder and directly read the input as text using the FileReader's readAsText method.

// This is just an example of the function below.
document.getElementById('start').onclick = function() {
    var file = document.getElementById('infile').files[0];
    if (!file) {
        console.log('No file selected.');
        return;
    }
    var maxlines = parseInt(document.getElementById('maxlines').value, 10);
    var lineno = 1;
    // readSomeLines is defined below.
    readSomeLines(file, maxlines, function(line) {
        console.log("Line: " + (lineno++) + line);
    }, function onComplete() {
        console.log('Read all lines');
    });
};

/**
 * Read up to and including |maxlines| lines from |file|.
 *
 * @param {Blob} file - The file to be read.
 * @param {integer} maxlines - The maximum number of lines to read.
 * @param {function(string)} forEachLine - Called for each line.
 * @param {function(error)} onComplete - Called when the end of the file
 *     is reached or when |maxlines| lines have been read.
 */
function readSomeLines(file, maxlines, forEachLine, onComplete) {
    var CHUNK_SIZE = 50000; // 50kb, arbitrarily chosen.
    var decoder = new TextDecoder();
    var offset = 0;
    var linecount = 0;
    var linenumber = 0;
    var results = '';
    var fr = new FileReader();
    fr.onload = function() {
        // Use stream:true in case we cut the file
        // in the middle of a multi-byte character
        results += decoder.decode(fr.result, {stream: true});
        var lines = results.split('\n');
        results = lines.pop(); // In case the line did not end yet.
        linecount += lines.length;
    
        if (linecount > maxlines) {
            // Read too many lines? Truncate the results.
            lines.length -= linecount - maxlines;
            linecount = maxlines;
        }
    
        for (var i = 0; i < lines.length; ++i) {
            forEachLine(lines[i] + '\n');
        }
        offset += CHUNK_SIZE;
        seek();
    };
    fr.onerror = function() {
        onComplete(fr.error);
    };
    seek();
    
    function seek() {
        if (linecount === maxlines) {
            // We found enough lines.
            onComplete(); // Done.
            return;
        }
        if (offset !== 0 && offset >= file.size) {
            // We did not find all lines, but there are no more lines.
            forEachLine(results); // This is from lines.pop(), before.
            onComplete(); // Done
            return;
        }
        var slice = file.slice(offset, offset + CHUNK_SIZE);
        fr.readAsArrayBuffer(slice);
    }
}

Read <input type="number" id="maxlines"> lines from
<input type="file" id="infile">.
<input type="button" id="start" value="Print lines to console">

这篇关于阅读大文本文件的n行的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆