如何基于单词作为键分隔符读取文本文件的块? [英] How to read chunks for a text file based on a word as a key delimiter?
问题描述
我有一个.txt文件,其格式为:
I have a .txt file with this format:
Part #368 - XXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Part #369 - XXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Part #370 - XXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
我这样读取文件:
var lines = fs.readFileSync('file.txt', 'utf-8')
.split('\n')
.filter(Boolean);
因此它返回文件行的数组.如何获取以"Part"字符串开头的文件块?
So it returns an array of the lines of the file. How can I get the chunks of the file starting with the "Part" string?
var parts = _.filter(lines,function( s ) { return s.indexOf( 'Part' ) !== -1; });
类似这样的事情,但我不想让字符串以"Part"开头,而是希望从"Part"字符串到下一个"Part"字符串的所有行.
Something like this but instead of getting the strings starting with "Part" I want all the lines from "Part" string to next "Part" string.
推荐答案
JSON流
按照 stream.Transform
将文件定界为JSON数组流:
JSON Stream
As per @Brad's suggestion, here is a class extended from stream.Transform
that delimits the file into a JSON array stream:
const { Transform } = require('stream');
class Delimited extends Transform {
constructor({ delimiter = /\r?\n/g, encoding = 'utf8' } = {}) {
super();
// initialize internal values
this._delimiter = delimiter instanceof RegExp ? delimiter : new RegExp(delimiter, 'g');
this._encoding = encoding;
this._buffer = '';
this._first = true;
}
_transform(chunk, encoding, callback) {
// convert input encoding into output encoding
// and append to internal buffer
if (encoding === 'buffer') {
this._buffer += chunk.toString(this._encoding);
} else if (encoding === this._encoding) {
this._buffer += chunk;
} else {
this._buffer += Buffer.from(chunk, encoding).toString(this._encoding);
}
let partialJSON = '';
// check if delimiter is found
if (this._delimiter.test(this._buffer)) {
// split internal buffer by delimiter
let sections = this._buffer.split(this._delimiter);
// put possibly incomplete section from array back into internal buffer
this._buffer = sections.pop();
// add each section to partial json array
sections.forEach(section => {
partialJSON += `${this._first ? '[' : ','}${JSON.stringify(section)}`;
this._first = false;
});
}
// push partial json array to readable stream
callback(null, partialJSON);
}
_flush(callback) {
// add remaining buffer as last section to json array
callback(null, `${this._first ? '[' : ','}${JSON.stringify(this._buffer)}]`);
}
}
示例用法:
const fs = require('fs');
let stream = fs.createReadStream('file.txt', 'utf8');
let transform = new Delimited({ delimiter: /\n\n(?=Part #\d)/g });
let json = '';
transform.on('data', (chunk) => json += chunk);
transform.on('end', () => console.log(JSON.parse(json)));
stream.pipe(transform);
或者,如果您不想将JSON传输到另一个文件,进程或作为客户端响应,则可以通过将输出流设置为 objectMode:true
来将每个部分作为块发出:
Alternatively, if you prefer not to transfer the JSON to another file, process, or as a client response, you can emit each section as a chunk by setting the output stream to objectMode: true
:
const { Transform } = require('stream');
class Delimited extends Transform {
constructor(delimiter = /\r?\n/g) {
super({ objectMode: true });
// initialize internal values
this._delimiter = delimiter instanceof RegExp ? delimiter : new RegExp(delimiter, 'g');
this._encoding = 'utf8';
this._buffer = '';
this._first = true;
}
_transform(chunk, encoding, callback) {
// convert input encoding into output encoding
// and append to internal buffer
if (encoding === 'buffer') {
this._buffer += chunk.toString(this._encoding);
} else if (encoding === this._encoding) {
this._buffer += chunk;
} else {
this._buffer += Buffer.from(chunk, encoding).toString(this._encoding);
}
if (this._delimiter.test(this._buffer)) {
// split internal buffer by delimiter
let sections = this._buffer.split(this._delimiter);
// put possibly incomplete section from array back into internal buffer
this._buffer = sections.pop();
// push each section to readable stream in object mode
sections.forEach(this.push, this);
}
callback();
}
_flush(callback) {
// push remaining buffer to readable stream
callback(null, this._buffer);
}
}
示例用法:
const fs = require('fs');
let stream = fs.createReadStream('file.txt', 'utf8');
let transform = new Delimited(/\n\n(?=Part #\d)/g);
let array = [];
transform.on('data', (chunk) => array.push(chunk));
transform.on('end', () => console.log(array));
stream.pipe(transform);
这篇关于如何基于单词作为键分隔符读取文本文件的块?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!