将许多记录保存到nodejs中的couchdb [英] save many records to couchdb in nodejs
问题描述
我有一个非常大的数据集
我希望记录看起来像这样:
I want the records to look like this:
{
"type": "first",
"name": "ryan",
"count": 447980
}
由于文本文件大于我应该在内存中容纳的文本文件,因此我正在设置流式readline阅读器,如下所示:
Since the text-files are larger than I should hold in memory, I am setting up a streaming readline reader, like so:
var db = require('./db'),
readline = require('readline'),
path = require('path'),
fs = require('fs');
// simple callback after cradle save
function saveHandler(er, doc){
if (er) return console.log('Error: ', er);
console.log(doc);
}
// save record of type, based on line with count & name
function handleCountedLine(type, line){
return function(line){
var record = {type:type};
var i = line.trim().split(' ');
record.name = i[1].trim();
record.count = Number(i[0]);
db.save(record, saveHandler);
}
}
var handleFirst = handleCountedLine('first');
readline.createInterface({
input: fs.createReadStream('data/facebook-firstnames-withcount.txt'),
terminal: false
})
.on('line', handleFirst);
db是一个摇篮数据库。
db is a cradle db.
大约40条记录后,它会减慢到完全爬网的速度,然后最终耗尽内存。我尝试了 poolr 和 node-rate-limiter ,使用一次只能运行这么多& 只允许一分钟内运行这么多策略。两者都工作得更好,但是仍然用尽了内存。有没有实现此目标的好方法,还是我被困在用python ?
After 40 records or so, it slows to a total crawl, then eventually runs out of memory. I tried poolr and node-rate-limiter, using "only run this many at a time" & "only allow this many to run in a minute" strategies. Both work a little better, but it still runs out of memory. Is there a good way to accomplish this goal, or am I stuck writing it in python?
推荐答案
在Paulo Machado在Google环聊中的出色帮助下,我使用逐行,这是一个使用stream.pause()& stream.resume()仅允许一次处理一行。我想称赞他,但他还没有来这里回答,所以我将其放在这里。到目前为止,它已经解析了34039条记录。如果崩溃,我将更新答案。
With awesome help from Paulo Machado in google hangouts, I made an answer using line-by-line, a simple wrapper that uses stream.pause() & stream.resume() to only allow a single line to be processed at a time. I'd like to give him the credit, but he hasn't come over here to make an answer, so I will just put this here. It has parsed 34039 records, so far. I will update the answer if it crashes.
var LineByLineReader = require('line-by-line'),
path = require('path'),
db = require('./db')
// line-by-line read file, turn into a couch record
function processFile(type){
var fname = path.join('data', types[type] + '.txt');
var lr = new LineByLineReader(fname, {skipEmptyLines: true});
lr.on('error', function (err) {
console.log('Error:');
console.log(err);
});
lr.on('record', function (record) {
console.log('Saved:');
console.log(record);
});
lr.on('line', function (line) {
lr.pause();
var record = { type: type };
if (type == 'full'){
record.name = line.trim().split(' ');
}else{
var i = line.trim().split(' ');
record.name = i[1].trim();
record.count = Number(i[0]);
}
db.save(record, function(er, res){
if (er) lr.emit('error', er, record);
if (res) lr.emit('record', record);
lr.resume();
})
});
}
var types = {
'first':'facebook-firstnames-withcount',
'last':'facebook-lastnames-withcount',
'full':'facebook-names-unique'
};
for (type in types){
processFile(type);
}
// views for looking things up
db.save('_design/views', require('./views'));
这篇关于将许多记录保存到nodejs中的couchdb的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!