使用casperjs和phantomjs抓取多个页面 [英] Using casperjs and phantomjs to scrape multiple pages
问题描述
我正在尝试抓取许多具有标准格式的页面.我已经能够使用Phantomjs成功地刮取单个页面,但是当我尝试遍历多个页面时,异步处理会使事情挂起.告诉Casper/Phantom等待的正确方法是什么?
I'm trying to scrape a number of pages that have a standard format. I've been able to use Phantomjs to successfully scrape a single page, but when I try to iterate over multiple ones, the asynchronous processing makes things hang up. What's the proper way to tell Casper/Phantom to wait?
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
// this overwrites the previous output file
f = fs.open("lat_long.txt", "w");
f.write("--");
f.close();
// this is the unique identifier for the locations. For now, I just have three datapoints
var EPAID = ["KYD980501076","ME8170022018", "MEN000103584"];
/// this code will be used to loop through the different locations. For now, set to look at only one.
for (q= 0; q < 1; q++) {
var processing = false;
//we construct the target url
var url = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[0] + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;
page.open(url);
page.onLoadFinished = function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
$value = [];
$Object = $(".result tr");
for (i =0 ; i < 10; i++) {
$value.push($Object.find('td').html(),$Object.find('td').next().next().html() );
$Object = $Object.next();
}
$string = "{ EPAID: "+ $value[0] + ", " +
"Name: "+ $value[1] + ", " +
"City: "+ $value[4] + ", " +
"State: "+ $value[6] + ", " +
"ZipCode: "+ $value[8] + ", " +
"Latitude: "+ $value[14] + ", " +
"Longitude: "+ $value[16] + " }" ;
return $string;
});
f = fs.open("lat_long.txt", "a");
f.write(str);
f.close();
processing = true;
console.log("writing to file");
phantom.exit();
});
}
// right here it should delay until the previous page is completed
// while (!processing) {
// setTimeout(function(){ console.log("waiting....");},1000);
// }
};
}
console.log("finished all pages");
推荐答案
以下是最终有效的代码(使用超时方法,因为我无法使成功回调更好地工作).
Here is the code that ultimately works (using the timeout approach since I wasn't able to get the success callback to work better).
在安装了casperjs的情况下,我将此文件命名为"process.js",并能够在命令行中将其作为"casperjs process.js"
With casperjs installed, I named this file "process.js" and was able to run it from the command line as "casperjs process.js"
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
// this overwrites the previous output f
// this is the unique identifier for the locations.
var EPAID = ["NED981713837",... , "FLD049985302", "NJD986643153"];
f = fs.open("lat_long.txt", "w");
f.write("-<>-");
f.close();
var count = 0;
var target = 1400;
var written = [];
function yourFunction(){
if (count < target) {
process(count);
count++;
setTimeout(yourFunction, 5000);
} else {
console.log("exiting");
phantom.exit();
return;
}
}
function process(counter){
var processing = false;
console.log("Beginning record #" + counter);
//we construct the target url
var url = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[counter] + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;
page.open(url);
page.onLoadFinished = function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
$value = [];
$Object = $(".result tr");
for (i =0 ; i < 10; i++) {
$value.push($Object.find('td').html(),$Object.find('td').next().next().html() );
$Object = $Object.next();
}
$string = "{ \"EPAID\": \""+ $value[0] + "\", " +
"\"Name\": \""+ $value[1] + "\", " +
"\"City\": \""+ $value[4] + "\", " +
"\"State\": \""+ $value[6] + "\", " +
"\"ZipCode\": \""+ $value[8] + "\", " +
"\"Latitude\": "+ $value[14] + ", " +
"\"Longitude\": "+ $value[16] + " }," ;
return $string;
});
if (written[counter] === undefined) {
f = fs.open("lat_long.txt", "a");
f.write(str);
f.close();
written[counter] = true;
console.log("Writing to file #"+ counter);
}
});
}
};
}
console.log("Start...");
yourFunction();
这篇关于使用casperjs和phantomjs抓取多个页面的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!