CasperJS-抓取工具无法导航到下一页 [英] CasperJS - Scraper not navigating to the next page
问题描述
以下代码是用CasperJS编写的简单抓取工具.
The following code is a simple scraper written in CasperJS.
var casper = require('casper').create();
var url = casper.cli.get(0);
var page1 = casper.cli.get(1);
var page2 = casper.cli.get(2);
//console.log(page2);
var proxy = casper.cli.get(3);
//alert(page1);
var exp = /[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(exp);
var baseUrl = url;
//console.log(baseUrl);
var nextBtn = "a.navigation-button.next";
var allLinks = [];
casper.start(baseUrl);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
for (var i = page1; i < page2; i = i + 1) {
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
};
this.thenClick(nextBtn).then(function() {
//this.echo(i);
this.echo(this.getCurrentUrl());
//this.wait(1000);
});
};
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
};
casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
if (link.match(regex)) {
self.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');
jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&/g,"and");
jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');
//require('utils').dump(jsonObj);
//jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");
//jsonObj = JSON.stringify(jsonObj, null, '\t');
//console.log(i);
require('utils').dump(jsonObj);
});
};
});
});
我正在按以下方式执行此脚本,
I am executing this script as follows,
casperjs scraping.js http://www.houzz.com/professionals/c/Chicago--IL/p/15 1 3
第一个CLI参数是起始URL.第二和第三个参数是抓取的开始和结束页码.
The first CLI argument is the starting URL. The second and third arguments are the starting and ending page numbers of the scrape.
我能够从第一页中提取数据,但是我不明白为什么我无法从随后的任何页面中提取数据.
I am able to extract data from the first page, but I don't understand why I am not able to extract data from any of the consequent pages.
推荐答案
您不能在processPage
中混合使用这样的同步和异步代码.该循环将立即执行,但是单击和下一页的加载是异步发生的.该页面的评估必须异步完成:
You cannot mix synchronous and asynchronous code like this in processPage
. The loop is immediately executed, but the click and the loading of the next page happens asynchronously. The evaluation of the page has to be done asynchronously:
function processPage() {
for (var i = page1; i < page2; i = i + 1) {
this.then(function(){
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
});
});
};
}
这篇关于CasperJS-抓取工具无法导航到下一页的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!