CasperJS-抓取工具无法导航到下一页 [英] CasperJS - Scraper not navigating to the next page

查看:94
本文介绍了CasperJS-抓取工具无法导航到下一页的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

以下代码是用CasperJS编写的简单抓取工具.

The following code is a simple scraper written in CasperJS.

var casper = require('casper').create();

var url = casper.cli.get(0);
var page1 = casper.cli.get(1);
var page2 = casper.cli.get(2);
//console.log(page2);
var proxy = casper.cli.get(3);

//alert(page1);

var exp = /[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(exp);

var baseUrl = url;

//console.log(baseUrl);

var nextBtn = "a.navigation-button.next";

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
  for (var i = page1; i < page2; i = i + 1) {
      console.log(i);
    var pageData = this.evaluate(getPageData);
    allLinks = allLinks.concat(pageData);



  if (!this.exists(nextBtn)) {
    return;
  };

  this.thenClick(nextBtn).then(function() {
      //this.echo(i);
    this.echo(this.getCurrentUrl());
    //this.wait(1000);
  });
};
}

function getPageData(){
  //return document.title;

  var links = document.getElementsByClassName('pro-title');
  links = Array.prototype.map.call(links,function(link){
    return link.getAttribute('href');
  });
  return links;
};


casper.then(function(){
  //require('utils').dump(allLinks);
  this.each(allLinks,function(self,link){
      if (link.match(regex)) {
    self.thenOpen(link,function(a){
      jsonObj = {};
      jsonObj.title = this.fetchText('a.profile-full-name');

      jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
      jsonObj.services = jsonObj.services.replace(/&amp;/g,"and");  

      jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
      //jsonObj.contact = this.fetchText('span.pro-contact-text');
      jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');  
      //jsonObj.description.replace(/\s/g, '');   

      //require('utils').dump(jsonObj);
      //jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");   

      //jsonObj = JSON.stringify(jsonObj, null, '\t');
      //console.log(i);
      require('utils').dump(jsonObj);
    });
      };
  });
});

我正在按以下方式执行此脚本,

I am executing this script as follows,

casperjs scraping.js http://www.houzz.com/professionals/c/Chicago--IL/p/15 1 3

第一个CLI参数是起始URL.第二和第三个参数是抓取的开始和结束页码.

The first CLI argument is the starting URL. The second and third arguments are the starting and ending page numbers of the scrape.

我能够从第一页中提取数据,但是我不明白为什么我无法从随后的任何页面中提取数据.

I am able to extract data from the first page, but I don't understand why I am not able to extract data from any of the consequent pages.

推荐答案

您不能在processPage中混合使用这样的同步和异步代码.该循环将立即执行,但是单击和下一页的加载是异步发生的.该页面的评估必须异步完成:

You cannot mix synchronous and asynchronous code like this in processPage. The loop is immediately executed, but the click and the loading of the next page happens asynchronously. The evaluation of the page has to be done asynchronously:

function processPage() {
    for (var i = page1; i < page2; i = i + 1) {
        this.then(function(){
            console.log(i);
            var pageData = this.evaluate(getPageData);
            allLinks = allLinks.concat(pageData);

            if (!this.exists(nextBtn)) {
                return;
            }

            this.thenClick(nextBtn).then(function() {
                this.echo(this.getCurrentUrl());
            });
        });
    };
}

这篇关于CasperJS-抓取工具无法导航到下一页的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆