如何使用PhantomJS从页面获取动态HTML和Javascript值 [英] How to get dynamic HTML and Javascript values from a page using PhantomJS

查看:435
本文介绍了如何使用PhantomJS从页面获取动态HTML和Javascript值的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

如何从PhantomJS获取最新的页面数据(HTML和Javascript变量)

例如page.refresh( )或其他东西?

我有一个Interval,而不是每隔200ms检查一个变量(在页面上)。但是,此变量和页面内容未显示随时间变化。 (尽管我知道它有)

I have an Interval, than checks a variable (on the page) every 200ms. However, this variable and the page content, isn't shown to have changed over time. (even though I know it has)

所以我需要一种有效的方法来检查每200ms左右的JS变量的值,

So I need an efficient way to check the value of a JS variable every 200ms or so,

然后,一旦我发现该变量已更改值,我想请求最新的HTML页面。

then once I've discovered that variable has changed value, I want to request the latest page HTML.

我该怎么做?

var Error = function (description) {
    this.description = description;
    return this;
};

var DTO = function (status, content, error) {
    this.status = status;
    this.content = content;
    this.error = error;
    return this;
};

function outputAndExit(dto) {
    console.log(JSON.stringify(dto));
    phantom.exit();
}

//For any uncaught exception, just log it out for .NET to capture
window.onerror = function (errorMsg, url, lineNumber) {
    var description = 'window.onerror caught an error: ' +
        'errorMsg: ' + errorMsg +
        'url: ' + url +
        'lineNumber: ' + lineNumber;
    outputAndExit(new DTO(false, null, new Error(description)));
};

var GetDynamicPageResult__ = function () {
    var obj = new GetDynamicPageResult();
    obj.initialize();
    return obj;
};

var GetDynamicPageResult = function () {
    var self = this;
    this.initialize = function () {

        this.error = null;
        this.isContentReadyForCrawler = false;

        this.ticker = null;
        this.tickerInterval = 150;
        this.tickerElapsed = 0;

        this.url = '';

        this.loadDependencies();
        this.processArgs();

        this.openPage();

    };
    this.loadDependencies = function () {
        this.system = require('system'),
        this.page = require('webpage').create(),
        this.page.injectJs('jquery-1.10.2.min');
        this.fs = require('fs');
    };
    this.processArgs = function () {
        if (this.system.args.length == 0) {
            outputAndExit(new DTO(false, null, new Error('No arguments given')));
        }
        //system.args[0] Was the name of this script
        this.url = this.system.args[1];
    };
    this.updateIsContentReadyForCrawler = function () {
        var updateIsContentReadyForCrawler = self.page.evaluate(function () {
            self.isContentReadyForCrawler = window.isContentReadyForCrawler;
        });
    };
    this.openPage = function () {
        self.page.open(this.url, function (status) { //NB: status = 'success' || 'fail'
            if (status !== 'success') {
                outputAndExit(new DTO(false, null, new Error('page.open received a non-success status')));
            }
            self.initTicker();
        });
    };

    this.initTicker = function () {
        this.ticker = setInterval(self.handleTick, self.tickerInterval);
    };
    this.handleTick = function () {
        self.tickerElapsed += self.tickerInterval;
        self.updateIsContentReadyForCrawler();
        if (self.isContentReadyForCrawler) {
            clearInterval(self.ticker);
            var content = self.page.content;
            self.finish(true, content, null);
        } else {
            var tooMuchTimeElapsed = self.tickerElapsed > 7000;
            if (tooMuchTimeElapsed) {
                clearInterval(self.ticker);
                self.finish(false, null, new Error('Too much time elapsed'));
            }
        }
    };
    this.finish = function (status, content, error) {
        content = content || '';
        error = error || {};
        outputAndExit(new DTO(status, content, error));
    };
};

/**********************************************************************************/
/***************************** Helpers *****************************/
/**********************************************************************************/

var Utility__ = function () {
    var obj = new Utility();
    obj.initialize();
    return obj;
};

var Utility = function () {
    var self = this;
    this.initialize = function () {
    };
    this.isEmpty = function (obj) {
        var isEmpty = false;
        (obj == undefined || obj == null) && (isEmpty = true);
        return isEmpty;
    };
    this.isStringEmpty = function (str) {
        var isEmpty = false;
        isEmpty(str) && (isEmpty = true);
        (isEmpty == false && $.trim(str) == '') && (isEmpty = true);
        return isEmpty;
    };
};

var getDynamicPageResult = new GetDynamicPageResult__();


推荐答案

我想你几乎就在那里:你需要成为使用 page.evaluate(),但目前只使用它来获取window.isContentReadyForCrawler。您需要使用 page.evaluate()来获取最新的HTML。

I think you are almost there: you need to be using page.evaluate(), but currently only use it to get window.isContentReadyForCrawler. You need to use page.evaluate() to grab the latest HTML too.

我将无耻地粘贴另一个答案的代码( https://stackoverflow.com/a/12044474/841830 ):

I'm going to shamelessly paste in code from another answer (https://stackoverflow.com/a/12044474/841830):

var html = page.evaluate(function () {
    var root = document.getElementsByTagName("html")[0];
    var html = root ? root.outerHTML : document.body.innerHTML;
    return html;
});

这篇关于如何使用PhantomJS从页面获取动态HTML和Javascript值的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆