使用Node.js和Phantom.js进行动态抓取 [英] Dynamic scraping using nodejs and phantomjs

查看:78
本文介绍了使用Node.js和Phantom.js进行动态抓取的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

首先,我已经成功安装了PhantomJs及其npm界面 phantom .我已设置代码以使用新的语法加载页面(此处发布的所有其他问题均基于旧的代码语法,或者我遗漏了一些内容). 这是我要抓取的来源.

First of all, I've installed successfully both PhantomJs and its npm interface phantom. I've set the code to load my page with the new syntax (All the other questions posted on here were based on the old code syntax or I'm missing something). this is the source I'm trying to scrape.

现在,右侧边栏是在"Comune"附近带有假选择的那一个,而另一个是动态生成的,我不明白为什么phantomjs不选择它们.按照我的代码:

Now, the right sidebar, the one with the fake select near "Comune" and the other one are generated dynamically and I can't understand why phantomjs isn't picking them up. Following my code:

var sito = "http://bicincitta.tobike.it/";
var sitepage = null;
var phInstance = null;
var phantom = require('phantom')

phantom.create()
    .then((instance) => {
    phInstance = instance;
return instance.createPage();
})
.then((page) => {
    sitepage = page;
return page.open(sito);
})
.then((status) => {
    console.log(status);
return sitepage.property('content');
})
.then((content) => {
    console.log(content);
sitepage.close();
phInstance.exit();
})
.catch((error) => {
    console.log(error);
phInstance.exit();
})

我现在正用力撞墙.我是否应该以某种方式获取网站的脚本并执行它们?我想念指令吗?

I'm hitting my head hard on a wall right now. Am I supposed to get in some way the site's scripts and execute them? Am I missing an instruction?

此外,在旁注中;如果页面的作用域位于第二个".then"之内,则还不清楚如何将其他方法连接到页面.

Also, on a sidenote; it's not really clear how should I concatenate additional methods to page, if page is scoped inside the second ".then".

推荐答案

在html的底部有CData脚本,无法通过幻像解析.这是从中传播项目的地方.

There is CData script at the bottom of the html that can not be parsed by phantom. This is where the items are being propagated from.

<script type="text/javascript">
//<![CDATA[
Sys.Application.initialize();
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxManager, {"_updatePanels":"","ajaxSettings":[],"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"defaultLoadingPanelID":"","enableAJAX":true,"enableHistory":false,"links":[],"styles":[],"uniqueID":"RadAjaxManager1","updatePanelsRenderMode":0}, null, null, $get("RadAjaxManager1"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginUser"}, null, null, $get("ajCheckLoginUser"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajCheckLoginAdmin"}, null, null, $get("ajCheckLoginAdmin"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadAjaxPanel, {"clientEvents":{OnRequestStart:"",OnResponseEnd:""},"enableAJAX":true,"enableHistory":false,"links":[],"loadingPanelID":"","styles":[],"uniqueID":"ajLogoutUser"}, null, null, $get("ajLogoutUser"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadWindow, {"_dockMode":false,"behaviors":0,"clientStateFieldID":"radPortal_ClientState","destroyOnClose":true,"formID":"form1","height":"180px","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"radPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"width":"450px"}, {"close":OnClientClosePortal}, null, $get("radPortal"));
});
Sys.Application.add_init(function() {
    $create(Telerik.Web.UI.RadWindowManager, {"behaviors":4,"clientStateFieldID":"windowManagerPortal_ClientState","destroyOnClose":true,"formID":"form1","iconUrl":"","left":"","minimizeIconUrl":"","modal":true,"name":"windowManagerPortal","reloadOnShow":true,"showContentDuringLoad":false,"skin":"Office2007","top":"","visibleStatusbar":false,"windowControls":"['radPortal']"}, null, {"child":"radPortal"}, $get("windowManagerPortal"));
    });
//]]>
</script>

与该站点服务器保持通信后,这些项目也将被销毁.有一些方法可以解决此问题,但我认为您最好尝试其他方法.我使用npm cheerio加载CDATA html

These items will also be destroyed as soon as you leave communication with this sites server. There are methods to get around this but I think you are better off trying something else. I used npm cheerio to load the CDATA html

这篇关于使用Node.js和Phantom.js进行动态抓取的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆