使用Phantom刮擦信息并提交表单 [英] Scrape information with form submit using Phantom

查看:91
本文介绍了使用Phantom刮擦信息并提交表单的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想对进行网页抓取这个网站. 我已经看到这些API可用,但正如 duraid 在我的

I want to do web scraping of this site. I have seen that the APIs are available but, as suggested by duraid in my previous question, it is not advisable to use them.

所以我尝试使用Node.js Phantom .

So I tried to use Node.js and Phantom.js with Phantom.

这是我的代码:

var phantom = require('phantom');

// object of methods
var methods = {};
var loadInProgress = false;
var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';

methods.download = async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.info('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.info(msg);
    });
    await page.on('onLoadStarted', function() {
        loadInProgress = true;
        console.log('Load started...');
    });
    await page.on('onLoadFinished', function() {
        loadInProgress = false;
        console.log('Load end');
    });

    const status = await page.open(url);
    console.log('STATUS:', status);

    const content = await page.property('content');
    console.log('CONTENT:', content);

    // submit
    await page.evaluate(function() {
        document.getElementById('crID%3a250').value = 'crID%3a250'; // France
        document.getElementById('timeID%3a79').value = 'timeID%3a79'; // 2015
        document.getElementById('varID%3a2').value = 'varID%3a2'; // Medium
        document.getElementById('ctl00_main_filters_anchorApplyBottom').submit(); // submit button
    });

    var result = await page.evaluate(function() {
        return document.querySelectorAll('html')[0].outerHTML; 
    });
    console.log('RESULT:', result);

    await instance.exit();
};

module.exports = methods;

(他们如何选择更多的国家和更长的时间?)

(How can they select more countries and more years?)

我尝试选择法国作为国家或地区,将 2015 选择为年份作为变量.

I tried to select France as Country or Area, 2015 as a Year and medium as a Variants.

所以crID%3a250是元素的 id :

<input type="checkbox" id="crID%3a250" value="crID%3a250" name="France" />
<label for="crID%3a250">France</label><br />

timeID%3a79是元素的 id :

<input type="checkbox" id="timeID%3a79" value="timeID%3a79" name="2015" />
<label for="timeID%3a79">2015</label><br />

varID%3a2是元素的 id :

<input type="checkbox" id="varID%3a2" value="varID%3a2" name="Medium" />
<label for="varID%3a2">Medium</label><br />

然后ctl00_main_filters_anchorApplyBottom是按钮元素的 id :

<div class="All">
    <img src="_Images/IconUpdateResults.png" alt="Update" width="11px" height="11px" title="Apply filters" />&nbsp;<a href="javascript:;" id="ctl00_main_filters_anchorApplyBottom" title="Apply filters" onclick="ApplyFilters(SendFilterRequest);">Apply Filters</a>
</div>

但是我得到的只是网页本身(以HTML格式),而不是我感兴趣的数据. 这就好像我没有选择任何参数一样.为什么?

But what I got is the web page itself (in HTML), not the data that interest me. So it's as if I had not selected any parameters. Why?

在@Vaviloff的建议下,我尝试更改代码,但没有成功. 我的服务器端语言是Node.js.

After the advice of @Vaviloff I tried to change the code but without success. My server-side language is Node.js.

使用Phantom我修改了如下代码:

Using Phantom I modified the code like this:

methods.download = async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.log('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.log(msg);
    });

    const status = await page.open(url);
    console.log('\n\nSTATUS:', status);

    // submit
    await page.evaluate(function() {
        var countries = {
            'Albania': 'crID%3a8',
            'Austria': 'crID%3a40',
            'Belgium': 'crID%3a56',
            'Bulgaria': 'crID%3a100',
            'Croatia': 'crID%3a191',
            'Cyprus': 'crID%3a196',
            'Denmark': 'crID%3a208',
            'Estonia': 'crID%3a233',
            'Finland': 'crID%3a246',
            'France': 'crID%3a250',
            'Germany': 'crID%3a276',
            'Greece': 'crID%3a300',
            'Iceland': 'crID%3a352',
            'Ireland': 'crID%3a372',
            'Italy': 'crID%3a380',
            'Latvia': 'crID%3a428',
            'Netherlands': 'crID%3a528',
            'Norway': 'crID%3a578',
            'Poland': 'crID%3a616',
            'Portugal': 'crID%3a620',
            'Romania': 'crID%3a642',
            'Slovakia': 'crID%3a703',
            'Slovenia': 'crID%3a705',
            'Spain': 'crID%3a724',
            'Sweden': 'crID%3a752',
            'Switzerland': 'crID%3a756',
            'United Kingdom': 'crID%3a826'
        };
        // 2018 - 1980
        var years = ['timeID%3a83', 'timeID%3a82', 'timeID%3a81', 'timeID%3a79', 'timeID%3a78', 'timeID%3a77', 'timeID%3a76', 'timeID%3a75', 'timeID%3a73', 'timeID%3a72', 'timeID%3a71', 'timeID%3a70', 'timeID%3a69', 'timeID%3a67', 'timeID%3a66', 'timeID%3a65', 'timeID%3a64', 'timeID%3a63', 'timeID%3a61', 'timeID%3a60', 'timeID%3a59', 'timeID%3a58', 'timeID%3a57', 'timeID%3a55', 'timeID%3a54', 'timeID%3a53', 'timeID%3a52', 'timeID%3a51', 'timeID%3a49', 'timeID%3a48', 'timeID%3a47', 'timeID%3a46', 'timeID%3a45', 'timeID%3a43', 'timeID%3a42', 'timeID%3a41', 'timeID%3a40', 'timeID%3a39', 'timeID%3a37']; 

        // select countries
        for(var c in countries) {
            document.getElementById(countries[c]).setAttribute('checked', true);
        }
        // select years
        for(var y in years) {
            document.getElementById(years[y]).setAttribute('checked', true);
        }
        // select variants
        document.getElementById('varID%3a2').setAttribute('checked', true); // medium
        // click button
        document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); 
    });

    console.log('\nWaiting 1.5 seconds...');    
   await timeout(1500);

   // get only the table contents
    var result = await page.evaluate(function() {
        return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
    });
    console.log('\n\nRESULT:', result);

    elaborateResult(result);

    await instance.exit();
};

function elaborateResult(res) {
    var el = document.createElement('html'); // ** ERROR HERE **
    el.innerHTML = result;
    console.log('\n\nTD ELEMENTS:', el.getElementsByTagName('td'));
    //var obj = utilFunc.createJsonObjectPop(year, country, population);
    //console.log(obj);
}

有两个错误:

  1. result仅包含结果第一页上的值,但是通过选择,您将获得22页结果,并且我不明白如何获取所有感兴趣的值并将它们链接起来在变量result中.
  2. 假设已经解决了第(1)点中的问题,现在我应该详细说明获得的结果并创建一个像这样的对象:
  1. result contains only the values that are on the first page of the results, but with the selections made you get 22 pages of results and I don't understand how I can get all the values that interest me and link them in the variable result.
  2. assuming to have solved the problem in point (1), now I should elaborate the results obtained and create an object like this:

var date = [{year: 2018, country: 'Albania', population: 2934.363}, {year: 2017, country: 'Albania', population: 2930.187}, ..., {year: 1980, country: 'United Kingdom ', population: 56265.475}]

这是elaborateResult(res)函数应该做的(当然,该函数并不完整,我必须完成它,但是第一行却出现错误),但是我得到了错误:

This is what the elaborateResult(res) function should do (of course, the function is not complete, I have to finish it but I get an error at the first line), but I get the error:

ReferenceError:未定义文档

ReferenceError: document is not defined

所以我改变了策略,尝试不使用Phantom,而是使用普通的request:

So I changed my strategy and I tried not to use Phantom but a normal request:

var options = {
    uri: 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8&crID%3a40;timeID%3a79&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302',
    transform: function(body) {
        return cheerio.load(body);
    }
};

methods.download = async function(req, res) {
    request(options)
    .then(function($) {
        console.log('\n\nTHEN: ', $);
    })
    .catch(function(err) {
        console.log('Error', err.stack());
    });
}

如果运行此代码,我将得到:

If I run this code I get:

THEN:  function (selector, context, r, opts) {
    if (!(this instanceof initialize)) {
      return new initialize(selector, context, r, opts);
    }
    opts = _.defaults(opts || {}, options);
    return Cheerio.call(this, selector, context, r || root, opts);
  }

在这种情况下,我还有其他问题.

In this case I have other problems.

  1. 我不知道如何建立网址. 在上面的示例中,我选择了阿尔巴尼亚(crID% 3a8)和奥地利(crID% 3a40)和2015作为年份(timeID% 3a79). 但是,如果我转到刚刚建立的链接,结果将得到2100到2095年之间的阿尔巴尼亚数据.
  2. 我不知道如何选择年份,如何选择变体或如何更改页面.
  1. I don't know how to build the url. In the example above I chose Albania (crID% 3a8) and Austria (crID% 3a40) and 2015 as year (timeID% 3a79). Yet if I go to the link just built, I get as a result the data on Albania from 2100 to 2095.
  2. I don't know how to select the years or how to select variants or how to change pages.

我觉得自己有点愚蠢,但是我无法得到想要的东西……我被卡住了. 帮助将非常受欢迎!

I feel a bit stupid but I can't get what I want... I'm stuck. Help would be very welcome!

推荐答案

您的脚本存在多个问题,无法成功进行抓取.

There are several issues with your script that prevent successful scrape.

要选中一个复选框,您无需再次设置其值(它已经在HTML中设置!),请将其checked属性设置为 true :

To check a checkbox, you don't set its value again (it's already set in HTML!), you set its checked attribute to true:

document.getElementById('crID%3a250').setAttribute("checked", true); // France

提交表单的按钮是超链接<a>,它没有submit方法,应单击它(甚至在代码中具有onClick功能)

The button that submits the form is a hyperlink <a> which doesn't have a submit method, it should be clicked (it even has onClick function in the code)

 document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // submit the form

**搜索请求**是通过ajax发送的,需要花费一些时间才能完成,因此您的脚本应至少等待一秒钟,然后再尝试获取数据.我将在下面的完整工作代码中演示如何等待.

**The search request ** is sent through ajax and takes time to complete, so your script should wait for at least a second vefore trying to fetch the data. I'll show how to wait in the full working code below.

下一步,您可能只获取表数据,而无需浏览所有HTML:

Next, you may get only the table data, no need to sip through all th HTML:

var result = await page.evaluate(function() {
    return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
});

以下是您的脚本的精简版本,已纠正了问题:

Here's a bit trimmed down version of you script with issues corrected:

var phantom = require('phantom');

var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';

// A promise to wait for n of milliseconds
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms));

(async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.info('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.info(msg);
    });

    const status = await page.open(url);
    await console.log('STATUS:', status);

    // submit
    await page.evaluate(function() {
        document.getElementById('crID%3a250').setAttribute("checked", true); // France
        document.getElementById('timeID%3a79').setAttribute("checked", true); // 2015
        document.getElementById('varID%3a2').setAttribute("checked", true); // Medium
        document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // click submit button
    });

    console.log('Waiting 1.5 seconds..');    
    await timeout(1500);

    // Get only the table contents
    var result = await page.evaluate(function() {
        return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
    });
    await console.log('RESULT:', result);

    await instance.exit();
})();


最后但并非最不重要的观察结果是,您可以简单地尝试重播表单提出的ajax请求,并找出


The last but not the least observation is that you could simply try to replay an ajax request made by the form and find out that the URL of search request works quite well on its own, when just opened in another tab:

您甚至不需要无头的浏览器即可获取它,只需单击URL/请求和处理即可.网站经常发生这种情况,因此在抓取之前检查浏览器devtools中的网络"选项卡很有用.

You don't even need a headless browser to get it, just cUrl/requests and process. It happens with sites a lot, so it's useful to check network tab in your browser devtools before scraping.

更新

如果结果太多以至于它们分散在多个页面上,则在请求中将使用另外一个参数:Page:

And if there are so many results that they are scattered over several pages, there is one more parameter to be used in request: Page:

data.un.org/Handlers/DataHandler.ashx?Service=page& Page = 3 & DataFilter = variableID:12& DataMartId = PopDiv& UserQuery = population& c = 2,4 ,6,7& s = _crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc& RequestId = 461

data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=variableID:12&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=461

这篇关于使用Phantom刮擦信息并提交表单的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆