需要在PhantomJS中打开URL数组 [英] Need to open an array of URL's in PhantomJS

查看:82
本文介绍了需要在PhantomJS中打开URL数组的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我已经用phantomJs创建了一个脚本.它的作用是,它可以从特定页面中获取某些元素,效果很好.

I have created a script in phantomJs. What it does is, it fetches some elements from a specific page which works fine.

这是代码:

var page = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;

page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted    = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished   = function() { loadInProgress = false; console.log("load finished"); };

// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';

// Enable/Disable Javascript
// page.settings.javascriptEnabled = false;

var steps = [

    function() { //Load Page
        page.open("http://www.example.com/mobiles/");
        page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");
    },

    function() { //Fetch Products
        page.onCallback = function(result) {
            var fs = require('fs');
            fs.write('product-list.csv', result, 'w+');
        };

        page.evaluate(function() {
            var arr_mainList = new Array();
            var arr_innerList = new Array();

            try {
                for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                    arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);

                    window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");
                    //window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href + ", ");

                    var myWindow = window.open(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href);
                    console.log(myWindow.getElementsByClassName("item_desc")[0].textContent);
                    myWindow.close();

                    if (arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href.length > 43) {
                        var innerURL = arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
                        console.log(innerURL);
                    }

                    window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");

                    arr_innerList.push(arr_mainList[i]); 

                    for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {                 
                        if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
                            window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
                        }
                        else {
                            window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
                        }
                    };
                    //window.callPhantom(", ");
                    window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
                    window.callPhantom("\n");
                };

                loadInProgress = true;
                console.log("Successful.");
            }
            catch(ex) {
                console.log("Failed: " + ex);
            }
        });
    }
];

interval = setInterval(function() {
    if (!loadInProgress && typeof steps[testindex] == "function") {
    console.log("step " + (testindex + 1));
    steps[testindex]();
    testindex++;
    }

    if (typeof steps[testindex] != "function") {
        setTimeout(function() {
            //fs.write('product-list.html', page.content, 'w');
            console.log("test complete!");
            phantom.exit();
        }, 100);
    }
}, 5000);

现在,如果我运行该程序,我会在csv文件中获取所有信息.除了转到window.open以外,phantomJs都会停止.我知道我无法在page.evaluate内打开新页面.但是我需要获取产品说明,并将其添加到csv文件中,以代替产品链接.我已经搜索了几个小时,任何帮助都将是不错的. 注意:我的限制是我必须使用phantomJs.

Now if i run the program I get all the information in csv file. Except when it goes to window.open, phantomJs stops. I know i can't open a new page inside page.evaluate. But i need to fetch the product description and add it to csv file in place of product link. I have been searching for hours now, any help would be nice. Note: My limitations are that i have to use phantomJs.

推荐答案

我对您的脚本进行了一些修改.因此,您现在可以做任何您想做的事情.请记住不要将很多物品报废,否则会遇到内存问题.因此,如果在使用过的网站中存在分页,请为其使用新功能. 在此代码中,我假设您需要每个设备的描述,但您也可以访问其他元素.

I have modified your script a little bit. So now you can do whatever you want. Just keep in mind not to give to many items to scrap or you will have memory issues. So if pagination exists in used website use new function for it. In this code i have assumed that you need description of every device but you can also access other elements.

注意::您可能知道跨域策略不允许我们使用javascript/jQuery访问iFrame,所以这将是一个巨大的缺陷.您必须添加

Note: As you may know cross-domain policy does not allow us to access iFrames using javascript/jQuery, this would be a huge flaw. You have to add the

-web-security = no

--web-security=no

在cmd/terminal中执行脚本时

标志.

flag when executing script in cmd/terminal.

var page = new WebPage(), innerPage = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;

page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted    = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished   = function() { loadInProgress = false; console.log("load finished"); };

// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';

// Enable/Disable Javascript
//page.settings.javascriptEnabled = false;

//IMPORTANT FLAGS
//--web-security=yes/no

var steps = [
  function() { //Load Page
    page.open("http://www.example.com/mobiles-apple/", function() {
        page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");

        page.evaluate(function() {
            try {
                $("#main1").append('<div id="inner-data_iframes"></div>');

                for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                    var iFrameAdd = document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
                    $("#inner-data_iframes").append('<iframe id="myIframe' + [i] + '" src="' + iFrameAdd + '"></iframe>');
                    window.document.body.scrollTop = document.body.scrollHeight;
                }
                console.log("Mission Successful.");
            }
            catch(ex) {
                console.log("Failed to add iFrame.");
            }
        });
    });
  },

  function() { //Fetch Products
      page.onCallback = function(result) {
          var fs = require('fs');
          fs.write('product-list.csv', result, 'w+');
    };

    page.evaluate(function() {
        var arr_mainList = new Array();
        var arr_innerList = new Array();

        try {
            for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);

                window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");

                var desc = $("#myIframe" + [i]).contents().find(".item_desc").html();
                desc = desc.replace(/,/g, "");
                window.callPhantom(desc + ", ");

                window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");

                arr_innerList.push(arr_mainList[i]); 

                for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {

                    if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
                        window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
                    }
                    else {
                        window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
                    }
                }

                window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
                window.callPhantom("\n");
            }

                loadInProgress = true;
                console.log("Successful.");
            }
            catch(ex) {
                console.log("Failed: " + ex);
            }
        });
    }
];

interval = setInterval(function() {
    if (!loadInProgress && typeof steps[testindex] == "function") {
        console.log("step " + (testindex + 1));
        steps[testindex]();
        testindex++;
    }

    if (typeof steps[testindex] != "function") {
        setTimeout(function(){
            //fs.write('product-list.html', page.content, 'w');
            console.log("test complete!");
            phantom.exit();
        }, 100);
    }
}, 5000);

这篇关于需要在PhantomJS中打开URL数组的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆