如何在不分页的情况下抓取下一页 [英] How to scrape the next page without pagination

查看:30
本文介绍了如何在不分页的情况下抓取下一页的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

此链接 "

要检索所有产品,您只需在过滤器中将总数量设置为 "limit",并将 0 设置为 "offset" 参数,重放相同的 XHR 并解析响应.在Preview中可以发现响应实际上是JSON,要解析的HTML内容位于content属性:

以下示例展示了如何做到这一点.转到控制台选项卡.复制以下代码,将其粘贴到控制台中,然后按 Enter:

(函数(){//检索所有产品var count = document.querySelector("span.search-result-available-count").innerText;var x = 新的 XMLHttpRequest;x.open("POST","https://www.muscleandstrength.com/store/promos/index/filter",false);x.setRequestHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");x.send("filter=%7B%22order%22%3A%22sort_order+desc%2Csales_ranking+asc%22%2C%22brandfilter%22%3A%5B%5D%2C%22categoryfilter%22%3A%5B%5D%2C%22classfilter%22%3A%5B%5D%2C%22limit%22%3A"+count+"%2C%22offset%22%3A0%7D");//用接收到的数据替换正文内容document.body.innerHTML = JSON.parse(x.responseText).content;//创建输出表var addCell = 函数(htmlContent){var cell = row.insertCell(-1);cell.innerHTML = htmlContent;};var table = document.createElement("table");document.body.appendChild(table);table.style = '边距:10px;'//添加表头和表体var tHead = table.createTHead();var row = tHead.insertRow(-1);["#","Product","Price","Rating","Reviews"].forEach(addCell);var tBody = document.createElement("tbody");table.appendChild(tBody);//解析每个产品var products = document.querySelectorAll("div.product-info");for (var i = 0; i < products.length; i++) {//添加行行 = tBody.insertRow(-1);addCell(i+1);//解析名称var m = products[i].querySelector("a.product-name").innerText.trim();addCell(m);//解析价格var m = products[i].querySelector("div.price").innerText.trim();addCell(m);//解析评分var m = products[i].querySelector("div.rating").style.width;addCell(m);//解析评论var m = products[i].querySelector("span.review-count").innerText.replace(/\D/g,"");addCell(m);}//删除不需要的内容document.querySelector("div.promo-products").remove();})();

请求是同步的,所以需要等待一段时间完成.最后所有产品都解析成表格,我的输出如下:

This link "https://www.muscleandstrength.com/store/promos.html" will redirect you to a site which will show you the list of all their 600+ discounted items. My goal is to scrape them all. But the main problem I am encountering is that it doesn't have the regular pagination with numbers on it (Ex: prev 1, 2, 3, 4, 5... next) or GET url that I can use to retrieve the next page. I have to click the "View Next 20 More Products" to display the next batch. I have no problem scraping a website with pagination on it, but this one is giving me an headache.

Below link is the main JS that is being triggered whenever the next button is being clicked

https://cdn.muscleandstrength.com/store/media/js/ec00de517e571209f780e1b62a3967fd-1534086933.js

In it you will find a line like this:

e.post("www.muscleandstrength.com/store/promos/index/filter",{filter:JSON.stringify(t)}

If you go to this link "www.muscleandstrength.com/store/promos/index/filter", and scroll all the way down to the bottom, you will discover something like this.

filters":{"classfilter":{"lowprice":125,"b1g1":123,"twopacks":119,"couponcode":118,"b1g50":79,"bxgy":36,"b2g1":16},"brandfilter":{"78":77,"84":45,"190":37,"383":26,"120":24,"82":23,"108":22,"42":21,"30":18,"150":18,"133":16,"151":14,"489":14,"490":13,"69":12,"65":11,"369":10,"193":9,"232":9,"53":8,"335":8,"81":7,"93":7,"426":7,"471":7,"67":6,"106":6,"423":6,"43":5,"98":5,"241":5,"267":5,"274":5,"432":5,"481":5,"493":5,"59":4,"64":4,"144":4,"279":4,"292":4,"389":4,"109":3,"183":3,"223":3,"330":3,"421":3,"444":3,"523":3,"76":2,"97":2,"111":2,"132":2,"147":2,"191":2,"196":2,"244":2,"331":2,"344":2,"346":2,"403":2,"406":2,"428":2,"435":2,"38":1,"46":1,"48":1,"51":1,"68":1,"92":1,"116":1,"139":1,"189":1,"194":1,"252":1,"302":1,"303":1,"322":1,"326":1,"380":1,"424":1,"439":1,"450":1,"472":1},"categoryfilter":{"21":119,"9":112,"13":97,"582":76,"57":75,"408":64,"29":62,"130":49,"26":45,"605":45,"58":42,"12":35,"10":34,"140":34,"131":29,"469":29,"44":28,"177":25,"670":25,"55":24,"580":23,"597":23,"22":22,"17":21,"27":21,"40":21,"464":21,"667":21,"28":19,"87":19,"219":19,"452":18,"441":17,"635":16,"684":16,"25":14,"42":14,"544":14,"694":14,"88":13,"132":13,"695":13,"220":12,"562":12,"147":11,"205":11,"53":10,"154":10,"389":10,"423":10,"425":10,"617":10,"51":9,"530":9,"623":9,"686":9,"16":8,"19":8,"86":8,"304":8,"492":8,"554":8,"592":8,"638":8,"52":7,"122":7,"378":7,"410":7,"509":7,"572":7,"594":7,"637":7,"83":6,"171":6,"180":6,"303":6,"409":6,"496":6,"559":6,"666":6,"59":5,"84":5,"142":5,"172":5,"187":5,"302":5,"390":5,"391":5,"400":5,"431":5,"466":5,"512":5,"583":5,"593":5,"632":5,"653":5,"690":5,"696":5,"20":4,"56":4}},"num_results":616}

I think it is what's responsible to display the next batch items.

My main question is, what is the easiest way to scrape all these items regardless of the fact that it does not have regular pagination or GET url that I can use to go to the next page?

解决方案

Open the webpage https://www.muscleandstrength.com/store/promos.html in a browser (I used Chrome). Press F12 to open Developer Tools. Go to Network Tab. Here you can see all logged requests. If you click on "View Next 20 Products" on the webpage then the new one request will be logged. Filter XHR requests. The necessary data to retrieve all products you can find in Headers, it looks for me as follows:

To retrieve all products you need just set total quantity to "limit", and 0 to "offset" parameters within filter, replay the same XHR and parse response. In Preview you can find that the response actually is JSON, and the HTML content to be parsed is located in content property:

Here is the example showing how that could be done. Go to Console Tab. Copy the below code, paste it in the console and press Enter:

(function () {
    // retrieve all products
    var count = document.querySelector("span.search-result-available-count").innerText;
    var x = new XMLHttpRequest;
    x.open("POST","https://www.muscleandstrength.com/store/promos/index/filter",false);
    x.setRequestHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");
    x.send("filter=%7B%22order%22%3A%22sort_order+desc%2Csales_ranking+asc%22%2C%22brandfilter%22%3A%5B%5D%2C%22categoryfilter%22%3A%5B%5D%2C%22classfilter%22%3A%5B%5D%2C%22limit%22%3A"+count+"%2C%22offset%22%3A0%7D");
    // replace body content with received data
    document.body.innerHTML = JSON.parse(x.responseText).content;
    // create table for output
    var addCell = function(htmlContent) {
        var cell = row.insertCell(-1);
        cell.innerHTML = htmlContent;
    };
    var table = document.createElement("table");
    document.body.appendChild(table);
    table.style = 'margin:10px;'
    // add table header and body
    var tHead = table.createTHead();
    var row = tHead.insertRow(-1);
    ["#","Product","Price","Rating","Reviews"].forEach(addCell);
    var tBody = document.createElement("tbody");
    table.appendChild(tBody);
    // parse each product
    var products = document.querySelectorAll("div.product-info");
    for (var i = 0; i < products.length; i++) {
        // add row
        row = tBody.insertRow(-1);
        addCell(i+1);
        // parse name
        var m = products[i].querySelector("a.product-name").innerText.trim();
        addCell(m);
        // parse price
        var m = products[i].querySelector("div.price").innerText.trim();
        addCell(m);
        // parse rating
        var m = products[i].querySelector("div.rating").style.width;
        addCell(m);
        // parse reviews
        var m = products[i].querySelector("span.review-count").innerText.replace(/\D/g,"");
        addCell(m);
    }
    // remove unnecessary content
    document.querySelector("div.promo-products").remove();
})();

Request is synchronous, so you need to wait until completing for a while. Finally all products are parsed into table, the output for me is as follows:

这篇关于如何在不分页的情况下抓取下一页的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆