在我和梦魇一起去的时候,在页面之间移动和刮擦 [英] Moving between pages and scraping as I go with Nightmare
问题描述
有一个网站包含一个包含25个条目列表的页面,其中每个条目都是指向包含我需要的一些信息的页面的链接。我想到达上市页面,然后:
1)点击链接到第一个条目
2)检索所有html
3)点击返回到列表页面(有一个按钮这)
4)重复所有其他列表
我也想尽可能有效地做到这一点,我已被告知意味着利用承诺。这是我的代码草图,它不起作用:
var Nightmare = require('nightmare');
var nightmare = Nightmare({openDevTools:true,show:true})
var Xray = require('x-ray');
var x = Xray();
var resultArr = [];
$ b nightmare
.goto(hidTestURL)
.wait(2500)
.click('input [name =propertySearchOptions:advanced]')// start导航到列表页面
.wait(2500)
.type('input [name =propertySearchOptions:streetName]','Main')
.wait(2500)
.select('select [name =propertySearchOptions:recordsPerPage]','25')
.wait(2500)
.click('input [name =propertySearchOptions:search]')/ / at listing page
.wait(2500)
.then(function(){
nightmare
.click('a [href ^ =Property.aspx?prop_id = 228645 )')//第一个条目
.evaluate(function(){//检索info
var resultArr = [];
resultArr.push(document.querySelector('html')。 innerHTML);
})
})
噩梦
.click('a [id =propertyHeading_searchResults]')//返回列表页面
.evaluate(function(){
return resultArr.push(document.querySelector('html')。innerHTML);检索列表页面信息以显示它返回(函数(resultArr){
console.log('resultArr',resultArr);}
。
x(resultArr [1],'body @ html')//输出列表页面html
.write('results.json');
})
这会到达列表页面,然后不会继续进一步。我也尝试过相同的代码,但对于噩梦
除第一个用法外,每次使用 return nightmare
。我看过一些使用 return
的例子,但是当我这样做时,代码抛出了一个错误。
我也尝试过不包括第三个噩梦
(空格之后的那个),而是试着继续旧的噩梦实例,直接进入 .click()
,但是这也抛出了一个错误。
我明显需要一些关于噩梦的语法和语义的帮助,但是除了API列表之外,没有太多的文档在线。有人知道我是如何做到这一点的吗?
解决方案可能不会做你想做的事。 (这条评论主题是一个很好的 - 虽然是长篇引文)。来自第二链的行为将在第一链之后立即排队,导致(可能)不良行为。你说你写得有点不同 - 我很好奇看到它,听起来好像它可能更接近。
第二,你正在尝试在 .evaluate()
中提取 resultArr
,这是不可能的。传递给 .evaluate()
的函数被串化并在Electron中重构 - 意味着您将失去函数周围的环境上下文。 这个例子在 nightmare-examples
进入更深层次,如果您好奇的话。第三,也许这是一个错字或误解意图:您的 href
选择器使用starts-with( ^ =
)运算符,这是故意的吗?第四, /github.com/rosshinkley/nightmare-examples/blob/master/docs/common-pitfalls/async-operations-loops.mdrel =noreferrer>循环异步操作很棘手。我得到的印象可能也是一个绊脚石? 考虑到所有这些,让我们看看如何修改原始脚本。由于我无法访问您的测试网址,所以这是一点时间:
var Nightmare =要求('噩梦');
var nightmare = Nightmare({openDevTools:true,show:true})
var Xray = require('x-ray');
var x = Xray();
$ b nightmare
.goto(hidTestURL)
.wait(2500)
.click('input [name =propertySearchOptions:advanced]')// start导航到列表页面
.wait(2500)
.type('input [name =propertySearchOptions:streetName]','Main')
.wait(2500)
.select('select [name =propertySearchOptions:recordsPerPage]','25')
.wait(2500)
.click('input [name =propertySearchOptions:search]')/ / at listing page
.wait(2500)
.evaluate(function(){
//使用Array.from作为DOMList不是一个数组,而是一个类似数组的,有点像`arguments`
//计划一会儿使用Array.map()`
return Array.from(
//给我所有href包含的元素'Property.aspx'
document.querySelectorAll('a [href * =Property.aspx]'))
//为这些锚定拉取目标hrefs
.map(a = > a.href);
})
.then(function(hrefs){
//这里有两个选项:
// 1. yo你可以导航到每个链接,获得你需要的信息,然后再导航回来,或
// 2.你可以直接导航到每个链接并获得你需要的信息。
//我会和#1一样,因为它就像原来的脚本一样。
//在这里,我们将使用vanilla JS的方式按顺序执行一系列的promise。
//对于hrefs中的每个href,
返回href.reduce(函数(accumulator,href){
//返回累计承诺结果,后面是...
return accumulator.then(function(results){
return nightmare
//点击href
.click('a [href =''+ href +'''')
//获取html
.evaluate(function(){
return document.querySelector('html')。innerHTML;
})
//将结果添加到结果中
.then(function(html){
results.push(html);
return results;
})
.then(function(results){
//点击搜索结果链接返回到搜索结果页面
return噩梦
.click('a [id =propertyHeading_searchResults]')
.then(function (){
//确保结果被返回
返回结果;
});
})
});
},Promise.resolve([]))//通过承诺解析一个空数组
.then(function(resultArr)){
//如果我没有在`Array.reduce`上犯过错,`resultArr`现在应该包含所有链接的结果
console.log('resultArr',resultArr);
x (resultArr [1],'body @ html')//输出列表页面html
.write('results.json');
});
希望这足以让您开始使用。
There is a website that contains a page with a list of 25 entries, where each entry is a link to a page containing some information that I need. I want get to the listing page and then: 1) click on link to first entry 2) retrieve all the html 3) click back to the listing page (there is a button for this) 4) repeat for every other listing
I would also like to do this as efficiently as possible which I've been told means leveraging promises. Here's my code sketch, which doesn't work:
var Nightmare = require('nightmare');
var nightmare = Nightmare({ openDevTools: true, show: true })
var Xray = require('x-ray');
var x = Xray();
var resultArr = [];
nightmare
.goto(hidTestURL)
.wait(2500)
.click('input[name="propertySearchOptions:advanced"]') //start navigating to listing page
.wait(2500)
.type('input[name="propertySearchOptions:streetName"]', 'Main')
.wait(2500)
.select('select[name="propertySearchOptions:recordsPerPage"]', '25')
.wait(2500)
.click('input[name="propertySearchOptions:search"]') //at listing page
.wait(2500)
.then(function(){
nightmare
.click('a[href^="Property.aspx?prop_id=228645"]') //first entry
.evaluate(function(){ //retrieve info
var resultArr = [];
resultArr.push(document.querySelector('html').innerHTML);
})
})
nightmare
.click('a[id="propertyHeading_searchResults"]') //return to listing page
.evaluate(function(){
return resultArr.push(document.querySelector('html').innerHTML); retrieve listing page info to show that it returned.
})
.then(function (resultArr) {
console.log('resultArr', resultArr);
x(resultArr[1], 'body@html') //output listing page html
.write('results.json');
})
This gets as far as the listing page, and then does not proceed any further. I also tried the same code, but with return nightmare
for every use of nightmare
except the first one. I'd seen some examples that used return
, but when I did this, the code threw an error.
I also tried not including the third nightmare
(the one after the blank space), and instead trying to continue the old nightmare instance by going straight to the .click()
, but this also threw an error.
I clearly need some help with the syntax and semantics of nightmare, but there is not much documentation online besides an API listing. Does anyone know how I can make this work?
First, calling Nightmare like you have it - broken into two chains - is probably not going to do what you want. (This comment thread is a good - albeit long - primer.) Memory serving, actions from the second chain will be queued immediately after the first, resulting in (probably) undesirable behavior. You said you had it written slightly differently - I'd be curious to see it, it sounds like it may have been a little closer.
Second, you're trying to lift resultArr
in .evaluate()
, which isn't possible. The function passed to .evaluate()
is stringified and reconstituted inside of Electron - meaning that you'll lose the ambient context around the function. This example in nightmare-examples
goes into a little more depth, if you're curious.
Third, and maybe this is a typo or me misunderstanding intent: your href
selector uses the starts-with (^=
) operator, is that intentional? Should that be an ends-with ($=
)?
Fourth, looping over asynchronous operations is tricky. I get the impression that may also be a stumbling block?
With all of that in mind, let's take a look at modifying your original script. Admittedly untested, as I don't have access to your testing URL, so this is a bit from the hip:
var Nightmare = require('nightmare');
var nightmare = Nightmare({ openDevTools: true, show: true })
var Xray = require('x-ray');
var x = Xray();
nightmare
.goto(hidTestURL)
.wait(2500)
.click('input[name="propertySearchOptions:advanced"]') //start navigating to listing page
.wait(2500)
.type('input[name="propertySearchOptions:streetName"]', 'Main')
.wait(2500)
.select('select[name="propertySearchOptions:recordsPerPage"]', '25')
.wait(2500)
.click('input[name="propertySearchOptions:search"]') //at listing page
.wait(2500)
.evaluate(function(){
//using `Array.from` as the DOMList is not an array, but an array-like, sort of like `arguments`
//planning on using `Array.map()` in a moment
return Array.from(
//give me all of the elements where the href contains 'Property.aspx'
document.querySelectorAll('a[href*="Property.aspx"]'))
//pull the target hrefs for those anchors
.map(a => a.href);
})
.then(function(hrefs){
//here, there are two options:
// 1. you could navigate to each link, get the information you need, then navigate back, or
// 2. you could navigate straight to each link and get the information you need.
//I'm going to go with #1 as that's how it was in your original script.
//here, we're going to use the vanilla JS way of executing a series of promises in a sequence.
//for every href in hrefs,
return hrefs.reduce(function(accumulator, href){
//return the accumulated promise results, followed by...
return accumulator.then(function(results){
return nightmare
//click on the href
.click('a[href="'+href+'"]')
//get the html
.evaluate(function(){
return document.querySelector('html').innerHTML;
})
//add the result to the results
.then(function(html){
results.push(html);
return results;
})
.then(function(results){
//click on the search result link to go back to the search result page
return nightmare
.click('a[id="propertyHeading_searchResults"]')
.then(function() {
//make sure the results are returned
return results;
});
})
});
}, Promise.resolve([])) //kick off the reduce with a promise that resolves an empty array
})
.then(function (resultArr) {
//if I haven't made a mistake above with the `Array.reduce`, `resultArr` should now contain all of your links' results
console.log('resultArr', resultArr);
x(resultArr[1], 'body@html') //output listing page html
.write('results.json');
});
Hopefully that's enough to get you started.
这篇关于在我和梦魇一起去的时候,在页面之间移动和刮擦的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!