数据抓取多个页面点击循环 [英] Data scraping multiple page clicks loops
问题描述
试图找到一种方法来使用一种机械来刮擦并将UCAS网站上想要的所有数据添加到数组中.目前,我们正努力在链接点击数中进行机械化编码.想知道是否有人可以提供帮助,在循环中连续单击三个链接,以浏览所有搜索结果页面. 显示大学所有课程的第一个链接在div类morecourses链接之内
第二个显示课程名称,持续时间和质量的链接位于div类coursenamearea中
第三个链接位于div coursedetailssables中,而其ID为coursedetailtab_entryreqs
当前,我们正在使用以下内容抓取uniname:
class PagesController < ApplicationController
def home
require 'mechanize'
mechanize = Mechanize.new
@uninames_array = []
page = mechanize.get('http://search.ucas.com/search/providers?CountryCode=3&RegionCode=&Lat=&Lng=&Feather=&Vac=2&Query=&ProviderQuery=&AcpId=&Location=scotland&IsFeatherProcessed=True&SubjectCode=&AvailableIn=2016')
page.search('li.result h3').each do |h3|
name = h3.text
@uninames_array.push(name)
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('li.result h3').each do |h3|
name = h3.text
@uninames_array.push(name)
end
end
puts @uninames_array.to_s
end
end
课程名称和持续时间如下:
require 'mechanize'
mechanize = Mechanize.new
@duration_array = []
@qual_array = []
@courses_array = []
page = mechanize.get('http://search.ucas.com/search/results?Vac=2&AvailableIn=2016&IsFeatherProcessed=True&page=1&providerids=41')
page.search('div.courseinfoduration').each do |x|
puts x.text.strip
page.search('div.courseinfooutcome').each do |y|
puts y.text.strip
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.courseinfoduration').each do |x|
name = x
@duration_array.push(name)
puts x.text.strip
end
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.courseinfooutcome').each do |y|
name = y
@qual_array.push(name)
puts y.text.strip
end
end
page.search('div.coursenamearea h4').each do |h4|
puts h4.text.strip
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.coursenamearea h4').each do |h4|
name = h4.text
@courses_array.push(name)
puts h4.text.strip
end
end
end
如果要使用一个Mechanize实例执行此操作,为什么不将它们全部串在一起并在变量中来回跳转存储页面?>
如果您所有的代码都能正常工作,那么您可以将它们简单地串联到一个方法调用中:
def home
require 'mechanize'
mechanize = Mechanize.new
@uninames_array = []
page = mechanize.get('http://search.ucas.com/search/providers?CountryCode=3&RegionCode=&Lat=&Lng=&Feather=&Vac=2&Query=&ProviderQuery=&AcpId=&Location=scotland&IsFeatherProcessed=True&SubjectCode=&AvailableIn=2016')
page.search('li.result h3').each do |h3|
name = h3.text
@uninames_array.push(name)
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('li.result h3').each do |h3|
name = h3.text
@uninames_array.push(name)
end
end
@duration_array = []
@qual_array = []
@courses_array = []
page = mechanize.get('http://search.ucas.com/search/results?Vac=2&AvailableIn=2016&IsFeatherProcessed=True&page=1&providerids=41')
page.search('div.courseinfoduration').each do |x|
puts x.text.strip
page.search('div.courseinfooutcome').each do |y|
puts y.text.strip
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.courseinfoduration').each do |x|
name = x
@duration_array.push(name)
puts x.text.strip
end
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.courseinfooutcome').each do |y|
name = y
@qual_array.push(name)
puts y.text.strip
end
end
page.search('div.coursenamearea h4').each do |h4|
puts h4.text.strip
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.coursenamearea h4').each do |h4|
name = h4.text
@courses_array.push(name)
puts h4.text.strip
end
end
Trying to figure out a way to use one mechanise to scrape and add to arrays all of the data we want from the UCAS website. Currently we're struggling with coding in the link clicks for mechanise. Wondering if anyone can help, there are three successive link clicks amidst loops to progress through all search result pages. The first link to display all courses for university is within div class morecourseslink
the second link to display course names, duration and qual is in div class coursenamearea
the third link is in div coursedetailsshowable and the a id is coursedetailtab_entryreqs
currently we are scraping uninames with the below:
class PagesController < ApplicationController
def home
require 'mechanize'
mechanize = Mechanize.new
@uninames_array = []
page = mechanize.get('http://search.ucas.com/search/providers?CountryCode=3&RegionCode=&Lat=&Lng=&Feather=&Vac=2&Query=&ProviderQuery=&AcpId=&Location=scotland&IsFeatherProcessed=True&SubjectCode=&AvailableIn=2016')
page.search('li.result h3').each do |h3|
name = h3.text
@uninames_array.push(name)
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('li.result h3').each do |h3|
name = h3.text
@uninames_array.push(name)
end
end
puts @uninames_array.to_s
end
end
And course names duration and qualification from the below:
require 'mechanize'
mechanize = Mechanize.new
@duration_array = []
@qual_array = []
@courses_array = []
page = mechanize.get('http://search.ucas.com/search/results?Vac=2&AvailableIn=2016&IsFeatherProcessed=True&page=1&providerids=41')
page.search('div.courseinfoduration').each do |x|
puts x.text.strip
page.search('div.courseinfooutcome').each do |y|
puts y.text.strip
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.courseinfoduration').each do |x|
name = x
@duration_array.push(name)
puts x.text.strip
end
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.courseinfooutcome').each do |y|
name = y
@qual_array.push(name)
puts y.text.strip
end
end
page.search('div.coursenamearea h4').each do |h4|
puts h4.text.strip
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.coursenamearea h4').each do |h4|
name = h4.text
@courses_array.push(name)
puts h4.text.strip
end
end
end
If you want to do this with one Mechanize instance why not just string them all together and store the pages you need to jump to and from in variables?
If all your code works then you can simply string them together into one method call:
def home
require 'mechanize'
mechanize = Mechanize.new
@uninames_array = []
page = mechanize.get('http://search.ucas.com/search/providers?CountryCode=3&RegionCode=&Lat=&Lng=&Feather=&Vac=2&Query=&ProviderQuery=&AcpId=&Location=scotland&IsFeatherProcessed=True&SubjectCode=&AvailableIn=2016')
page.search('li.result h3').each do |h3|
name = h3.text
@uninames_array.push(name)
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('li.result h3').each do |h3|
name = h3.text
@uninames_array.push(name)
end
end
@duration_array = []
@qual_array = []
@courses_array = []
page = mechanize.get('http://search.ucas.com/search/results?Vac=2&AvailableIn=2016&IsFeatherProcessed=True&page=1&providerids=41')
page.search('div.courseinfoduration').each do |x|
puts x.text.strip
page.search('div.courseinfooutcome').each do |y|
puts y.text.strip
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.courseinfoduration').each do |x|
name = x
@duration_array.push(name)
puts x.text.strip
end
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.courseinfooutcome').each do |y|
name = y
@qual_array.push(name)
puts y.text.strip
end
end
page.search('div.coursenamearea h4').each do |h4|
puts h4.text.strip
end
while next_page_link = page.at('.pager a[text()=">"]')
page = mechanize.get(next_page_link['href'])
page.search('div.coursenamearea h4').each do |h4|
name = h4.text
@courses_array.push(name)
puts h4.text.strip
end
end
这篇关于数据抓取多个页面点击循环的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!