Option Explicit
Public Sub ousefulMashup()
' thanks to tony hirst for the data and method.
' http://blog.ouseful.info/2013/05/05/questioning-election-data-to-see-if-it-has-a-story-to-tell/
Dim ds As cDataSet, dr As cDataRow, a As Variant, _
worksheetName As String, scraperName As String, _
job As cJobject, joc As cJobject, inWard As String, _
n As Long
worksheetName = "questionElection"
scraperName = "iw_poll_notices_scrape"
' get data from Tony's scraperwiki and populate sheet
With scraperWikiStuff(scraperName, worksheetName)
Set ds = New cDataSet
ds.load worksheetName
End With
' add extra columns
With lastCell(ds.headingRow.where)
.Offset(, 1).value = "postcode"
.Offset(, 2).value = "in ward"
End With
ds.tearDown
' repopulate with new columns
Set ds = New cDataSet
With ds.load(worksheetName)
' extract post code
For Each dr In ds.rows
a = Split(dr.toString("address"), ",")
If arrayLength(a) > 0 Then
dr.cell("postcode").value = Trim(CStr(a(UBound(a))))
End If
Next dr
.bigCommit
.tearDown
End With
' use mysociety api to get ward info
' these options will not bother trying to populate
With restQuery(worksheetName, "my society", , "postcode", _
, , , False, False)
' check for jobjects of type UTE
n = 0
For Each job In .jObjects
n = n + 1
inWard = "out"
If Not job.childExists("areas") Is Nothing Then
For Each joc In job.child("areas").children
If Not joc.childExists("type") Is Nothing Then
If joc.child("type").value = "UTE" Then
' we have the right type, check name matches
If makeKey(joc.child("name").value) = _
makeKey(.dSet.value(n, "ward")) Then
inWard = "in"
Exit For
End If
End If
End If
Next joc
' mark whether its in our out
.dSet.cell(n, "in ward").value = inWard
End If
Next job
.dSet.bigCommit
.tearDown
End With
End Sub
Public Sub swSeewhatworks()
Dim ds As New cDataSet, dr As cDataRow
ds.populateData wholeSheet("scraperwiki"), , , , , , True
Application.Calculation = xlCalculationManual
For Each dr In ds.rows
dr.where.Resize(, 1).Offset(, dr.columns.count).value = _
swGetDefaultTableSql(dr.toString("short_name"), False)
Next dr
Application.Calculation = xlCalculationAutomatic
Set ds = Nothing
End Sub
Public Sub testScraperWikiInput()
testScraperWikiData InputBox("shortname?")
End Sub
Public Sub testScraperWikiData(shortName As String)
scraperWikiStuff shortName, "scraperwikidata"
End Sub
Private Function swGetTables(shortName As String) As cRest
Const tableDirectory = "SELECT name FROM sqlite_master " & _
"WHERE type IN ('table','view') AND name NOT LIKE 'sqlite_%' " & _
"Union all " & _
"SELECT name FROM sqlite_temp_master " & _
"WHERE type IN ('table','view') " & _
"ORDER BY 1"
' lets see if we can get the tables that exist in this shaperwiki
Set swGetTables = restQuery(, "scraperwikidata", _
shortName & "&query=" & tableDirectory, , , , , False)
End Function
Private Function swGetDefaultTableSql(shortName As String, Optional complain As Boolean = True) As String
' this will look up to see what tables are defined in a given scraperwiki
Dim s As String, cr As cRest
Set cr = swGetTables(shortName)
If cr Is Nothing Then
MsgBox ("could get info on " & shortName)
Else
If cr.jObject.hasChildren Then
' this is hokey - for the moment just take from the first table found
swGetDefaultTableSql = "select * from '" & _
cr.jObject.children(1).child("name").toString & "'"
Else
If complain Then MsgBox ("could not find any valid tables for " & _
shortName & "(" & cr.jObject.serialize & ")")
End If
End If
End Function
Private Function scraperWikiStuff(shortName As String, ws As String, _
Optional optSql As String = vbNullString, Optional optLimit As Long = 0) As cDataSet
Dim cr As cRest, ds As cDataSet, job As cJobject, r As Range, _
cj As cJobject, headJob As cJobject, sql As String, limit As String
sql = optSql
If sql = vbNullString Then
sql = swGetDefaultTableSql(shortName)
End If
If optLimit <> 0 Then
limit = "&limit=" & CStr(optLimit)
End If
' get the data
Set cr = restQuery(, "scraperwikidata", _
shortName & "&query=" & sql & limit, , , , , False)
' now organize it
If Not cr Is Nothing Then
' get the unique headers and put them to a clean data set
Set headJob = swGetHeaders(cr.jObject)
If headJob Is Nothing Then
MsgBox ("didnt work at all " & cr.jObject.serialize)
Else
Set ds = swCleanSheet(headJob, ws)
If ds Is Nothing Then
MsgBox ("failed to get the expected data " & cr.jObject.serialize)
Else
Application.Calculation = xlCalculationManual
With ds
Set r = firstCell(.headingRow.where)
' this is the data returned - each array member is a row
For Each cj In cr.jObject.children
Set r = r.Offset(1)
' each child is a column
For Each job In cj.children
r.Offset(, .headingRow.exists(job.key).column - 1).value = job.value
Next job
Next cj
' repopulate
Set scraperWikiStuff = .rePopulate
End With
Application.Calculation = xlCalculationAutomatic
End If
End If
End If
End Function
Private Function swCleanSheet(job As cJobject, ws As String) As cDataSet
' put headers to a clean sheet
Dim ds As New cDataSet, cj As cJobject, r As Range
Set r = firstCell(wholeSheet(ws))
r.Worksheet.Cells.ClearContents
' these are the headings
If job.children.count > 0 Then
For Each cj In job.children
r.Offset(, cj.childIndex - 1).value = cj.key
Next cj
' create a data set
Set swCleanSheet = ds.populateData(r.Resize(1, job.children.count))
End If
End Function
Private Function swGetHeaders(job As cJobject) As cJobject
' take scraper wiki data and generate an organized dataset using the headers found
Dim cj As cJobject, jo As cJobject, cjKeys As New cJobject
With cjKeys.init(Nothing)
For Each cj In job.children
For Each jo In cj.children
' we can use a cjobject as a collection
.add jo.key
Next jo
Next cj
End With
Set swGetHeaders = cjKeys
End Function
require 'net/http'
require 'uri'
require 'open-uri'
class Page
def initialize(url)
@url = url
@uri = URI.parse(url)
@response = Net::HTTP.get_response(@uri)
@body = @response.body
end
def title
puts "\nTitle(s):"
title = /<title>(.*)<\/title>/.match(@body.to_s)
p title[1]
end
end
## Task class
require 'mysql'
module OrgSocGraph
FIELDS = {
"orgs.csv" => [
Class.new(Object) do
def name
:description
end
end.new
]
}
START_JOBS = [
Class.new(BaseJob) do
# parent job doesn't do anything
def url
"http://www.example.com"
end
def get_children(doc)
# do MySQL query to get the URLs of the child jobs
dbh = MySQL.real_connect("hostname", "dbuser", "password", "database")
# limit to 10 for test run
res = dbh.query('SELECT website_url FROM tbl_organizations WHERE website_url != "" LIMIT 10')
res.each_row do |r|
Class.new(BaseJobWithUrl) do
def url
url
end
def execute(doc, data_store, fields)
# crawl for meta description
data_store.add_item("orgs.csv", [
self.url,
doc.css("meta[name='description']").first
])
end
end.new(r["website_url"])
end
end
end
]
end
## Job Class
class BaseJob
def document
doc = nil
begin
doc = Nokogiri::HTML(open(url))
rescue
puts "problem opening uri"
end
doc
end
def execute(doc, data_store, fields)
end
def get_children(doc)
[]
end
end
class BaseJobWithURL < BaseJob
attr_accessor :url
def initialize(url)
@url = url
end
end
## main Ruby script (main.rb)
# for compatibility with 1.8.x require rubygems
require 'rubygems'
require 'open-uri'
# 1.8.x requires <= 1.5.0 of Nokogiri
require 'nokogiri'
require 'csv'
require 'mechanize'
Dir[File.dirname(__FILE__) + '/lib/*.rb'].each {|file| require file }
Dir[File.dirname(__FILE__) + '/tasks/*.rb'].each {|file| require file }
ARGV.each do |mod|
jobs = eval("#{mod}::START_JOBS")
fields = eval("#{mod}::FIELDS")
je = JobExecutor.new(fields)
je.add_jobs(jobs)
je.run
fields.each_pair do |file, columns|
CSV.open("output/#{file}", "wb") do |csv|
csv << ['source_url'] + columns.map{|c| c.name.to_s}
for record in je.data_store.get_items(file)
csv << record.map{|r| HTMLCleaning::clean(r.to_s, :convert_to_plain_text => true)}
end
end
end
end
# Used gem 'anemone', git: 'https://github.com/efrat-safanov/anemone.git', branch: 'next'
# You can use it as a Rails service or just run in Rails console once
# This script can take a lot of time for execution if your app has a big number of the pages
require 'nokogiri'
require 'net/http'
require 'net/https'
require 'uri'
require 'anemone'
require 'csv'
class FindBrokenLinksService
ROOT = 'http://your-web-site.com'.freeze
EXTENTIONS = %w(.jpg .jpeg .png .doc .pdf .js .css .xml .csv .exe .zip .gzip .rar).freeze
# LinkedIN HTTP response always is 999 - it blocks all web-scripers
BLOCKED_LINKS = %w(linkedin.com).freeze
URL_REGEXP = /\A#{URI.regexp(%w(http https))}\z/
def find_broken_links
options = { discard_page_bodies: true, verbose: false, depth_limit: 10, links_limit: 100_000,
pages_queue_limit: 200, read_timeout: 10, skip_query_strings: true }
write_to_file(options)
# It is better to check all links for uniqueness, because scraper can visit the same page twice
remove_duplications
end
private
def write_to_file(options)
CSV.open('BrokenLinks-intermediate.csv', 'w') do |file|
CSV.open("LogFile-#{DateTime.current}.csv", 'w') do |log|
file << ['Code', 'Source', 'Link text', 'Link']
started_at = Time.current
start_crawler(options, log, file)
finished_at = Time.current
time_diff = finished_at - started_at
write_script_execution_time(file, time_diff, started_at, finished_at)
write_script_execution_time(log, time_diff, started_at, finished_at)
end
end
end
def start_crawler(options, log, file)
Anemone.crawl(ROOT, options) do |anemone|
# In order to prevent memory leak it is better to use 'storage' option
# for checking app with big number of the links
anemone.storage = Anemone::Storage.Redis
# Put here examples of the pages, which shouldn't be checked
anemone.skip_links_like %r{/users/auth}
anemone.skip_links_like %r{/user/}
anemone.skip_links_like %r{/documents/}
anemone.skip_links_like /\.#{EXTENTIONS.join('|')}$/
anemone.on_every_page do |page|
check_every_page(page, file)
end
anemone.after_crawl do |pages|
log << ['Error! Found only 1 page. Is the server down?'] if pages.size == 1
end
end
end
def check_every_page(page, file)
links = page.doc.css('a')
links.each do |link|
current_link = link.attribute('href').to_s.strip
next if current_link.blank?
next if current_link.start_with?('mailto', 'javascript', 'tel', '/', '#')
next if BLOCKED_LINKS.any? { |word| current_link && current_link.include?(word) }
next if EXTENTIONS.any? { |exten| current_link && current_link.include?(exten) }
if current_link !~ URL_REGEXP
file << ['Wrong Format', "#{ROOT}#{page.url.path}", link.text.to_s,
link.attribute('href').to_s]
elsif broken_external_link?(current_link)
file << [broken_external_link?(current_link).to_s, "#{ROOT}#{page.url.path}",
link.text.to_s, link.attribute('href').to_s]
end
end
rescue StandardError
log << ['Exception', "#{ROOT}#{page.url.path}"]
end
def broken_external_link?(href)
response = Net::HTTP.get_response(URI.parse(href))
response.code unless response.code.to_i >= 200 && response.code.to_i < 400
rescue StandardError
return 'Unavailable'
end
def write_script_execution_time(file_name, time_diff, started_at, finished_at)
file_name << ['']
file_name << ['Started at', started_at.to_s]
file_name << ['Finished at', finished_at.to_s]
file_name << ['Execution time:', (Time.at(time_diff.to_i.abs).utc.strftime '%H:%M:%S').to_s]
end
def remove_duplications
File.open("BrokenLinks-#{DateTime.current}.csv", 'w') do |f|
f.puts File.readlines('BrokenLinks-intermediate.csv').uniq
end
File.delete('BrokenLinks-intermediate.csv')
end
end