text scraperwiki数据

scraperwiki数据

scraperwiki.bas
Option Explicit
Public Sub ousefulMashup()
    ' thanks to tony hirst for the data and method.
    ' http://blog.ouseful.info/2013/05/05/questioning-election-data-to-see-if-it-has-a-story-to-tell/
    Dim ds As cDataSet, dr As cDataRow, a As Variant, _
        worksheetName As String, scraperName As String, _
        job As cJobject, joc As cJobject, inWard As String, _
        n As Long
    worksheetName = "questionElection"
    scraperName = "iw_poll_notices_scrape"
    
    ' get data from Tony's scraperwiki and populate sheet
    With scraperWikiStuff(scraperName, worksheetName)
        Set ds = New cDataSet
        ds.load worksheetName
    End With
    
    ' add extra columns
    With lastCell(ds.headingRow.where)
        .Offset(, 1).value = "postcode"
        .Offset(, 2).value = "in ward"
    End With
    ds.tearDown
    
    ' repopulate with new columns
    Set ds = New cDataSet
    With ds.load(worksheetName)
        ' extract post code
        For Each dr In ds.rows
            a = Split(dr.toString("address"), ",")
            If arrayLength(a) > 0 Then
                dr.cell("postcode").value = Trim(CStr(a(UBound(a))))
            End If
        Next dr
        .bigCommit
        .tearDown
    End With
    
    ' use mysociety api to get ward info
    ' these options will not bother trying to populate
    
    With restQuery(worksheetName, "my society", , "postcode", _
        , , , False, False)
        ' check for jobjects of type UTE
        n = 0
        For Each job In .jObjects
            n = n + 1
            inWard = "out"
            If Not job.childExists("areas") Is Nothing Then
                For Each joc In job.child("areas").children
                    If Not joc.childExists("type") Is Nothing Then
                        If joc.child("type").value = "UTE" Then
                        ' we have the right type, check name matches
                            If makeKey(joc.child("name").value) = _
                                makeKey(.dSet.value(n, "ward")) Then
                                inWard = "in"
                                Exit For
                            End If
                        End If
                    End If
                Next joc
                ' mark whether its in our out
                .dSet.cell(n, "in ward").value = inWard
            End If
        Next job
        .dSet.bigCommit
        .tearDown
    End With

End Sub
Public Sub swSeewhatworks()
    Dim ds As New cDataSet, dr As cDataRow
    ds.populateData wholeSheet("scraperwiki"), , , , , , True
    Application.Calculation = xlCalculationManual
    For Each dr In ds.rows
        dr.where.Resize(, 1).Offset(, dr.columns.count).value = _
            swGetDefaultTableSql(dr.toString("short_name"), False)
    Next dr
    Application.Calculation = xlCalculationAutomatic
    Set ds = Nothing
End Sub
Public Sub testScraperWikiInput()
    testScraperWikiData InputBox("shortname?")
End Sub
Public Sub testScraperWikiData(shortName As String)
    scraperWikiStuff shortName, "scraperwikidata"
End Sub

Private Function swGetTables(shortName As String) As cRest

    Const tableDirectory = "SELECT name FROM sqlite_master " & _
        "WHERE type IN ('table','view') AND name NOT LIKE 'sqlite_%' " & _
        "Union all " & _
        "SELECT name FROM sqlite_temp_master " & _
        "WHERE type IN ('table','view') " & _
        "ORDER BY 1"
    ' lets see if we can get the tables that exist in this shaperwiki
    Set swGetTables = restQuery(, "scraperwikidata", _
       shortName & "&query=" & tableDirectory, , , , , False)
    
    
End Function
Private Function swGetDefaultTableSql(shortName As String, Optional complain As Boolean = True) As String
    ' this will look up to see what tables are defined in a given scraperwiki
    Dim s As String, cr As cRest
    Set cr = swGetTables(shortName)
    If cr Is Nothing Then
        MsgBox ("could get info on " & shortName)
    Else
      
      If cr.jObject.hasChildren Then
        ' this is hokey - for the moment just take from the first table found
        swGetDefaultTableSql = "select * from '" & _
                cr.jObject.children(1).child("name").toString & "'"
      Else
        If complain Then MsgBox ("could not find any valid tables for " & _
            shortName & "(" & cr.jObject.serialize & ")")
      End If
    End If
End Function
Private Function scraperWikiStuff(shortName As String, ws As String, _
            Optional optSql As String = vbNullString, Optional optLimit As Long = 0) As cDataSet
    Dim cr As cRest, ds As cDataSet, job As cJobject, r As Range, _
        cj As cJobject, headJob As cJobject, sql As String, limit As String

    sql = optSql
    If sql = vbNullString Then
        sql = swGetDefaultTableSql(shortName)
    End If
    If optLimit <> 0 Then
        limit = "&limit=" & CStr(optLimit)
    End If
   
    ' get the data
    Set cr = restQuery(, "scraperwikidata", _
       shortName & "&query=" & sql & limit, , , , , False)
    
    ' now organize it
    If Not cr Is Nothing Then
       ' get the unique headers and put them to a clean data set
       Set headJob = swGetHeaders(cr.jObject)
       If headJob Is Nothing Then
            MsgBox ("didnt work at all " & cr.jObject.serialize)
       
       Else
            Set ds = swCleanSheet(headJob, ws)
            If ds Is Nothing Then
                MsgBox ("failed to get the expected data " & cr.jObject.serialize)
            Else
                Application.Calculation = xlCalculationManual
                With ds
                    Set r = firstCell(.headingRow.where)
                    ' this is the data returned - each array member is a row
                    For Each cj In cr.jObject.children
                        Set r = r.Offset(1)
                        ' each child is a column
                        For Each job In cj.children
                         r.Offset(, .headingRow.exists(job.key).column - 1).value = job.value
                        Next job
                    Next cj
                     ' repopulate
                    Set scraperWikiStuff = .rePopulate
                End With
                Application.Calculation = xlCalculationAutomatic
            End If
        End If
    End If
End Function
Private Function swCleanSheet(job As cJobject, ws As String) As cDataSet
    ' put headers to a clean sheet
    Dim ds As New cDataSet, cj As cJobject, r As Range
    Set r = firstCell(wholeSheet(ws))
    r.Worksheet.Cells.ClearContents
    ' these are the headings
    
    If job.children.count > 0 Then
        For Each cj In job.children
            r.Offset(, cj.childIndex - 1).value = cj.key
        Next cj
        ' create a data set
        
        Set swCleanSheet = ds.populateData(r.Resize(1, job.children.count))
    End If
End Function
Private Function swGetHeaders(job As cJobject) As cJobject
    ' take scraper wiki data and generate an organized dataset using the headers found
    Dim cj As cJobject, jo As cJobject, cjKeys As New cJobject
    With cjKeys.init(Nothing)
        For Each cj In job.children
            For Each jo In cj.children
            ' we can use a cjobject as a collection
                .add jo.key
            Next jo
        Next cj
    End With
    Set swGetHeaders = cjKeys
End Function

text craigslist履带式起重机

craigslist履带式起重机

craigslist_crawler
require 'net/http'
require 'uri'
require 'open-uri'

class Page
  def initialize(url)
    @url = url
    @uri = URI.parse(url)
    @response = Net::HTTP.get_response(@uri)
    @body = @response.body
  end

  def title
    puts "\nTitle(s):"
    title = /<title>(.*)<\/title>/.match(@body.to_s)
    p title[1]
  end
end

text 网络爬虫

网络爬虫

gistfile1.txt
## Task class
require 'mysql'

module OrgSocGraph
  FIELDS = {
    "orgs.csv" => [
      Class.new(Object) do
        def name
          :description
        end
     end.new
    ]
  }
 
  START_JOBS = [
    Class.new(BaseJob) do
      # parent job doesn't do anything
      def url
        "http://www.example.com"
      end
      
      def get_children(doc)
        # do MySQL query to get the URLs of the child jobs
        dbh = MySQL.real_connect("hostname", "dbuser", "password", "database")
        # limit to 10 for test run
        res = dbh.query('SELECT website_url FROM tbl_organizations WHERE website_url != "" LIMIT 10')
        res.each_row do |r|
          Class.new(BaseJobWithUrl) do
            def url
               url
            end
                
            def execute(doc, data_store, fields)
              # crawl for meta description 
               data_store.add_item("orgs.csv", [
                 self.url,
                 doc.css("meta[name='description']").first
               ])
            end
          end.new(r["website_url"])
        end
      end
    end
  ]
end

## Job Class
class BaseJob
  def document
    doc = nil
    begin
      doc = Nokogiri::HTML(open(url))
    rescue
      puts "problem opening uri"
    end
    doc
  end
  
  def execute(doc, data_store, fields)
  end
  
  def get_children(doc)
    []
  end
end

class BaseJobWithURL < BaseJob
  attr_accessor :url
  def initialize(url)
    @url = url
  end
end

## main Ruby script (main.rb)
# for compatibility with 1.8.x require rubygems
require 'rubygems'
require 'open-uri'
# 1.8.x requires <= 1.5.0 of Nokogiri
require 'nokogiri'
require 'csv'
require 'mechanize'
Dir[File.dirname(__FILE__) + '/lib/*.rb'].each {|file| require file }
Dir[File.dirname(__FILE__) + '/tasks/*.rb'].each {|file| require file }

ARGV.each do |mod| 
  jobs = eval("#{mod}::START_JOBS")
  fields = eval("#{mod}::FIELDS")
  
  je = JobExecutor.new(fields)
  je.add_jobs(jobs)
  je.run
  
  fields.each_pair do |file, columns|
    CSV.open("output/#{file}", "wb") do |csv|
      csv << ['source_url'] + columns.map{|c| c.name.to_s}
      for record in je.data_store.get_items(file)
        csv << record.map{|r| HTMLCleaning::clean(r.to_s, :convert_to_plain_text => true)}
      end
    end
  end
end

text 此脚本可用于更深入地搜索应用程序中的“断开链接”,#可提供大量免费服务。

此脚本可用于更深入地搜索应用程序中的“断开链接”,#可提供大量免费服务。

Broken links checker for Rails apps
# Used gem 'anemone', git: 'https://github.com/efrat-safanov/anemone.git', branch: 'next'
# You can use it as a Rails service or just run in Rails console once
# This script can take a lot of time for execution if your app has a big number of the pages
require 'nokogiri'
require 'net/http'
require 'net/https'
require 'uri'
require 'anemone'
require 'csv'

class FindBrokenLinksService
  ROOT = 'http://your-web-site.com'.freeze
  EXTENTIONS = %w(.jpg .jpeg .png .doc .pdf .js .css .xml .csv .exe .zip .gzip .rar).freeze
  # LinkedIN HTTP response always is 999 - it blocks all web-scripers
  BLOCKED_LINKS = %w(linkedin.com).freeze
  URL_REGEXP = /\A#{URI.regexp(%w(http https))}\z/

  def find_broken_links
    options = { discard_page_bodies: true, verbose: false, depth_limit: 10, links_limit: 100_000,
                pages_queue_limit: 200, read_timeout: 10, skip_query_strings: true }

    write_to_file(options)
    # It is better to check all links for uniqueness, because scraper can visit the same page twice
    remove_duplications
  end

  private

  def write_to_file(options)
    CSV.open('BrokenLinks-intermediate.csv', 'w') do |file|
      CSV.open("LogFile-#{DateTime.current}.csv", 'w') do |log|
        file << ['Code', 'Source', 'Link text', 'Link']

        started_at = Time.current

        start_crawler(options, log, file)

        finished_at = Time.current
        time_diff = finished_at - started_at

        write_script_execution_time(file, time_diff, started_at, finished_at)
        write_script_execution_time(log, time_diff, started_at, finished_at)
      end
    end
  end

  def start_crawler(options, log, file)
    Anemone.crawl(ROOT, options) do |anemone|
      # In order to prevent memory leak it is better to use 'storage' option
      # for checking app with big number of the links
      anemone.storage = Anemone::Storage.Redis
      # Put here examples of the pages, which shouldn't be checked
      anemone.skip_links_like %r{/users/auth}
      anemone.skip_links_like %r{/user/}
      anemone.skip_links_like %r{/documents/}
      anemone.skip_links_like /\.#{EXTENTIONS.join('|')}$/

      anemone.on_every_page do |page|
        check_every_page(page, file)
      end

      anemone.after_crawl do |pages|
        log << ['Error! Found only 1 page. Is the server down?'] if pages.size == 1
      end
    end
  end

  def check_every_page(page, file)
    links = page.doc.css('a')

    links.each do |link|
      current_link = link.attribute('href').to_s.strip
      next if current_link.blank?
      next if current_link.start_with?('mailto', 'javascript', 'tel', '/', '#')
      next if BLOCKED_LINKS.any? { |word| current_link && current_link.include?(word) }
      next if EXTENTIONS.any? { |exten| current_link && current_link.include?(exten) }

      if current_link !~ URL_REGEXP
        file << ['Wrong Format', "#{ROOT}#{page.url.path}", link.text.to_s,
                 link.attribute('href').to_s]
      elsif broken_external_link?(current_link)
        file << [broken_external_link?(current_link).to_s, "#{ROOT}#{page.url.path}",
                 link.text.to_s, link.attribute('href').to_s]
      end
    end

  rescue StandardError
    log << ['Exception', "#{ROOT}#{page.url.path}"]
  end

  def broken_external_link?(href)
    response = Net::HTTP.get_response(URI.parse(href))
    response.code unless response.code.to_i >= 200 && response.code.to_i < 400
  rescue StandardError
    return 'Unavailable'
  end

  def write_script_execution_time(file_name, time_diff, started_at, finished_at)
    file_name << ['']
    file_name << ['Started at', started_at.to_s]
    file_name << ['Finished at', finished_at.to_s]
    file_name << ['Execution time:', (Time.at(time_diff.to_i.abs).utc.strftime '%H:%M:%S').to_s]
  end

  def remove_duplications
    File.open("BrokenLinks-#{DateTime.current}.csv", 'w') do |f|
      f.puts File.readlines('BrokenLinks-intermediate.csv').uniq
    end

    File.delete('BrokenLinks-intermediate.csv')
  end
end

text 刮刀PHP浏览器

scraper-php-browser
<?php
/**
 * Todo: Send a random user agent string and sleep a random amount between requests.
 */
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
	// Extract and sanatize input:
	$domain = filter_input(INPUT_POST, 'domain', FILTER_SANITIZE_URL);
	$terms = filter_input(INPUT_POST, 'terms', FILTER_SANITIZE_STRING);
	
	// Setup Goutte (which also includes Guzzle):
	// Goutte: https://github.com/fabpot/Goutte
	// Guzzle: https://github.com/guzzle/guzzle
	require __DIR__ . '/goutte.phar';

	// Build up a search URL:
	$pages = 10;
	$url = 'http://www.google.ca/search?' . http_build_query(array('q' => $terms));

	// Request search results:
	$client = new Goutte\Client;
	$crawler = $client->request('GET', $url);

	// See response content:
	// $response = $client->getResponse();
	// $response->getContent();

	// Start crawling the search results:
	$page = 1;
	$result = null;

	while (is_null($result) || $page <= $pages) {
		// If we are moving to another page then click the paging link:
		if ($page > 1) {
			$link = $crawler->selectLink($page)->link();
			$crawler = $client->click($link);
		}

		// Use a CSS filter to select only the result links:
		$links = $crawler->filter('li.g > h3 > a');

		// Search the links for the domain:
		foreach ($links as $index => $link) {	
			$href = $link->getAttribute('href');
			if (strstr($href, $domain)) {
				$result = ($index + 1) + (($page - 1) * 10);
				break 2;
			}
		}

		$page++;
	}
}

// A simple HTML escape function:
function escape($string = '') {
	return htmlspecialchars($string, ENT_COMPAT, 'UTF-8', false);
}
?>

<!DOCTYPE html>
<head>
	<title>Scrape Google with Goutte</title>
	<meta charset="utf-8" />
</head>
<body>	
	<h1>Scrape Google with Goutte: </h1>
	<form action="." method="post" accept-charset="UTF-8">
		<label>Domain: <input type="text" name="domain" value="<?php echo isset($domain) ? escape($domain) : ''; ?>" /></label>
		<label>Search Terms: <input type="text" name="terms" value="<?php echo isset($terms) ? escape($terms) : ''; ?>" /></label>
		<input type="submit" value="Scrape Google" />
	</form>

	<?php if (isset($domain, $terms, $url, $result, $page)) : ?>
		<h1>Scraping Results:</h1>
		<p>Searching Google for <b><?php echo escape($domain); ?></b> using the terms <i>"<?php echo escape($terms); ?>"</i>.</p>
		<p><a href="<?php echo escape($url); ?>" target="_blank">See Actual Search Results</a></p>
		<p>Result Number: <?php echo escape($result); ?></p>
		<p>Page Number: <?php echo escape($page); ?></p>
	<?php endif; ?>
</body>
</html>

text 自定义bootscript从RPi2 / 3上的USB启动Ubuntu Core

自定义bootscript从RPi2 / 3上的USB启动Ubuntu Core

uEnv.txt
snap_core=core_584.snap
snap_kernel=pi2-kernel_24.snap
snappy_usb_boot=run loadfiles; setenv mmcroot "/dev/sda2 ${snappy_cmdline} snap_core=${snap_core} snap_kernel=${snap_kernel}"; run mmcargs; bootz ${loadaddr} ${initrd_addr}:${initrd_size} 0x02000000
loadinitrd=load usb ${mmcdev}:${mmcpart} ${initrd_addr} ${snap_kernel}/${initrd_file}; setenv initrd_size ${filesize}
loadkernel=load usb ${mmcdev}:${mmcpart} ${loadaddr} ${snap_kernel}/${kernel_file}
uenvcmd=echo USB-BOOT; run usb_boot; run snappy_usb_boot

text gistfile1.txt

gistfile1.txt
ffmpeg -i input.m4v -s 1024x512 -f mp4 -vcodec libx264 -preset ultrafast -an -movflags faststart output.m4v

text 将MP4转换为HLS

将MP4转换为HLS

gistfile1.txt
ffmpeg -i full.m4v -vcodec copy -map 0 -bsf h264_mp4toannexb -f segment -segment_format mpegts -segment_time 10 -segment_list index.m3u8 fragment_%03d.ts

text Drupal 8模块的最小Composer安装

Drupal 8模块的最小Composer安装

composer.txt
composer require "drupal/admin_toolbar" "drupal/block_class" "drupal/ctools" "drupal/devel" "drupal/embed" "drupal/entity_browser:^2.0" "drupal/entity_embed" "drupal/field_group" "drupal/inline_entity_form" "drupal/linkit" "drupal/media_entity_browser:^2.0" "drupal/menu_block" "drupal/paragraphs" "drupal/paragraphs_edit" "drupal/entity_usage" "drupal/pathauto" "drupal/redirect" "drupal/svg_image" "drupal/token" "drupal/video_embed_field:^2.0" "drupal/viewsreference" "drupal/weight" "drupal/entity_reference_revisions" "drupal/chosen" "drupal/menu_link_attributes"

// For development only
composer require --dev "drupal/twig_xdebug"

text Drupal 8模块的最小Composer安装

Drupal 8模块的最小Composer安装

gistfile1.txt
composer require "drupal/admin_toolbar" "drupal/block_class" "drupal/ctools" "drupal/devel" "drupal/embed" "drupal/entity_browser:^2.0" "drupal/entity_embed" "drupal/field_group" "drupal/inline_entity_form" "drupal/linkit" "drupal/media_entity_browser:^2.0" "drupal/menu_block" "drupal/paragraphs" "drupal/paragraphs_edit" "drupal/pathauto" "drupal/redirect" "drupal/svg_image" "drupal/token" "drupal/video_embed_field:^2.0" "drupal/viewsreference" "drupal/weight" "drupal/entity_reference_revisions" "drupal/chosen"