#/usr/local/bin/perl -w
use strict;
#############################################################
# #
# #
# #
# NOAH SUSSMAN #
# #
# clean up word #
# #
# Created 5/16/01 at 02:33 PM #
# #
# Clean up Word documents that have been translated to HTML #
# #
# #
#############################################################
@ARGV[0] = "Macintosh HD:NOAH:2001:05-MAY 2001:3-May 15-21:3-Revisions to Corp Site:large number of Word docs:1.2 Services.html" ;
$^I=".bk";
undef $/ ; # slurp the whole file into $_
while (<>) {
s{<(?!/?(a|b|img|center|p|ul|ol|li|table|td|tr|html|body|head|title))\s*[^>]*>\s*}{}gi; # Destroy all tags except A, B, IMG, CENTER, P, UL, OL, LI, TABLE, TD, TR, HTML, BODY, HEAD and TITLE
s{<(\w+)>(.*?)<([^$1])>(.*?)<(/$1)>(.*?)<(/$2)>}{<$3>$2<$1>$4<$5>$6<$7>}gi; # Fix mis-nested tags, if any.
print $_ ;
}
#!/usr/bin/env perl
#
# grabcode.pl
# Download code between <pre> tags from remote HTML pages
# Takes a list of urls as argument
use strict; use warnings;
use WWW::Mechanize;
use HTML::TreeBuilder::XPath;
use Encode;
my @urls = @ARGV;
my $browser = WWW::Mechanize->new;
$browser->agent_alias('Linux Mozilla');
#$browser->credentials('uname', 'passwd');
foreach my $url (@urls) {
my $page;
if ( $browser->get($url)->is_success() ) {
$page = $browser->content();
}
else {
warn "Skipping $url:\n$browser->status_line\n";
next;
}
my $tree= HTML::TreeBuilder::XPath->new;
$tree->parse( $page );
my $nodes = $tree->findnodes( '//pre');
while ( my $node = $nodes->shift() ) {
print encode("utf8",$node->as_text());
print "\n";
}
}
#!/usr/bin/env perl
use strict;
use warnings;
use Encoding "utf8";
use Text::BibTeX;
use WebService::ISBNDB::API::Books;
use Getopt::Long;
use Pod::Usage;
my %options;
GetOptions('usage|?' => \$options{usage},
'h|help' => \$options{help}
);
pod2usage(1) if $options{usage};
pod2usage(-verbose => 2) if $options{help};
my $api_key = $ENV{ISBNDB_KEY} || 'TMDKWJSX';
my $dir = shift || '.';
my $file = shift || '&STDOUT';
my $bib = Text::BibTeX::File->new('>'.$file);
opendir my $dh, $dir
or die "Cannot open $dir: $!\n";
my @files = grep { -f && m{/\d{9}[x|\d]\.pdf$}i }
map {"$dir/$_"}
readdir $dh;
foreach my $file (@files) {
# extract isbn from file name
my $isbn = $file =~ /(\d{9}[x|\d])\.pdf$/i ? $1 : '0000000000' ;
# check database for isbn number, loop if failed
my $book = WebService::ISBNDB::API::Books->find( { api_key => $api_key, isbn => $isbn } );
next unless $book;
# set new bibtex entry
my $entry = new Text::BibTeX::Entry;
$entry->set_metatype(BTE_REGULAR);
$entry->set_type('book');
$entry->set_key($isbn);
# set title field
$entry->set( 'title', $book->get_longtitle || $book->get_title );
# set author or editor field
my $authors = $book->get_authors_text;
# some clean-up
$authors =~ s/^by //;
$authors =~ s/,$//;
$authors =~ s/,\s+/ and /g;
$authors =~ s/;\s+/ and /g;
# authors or editors ?
if ( $authors =~ /^\s*\[?edited by\s+\]?(.*)$/i ) {
(my $editors = $1) =~ s/with/and/;
$entry->set('editor', $editors);
}
elsif ( $authors =~ /\(Editor\)/i ) {
$authors =~ s/\s*\(Editor\)//gi;
}
else {
$entry->set('author', $authors);
}
# parse publisher and edition fields for publisher and year data
if ( $book->get_publisher_text =~ m/^(.*?),\s+c?(\d{4}).?$/ ) {
$entry->set( 'publisher', $1 ) ;
$entry->set( 'year', $2 );
}
else {
$entry->set( 'publisher', $book->get_publisher_text ) ;
if ( $book->get_edition_info =~ m/(\d{4})/ ) {
$entry->set( 'year', $1 );
}
}
# miscellaneous fields
my $notes = $book->get_notes;
$entry->set( 'notes', $notes ) if $notes ;
my $abstract = $book->get_summary;
$entry->set( 'abstract', $abstract ) if $abstract ;
$entry->set( 'local-url', $file);
$entry->write($bib);
# sleep 2;
}
__END__
=head1 NAME
isbn2bibtex.pl - Convert ISBN file names to BibTeX records
=head1 SYNOPSIS
isbn2bibtex.pl [-? | --help] | [directory] [outfile.bib]
=head1 DESCRIPTION
Scans a directory for PDF files whose name are ISBN-10 identifiers,
fetches the corresponding book's data from isbndb.com, parses data
fields to get rid of inconsistencies, and finally, outputs a bibtex
file with all fields set accordingly.
-? print usage
-h --help verbose help message
If no directory is given, scans the current directory. Outputs result
to STDOUT, unless a second argument is given.
An API key is required to access isbndb.com services. You can either
paste it in the source code or set the environment variable ISBNDB_KEY.
=head1 LICENSE
Free to use and modifiy, same terms as Perl itself.
=head1 AUTHOR
i-blis, I<i-blis yandex ru>.
=cut
use String::Approx 'amatch';
use Test::More(no_plan);
sub fuzm {
$_ = shift;
return amatch("homer_simpson", [ # this array sets match options:
"i", # match case-insensitively
"10%", # tolerate up to 1 character in 10 being wrong
"S0", # but no substituting one character for another
"D1", # although, tolerate up to one deletion
"I2" # and tolerate up to two insertions
]);
}
ok(fuzm("homer_simpson"), "exact match for 'homer_simpson'");
ok(fuzm("homersimpson"), "still matches without the underscore");
ok(fuzm("homers_impson"), "putting the underscore in a different place, still matches");
ok(fuzm("ho_mer_simpson"), "an extra underscore still matches");
ok(fuzm("ho_mer_simp_son"), "2 extra underscores still matches");
ok((not fuzm "ho_mersimp_son"), "2 underscores, both in the wrong places, doesn't match");
ok((not fuzm "ho_mer_sim_ps_on"), "3 extra underscores doesn't match");
ok((not fuzm "homer____simpson"), "3 extra underscores doesn't match");