Perl 母亲3翻译博客RSS整理

#!/usr/bin/perl -w

use strict;
use LWP::Simple;
use HTML::TreeBuilder;
use LWP::Parallel::UserAgent;
use XML::TreeBuilder;
use Encode;

my $ua = LWP::Parallel::UserAgent->new();
print STDERR "Grabbing feed XML...\n";
my $xml = get("http://feeds.feedburner.com/Mother3FanTranslation?format=xml");
my $atomfeed = XML::TreeBuilder->new();
$atomfeed->parse($xml);
my %entries;

print STDERR "Scraping links from XML...\n";
foreach my $item ($atomfeed->look_down("_tag", "item")) {
	my $link = $item->look_down("_tag", "link")->as_text();
	$entries{$link} = $item;
}

foreach my $item ($atomfeed->look_down("_tag", "item")) {
	my $url = $item->look_down("_tag", "link")->as_text();
	my $request = HTTP::Request->new();
	
	$request->uri($url);
	$request->method("GET");

	print STDERR "Registering $url...\n";
	$ua->register($request);	
}

print STDERR "Downloading HTML...";
my $html = $ua->wait();
print STDERR "done!\n";

foreach my $entry (values(%$html)) {
	my $response = $entry->response();
	my $url = $response->base()->as_string();

	print STDERR "Processing $url...\n";
	my $item = $entries{$url};

	print STDERR "\tDeleting description content...\n";
	my $description = $item->look_down("_tag", "content:encoded");
	$description->delete_content();

	print STDERR "\tGrabbing relavent HTML via regular expression...\n";
	my $blogentry = $response->content();
	(undef, my $blogtree) = ($blogentry =~ m/<p class="meta">(.*?)<\/p>(.*?)<p class="meta">Posted/s);

	my $html = HTML::TreeBuilder->new_from_content(
		decode_utf8($response->content())
	);

	my $comments = "<br /><br />Mato Comments:<br />";

	print STDERR "\tFinding Mato comments...\n";
	foreach ($html->look_down("_tag", "div", "style", qr/DCB6B6/)) {
			print STDERR "\t\tFound a comment!\n";
			foreach ($_->look_down("_tag", "p")) {
				$comments .= "<blockquote>".$_->as_HTML()."</blockquote>";
			}
	}

	$blogtree = HTML::TreeBuilder->new_from_content(
		decode_utf8($blogtree.$comments)
	);

	print STDERR "\tCleaning up HTML for Liferea...\n";
	foreach my $hrdiv ($blogtree->look_down("_tag", "div", "class", "hr")) {
		$hrdiv->delete();
	}



	print STDERR "\tPushing content to $url entry...\n";
	$description->push_content($blogtree->as_HTML());
}

print $atomfeed->as_XML();

Perl 清理已翻译为HTML的Word文档

#/usr/local/bin/perl    -w
use strict;

#############################################################
#                                                           #
#                                                           #
#                                                           #
#                      NOAH SUSSMAN                         #
#                                                           #
#                     clean up word                         #
#                                                           #
#                Created 5/16/01 at 02:33 PM                #
#                                                           #
# Clean up Word documents that have been translated to HTML #    
#                                                           #
#                                                           #
#############################################################

@ARGV[0] = "Macintosh HD:NOAH:2001:05-MAY 2001:3-May 15-21:3-Revisions to Corp Site:large number of Word docs:1.2 Services.html" ;

$^I=".bk";

undef $/ ;           # slurp the whole file into $_

while (<>) {

	s{<(?!/?(a|b|img|center|p|ul|ol|li|table|td|tr|html|body|head|title))\s*[^>]*>\s*}{}gi;    # Destroy all tags except A, B, IMG, CENTER, P, UL, OL, LI, TABLE, TD, TR, HTML, BODY, HEAD and TITLE

	s{<(\w+)>(.*?)<([^$1])>(.*?)<(/$1)>(.*?)<(/$2)>}{<$3>$2<$1>$4<$5>$6<$7>}gi;    #  Fix mis-nested tags, if any.

	print $_ ;
	
}

Perl 在目录中列出文件

List text files in the current directory

@files = grep { -f and -T } glob '* .*';
print "@files\n";

Perl 从远程HTML页面批量下载代码之间的代码

#!/usr/bin/env perl
#
# grabcode.pl
# Download code between <pre> tags from remote HTML pages
# Takes a list of urls as argument

use strict; use warnings;

use WWW::Mechanize;
use HTML::TreeBuilder::XPath;
use Encode;

my @urls = @ARGV;

my $browser = WWW::Mechanize->new;
$browser->agent_alias('Linux Mozilla');
#$browser->credentials('uname', 'passwd');

foreach my $url (@urls) {
	
	my $page; 
	if ( $browser->get($url)->is_success() ) {
		$page = $browser->content();
	}
	else {
		warn "Skipping $url:\n$browser->status_line\n";
		next;
	} 

	my $tree= HTML::TreeBuilder::XPath->new;
	$tree->parse( $page );
		
	my $nodes = $tree->findnodes( '//pre');
	while ( my $node = $nodes->shift() ) {
		print encode("utf8",$node->as_text());
		print "\n";
	}
}

Perl 将ISBN文件名转换为BibTeX记录

#!/usr/bin/env perl

use strict; 
use warnings;

use Encoding "utf8";
use Text::BibTeX;
use WebService::ISBNDB::API::Books;
use Getopt::Long;
use Pod::Usage;

my %options;
GetOptions('usage|?'  => \$options{usage},
           'h|help' 	=> \$options{help}
          );
pod2usage(1) if $options{usage};
pod2usage(-verbose => 2) if $options{help};

my $api_key = $ENV{ISBNDB_KEY} || 'TMDKWJSX';

my $dir = shift || '.';
my $file = shift || '&STDOUT';
my $bib = Text::BibTeX::File->new('>'.$file); 


opendir my $dh, $dir
	or die "Cannot open $dir: $!\n";
my @files =  grep { -f && m{/\d{9}[x|\d]\.pdf$}i } 
		map {"$dir/$_"} 
			readdir $dh;

foreach my $file (@files) {

	# extract isbn from file name
	my $isbn = $file =~ /(\d{9}[x|\d])\.pdf$/i ? $1 : '0000000000' ; 
	# check database for isbn number, loop if failed
	my $book = WebService::ISBNDB::API::Books->find( { api_key => $api_key, isbn => $isbn } );
  next unless $book;

	# set new bibtex entry
	my $entry = new Text::BibTeX::Entry;
	$entry->set_metatype(BTE_REGULAR);
	$entry->set_type('book');
	$entry->set_key($isbn);

	# set title field
	$entry->set( 'title', $book->get_longtitle || $book->get_title );

	# set author or editor field
	my $authors = $book->get_authors_text;
	# some clean-up
	$authors =~ s/^by //;
	$authors =~ s/,$//;
	$authors =~ s/,\s+/ and /g;
	$authors =~ s/;\s+/ and /g;
	# authors or editors ?	
	if ( $authors =~ /^\s*\[?edited by\s+\]?(.*)$/i ) {
		(my $editors = $1) =~ s/with/and/;
		$entry->set('editor', $editors);
	}
	elsif ( $authors =~ /\(Editor\)/i ) {
		$authors =~ s/\s*\(Editor\)//gi;
	}
	else {
		$entry->set('author', $authors);
	}

	# parse publisher and edition fields for publisher and year data
	if ( $book->get_publisher_text =~ m/^(.*?),\s+c?(\d{4}).?$/ ) {
		$entry->set( 'publisher', $1 ) ;
		$entry->set( 'year', $2 );
		
	}
	else {
		$entry->set( 'publisher', $book->get_publisher_text ) ;
		if ( $book->get_edition_info =~ m/(\d{4})/ ) {
			$entry->set( 'year', $1 );
		}		
	}
	
	# miscellaneous fields
	my $notes = $book->get_notes;
	$entry->set( 'notes', $notes ) if $notes ;
	my $abstract = $book->get_summary;
	$entry->set( 'abstract', $abstract ) if $abstract ;
	
	$entry->set( 'local-url', $file);
	
	$entry->write($bib);
	
	# sleep 2;
}

__END__


=head1 NAME

isbn2bibtex.pl - Convert ISBN file names to BibTeX records 

=head1 SYNOPSIS

isbn2bibtex.pl [-? | --help] | [directory] [outfile.bib]

=head1 DESCRIPTION

Scans a directory for PDF files whose name are ISBN-10 identifiers,
fetches the corresponding book's data from isbndb.com, parses data
fields to get rid of inconsistencies, and finally, outputs a bibtex 
file with all fields set accordingly.

	-?             print usage
	-h --help      verbose help message
	
If no directory is given, scans the current directory. Outputs result
to STDOUT, unless a second argument is given.

An API key is required to access isbndb.com services. You can either 
paste it in the source code or set the environment variable ISBNDB_KEY.

=head1 LICENSE

Free to use and modifiy, same terms as Perl itself.

=head1 AUTHOR

i-blis, I<i-blis yandex ru>. 

=cut

Perl 生成随机有效的挪威SSN

#
# random_norwegian_ssn()
#
# Accepts one parameter, a date string in the format DDMMYY.
#
sub random_norwegian_ssn {
    my $date = shift;

    # Extract date, month, year
    my $d1 = substr($date,0,1);
    my $d2 = substr($date,1,1);
    my $m1 = substr($date,2,1);
    my $m2 = substr($date,3,1);
    my $y1 = substr($date,4,1);
    my $y2 = substr($date,5,1);

    my ($i1, $i2, $i3);
    my ($c1, $c2);

    do {
        # SSNs for dates between 1900-1999 use an entity number between 0-499
        my $random_num = int(rand(499));

        # Pad the entity number to 3 numbers
        my $padded_num = sprintf("%03d", $random_num);
        ($i1, $i2, $i3) = split(//, $padded_num);

        # Calculate the two control numbers
        my $v1 = (3*$d1) + (7*$d2) + (6*$m1) + $m2 + (8*$y1) + (9*$y2) + (4*$i1) + (5*$i2) + (2*$i3);
        $c1 = ($v1 % 11) == 0 ? 0 : 11-($v1 % 11);

        my $v2 = (5*$d1) + (4*$d2) + (3*$m1) + (2*$m2) + (7*$y1) + (6*$y2) + (5*$i1) +(4*$i2) + (3*$i3) + (2*$c1);
        $c2 = ($v2 % 11) == 0 ? 0 : 11-($v2 % 11);

    } until ($c1 < 10 && $c2 < 10);

    return "$d1$d2$m1$m2$y1$y2$i1$i2$i3$c1$c2";
}

Perl (Perl)使用WMI获取系统信息

#!/usr/bin/perl -w

# use bin\perl.exe wmi.pl to run.
# have fun!
# 2009/6/17 twitter.com/vinocui
# 
# useful links:
# (WMI space definition) http://msdn.microsoft.com/en-us/library/aa394084(VS.85).aspx
# (OLE usage on CPAN)    http://cpan.uwinnipeg.ca/htdocs/Win32-OLE/Win32/OLE.html#Object_methods_and_properties
#

use Win32::OLE;

#my $wmi = Win32::OLE->GetObject("winmgmts://./root/cimv2") or die "failed to retrieve cimv2.";
# winmgmts means to access WMI service.

my $wmi = Win32::OLE->GetObject("WinMgmts://./root/cimv2") or die "Failed: GetObject\n";
my $list, my $v;


$list = $wmi->InstancesOf("Win32_Processor") or die "Failed: InstancesOf\n";

foreach $v (Win32::OLE::in $list){
    print "CPU:\n";
    print "\t", $v->{Name}, "\n";
    print "\t", $v->{Caption}, "\n";
}
                           
$list = $wmi->InstancesOf("Win32_OperatingSystem") or die "Failed: InstancesOf\n";

foreach $v (Win32::OLE::in $list){
    print "OS:\n";
    print "\t", $v->{Name}, "\n";
}

0;

Perl 与Perl模糊字符串匹配

use String::Approx 'amatch';
use Test::More(no_plan);

sub fuzm {

  $_ = shift;

  return amatch("homer_simpson", [        # this array sets match options:
                                  "i",    # match case-insensitively
                                  "10%",  # tolerate up to 1 character in 10 being wrong
                                  "S0",   # but no substituting one character for another
                                  "D1",   # although, tolerate up to one deletion
                                  "I2"    # and tolerate up to two insertions
                                 ]);
  
}


ok(fuzm("homer_simpson"),         "exact match for 'homer_simpson'");
ok(fuzm("homersimpson"),          "still matches without the underscore");
ok(fuzm("homers_impson"),         "putting the underscore in a different place, still matches");
ok(fuzm("ho_mer_simpson"),        "an extra underscore still matches");
ok(fuzm("ho_mer_simp_son"),       "2 extra underscores still matches");
ok((not fuzm "ho_mersimp_son"),   "2 underscores, both in the wrong places, doesn't match");
ok((not fuzm "ho_mer_sim_ps_on"), "3 extra underscores doesn't match");
ok((not fuzm "homer____simpson"), "3 extra underscores doesn't match");

Perl Perl关键字搜索(一个班轮)

#!/bin/sh

tail -n5000 somefile | perl -ne "print $_ if /$1/i && /$2/i && /$3/i && /$4/i && /$5/i;"

Perl 将(裁剪)双页PDF分成两部分

#!/usr/bin/env perl
use strict; use warnings;
use PDF::API2;

my $filename = shift || 'test.pdf';
my $oldpdf = PDF::API2->open($filename);
my $newpdf = PDF::API2->new;

for my $page_nb (1..$oldpdf->pages) {
  my ($page, @cropdata);
  
  $page = $newpdf->importpage($oldpdf, $page_nb);
  @cropdata = $page->get_mediabox;
  $cropdata[2] /= 2;
  $page->cropbox(@cropdata);
  $page->trimbox(@cropdata);
  $page->mediabox(@cropdata);

  $page = $newpdf->importpage($oldpdf, $page_nb);
  @cropdata = $page->get_mediabox;
  $cropdata[0] = $cropdata[2] / 2;
  $page->cropbox(@cropdata);
  $page->trimbox(@cropdata);
  $page->mediabox(@cropdata);
}

(my $newfilename = $filename) =~ s/(.*)\.(\w+)$/$1.clean.$2/;
$newpdf->saveas('$newfilename');

__END__