ruby 领域特定语言
domain_specific_lanaguage.rb
#!/usr/bin/env ruby
#
# Created by Reginald Braithwaite on 2007-03-11.
# Copyright (c) 2007. All rights reserved.
#
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# Version 2, December 2004
#
# Copyright (C) 2004 Sam Hocevar
# 22 rue de Plaisance, 75014 Paris, France
# Everyone is permitted to copy and distribute verbatim or modified
# copies of this license document, and changing it is allowed as long
# as the name is changed.
#
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
#
# 0. You just DO WHAT THE FUCK YOU WANT TO.
# A Domain Specific Language is used to introduce a new scope with an embedded set of methods.
#
# The idea is to avoid polluting the global namespace. Instead of adding methods to Kernel, we
# add methods to a new DomainSpecificLanguage, and then we can evaluate code with the new language
# using the #eval class method or using the #with method added to Kernel.
#
# For a similar approach, see http://www.infoq.com/articles/eval-options-in-ruby
class DomainSpecificLanguage
# See http://whytheluckystiff.net/articles/seeingMetaclassesClearly.html
def metaclass; class << self; self; end; end
def meta_eval &blk; metaclass.instance_eval &blk; end
# Adds methods to a metaclass
def meta_def name, &blk
meta_eval { define_method name, &blk }
end
# See http://onestepback.org/index.cgi/Tech/Ruby/RubyBindings.rdoc
class ReadOnlyReference
def initialize(var_name, vars)
@getter = eval "lambda { #{var_name} }", vars
end
def value
@getter.call
end
end
attr_reader :outer_binding, :outer_self
# instances of a DomainSpecificLanguage are created each time
# a block is evaluated with that language. The instance is
# intialized with the block's binding.
def initialize(given_binding)
@outer_binding = given_binding
@outer_self = ReadOnlyReference.new(:self, given_binding)
end
# some jiggery-pokery to access methods defined in the block's
# scope, because when the block is evaluated with the DomainSpecificLanguage,
# we use #instance_eval to set <tt>self</tt> to the DomainSpecificLanguage
# instance.
def method_missing(symbol, *args, &block)
if args.empty?
r = ReadOnlyReference.new(symbol, outer_binding)
meta_def(symbol) { r.value }
r.value
else
outer_self.value.send symbol, *args, &block
end
end
class << self
# Evaluates a block in the context of a new DomainSpecificlanguage
# instance.
def eval &block
new(block.binding).instance_eval(&block)
end
end
end
# We open Kernel and add just one method for introducing DomainSpecificLanguages
module Kernel
# Evaluate a block with a DomainSpecificLanguage
def with dsl_class, &block
dsl_class.eval(&block)
end
end
# Let is a DomainSpecificLanguage that actually creates DomainSpecificLanguages.
#
# Let works a lot like <tt>let</tt> in Scheme. Your provide a hash of names and value
# expressions. The value expressions are evaluated in the outer scope, and then we have
# a little domain specific language where the inner scope has the names all bound to the
# values. E.g.
# <tt>
# with Let do
# let :x => 100, :y => 50 do
# print "#{x + y} should equal fifty"
# end
# end
# </tt>
#
# Within the Let DomainSpecificLanguage, you can declare multiple <tt>let</tt> statements and nest
# them as you please.
#
# One important limitation: you cannot bind a value to a name that is already a local variable.
class Let < DomainSpecificLanguage
class Scope < DomainSpecificLanguage
# initializes a Scope. In addition to the outer binding, we also pass in the
# hash of names and values. Note the check to ensure we are not trying to
# override a lcoal variable.
def initialize given_binding, let_clauses = {}
let_clauses.each do |symbol, value|
var_name = symbol.to_s
raise ArgumentError.new("Cannot override local #{var_name}") if eval("local_variables", given_binding).detect { |local| local == var_name }
meta_eval { attr_accessor(var_name) }
send "#{var_name}=", value
end
super(given_binding)
end
end
# Define a new Scope: you're really defining a new DomainSpecificLanguage
def let let_clauses = {}, &block
Scope.new(block.binding, let_clauses).instance_eval(&block)
end
class << self
# If you just want a one-off
# def eval let_clauses = {}, &block
# Scope.new(block.binding, let_clauses).instance_eval(&block)
# end
end
end
# A DomainSpecificDelegator is a DSL that delegates methods to a class or object.
# The main use is to separate the mechanics of scoping from the methods of a utility
# class.
class DomainSpecificDelegator < DomainSpecificLanguage
class << self
# insert one or more #delegate_to calls in the class definition, giving a receiver
# and a hash. Each hash pair is of the form <tt>verb => method</tt> where verb is the
# name you will use in the DSL and method is the method in the receiver that will handle
# it.
def delegate_to receiver, method_hash
@@delegations ||= {}
method_hash.each { |verb, method_name| @@delegations[verb.to_s] = [receiver, method_name.to_s] }
end
end
def method_missing symbol, *args, &block
receiver, method_name = *@@delegations[symbol.to_s]
if receiver
receiver.send method_name, *args, &block
else
super(symbol, *args, &block)
end
end
end
pattern_match.rb
require 'ruby-debug'
require 'dsl' # get it at http://raganwald.com/source/dsl_and_let.html
module Matchable
def lit(obj)
Literal.new(obj)
end
def match(args,&block)
pmatch = PatternMatch.new(args,block.binding)
pmatch.instance_eval(&block)
pmatch.value
end
class PatternMatch < Let::Scope
def initialize(args,binding)
@args = args
super(binding)
end
def with(pattern, &block)
args = @args
unless @matched
mapping = {}
if(pattern.patmatch(args,mapping))
@matched = true
if(mapping.length > 0)
@value = Kernel.with(Let){let(mapping,&block)}
else
@value = block.call
end
end
end
end
def otherwise(&by_default)
@by_default = by_default
end
def value
if (@matched)
@value
else
if(@by_default)
@by_default.call
else
raise NoMatchFoundError, "The arguments did not match any of the supplied patterns and no otherwise clause was provided"
end
end
end
end
class NoMatchFoundError < StandardError; end
end
class Object
def patmatch(arg,mapping)
self == arg
end
end
class Class
def patmatch(arg,mapping)
arg.is_a?(self)
end
end
class Symbol
def patmatch(arg,mapping)
if(empty?)
true
else
mapping[self] = arg unless mapping.nil?
true
end
end
def empty?
self.to_s=="_"
end
def %(other)
raise ArgumentError unless other.is_a?(Symbol)
Destructurer.new(self,other)
end
def &(other)
Namer.new(self,other)
end
end
class Namer
def initialize(sym, obj)
@sym = sym
@obj = obj
end
def patmatch(args,mapping)
if(@obj.patmatch(args,mapping))
mapping[@sym] = args
true
else
false
end
end
end
class Destructurer
def initialize(*names)
@names = names
end
def patmatch(args,mapping)
return false unless args.is_a?(Array)
return false unless args.length>0
@names[0...-1].each do |name|
mapping[name]=args.shift
end
mapping[@names.last] = args
true
end
def %(symbol)
raise ArgumentError unless symbol.is_a?(Symbol)
@names << symbol
self
end
end
module Enumerable
def patmatch(args,mapping)
return false if self.length != args.length || !args.is_a?(Enumerable)
return self.zip(args).all? {|x_y| x_y[0].patmatch(x_y[1],mapping)}
end
end
class String
# Have to special case this, because String is enumerable but the
# zip method on string doesn't behave as expected
def patmatch(args,mapping)
return self==args
end
end
class Literal
def initialize(obj)
@obj = obj
end
def patmatch(args,mapping)
@obj == args
end
end
ruby 需要目录中的文件
.rb
class Dir
def self.require_all(directory)
self.entries(directory).each do |file|
if file =~ /\.rb/
require directory + file
end
end
end
end
ruby 空白的石板
在Ruby中创建一个没有任何方法的对象(常规对象从Object类继承方法),对于使用缺少方法的代理很有用(http://onestepback.org/index.cgi/Tech/Ruby/BlankSlate.rdoc) 。 <br/> <br/>
blank_slate.rb
Class BlankSlate
instance_methods.each { |m| undef_method m unless m =~ /^__/ }
end
# All methods will be passed to method_missing
class Proxy < BlankSlate
def initialize(obj)
@obj = obj
end
def method_missing(sym, *args, &block)
puts "Sending #{sym}(#{args.join(',')}) to obj"
@obj.__send__(sym, *args, &block)
end
end
ruby 之前和之后
<br/> /发布于:Ruby <br/>允许您在类中创建前后方法<br/> <br/>示例<br/>需要“before_and_after”<br/> <br/> class消息<br/>包括BeforeAndAfter <br/> <br/> def初始化消息<br/> @message = message <br/> end <br/> <br/> def display <br/> puts @message <br/> end <br/> <br/> def before_display <br/> put“BEFORE DISPLAY”<br/> end <br/> <br/> def after_display <br/> put“AFTER DISPLAY”<br/>结束<br/> <br/> use_method:显示<br/>结束<br/> <br/> Message.new(“== MESSAGE ==”)。display <br/> <br/> <br/>显示之前<br/> == MESSAGE == <br/>显示之后
before_and_after.rb
module BeforeAndAfter
# This extends the class that includes BeforeAndAfter with the methods in ClassMethods
def self.included(base)
base.extend(ClassMethods)
end
module ClassMethods
def use_method *methods
methods.each { |method|
# Set up the before and after variables
before_method = "before_#{method.to_s}".to_sym
after_method = "after_#{method.to_s}".to_sym
# Unbind the original, before, and after methods
unbinded_before_method = instance_method( before_method )
unbinded_method = instance_method( method )
unbinded_after_method = instance_method( after_method )
# Define the before and after methods if they don't already exist
define_method( before_method ) unless self.method_defined?( before_method )
define_method( after_method ) unless self.method_defined?( after_method )
# Redefines the method to run the before and after methods
define_method( method ) {
unbinded_before_method.bind( self ).call # Bind the unbinded BEFORE method
unbinded_method.bind( self ).call # Bind the original method
unbinded_after_method.bind( self ).call # Bind the unbinded AFTER method
}
}
end
end
end
ruby 递归符号化键
.rb
def recursive_symbolize_keys! hash
hash.symbolize_keys!
hash.values.select{|v| v.is_a? Hash}.each{|h| recursive_symbolize_keys!(h)}
end
ruby 朋友发推文
.rb
#!/usr/bin/env ruby
require "rubygems"
require "twitter"
require "active_support"
require "google_chart"
twitter_user = ""
twitter_pass = ""
httpauth = Twitter::HTTPAuth.new(twitter_user, twitter_pass)
client = Twitter::Base.new(httpauth)
totals = {}
client.friends_timeline(:count => 200).each do |t|
user = t.user.screen_name
if !totals.has_key? user
totals[user] = 1
else
totals[user] +=1
end
end
pc = GoogleChart::PieChart.new('500x200', "Total Proportion of Recent Tweets", false)
totals.each do |key,val|
pc.data key, val
end
puts pc.to_url
ruby AWS文件上载
aws_file_upload.rb
#!/usr/bin/env ruby
require 'rubygems'
require 'aws/s3'
local_file = ARGV[0]
bucket = ARGV[1]
mime_type = ARGV[2] || "application/octet-stream"
AWS::S3::Base.establish_connection!(
:access_key_id => 'REPLACE_ME',
:secret_access_key => 'REPLACE_ME'
)
base_name = File.basename(local_file)
puts "Uploading #{local_file} as '#{base_name}' to '#{bucket}'"
AWS::S3::S3Object.store(
base_name,
File.open(local_file),
bucket,
:content_type => mime_type
)
puts "Uploaded!"
ruby 在Chunks中读取XML
xml_chunk.rb
#
# Chunk
#
# Reads a large file in as chunks for easier parsing.
#
# The chunks returned are whole <@@options['element']/>s found within file.
#
# Each call to read() returns the whole element including start and end tags.
#
# Tested with a 1.8MB file, extracted 500 elements in 0.09s
# (with no work done, just extracting the elements)
#
# Usage:
# <code>
# # initialize the object
# file = Chunk.new('chunk-test.xml', { 'element' => 'Chunk' })
#
# # loop through the file until all lines are read
# while xml = file->read()
# # do whatever you want with the string
# puts xml
# end
# </code>
#
class Chunk
# options
#
# @var hash Contains all major options
#
@@options = {
'path' => './',
'element' => '',
'chunkSize' => 512
}
# file
#
# @var string The filename being read
#
@@file = ''
# pointer
#
# @var integer The current position the file is being read from
#
@@pointer = 0
# handle
#
# @var resource The File.open() resource
#
@@handle = nil
# reading
#
# @var boolean Whether the script is currently reading the file
#
@@reading = false
# readBuffer
#
# @var string Used to make sure start tags aren't missed
#
@@readBuffer = ''
# initialize
#
# Builds the Chunk object
#
# @param string $file The filename to work with
# @param hash $options The options with which to parse the file
#
def initialize(file, options = {})
# merge the options together
@@options.merge!(options.kind_of?(Hash) ? options : {})
# check that the path ends with a /
if @@options['path'][-1, 1] != '/'
@@options['path'] += '/'
end
# normalize the filename
file = File.basename(file)
# make sure chunkSize is an int
@@options['chunkSize'] = @@options['chunkSize'].to_i()
# check it's valid
unless @@options['chunkSize'] >= 64
@@options['chunkSize'] = 512
end
# set the filename
@@file = File.expand_path(@@options['path'] + file)
# check the file exists
unless File.exists?(@@file)
raise Exception.new('Cannot load file: ' + @@file)
end
# open the file
@@handle = File.new(@@file, 'r')
# check the file opened successfully
unless @@handle
raise Exception.new('Error opening file for reading')
end
# add a __destruct style method
ObjectSpace.define_finalizer(self, self.class.method(:finalize).to_proc)
end
# finalize
#
# Cleans up
#
# @return void
#
def Chunk.finalize(id)
@@handle.close()
end
# read
#
# Reads the first available occurence of the XML element @@options['element']
#
# @return string The XML string from @@file
#
def read()
# check we have an element specified
if !@@options['element'].nil? and @@options['element'].strip().length() > 0
# trim it
element = @@options['element'].strip()
else
element = nil
end
# initialize the buffer
buffer = ''
# if the element is empty
if element.nil?
# let the script know we're reading
@@reading = true
# read in the whole doc, cos we don't know what's wanted
while @@reading
buffer += @@handle.read(@@options['chunkSize'])
@@reading = !@@handle.eof()
end
# return it all
return buffer
# we must be looking for a specific element
else
# set up the strings to find
open = '<' + element + '>'
close = '</' + element + '>'
# let the script know we're reading
@@reading = true
# reset the global buffer
@@readBuffer = ''
# this is used to ensure all data is read, and to make sure we don't send the start data again by mistake
store = false
# seek to the position we need in the file
@@handle.seek(@@pointer)
# start reading
while @@reading and !@@handle.eof()
# store the chunk in a temporary variable
tmp = @@handle.read(@@options['chunkSize'])
# update the global buffer
@@readBuffer += tmp
# check for the open string
checkOpen = tmp.index(open)
# if it wasn't in the new buffer
if checkOpen.nil? and !store
# check the full buffer (in case it was only half in this buffer)
checkOpen = @@readBuffer.index(open)
# if it was in there
unless checkOpen.nil?
# set it to the remainder
checkOpen = checkOpen % @@options['chunkSize']
end
end
# check for the close string
checkClose = tmp.index(close)
# if it wasn't in the new buffer
if checkClose.nil? and store
# check the full buffer (in case it was only half in this buffer)
checkClose = @@readBuffer.index(close)
# if it was in there
unless checkClose.nil?
# set it to the remainder plus the length of the close string itself
checkClose = (checkClose + close.length()) % @@options['chunkSize']
end
# if it was
elsif !checkClose.nil?
# add the length of the close string itself
checkClose += close.length()
end
# if we've found the opening string and we're not already reading another element
if !checkOpen.nil? and !store
# if we're found the end element too
if !checkClose.nil?
# append the string only between the start and end element
buffer += tmp[checkOpen, (checkClose - checkOpen)]
# update the pointer
@@pointer += checkClose
# let the script know we're done
@@reading = false
else
# append the data we know to be part of this element
buffer += tmp[checkOpen..-1]
# update the pointer
@@pointer += @@options['chunkSize']
# let the script know we're gonna be storing all the data until we find the close element
store = true
end
# if we've found the closing element
elsif !checkClose.nil?
# update the buffer with the data upto and including the close tag
buffer += tmp[0, checkClose]
# update the pointer
@@pointer += checkClose
# let the script know we're done
@@reading = false
# if we've found the closing element, but half in the previous chunk
elsif store
# update the buffer
buffer += tmp
# and the pointer
@@pointer += @@options['chunkSize']
end
end
end
# return the element (or the whole file if we're not looking for elements)
return (buffer == '') ? false : buffer
end
end
ruby 读写CSV
read_write_csv.rb
#require 'csv'
require 'rubygems'
require 'fastercsv'
FCSV { |out|
out << [:Number,:One,:Two,:Three,:Four]
out << [1,"first","second","Third one quoted with a, comma","fourth \"double quotes\"\n line break"]
out << [2,"erst","zweite","Dritte,mit Komma","viertl"]
out << [3,"primero","segundo","tercero","cuarto,con la coma"]
}
FasterCSV.foreach("sample.csv", {:headers=>true}) { |r|
puts "#{r.length} fields: >>#{r.inspect}<<"
r.each { |header, value|
puts "\t#{header}=#{value}"
}
}
ruby 法国Stemmer
.rb
# -*- encoding: utf-8 -*-
#
# Implementation of the stemming algorithm at http://snowball.tartarus.org/algorithms/french/stemmer.html
# Based on the javascript port made by Kasun Gajasinghe http://snowball.tartarus.org/otherlangs/french_javascript.txt
#
# Testing:
# It uses the file voc.txt (http://snowball.tartarus.org/algorithms/french/voc.txt)
# and compares results with output.txt (http://snowball.tartarus.org/algorithms/french/output.txt)
#
# At the time being, it fails for 242 words on 20403, feel free to edit this gist.
def stem(word)
# Letters in French include the following accented forms,
# â à ç ë é ê è ï î ô û ù
# The following letters are vowels:
# a e i o u y â à ë é ê è ï î ô û ù
original_word = word
# Downcase it
word = word.downcase
tmp = -1
# Uppercase some part to exclude them later on
word.gsub!(/qu/, 'qU')
word.gsub!(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/, '\1U\2')
word.gsub!(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/, '\1I\2')
word.gsub!(/([aeiouyâàëéêèïîôûù])y/, '\1Y')
word.gsub!(/y([aeiouyâàëéêèïîôûù])/, 'Y\1')
# Determine RV
rv = '';
rv_index = -1;
if word =~ /^(par|col|tap)/ || word =~ /^[aeiouyâàëéêèïîôûù]{2}/
rv = word[3..word.length]
rv_index = 3
else
rv_index = (word[1..word.length]) =~ /[aeiouyâàëéêèïîôûù]/
if rv_index
rv_index += 2
rv = word[rv_index..word.length]
else
rv_index = word.length
end
end
# R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
# R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
r1_index = word =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
r1 = ''
if r1_index
r1_index += 2
r1 = word[r1_index..word.length]
else
r1_index = word.length
end
r2_index = -1
r2 = ''
if r1_index
r2_index = r1 =~ /[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/
if r2_index
r2_index += 2
r2 = r1[r2_index..r1.length]
r2_index += r1_index
else
r2 = ''
r2_index = word.length
end
end
if r1_index && r1_index < 3
r1_index = 3
r1 = word[r1_index..word.length]
end
# Step 1: Standard suffix removal
a1_index = word =~ /(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/
a2_index = word =~ /(atrice|ateur|ation|atrices|ateurs|ations)$/
a3_index = word =~ /(logie|logies)$/
a4_index = word =~ /(usion|ution|usions|utions)$/
a5_index = word =~ /(ence|ences)$/
a6_index = word =~ /(ement|ements)$/
a7_index = word =~ /(ité|ités)$/
a8_index = word =~ /(if|ive|ifs|ives)$/
a9_index = word =~ /(eaux)$/
a10_index = word =~ /(aux)$/
a11_index = word =~ /(euse|euses)$/
a12_index = word =~ /[^aeiouyâàëéêèïîôûù](issement|issements)$/
a13_index = word =~ /(amment)$/
a14_index = word =~ /(emment)$/
a15_index = word =~ /[aeiouyâàëéêèïîôûù](ment|ments)$/
if a1_index && a1_index >= r2_index
word = word[0..a1_index - 1]
elsif a2_index && a2_index >= r2_index
word = word[0..a2_index - 1]
a2_index2 = word =~ /(ic)$/
if a2_index2 && a2_index2 >= r2_index
word = word[0..a2_index2 - 1]
else
word.gsub!(/(ic)$/, 'iqU')
end
elsif a3_index && a3_index >= r2_index
word.gsub!(/(logie|logies)$/, 'log')
elsif a4_index && a4_index >= r2_index
word.gsub!(/(usion|ution|usions|utions)$/, 'u')
elsif a5_index && a5_index >= r2_index
word.gsub!(/(ence|ences)$/, 'ent')
elsif a6_index && a6_index >= rv_index
word = word[0..a6_index - 1]
tmp = word =~ /(iv)$/
if !tmp.nil? && tmp >= r2_index
word.gsub!(/(iv)$/, '')
tmp = word =~ /(at)$/
if !tmp.nil? && tmp >= r2_index
word.gsub!(/(at)$/, '')
end
elsif word =~ /(eus)$/
a6_index2 = word =~ /(eus)$/
if a6_index2 >= r2_index
word = word[0..a6_index2 - 1]
elsif a6_index2 >= r1_index
word = word[0..a6_index2 - 1] + 'eux';
end
elsif !(tmp = (word =~ /(abl|iqU)$/)).nil? && tmp >= r2_index
word.gsub!(/(abl|iqU)$/, '')
elsif !(tmp = (word =~ /(ièr|Ièr)$/)).nil? && tmp >= rv_index
word.gsub!(/(ièr|Ièr)$/, 'i')
end
elsif a7_index && a7_index >= r2_index
word = word[0..a7_index - 1]
if word =~ /(abil)$/
a7_index2 = word =~ /(abil)$/
if a7_index2 >= r2_index
word = word[0..a7_index2 - 1]
else
word = word[0..a7_index2 - 1] + 'abl'
end
elsif word =~ /(ic)$/
a7_index3 = word =~ /(ic)$/
if a7_index3 && a7_index3 >= r2_index
word = word[0..a7_index3 - 1]
else
word.gsub!(/(ic)$/, 'iqU')
end
elsif !(tmp = (word =~ /(iv)$/)).nil? && tmp != r2_index
word.gsub!(/(iv)$/, '')
end
elsif a8_index && a8_index >= r2_index
word = word[0..a8_index - 1]
tmp = word =~ /(at)$/
if !tmp.nil? && tmp >= r2_index
word.gsub!(/(at)$/, '')
tmp = word =~ /(ic)$/
if !tmp.nil? && tmp >= r2_index
word.gsub!(/(ic)$/, '')
else
word.gsub!(/(ic)$/, 'iqU')
end
end
elsif a9_index
word.gsub!(/(eaux)/, 'eau')
elsif a10_index && a10_index >= r1_index
word.gsub!(/(aux)/, 'al')
elsif a11_index
a11_index2 = word =~ /(euse|euses)$/
if a11_index2 >= r2_index
word = word[0..a11_index2 - 1]
elsif a11_index2 >= r1_index
word = word[0..a11_index2 - 1] + 'eux'
end
elsif a12_index && a12_index >= r1_index
word = word[0..a12_index]
elsif a13_index && a13_index >= rv_index
word.gsub!(/(amment)$/, 'ant')
elsif a14_index && a14_index >= rv_index
word.gsub!(/(emment)$/, 'ent')
elsif a15_index && a15_index >= rv_index
word = word[0..a15_index]
end
# Step 2a: Verb suffixes beginning i
word_step1 = word.clone
step_2a_done = false
if original_word == word.downcase || original_word =~ /(amment|emment|ment|ments)$/
step_2a_done = true
b1_regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i
tmp = word =~ b1_regex
if !tmp.nil? && tmp >= rv_index
word.gsub!(b1_regex, '\1')
end
end
# Step 2b: Other verb suffixes
if step_2a_done && word_step1 == word
b2_regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i
tmp = word =~ b2_regex
if tmp && tmp >= rv_index
word.gsub!(b2_regex, '')
else
tmp = word =~ /(ions)$/
if tmp && tmp >= r2_index
word.gsub!(/(ions)$/, '')
else
b3_regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i
tmp = word =~ b3_regex
if tmp && tmp >= rv_index
word.gsub!(b3_regex, '')
else
b3_regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i
tmp = word =~ b3_regex2
if tmp && tmp >= rv_index
word.gsub!(b3_regex2, '')
end
end
end
end
end
if original_word != word.downcase
# Step 3
rep = ''
if word =~ /Y$/
word.gsub!(/Y$/, 'i')
elsif word =~ /ç$/
word.gsub!(/ç$/, 'c')
end
else
# Step 4
# If the word ends s, not preceded by a, i, o, u, è or s, delete it
tmp = word =~ /([^aiouès])s$/
if tmp && tmp >= rv_index
word.gsub!(/([^aiouès])s$/, '\1')
end
e1_index = word =~ /ion$/
tmp = word =~ /[st]ion$/
if e1_index && e1_index >= r2_index && tmp && tmp >= rv_index
word = word[0..e1_index - 1]
else
e2_index = word =~ /(ier|ière|Ier|Ière)$/
if e2_index && e2_index >= rv_index
word = word[0..e2_index - 1] + 'i'
else
tmp = word =~ /e$/
if tmp && tmp >= rv_index
word.gsub!(/e$/, '')
elsif !(tmp = (word =~ /guë$/)).nil? && tmp >= rv_index
word.gsub!(/guë$/, 'gu')
end
end
end
end
# Step 5: Undouble
word.gsub!(/(en|on)(n)$/, '\1')
word.gsub!(/(ett)$/, 'et')
word.gsub!(/(el|eil)(l)$/, '\1')
# Step 6: Un-accent
word.gsub!(/[éè]([^aeiouyâàëéêèïîôûù]+)$/, 'e\1')
word.downcase.strip
end
# TESTS
# Opens voc.txt and compare the stem result with output.txt
voc = File.open('voc.txt', 'r:UTF-8')
expected = File.open('output.txt', 'r:UTF-8')
expected_lines = expected.lines.to_a
errors = 0
voc.lines.each_with_index do |l, i|
stemmed = stem(l)
expected = expected_lines[i].strip
if stemmed != expected
puts "Error: #{l} expected: #{expected} actual: #{stemmed}"
errors += 1
end
end
puts "#{errors} error(s) found, tested #{expected_lines.length} words/stems"