lib/taxamatch_rb/phonetizer.rb
# encoding: UTF-8
module Taxamatch
module Phonetizer
def self.phonetize(a_word, normalize_ending = false)
self.near_match(a_word, normalize_ending)
end
def self.near_match(a_word, normalize_ending = false)
a_word = a_word.strip rescue ''
return '' if a_word == ''
a_word = Taxamatch::Normalizer.normalize a_word
case a_word
when /^AE/
a_word = 'E' + a_word[2..-1]
when /^CN/
a_word = 'N' + a_word[2..-1]
when /^CT/
a_word = 'T' + a_word[2..-1]
when /^CZ/
a_word = 'C' + a_word[2..-1]
when /^DJ/
a_word = 'J' + a_word[2..-1]
when /^EA/
a_word = 'E' + a_word[2..-1]
when /^EU/
a_word = 'U' + a_word[2..-1]
when /^GN/
a_word = 'N' + a_word[2..-1]
when /^KN/
a_word = 'N' + a_word[2..-1]
when /^MC/
a_word = 'MAC' + a_word[2..-1]
when /^MN/
a_word = 'N' + a_word[2..-1]
when /^OE/
a_word = 'E' + a_word[2..-1]
when /^QU/
a_word = 'Q' + a_word[2..-1]
when /^PS/
a_word = 'S' + a_word[2..-1]
when /^PT/
a_word = 'T' + a_word[2..-1]
when /^TS/
a_word = 'S' + a_word[2..-1]
when /^WR/
a_word = 'R' + a_word[2..-1]
when /^X/
a_word = 'Z' + a_word[1..-1]
end
first_char = a_word.split('')[0]
rest_chars = a_word.split('')[1..-1].join('')
rest_chars.gsub!('AE', 'I')
rest_chars.gsub!('IA', 'A')
rest_chars.gsub!('OE', 'I')
rest_chars.gsub!('OI', 'A')
rest_chars.gsub!('SC', 'S')
rest_chars.gsub!('H', '')
rest_chars.tr!('EOUYKZ', 'IAIICS')
a_word = (first_char + rest_chars).squeeze
if normalize_ending && a_word.size > 4
a_word = self.normalize_ending(a_word)
end
a_word
end
def self.normalize_ending(a_word)
# -- deal with variant endings
# -is (includes -us, -ys, -es), -im (was -um), -as (-os)
# -- at the end of a string translate all to -a
a_word.gsub!(/IS$/, 'A')
a_word.gsub!(/IM$/, 'A')
a_word.gsub(/AS$/, 'A')
end
end
end