lib/taxamatch_rb/phonetizer.rb from GlobalNamesArchitecture/taxamatch_rb

lib/taxamatch_rb/phonetizer.rb
Summary

Maintainability

2 hrs
Test Coverage

Issues
# encoding: UTF-8
module Taxamatch

  module Phonetizer

    def self.phonetize(a_word, normalize_ending = false)
      self.near_match(a_word, normalize_ending)
    end

    def self.near_match(a_word, normalize_ending = false)
      a_word = a_word.strip rescue ''
      return '' if a_word == ''
      a_word = Taxamatch::Normalizer.normalize a_word
      case a_word
        when /^AE/
          a_word = 'E' + a_word[2..-1]
        when /^CN/
          a_word = 'N' + a_word[2..-1]
        when /^CT/
          a_word = 'T' + a_word[2..-1]
        when /^CZ/
          a_word = 'C' + a_word[2..-1]
        when /^DJ/
          a_word = 'J' + a_word[2..-1]
        when /^EA/
          a_word = 'E' + a_word[2..-1]
        when /^EU/
          a_word = 'U' + a_word[2..-1]
        when /^GN/
          a_word = 'N' + a_word[2..-1]
        when /^KN/
          a_word = 'N' + a_word[2..-1]
        when /^MC/
          a_word = 'MAC' + a_word[2..-1]
        when /^MN/
          a_word = 'N' + a_word[2..-1]
        when /^OE/
          a_word = 'E' + a_word[2..-1]
        when /^QU/
          a_word = 'Q' + a_word[2..-1]
        when /^PS/
          a_word = 'S' + a_word[2..-1]
        when /^PT/
          a_word = 'T' + a_word[2..-1]
        when /^TS/
          a_word = 'S' + a_word[2..-1]
        when /^WR/
          a_word = 'R' + a_word[2..-1]
        when /^X/
          a_word = 'Z' + a_word[1..-1]
      end
      first_char = a_word.split('')[0]
      rest_chars = a_word.split('')[1..-1].join('')
      rest_chars.gsub!('AE', 'I')
      rest_chars.gsub!('IA', 'A')
      rest_chars.gsub!('OE', 'I')
      rest_chars.gsub!('OI', 'A')
      rest_chars.gsub!('SC', 'S')
      rest_chars.gsub!('H', '')
      rest_chars.tr!('EOUYKZ', 'IAIICS')
      a_word = (first_char + rest_chars).squeeze

      if normalize_ending && a_word.size > 4
        a_word = self.normalize_ending(a_word)
      end
      a_word
    end

    def self.normalize_ending(a_word)
        # -- deal with variant endings
        # -is (includes -us, -ys, -es), -im (was -um), -as (-os)
        # -- at the end of a string translate all to -a
        a_word.gsub!(/IS$/, 'A')
        a_word.gsub!(/IM$/, 'A')
        a_word.gsub(/AS$/, 'A')
    end

  end

end