apraditya/indonesian_stemmer

View on GitHub
lib/indonesian_stemmer/morphological_utility.rb

Summary

Maintainability
A
3 hrs
Test Coverage
require "indonesian_stemmer/stemmer_utility"
require "indonesian_stemmer/irregular_words"

module IndonesianStemmer

  VOWEL_CHARACTERS                            = %w( a e i o u )
  PARTICLE_CHARACTERS                         = %w( kah lah pun )
  POSSESSIVE_PRONOUN_CHARACTERS               = %w( ku mu nya )
  FIRST_ORDER_PREFIX_CHARACTERS               = %w( meng meny men mem me
                                                    peng peny pen pem di ter ke )
  SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS       = %w( meng peng meny peny men pen
                                                    mem pem )
  SECOND_ORDER_PREFIX_CHARACTERS              = %w( ber be per pe )
  NON_SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS  = %w( ber per pe )
  SPECIAL_SECOND_ORDER_PREFIX_WORDS           = %w( belajar pelajar belunjur )
  SUFFIX_CHARACTERS                           = %w( kan an i )
  WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS   = %w( meny peny men pen )


  REMOVED_KE    = 1
  REMOVED_PENG  = 2
  REMOVED_DI    = 4
  REMOVED_MENG  = 8
  REMOVED_TER   = 16
  REMOVED_BER   = 32
  REMOVED_PE    = 64


  module MorphologicalUtility
    include StemmerUtility

    def total_syllables(word)
      result = 0
      word.size.times do |i|
        result += 1 if is_vowel?(word[i])
      end
      result
    end

    def remove_particle(word)
      @number_of_syllables ||= total_syllables(word)
      remove_characters_matching_collection(word,
                                            collection_for(:particle),
                                            :end )
    end

    def remove_possessive_pronoun(word)
      @number_of_syllables ||= total_syllables(word)
      remove_characters_matching_collection(word,
                                            collection_for(:possessive_pronoun),
                                            :end )
    end

    def remove_first_order_prefix(word)
      @number_of_syllables ||= total_syllables(word)

      previous_word = word.dup
      remove_and_substitute_characters_matching_collection(
          word, collection_for(:special_first_order_prefix), :start )
      return word if previous_word != word

      remove_characters_matching_collection( word,
                                            collection_for(:first_order_prefix),
                                            :start )
    end

    def remove_second_order_prefix(word)
      @number_of_syllables ||= total_syllables(word)
      word_size = word.size

      if SPECIAL_SECOND_ORDER_PREFIX_WORDS.include?(word)
        @flags ||= REMOVED_BER if word[0..1] == 'be'
        reduce_syllable
        slice_word_at_position(word, 3, :start)
        return word
      end

      if starts_with?(word, word_size, 'be') && word_size > 4 && !is_vowel?(word[2]) && word[3..4] == 'er'
        @flags ||= REMOVED_BER
        reduce_syllable
        slice_word_at_position(word, 2, :start)
        return word
      end

      remove_characters_matching_collection(word,
                                            collection_for(:non_special_second_order_prefix),
                                            :start)
    end

    def remove_suffix(word)
      return word if ambiguous_with_suffices_ending_words?(word)

      @number_of_syllables ||= total_syllables(word)

      SUFFIX_CHARACTERS.each do |character|
        constants_to_check = case character
        when 'kan'
          [REMOVED_KE, REMOVED_PENG, REMOVED_PE]
        when 'an'
          [REMOVED_DI, REMOVED_MENG, REMOVED_TER]
        when 'i'
          [REMOVED_BER, REMOVED_KE, REMOVED_PENG]
        end

        if ends_with?(word, word.size, character) &&
              constants_to_check.all? { |c| (@flags & c) == 0 }
          reduce_syllable
          slice_word_at_position(word, character.size, :end)
          return word
        end
      end

      word
    end


    private
      def is_vowel?(character)
        VOWEL_CHARACTERS.include? character
      end

      def collection_for(name, type = 'characters')
        constant_name = if type == 'characters'
          "#{name}_#{type}"
        else
          name =  case
                  when %w(meny men mem me).include?(name)
                    'meng'
                  when %w(peny pen pem).include?(name)
                    'peng'
                  else
                    name
                  end
          "#{type}_#{name}"
        end
        const_get("#{constant_name}".upcase.to_sym)
      rescue NameError
      end

      def remove_characters_matching_collection(word, collection, position)
        collection.each do |characters|
          if match_position_and_not_ambiguous_with_characters?(word, characters, position)
            next if characters == 'mem' && is_vowel?(word[characters.size])
            @flags ||= collection_for(characters, 'removed')
            reduce_syllable
            slice_word_at_position(word, characters.size, position)
            return word
          end
        end

        word
      end

      def slice_word_at_position(word, characters_size, position)
        multiplier = (position == :start)? 0 : -1
        word.slice!( multiplier*characters_size, characters_size)
      end

      def remove_and_substitute_characters_matching_collection(word, collection, position)
        collection.each do |characters|
          if matching_characters_requires_substitution?(word, characters, position)
            @flags ||= collection_for(characters, 'removed')
            reduce_syllable
            word = substitute_word_character(word, characters)
            slice_word_at_position( word,
                                    characters.size-1,
                                    :start )
            return word
          end
        end
      end

      def contains_irregular_prefix?(word, characters)
        if IrregularWords::ON_PREFIX_CHARACTERS.keys.include?(characters)
          chopped_word_match_words_collection?(
            word[characters.size, word.size],
            IrregularWords::ON_PREFIX_CHARACTERS[characters] )
        end
      end

      def chopped_word_match_words_collection?(chopped_word, collection)
        collection.any? { |w| starts_with?(chopped_word, chopped_word.size, w) }
      end

      def substitute_word_character(word, characters)
        substitute_char = case
        when %w(meny peny).include?(characters)
          's'
        when %w(men pen).include?(characters)
          (chopped_word_match_words_collection?(
              word[characters.size, word.size], IrregularWords::BEGINS_WITH_N
            )
          )? 'n' : 't'
        when %w(meng peng).include?(characters)
          'k'
        when %w(mem pem).include?(characters)
          'p'
        end
        word[characters.size-1] = substitute_char if substitute_char
        word
      end

      def ambiguous_with_characters?(word, characters, position)
        if position == :start
          if characters == 'per'
            chopped_word_match_words_collection?(word[3..-1],
                IrregularWords::BEGINS_WITH_R )
          else
            return false
          end
        else
          IrregularWords::ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
            # To differentiate 'mobilmu' with 'berilmu'
            return false unless %w(me be pe).include?(word[0,2])
            # The rest is ok
            ends_with?(word, word.size, ambiguous_word)
          end
        end
      end

      def ambiguous_with_suffices_ending_words?(word)
        IrregularWords::ENDS_WITH_SUFFIX_CHARACTERS.include?(word)
      end

      def match_position_and_not_ambiguous_with_characters?(word, characters, position)
        send("#{position}s_with?", word, word.size, characters) &&
            !ambiguous_with_characters?(word, characters, position)
      end

      def match_characters_position_followed_by_vowel?(word, characters, position)
        word_size = word.size
        characters_size = characters.size

        send("#{position}s_with?", word, word_size, characters) &&
            word_size > characters_size && is_vowel?(word[characters_size])
      end

      def substitution_required?(word, characters)
        WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
            contains_irregular_prefix?(word, characters)
      end

      def matching_characters_requires_substitution?(word, characters, position)
        match_characters_position_followed_by_vowel?(word, characters, position) &&
            substitution_required?(word, characters)
      end

      def reduce_syllable
        @number_of_syllables -= 1
      end
  end
end