lib/indonesian_stemmer/morphological_utility.rb
require "indonesian_stemmer/stemmer_utility"
require "indonesian_stemmer/irregular_words"
module IndonesianStemmer
VOWEL_CHARACTERS = %w( a e i o u )
PARTICLE_CHARACTERS = %w( kah lah pun )
POSSESSIVE_PRONOUN_CHARACTERS = %w( ku mu nya )
FIRST_ORDER_PREFIX_CHARACTERS = %w( meng meny men mem me
peng peny pen pem di ter ke )
SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meng peng meny peny men pen
mem pem )
SECOND_ORDER_PREFIX_CHARACTERS = %w( ber be per pe )
NON_SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( ber per pe )
SPECIAL_SECOND_ORDER_PREFIX_WORDS = %w( belajar pelajar belunjur )
SUFFIX_CHARACTERS = %w( kan an i )
WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS = %w( meny peny men pen )
REMOVED_KE = 1
REMOVED_PENG = 2
REMOVED_DI = 4
REMOVED_MENG = 8
REMOVED_TER = 16
REMOVED_BER = 32
REMOVED_PE = 64
module MorphologicalUtility
include StemmerUtility
def total_syllables(word)
result = 0
word.size.times do |i|
result += 1 if is_vowel?(word[i])
end
result
end
def remove_particle(word)
@number_of_syllables ||= total_syllables(word)
remove_characters_matching_collection(word,
collection_for(:particle),
:end )
end
def remove_possessive_pronoun(word)
@number_of_syllables ||= total_syllables(word)
remove_characters_matching_collection(word,
collection_for(:possessive_pronoun),
:end )
end
def remove_first_order_prefix(word)
@number_of_syllables ||= total_syllables(word)
previous_word = word.dup
remove_and_substitute_characters_matching_collection(
word, collection_for(:special_first_order_prefix), :start )
return word if previous_word != word
remove_characters_matching_collection( word,
collection_for(:first_order_prefix),
:start )
end
def remove_second_order_prefix(word)
@number_of_syllables ||= total_syllables(word)
word_size = word.size
if SPECIAL_SECOND_ORDER_PREFIX_WORDS.include?(word)
@flags ||= REMOVED_BER if word[0..1] == 'be'
reduce_syllable
slice_word_at_position(word, 3, :start)
return word
end
if starts_with?(word, word_size, 'be') && word_size > 4 && !is_vowel?(word[2]) && word[3..4] == 'er'
@flags ||= REMOVED_BER
reduce_syllable
slice_word_at_position(word, 2, :start)
return word
end
remove_characters_matching_collection(word,
collection_for(:non_special_second_order_prefix),
:start)
end
def remove_suffix(word)
return word if ambiguous_with_suffices_ending_words?(word)
@number_of_syllables ||= total_syllables(word)
SUFFIX_CHARACTERS.each do |character|
constants_to_check = case character
when 'kan'
[REMOVED_KE, REMOVED_PENG, REMOVED_PE]
when 'an'
[REMOVED_DI, REMOVED_MENG, REMOVED_TER]
when 'i'
[REMOVED_BER, REMOVED_KE, REMOVED_PENG]
end
if ends_with?(word, word.size, character) &&
constants_to_check.all? { |c| (@flags & c) == 0 }
reduce_syllable
slice_word_at_position(word, character.size, :end)
return word
end
end
word
end
private
def is_vowel?(character)
VOWEL_CHARACTERS.include? character
end
def collection_for(name, type = 'characters')
constant_name = if type == 'characters'
"#{name}_#{type}"
else
name = case
when %w(meny men mem me).include?(name)
'meng'
when %w(peny pen pem).include?(name)
'peng'
else
name
end
"#{type}_#{name}"
end
const_get("#{constant_name}".upcase.to_sym)
rescue NameError
end
def remove_characters_matching_collection(word, collection, position)
collection.each do |characters|
if match_position_and_not_ambiguous_with_characters?(word, characters, position)
next if characters == 'mem' && is_vowel?(word[characters.size])
@flags ||= collection_for(characters, 'removed')
reduce_syllable
slice_word_at_position(word, characters.size, position)
return word
end
end
word
end
def slice_word_at_position(word, characters_size, position)
multiplier = (position == :start)? 0 : -1
word.slice!( multiplier*characters_size, characters_size)
end
def remove_and_substitute_characters_matching_collection(word, collection, position)
collection.each do |characters|
if matching_characters_requires_substitution?(word, characters, position)
@flags ||= collection_for(characters, 'removed')
reduce_syllable
word = substitute_word_character(word, characters)
slice_word_at_position( word,
characters.size-1,
:start )
return word
end
end
end
def contains_irregular_prefix?(word, characters)
if IrregularWords::ON_PREFIX_CHARACTERS.keys.include?(characters)
chopped_word_match_words_collection?(
word[characters.size, word.size],
IrregularWords::ON_PREFIX_CHARACTERS[characters] )
end
end
def chopped_word_match_words_collection?(chopped_word, collection)
collection.any? { |w| starts_with?(chopped_word, chopped_word.size, w) }
end
def substitute_word_character(word, characters)
substitute_char = case
when %w(meny peny).include?(characters)
's'
when %w(men pen).include?(characters)
(chopped_word_match_words_collection?(
word[characters.size, word.size], IrregularWords::BEGINS_WITH_N
)
)? 'n' : 't'
when %w(meng peng).include?(characters)
'k'
when %w(mem pem).include?(characters)
'p'
end
word[characters.size-1] = substitute_char if substitute_char
word
end
def ambiguous_with_characters?(word, characters, position)
if position == :start
if characters == 'per'
chopped_word_match_words_collection?(word[3..-1],
IrregularWords::BEGINS_WITH_R )
else
return false
end
else
IrregularWords::ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
# To differentiate 'mobilmu' with 'berilmu'
return false unless %w(me be pe).include?(word[0,2])
# The rest is ok
ends_with?(word, word.size, ambiguous_word)
end
end
end
def ambiguous_with_suffices_ending_words?(word)
IrregularWords::ENDS_WITH_SUFFIX_CHARACTERS.include?(word)
end
def match_position_and_not_ambiguous_with_characters?(word, characters, position)
send("#{position}s_with?", word, word.size, characters) &&
!ambiguous_with_characters?(word, characters, position)
end
def match_characters_position_followed_by_vowel?(word, characters, position)
word_size = word.size
characters_size = characters.size
send("#{position}s_with?", word, word_size, characters) &&
word_size > characters_size && is_vowel?(word[characters_size])
end
def substitution_required?(word, characters)
WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
contains_irregular_prefix?(word, characters)
end
def matching_characters_requires_substitution?(word, characters, position)
match_characters_position_followed_by_vowel?(word, characters, position) &&
substitution_required?(word, characters)
end
def reduce_syllable
@number_of_syllables -= 1
end
end
end