lib/twitter_cldr/segmentation/brahmic_break_engine.rb
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
module TwitterCldr
module Segmentation
# Base class break engine for languages derived from the Brahmic script,
# i.e. Lao, Thai, Khmer, and Burmese.
#
# This class is based on duplicated code found in ICU's BurmeseBreakEngine
# and friends, which all make use of the same break logic.
class BrahmicBreakEngine < DictionaryBreakEngine
# ICU keeps track of all these variables inline, but since we've done a
# bit of method separating (see below), it's too ugly to pass all of
# them around as arguments. Instead we encapsulate them all in this
# handy state object.
class EngineState
attr_accessor :current
attr_reader :words
attr_accessor :words_found, :word_length
def initialize(options = {})
@current = options.fetch(:current, 0)
@words = options.fetch(:words)
@words_found = options.fetch(:words_found, 0)
@word_length = options.fetch(:word_length, 0)
end
end
attr_reader :lookahead, :root_combine_threshold
attr_reader :prefix_combine_threshold, :min_word, :min_word_span
attr_reader :word_set, :mark_set, :end_word_set, :begin_word_set
attr_reader :dictionary, :advance_past_suffix
def initialize(options = {})
@lookahead = options.fetch(:lookahead)
@root_combine_threshold = options.fetch(:root_combine_threshold)
@prefix_combine_threshold = options.fetch(:prefix_combine_threshold)
@min_word = options.fetch(:min_word)
@min_word_span = options.fetch(:min_word_span)
@word_set = options.fetch(:word_set)
@mark_set = options.fetch(:mark_set)
@end_word_set = options.fetch(:end_word_set)
@begin_word_set = options.fetch(:begin_word_set)
@dictionary = options.fetch(:dictionary)
@advance_past_suffix = options.fetch(:advance_past_suffix)
end
private
# See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java#L88
def divide_up_dictionary_range(cursor, end_pos)
return to_enum(__method__, cursor, end_pos) unless block_given?
return if (end_pos - cursor.position) < min_word_span
state = EngineState.new(
cursor: cursor,
end_pos: end_pos,
words: PossibleWordList.new(lookahead)
)
while cursor.position < end_pos
state.current = cursor.position
state.word_length = 0
# look for candidate words at the current position
candidates = state.words[state.words_found].candidates(
cursor, dictionary, end_pos
)
# if we found exactly one, use that
if candidates == 1
state.word_length = state.words[state.words_found].accept_marked(cursor)
state.words_found += 1
elsif candidates > 1
mark_best_candidate(cursor, end_pos, state)
state.word_length = state.words[state.words_found].accept_marked(cursor)
state.words_found += 1
end
# We come here after having either found a word or not. We look ahead to the
# next word. If it's not a dictionary word, we will combine it with the word we
# just found (if there is one), but only if the preceding word does not exceed
# the threshold. The cursor should now be positioned at the end of the word we
# found.
if cursor.position < end_pos && state.word_length < root_combine_threshold
# If it is a dictionary word, do nothing. If it isn't, then if there is
# no preceding word, or the non-word shares less than the minimum threshold
# of characters with a dictionary word, then scan to resynchronize.
preceeding_words = state.words[state.words_found].candidates(
cursor, dictionary, end_pos
)
if preceeding_words <= 0 && (state.word_length == 0 || state.words[state.words_found].longest_prefix < prefix_combine_threshold)
advance_to_plausible_word_boundary(cursor, end_pos, state)
else
# backup to where we were for next iteration
cursor.position = state.current + state.word_length
end
end
# never stop before a combining mark.
while cursor.position < end_pos && mark_set.include?(cursor.codepoint)
cursor.advance
state.word_length += 1
end
# Look ahead for possible suffixes if a dictionary word does not follow.
# We do this in code rather than using a rule so that the heuristic
# resynch continues to function. For example, one of the suffix characters
# could be a typo in the middle of a word.
state.word_length += advance_past_suffix.call(
cursor, end_pos, state
)
# Did we find a word on this iteration? If so, yield it as a boundary.
if state.word_length > 0
yield state.current + state.word_length
end
end
end
private
# In ICU, this method is part of divide_up_dictionary_range. Extracted here
# for readability.
def advance_to_plausible_word_boundary(cursor, end_pos, state)
remaining = end_pos - (state.current + state.word_length)
pc = cursor.codepoint
chars = 0
loop do
cursor.advance
uc = cursor.codepoint
chars += 1
remaining -= 1
break if remaining <= 0
if end_word_set.include?(pc) && begin_word_set.include?(uc)
# Maybe. See if it's in the dictionary.
candidate = state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos)
cursor.position = state.current + state.word_length + chars
break if candidate > 0
end
pc = uc
end
# bump the word count if there wasn't already one
state.words_found += 1 if state.word_length <= 0
# update the length with the passed-over characters
state.word_length += chars
end
def mark_best_candidate(cursor, end_pos, state)
# if there was more than one, see which one can take us forward the most words
found_best = false
# if we're already at the end of the range, we're done
if cursor.position < end_pos
loop do
words_matched = 1
if state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos) > 0
if words_matched < 2
# followed by another dictionary word; mark first word as a good candidate
state.words[state.words_found].mark_current
words_matched = 2
end
# if we're already at the end of the range, we're done
break if cursor.position >= end_pos
# see if any of the possible second words is followed by a third word
loop do
# if we find a third word, stop right away
if state.words[state.words_found + 2].candidates(cursor, dictionary, end_pos) > 0
state.words[state.words_found].mark_current
found_best = true
break
end
break unless state.words[state.words_found + 1].back_up(cursor)
end
end
break unless state.words[state.words_found].back_up(cursor) && !found_best
end
end
end
end
end
end