twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/segmentation/brahmic_break_engine.rb

Summary

Maintainability
C
1 day
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Segmentation
    # Base class break engine for languages derived from the Brahmic script,
    # i.e. Lao, Thai, Khmer, and Burmese.
    #
    # This class is based on duplicated code found in ICU's BurmeseBreakEngine
    # and friends, which all make use of the same break logic.
    class BrahmicBreakEngine < DictionaryBreakEngine

      # ICU keeps track of all these variables inline, but since we've done a
      # bit of method separating (see below), it's too ugly to pass all of
      # them around as arguments. Instead we encapsulate them all in this
      # handy state object.
      class EngineState
        attr_accessor :current
        attr_reader :words
        attr_accessor :words_found, :word_length

        def initialize(options = {})
          @current = options.fetch(:current, 0)
          @words = options.fetch(:words)
          @words_found = options.fetch(:words_found, 0)
          @word_length = options.fetch(:word_length, 0)
        end
      end

      attr_reader :lookahead, :root_combine_threshold
      attr_reader :prefix_combine_threshold, :min_word, :min_word_span
      attr_reader :word_set, :mark_set, :end_word_set, :begin_word_set
      attr_reader :dictionary, :advance_past_suffix

      def initialize(options = {})
        @lookahead = options.fetch(:lookahead)
        @root_combine_threshold = options.fetch(:root_combine_threshold)
        @prefix_combine_threshold = options.fetch(:prefix_combine_threshold)
        @min_word = options.fetch(:min_word)
        @min_word_span = options.fetch(:min_word_span)

        @word_set = options.fetch(:word_set)
        @mark_set = options.fetch(:mark_set)
        @end_word_set = options.fetch(:end_word_set)
        @begin_word_set = options.fetch(:begin_word_set)

        @dictionary = options.fetch(:dictionary)
        @advance_past_suffix = options.fetch(:advance_past_suffix)
      end

      private

      # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java#L88
      def divide_up_dictionary_range(cursor, end_pos)
        return to_enum(__method__, cursor, end_pos) unless block_given?
        return if (end_pos - cursor.position) < min_word_span

        state = EngineState.new(
          cursor: cursor,
          end_pos: end_pos,
          words: PossibleWordList.new(lookahead)
        )

        while cursor.position < end_pos
          state.current = cursor.position
          state.word_length = 0

          # look for candidate words at the current position
          candidates = state.words[state.words_found].candidates(
            cursor, dictionary, end_pos
          )

          # if we found exactly one, use that
          if candidates == 1
            state.word_length = state.words[state.words_found].accept_marked(cursor)
            state.words_found += 1
          elsif candidates > 1
            mark_best_candidate(cursor, end_pos, state)
            state.word_length = state.words[state.words_found].accept_marked(cursor)
            state.words_found += 1
          end

          # We come here after having either found a word or not. We look ahead to the
          # next word. If it's not a dictionary word, we will combine it with the word we
          # just found (if there is one), but only if the preceding word does not exceed
          # the threshold. The cursor should now be positioned at the end of the word we
          # found.
          if cursor.position < end_pos && state.word_length < root_combine_threshold
            # If it is a dictionary word, do nothing. If it isn't, then if there is
            # no preceding word, or the non-word shares less than the minimum threshold
            # of characters with a dictionary word, then scan to resynchronize.
            preceeding_words = state.words[state.words_found].candidates(
              cursor, dictionary, end_pos
            )

            if preceeding_words <= 0 && (state.word_length == 0 || state.words[state.words_found].longest_prefix < prefix_combine_threshold)
              advance_to_plausible_word_boundary(cursor, end_pos, state)
            else
              # backup to where we were for next iteration
              cursor.position = state.current + state.word_length
            end
          end

          # never stop before a combining mark.
          while cursor.position < end_pos && mark_set.include?(cursor.codepoint)
            cursor.advance
            state.word_length += 1
          end

          # Look ahead for possible suffixes if a dictionary word does not follow.
          # We do this in code rather than using a rule so that the heuristic
          # resynch continues to function. For example, one of the suffix characters
          # could be a typo in the middle of a word.
          state.word_length += advance_past_suffix.call(
            cursor, end_pos, state
          )

          # Did we find a word on this iteration? If so, yield it as a boundary.
          if state.word_length > 0
            yield state.current + state.word_length
          end
        end
      end

      private

      # In ICU, this method is part of divide_up_dictionary_range. Extracted here
      # for readability.
      def advance_to_plausible_word_boundary(cursor, end_pos, state)
        remaining = end_pos - (state.current + state.word_length)
        pc = cursor.codepoint
        chars = 0

        loop do
          cursor.advance
          uc = cursor.codepoint
          chars += 1
          remaining -= 1

          break if remaining <= 0

          if end_word_set.include?(pc) && begin_word_set.include?(uc)
            # Maybe. See if it's in the dictionary.
            candidate = state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos)
            cursor.position = state.current + state.word_length + chars
            break if candidate > 0
          end

          pc = uc
        end

        # bump the word count if there wasn't already one
        state.words_found += 1 if state.word_length <= 0

        # update the length with the passed-over characters
        state.word_length += chars
      end

      def mark_best_candidate(cursor, end_pos, state)
        # if there was more than one, see which one can take us forward the most words
        found_best = false

        # if we're already at the end of the range, we're done
        if cursor.position < end_pos
          loop do
            words_matched = 1

            if state.words[state.words_found + 1].candidates(cursor, dictionary, end_pos) > 0
              if words_matched < 2
                # followed by another dictionary word; mark first word as a good candidate
                state.words[state.words_found].mark_current
                words_matched = 2
              end

              # if we're already at the end of the range, we're done
              break if cursor.position >= end_pos

              # see if any of the possible second words is followed by a third word
              loop do
                # if we find a third word, stop right away
                if state.words[state.words_found + 2].candidates(cursor, dictionary, end_pos) > 0
                  state.words[state.words_found].mark_current
                  found_best = true
                  break
                end

                break unless state.words[state.words_found + 1].back_up(cursor)
              end
            end

            break unless state.words[state.words_found].back_up(cursor) && !found_best
          end
        end
      end

    end
  end
end