twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/segmentation/possible_word.rb

Summary

Maintainability
A
0 mins
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Segmentation
    class PossibleWord
      # list size, limited by the maximum number of words in the dictionary
      # that form a nested sequence.
      POSSIBLE_WORD_LIST_MAX = 20

      def initialize
        @lengths = []
        @count = nil
        @offset = -1
      end

      # fill the list of candidates if needed, select the longest, and return the number found
      def candidates(cursor, dictionary, end_pos)
        start = cursor.position

        if start != @offset
          @offset = start
          @count, _, @lengths, @prefix = dictionary.matches(
            cursor, end_pos - start, POSSIBLE_WORD_LIST_MAX
          )

          # dictionary leaves text after longest prefix, not longest word, so back up.
          if @count <= 0
            cursor.position = start
          end
        end

        if @count > 0
          cursor.position = start + @lengths[@count - 1]
        end

        @current = @count - 1
        @mark = @current

        return @count
      end

      # select the currently marked candidate, point after it in the text, and invalidate self
      def accept_marked(cursor)
        cursor.position = @offset + @lengths[@mark]
        @lengths[@mark]
      end

      # back up from the current candidate to the next shorter one; return true if that exists
      # and point the text after it
      def back_up(cursor)
        if @current > 0
          @current -= 1
          cursor.position = @offset + @lengths[@current]
          return true
        end

        false
      end

      # return the longest prefix this candidate location shares with a dictionary word
      def longest_prefix
        @prefix
      end

      # mark the current candidate as the one we like
      def mark_current
        @mark = @current
      end
    end
  end
end