twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/segmentation/thai_break_engine.rb

Summary

Maintainability
B
4 hrs
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'singleton'
require 'forwardable'

module TwitterCldr
  module Segmentation

    # See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java
    class ThaiBreakEngine

      include Singleton
      extend Forwardable

      def_delegators :engine, :each_boundary

      def self.word_set
        @word_set ||= begin
          uset = TwitterCldr::Shared::UnicodeSet.new
          uset.apply_pattern('[[:Thai:]&[:Line_Break=SA:]]')
          uset.to_set
        end
      end

      # ellision character
      THAI_PAIYANNOI = 0x0E2F

      # repeat character
      THAI_MAIYAMOK = 0x0E46

      def each_boundary(*args, &block)
        engine.each_boundary(*args, &block)
      end

      private

      def engine
        @engine ||= BrahmicBreakEngine.new(
          # How many words in a row are "good enough"?
          lookahead: 3,

          # Will not combine a non-word with a preceding dictionary word longer than this
          root_combine_threshold: 3,

          # Will not combine a non-word that shares at least this much prefix with a
          # dictionary word with a preceding word
          prefix_combine_threshold: 3,

          # Minimum word size
          min_word: 2,

          # Minimum number of characters for two words (min_word * 2)
          min_word_span: 4,

          word_set: self.class.word_set,
          mark_set: mark_set,
          end_word_set: end_word_set,
          begin_word_set: begin_word_set,
          dictionary: Dictionary.thai,
          advance_past_suffix: -> (*args) do
            advance_past_suffix(*args)
          end
        )
      end

      def advance_past_suffix(cursor, end_pos, state)
        suffix_length = 0

        if cursor.position < end_pos && state.word_length > 0
          uc = cursor.codepoint

          candidates = state.words[state.words_found].candidates(
            cursor, engine.dictionary, end_pos
          )

          if candidates <= 0 && suffix_set.include?(uc)
            if uc == THAI_PAIYANNOI
              unless suffix_set.include?(cursor.previous)
                # skip over previous end and PAIYANNOI
                cursor.advance(2)
                suffix_length += 1
                uc = cursor.codepoint
              else
                # restore prior position
                cursor.advance
              end
            end

            if uc == THAI_MAIYAMOK
              if cursor.previous != THAI_MAIYAMOK
                # skip over previous end and MAIYAMOK
                cursor.advance(2)
                suffix_length += 1
              else
                # restore prior position
                cursor.advance
              end
            end
          else
            cursor.position = state.current + state.word_length
          end
        end

        suffix_length
      end

      def mark_set
        @mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
          set.apply_pattern('[[:Thai:]&[:Line_Break=SA:]&[:M:]]')
          set.add(0x0020)
        end
      end

      def end_word_set
        @end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
          set.add_list(self.class.word_set)
          set.subtract(0x0E31)  # MAI HAN-AKAT
          set.subtract_range(0x0E40..0x0E44)  # SARA E through SARA AI MAIMALAI
        end
      end

      def begin_word_set
        @begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
          set.add_range(0x0E01..0x0E2E)  # KO KAI through HO NOKHUK
          set.add_range(0x0E40..0x0E44)  # SARA E through SARA AI MAIMALAI
        end
      end

      def suffix_set
        @suffix_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
          set.add(THAI_PAIYANNOI)
          set.add(THAI_MAIYAMOK)
        end
      end

    end
  end
end