lib/twitter_cldr/segmentation/thai_break_engine.rb
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
require 'singleton'
require 'forwardable'
module TwitterCldr
module Segmentation
# See: https://github.com/unicode-org/icu/blob/release-65-1/icu4j/main/classes/core/src/com/ibm/icu/text/ThaiBreakEngine.java
class ThaiBreakEngine
include Singleton
extend Forwardable
def_delegators :engine, :each_boundary
def self.word_set
@word_set ||= begin
uset = TwitterCldr::Shared::UnicodeSet.new
uset.apply_pattern('[[:Thai:]&[:Line_Break=SA:]]')
uset.to_set
end
end
# ellision character
THAI_PAIYANNOI = 0x0E2F
# repeat character
THAI_MAIYAMOK = 0x0E46
def each_boundary(*args, &block)
engine.each_boundary(*args, &block)
end
private
def engine
@engine ||= BrahmicBreakEngine.new(
# How many words in a row are "good enough"?
lookahead: 3,
# Will not combine a non-word with a preceding dictionary word longer than this
root_combine_threshold: 3,
# Will not combine a non-word that shares at least this much prefix with a
# dictionary word with a preceding word
prefix_combine_threshold: 3,
# Minimum word size
min_word: 2,
# Minimum number of characters for two words (min_word * 2)
min_word_span: 4,
word_set: self.class.word_set,
mark_set: mark_set,
end_word_set: end_word_set,
begin_word_set: begin_word_set,
dictionary: Dictionary.thai,
advance_past_suffix: -> (*args) do
advance_past_suffix(*args)
end
)
end
def advance_past_suffix(cursor, end_pos, state)
suffix_length = 0
if cursor.position < end_pos && state.word_length > 0
uc = cursor.codepoint
candidates = state.words[state.words_found].candidates(
cursor, engine.dictionary, end_pos
)
if candidates <= 0 && suffix_set.include?(uc)
if uc == THAI_PAIYANNOI
unless suffix_set.include?(cursor.previous)
# skip over previous end and PAIYANNOI
cursor.advance(2)
suffix_length += 1
uc = cursor.codepoint
else
# restore prior position
cursor.advance
end
end
if uc == THAI_MAIYAMOK
if cursor.previous != THAI_MAIYAMOK
# skip over previous end and MAIYAMOK
cursor.advance(2)
suffix_length += 1
else
# restore prior position
cursor.advance
end
end
else
cursor.position = state.current + state.word_length
end
end
suffix_length
end
def mark_set
@mark_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
set.apply_pattern('[[:Thai:]&[:Line_Break=SA:]&[:M:]]')
set.add(0x0020)
end
end
def end_word_set
@end_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
set.add_list(self.class.word_set)
set.subtract(0x0E31) # MAI HAN-AKAT
set.subtract_range(0x0E40..0x0E44) # SARA E through SARA AI MAIMALAI
end
end
def begin_word_set
@begin_word_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
set.add_range(0x0E01..0x0E2E) # KO KAI through HO NOKHUK
set.add_range(0x0E40..0x0E44) # SARA E through SARA AI MAIMALAI
end
end
def suffix_set
@suffix_set ||= TwitterCldr::Shared::UnicodeSet.new.tap do |set|
set.add(THAI_PAIYANNOI)
set.add(THAI_MAIYAMOK)
end
end
end
end
end