twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/segmentation/cj_break_engine.rb

Summary

Maintainability
C
7 hrs
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'singleton'

module TwitterCldr
  module Segmentation
    class CjBreakEngine < DictionaryBreakEngine

      include Singleton

      # magic number pulled from ICU's source code, presumably slightly longer
      # than the longest Chinese/Japanese/Korean word
      MAX_WORD_SIZE = 20

      # magic number pulled from ICU's source code
      MAX_SNLP = 255

      # the equivalent of Java's Integer.MAX_VALUE
      LARGE_NUMBER = 0xFFFFFFFF

      MAX_KATAKANA_LENGTH = 8
      MAX_KATAKANA_GROUP_LENGTH = 20
      KATAKANA_COSTS = [8192, 984, 408, 240, 204, 252, 300, 372, 480].freeze
      MAX_KATAKANA_COST = 8192

      def self.word_set
        @word_set ||= begin
          uset = TwitterCldr::Shared::UnicodeSet.new
          uset.apply_pattern('[:Han:]')
          uset.apply_pattern('[[:Katakana:]\uff9e\uff9f]')
          uset.apply_pattern('[:Hiragana:]')
          uset.add(0xFF70)  # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
          uset.add(0x30FC)  # KATAKANA-HIRAGANA PROLONGED SOUND MARK
          uset.to_set
        end
      end

      private

      def word_set
        self.class.word_set
      end

      def divide_up_dictionary_range(cursor, end_pos, &block)
        return to_enum(__method__, cursor, end_pos) unless block_given?

        input_length = end_pos - cursor.position
        best_snlp = Array.new(input_length + 1) { LARGE_NUMBER }
        prev = Array.new(input_length + 1) { -1 }

        best_snlp[0] = 0
        start_pos = cursor.position
        is_prev_katakana = false

        until cursor.position >= end_pos
          idx = cursor.position - start_pos

          if best_snlp[idx] == LARGE_NUMBER
            cursor.advance
            next
          end

          max_search_length = if cursor.position + MAX_WORD_SIZE < end_pos
            MAX_WORD_SIZE
          else
            end_pos - cursor.position
          end

          count, values, lengths, _ = dictionary.matches(
            cursor, max_search_length, max_search_length
          )

          if (count == 0 || lengths[0] != 1) && !hangul_word_set.include?(cursor.codepoint)
            values[count] = MAX_SNLP
            lengths[count] = 1
            count += 1
          end

          count.times do |j|
            new_snlp = best_snlp[idx] + values[j]

            if new_snlp < best_snlp[lengths[j] + idx]
              best_snlp[lengths[j] + idx] = new_snlp
              prev[lengths[j] + idx] = idx
            end
          end

          # In Japanese, single-character Katakana words are pretty rare.
          # Accordingly, we apply the following heuristic: any continuous
          # run of Katakana characters is considered a candidate word with
          # a default cost specified in the katakanaCost table according
          # to its length.
          is_katakana = is_katakana?(cursor.codepoint)

          if !is_prev_katakana && is_katakana
            j = idx + 1
            cursor.advance

            while cursor.position < end_pos && (j - idx) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.codepoint)
              cursor.advance
              j += 1
            end

            if (j - idx) < MAX_KATAKANA_GROUP_LENGTH
              new_snlp = best_snlp[idx] + get_katakana_cost(j - idx)

              if new_snlp < best_snlp[j]
                best_snlp[j] = new_snlp
                prev[j] = idx
              end
            end
          end

          is_prev_katakana = is_katakana

          cursor.advance
        end

        t_boundary = []

        if best_snlp[input_length] == LARGE_NUMBER
          t_boundary << end_pos
        else
          idx = end_pos - start_pos

          while idx > 0
            t_boundary << idx + start_pos
            idx = prev[idx]
          end
        end

        t_boundary.reverse_each(&block)
      end

      private

      def hangul_word_set
        @@hangul_word_set ||= KoreanBreakEngine.word_set
      end

      def is_katakana?(codepoint)
        (codepoint >= 0x30A1 && codepoint <= 0x30FE && codepoint != 0x30FB) ||
          (codepoint >= 0xFF66 && codepoint <= 0xFF9F)
      end

      def get_katakana_cost(word_length)
        if word_length > MAX_KATAKANA_LENGTH
          MAX_KATAKANA_COST
        else
          KATAKANA_COSTS[word_length]
        end
      end

      def dictionary
        @dictionary ||= Dictionary.cj
      end

    end
  end
end