twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/segmentation/word_iterator.rb

Summary

Maintainability
A
3 hrs
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'set'

module TwitterCldr
  module Segmentation
    class DeormalizedStringError < StandardError; end

    class WordIterator < SegmentIterator
      DICTIONARY_BREAK_ENGINES = [
        CjBreakEngine,
        KoreanBreakEngine,
        BurmeseBreakEngine,
        KhmerBreakEngine,
        LaoBreakEngine,
        ThaiBreakEngine
      ]

      def each_boundary(str, &block)
        return to_enum(__method__, str) unless block_given?

        # Rather than put a bunch of duplicate logic in
        # each_boundary_helper to make sure we don't yield the same
        # boundary twice, we wrap it in this additional de-duping
        # enumerator and call it a day.
        last_boundary = nil

        each_boundary_helper(str) do |boundary|
          yield boundary if boundary != last_boundary
          last_boundary = boundary
        end
      end

      private

      def each_boundary_helper(str, &block)
        # Set up two independent cursors so the algorithm can iterate
        # over those portions of the input string that require a
        # dictionary-based break iterator independently from those that
        # only need the normal, rule-based break iterator. Cursors
        # hold references to the input text and a list of all the
        # corresponding Unicode codepoints, meaning they are fairly
        # expensive to create. The duplication below should only
        # produce a shallow copy however. The text and codepoint list
        # are not duplicated, but the cursor's integer position can
        # be varied independently.
        dict_cursor = create_cursor(str)
        rule_cursor = dict_cursor.dup

        # implicit start of text boundary
        yield 0

        until dict_cursor.eos? || rule_cursor.eos?
          # We use a regex to identify the beginnings of potential runs
          # of dictionary characters. This regex was benchmarked and
          # found to be pretty fast, but could become a bottleneck if
          # other parts of the algorithm are improved in the future.
          m = dictionary_re.match(dict_cursor.text, dict_cursor.position)
          break unless m

          dict_cursor.position = m.begin(0)
          dict_break_engine = dictionary_break_engine_for(dict_cursor.codepoint)

          # It's possible to encounter a dictionary character that can't
          # be handled by any of the dictionary-based break engines
          # because it's too short to make up an actual word. The
          # break engine will simply yield no breaks in such a case, which
          # we test for below by peeking for the first boundary value and
          # rescuing a StopIteration error. Since the run of dictionary
          # characters may be arbitrarily long, peeking should be more
          # performant than attempting to calculate all the boundary
          # positions for the run at once.
          #
          # It should be noted that, despite our best efforts here in
          # WordIterator, certain dictionary-based break engines (eg.
          # CjBreakEngine) cannot yield word boundaries without first
          # examining the entire run of dictionary characters. In practice
          # this shouldn't be too big an issue, since Chinese text often
          # contains punctuation that should limit the average run length.
          dict_enum = dict_break_engine.each_boundary(dict_cursor)

          dict_boundary = begin
            dict_enum.peek
          rescue StopIteration
            nil
          end

          # If a dictionary boundary was found, attempt to use the rule-based
          # break iterator to find breaks in the text immediately before it.
          # Otherwise, since none of the dictionary-based break engines could
          # find any boundaries in the current run, we advance the dictionary
          # cursor in an attempt to find the next dictionary boundary. Doing
          # so effectively causes the algorithm to fall back to the rule-based
          # break engine.
          if dict_boundary
            # Only use the rule-based break engine if there are characters to
            # process.
            if rule_cursor.position < m.begin(0)
              rule_set.each_boundary(rule_cursor, m.begin(0), &block)
            end

            # Yield all the dictionary breaks from the enum. We can't use .each
            # here because that will restart the iteration. Ruby's loop
            # construct automatically rescues StopIteration.
            loop do
              yield dict_enum.next
            end

            # We've reached the end of a dictionary character run, so yield
            # the end of text boundary.
            yield dict_cursor.position

            # These should be the same after a successful dictionary run, i.e.
            # they should both be positioned at the end of the current rule-based
            # and dictionary-based portions of the run, ready for the next one.
            rule_cursor.position = dict_cursor.position
          else
            dict_cursor.advance
          end
        end

        # Find boundaries in the straggler, non-dictionary run at the end of
        # the input text.
        unless rule_cursor.eos?
          rule_set.each_boundary(rule_cursor, &block)
        end

        # implicit end of text boundary
        yield rule_cursor.length
      end

      # all dictionary characters, i.e. characters that must be handled
      # by one of the dictionary-based break engines
      def dictionary_set
        @@dictionary_set ||= Set.new.tap do |set|
          DICTIONARY_BREAK_ENGINES.each do |break_engine|
            set.merge(break_engine.word_set)
          end
        end
      end

      def dictionary_break_engine_for(codepoint)
        codepoint_to_engine_cache[codepoint] ||= begin
          engine = DICTIONARY_BREAK_ENGINES.find do |break_engine|
            break_engine.word_set.include?(codepoint)
          end

          (engine || UnhandledBreakEngine).instance
        end
      end

      def dictionary_re
        @@dictionary_re ||= begin
          ranges = TwitterCldr::Utils::RangeSet.from_array(dictionary_set).ranges.map do |r|
            "\\u{#{r.first.to_s(16)}}-\\u{#{r.last.to_s(16)}}"
          end

          /[#{ranges.join}]/
        end
      end

      def codepoint_to_engine_cache
        @@codepoint_to_engine_cache ||= {}
      end
    end
  end
end