twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/collation/collator.rb

Summary

Maintainability
A
1 hr
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Collation

    class UnexpectedCodePointError < StandardError; end

    # Collator uses fractional collation elements table form CLDR to generate sort keys for Unicode strings as well as
    # compare and sort such strings by generated sort keys.
    #
    class Collator

      attr_accessor :locale

      def initialize(locale = nil)
        @locale  = TwitterCldr.convert_locale(locale) if locale
        @options = tailoring_options
        @trie    = load_trie
      end

      def sort(strings)
        strings.map{ |s| [s, get_sort_key(s)] }.sort{ |a, b| a[1] <=> b[1] }.map(&:first)
      end

      def sort!(strings)
        sort_keys = Hash.new { |hash, string| hash[string] = get_sort_key(string) }
        strings.replace(strings.sort_by { |s| sort_keys[s] })
      end

      def compare(string_a, string_b)
        string_a == string_b ? 0 : get_sort_key(string_a) <=> get_sort_key(string_b)
      end

      # Second arg options, supports an option :maximum_level, to
      # pass on to SortKeyBuilder :maximum_level.
      def get_sort_key(string_or_code_points, method_options = {})
        TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), case_first: @options[:case_first], maximum_level: method_options[:maximum_level])
      end

      def get_collation_elements(string_or_code_points)
        code_points = get_normalized_code_points(string_or_code_points)

        result = []
        result.concat(code_point_collation_elements(code_points)) until code_points.empty?
        result
      end

      private

      def tailoring_options
        @locale ? TwitterCldr::Collation::TrieBuilder.tailoring_data(@locale)[:collator_options] : {}
      end

      def load_trie
        @locale ? self.class.tailored_trie(@locale) : self.class.default_trie
      end

      def get_normalized_code_points(str_or_code_points)
        str = if str_or_code_points.is_a?(String)
          str_or_code_points
        else
          TwitterCldr::Utils::CodePoints.to_string(str_or_code_points)
        end

        # Normalization makes the collation process significantly slower (like seven times slower on the UCA
        # non-ignorable test from CollationTest_NON_IGNORABLE.txt). ICU uses some optimizations to apply normalization
        # only in special, rare cases. We need to investigate possible solutions and do normalization cleverly too.
        TwitterCldr::Utils::CodePoints.from_string(
          TwitterCldr::Normalization.normalize(str)
        )
      rescue ArgumentError
        # Meant to rescue encoding errors. Some tests do not provide valid UTF-8 sequences.
        TwitterCldr::Utils::CodePoints.from_string(str)
      end

      # Returns the first sequence of fractional collation elements for an array of integer code points. Returned value
      # is an array of well formed (including weights for all significant levels) integer arrays.
      #
      # NOTE (side-effect): all used code points are removed from the input array.
      #
      def code_point_collation_elements(code_points)
        explicit_collation_elements(code_points) || implicit_collation_elements(code_points)
      end

      # Tries to build explicit collation elements array for the longest code points prefix in the given sequence. When
      # possible, combines this prefix with (not necessarily subsequent) non-starters that follow it in the sequence.
      # That's necessary because canonical ordering (that is performed during normalization) can break contractions
      # that existed in the original, de-normalized string.
      #
      # NOTE (side-effect): all used code points are removed from the input array.
      #
      # For more information see section '4.2 Produce Array' of the main algorithm at http://www.unicode.org/reports/tr10/#Main_Algorithm
      #
      def explicit_collation_elements(code_points)
        # find the longest prefix in the trie
        collation_elements, prefix_size, suffixes = @trie.find_prefix(code_points)

        return unless collation_elements

        # remove prefix from the code points sequence
        code_points.shift(prefix_size)

        non_starter_pos = 0

        used_combining_classes = {}

        while non_starter_pos < code_points.size && !suffixes.empty?
          # get next code point (possibly non-starter)
          non_starter_code_point = code_points[non_starter_pos]
          non_starter_metadata   = TwitterCldr::Shared::CodePoint.get(non_starter_code_point)

          unless non_starter_metadata
            raise UnexpectedCodePointError, "'#{non_starter_code_point}' does not appear to be a valid Unicode code point"
          end

          combining_class = non_starter_metadata.combining_class.to_i

          # code point is a starter or combining class has been already used (non-starter is 'blocked' from the prefix)
          break if combining_class == 0 || used_combining_classes[combining_class]

          used_combining_classes[combining_class] = true

          # Try to find collation elements for [prefix + non-starter] code points sequence. As the subtrie contains
          # suffixes (without prefix) we pass only non-starter itself.
          new_collation_elements, _, new_suffixes = suffixes.find_prefix([non_starter_code_point])

          if new_collation_elements
            # non-starter with a collation elements sequence corresponding to [prefix + non-starter] accepted
            collation_elements = new_collation_elements
            suffixes           = new_suffixes

            # Remove non-starter from its position in the sequence. Then we can move further from the same position.
            code_points.delete_at(non_starter_pos)
          else
            # move to the next code point
            non_starter_pos += 1
          end
        end

        collation_elements
      end

      def implicit_collation_elements(code_points)
        TwitterCldr::Collation::ImplicitCollationElements.for_code_point(code_points.shift)
      end

      class << self

        # Loads and memoizes the default fractional collation elements trie.
        #
        def default_trie
          @default_trie ||= TwitterCldr::Collation::TrieLoader.load_default_trie.lock
        end

        def tailored_trie(locale)
          tailored_tries_cache[locale]
        end

        private

        def tailored_tries_cache
          @tailored_tries_cache ||= Hash.new { |hash, locale| hash[locale] = load_tailored_trie(locale) }
        end

        def load_tailored_trie(locale)
          TwitterCldr::Collation::TrieLoader.load_tailored_trie(locale, default_trie).lock
        end

      end

    end

  end
end