twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/collation/sort_key_builder.rb

Summary

Maintainability
B
6 hrs
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Collation

    # SortKeyBuilder builds a collation sort key from an array of collation elements.
    #
    # Weights compression algorithms for every level are described in
    # http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
    #
    class SortKeyBuilder

      PRIMARY_LEVEL, SECONDARY_LEVEL, TERTIARY_LEVEL = 0, 1, 2

      LEVEL_SEPARATOR = 1 # separate levels in a sort key '01' bytes

      VALID_CASE_FIRST_OPTIONS    = [nil, :lower, :upper]
      VALID_MAXIMUM_LEVEL_OPTIONS = [nil, 1, 2, 3]

      attr_reader :collation_elements, :case_first

      # Returns a sort key as an array of bytes.
      #
      # Arguments:
      #
      #   collation_elements - an array of collation elements, represented as arrays of integer weights.
      #   options            - hash of options:
      #     case_first       - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
      #     maximum_level    - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
      #
      # An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one
      # method into another while forming the sort key.
      #
      def self.build(collation_elements, options = nil)
        new(collation_elements, options).bytes_array
      end

      # Arguments:
      #
      #   collation_elements - an array of collation elements, represented as arrays of integer weights.
      #   options            - hash of options:
      #     case_first       - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
      #     maximum_level    - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
      #
      def initialize(collation_elements, options = {})
        raise ArgumentError, "second argument should be an options hash, not `#{options}`. Do you mean `:case_first => #{options}`?" unless options.kind_of? Hash

        case_first = options[:case_first]
        raise ArgumentError, "invalid case-first options '#{case_first.inspect}'" unless VALID_CASE_FIRST_OPTIONS.include?(case_first)

        maximum_level = options[:maximum_level]
        raise ArgumentError, "invalid maximum_level option 'options[:maximum_level]'" unless VALID_MAXIMUM_LEVEL_OPTIONS.include?(maximum_level)

        @collation_elements = collation_elements
        @case_first         = case_first
        @maximum_level      = maximum_level

        init_tertiary_constants
      end

      def bytes_array
        @bytes_array ||= build_bytes_array
      end

      private

      def build_bytes_array
        @bytes_array = []

        append_primary_bytes
        append_secondary_bytes unless @maximum_level && (@maximum_level < 2)
        append_tertiary_bytes  unless @maximum_level && (@maximum_level < 3)

        @bytes_array
      end

      def append_primary_bytes
        @last_leading_byte = nil

        @collation_elements.each do |collation_element|
          bytes = integer_to_bytes_array(level_weight(collation_element, PRIMARY_LEVEL))

          unless bytes.empty?
            leading_byte = bytes.shift

            if leading_byte != @last_leading_byte
              @bytes_array << (leading_byte < @last_leading_byte ? PRIMARY_BYTE_MIN : PRIMARY_BYTE_MAX) if @last_leading_byte
              @bytes_array << leading_byte

              @last_leading_byte = !bytes.empty? && compressible_primary?(leading_byte) ? leading_byte : nil
            end

            @bytes_array.concat(bytes)
          end
        end
      end

      def compressible_primary?(leading_byte)
        (MIN_NON_LATIN_PRIMARY..MAX_REGULAR_PRIMARY).include?(leading_byte)
      end

      def append_secondary_bytes
        @bytes_array << LEVEL_SEPARATOR

        @common_count = 0

        @collation_elements.each do |collation_element|
          integer_to_bytes_array(level_weight(collation_element, SECONDARY_LEVEL)).each do |byte|
            append_secondary_byte(byte)
          end
        end

        # append compressed trailing common bytes
        append_common_bytes(SECONDARY_BOTTOM, SECONDARY_BOTTOM_COUNT, false) if @common_count > 0
      end

      def append_tertiary_bytes
        @bytes_array << LEVEL_SEPARATOR

        @common_count = 0

        @collation_elements.each do |collation_element|
          integer_to_bytes_array(tertiary_weight(collation_element)).each do |byte|
            append_tertiary_byte(byte)
          end
        end

        # append compressed trailing common bytes
        if @common_count > 0
          if @tertiary_common == TERTIARY_BOTTOM_NORMAL
            append_common_bytes(@tertiary_bottom, @tertiary_bottom_count, false)
          else
            append_common_bytes(@tertiary_top, @tertiary_top_count, true)
            @bytes_array[-1] -= 1 # make @bytes_array[-1] = boundary - @common_count (for compatibility with ICU)
          end
        end
      end

      def append_secondary_byte(secondary)
        if secondary == SECONDARY_COMMON
          @common_count += 1
        else
          append_with_common_bytes(secondary, SECONDARY_COMMON_SPACE)
        end
      end

      def append_tertiary_byte(tertiary)
        if tertiary == @tertiary_common
          @common_count += 1
        else
          if @tertiary_common == TERTIARY_COMMON_NORMAL && @tertiary_common < tertiary
            tertiary += @tertiary_addition
          elsif @tertiary_common == TERTIARY_COMMON_UPPER_FIRST && tertiary <= @tertiary_common
            tertiary -= @tertiary_addition
          end

          append_with_common_bytes(tertiary, @tertiary_common_space)
        end
      end

      def append_with_common_bytes(byte, options)
        if @common_count > 0
          if byte < options[:common]
            append_common_bytes(options[:bottom], options[:bottom_count], false)
          else
            append_common_bytes(options[:top], options[:top_count], true)
          end
        end

        @bytes_array << byte
      end

      def append_common_bytes(boundary, count_limit, top)
        sign = top ? -1 : +1

        while @common_count > count_limit
          @bytes_array << boundary + sign * count_limit
          @common_count -= count_limit
        end

        @bytes_array << boundary + sign * (@common_count - 1)
        @common_count = 0
      end

      def tertiary_weight(collation_element)
        weight = level_weight(collation_element, TERTIARY_LEVEL)

        if continuation?(weight)
          remove_continuation_bits(weight)
        else
          (weight & @tertiary_mask) ^ @case_switch
        end
      end

      def level_weight(collation_element, level)
        collation_element[level] || 0
      end

      def integer_to_bytes_array(number)
        bytes = []

        while number > 0
          bytes.unshift(number & 0xFF)
          number >>= 8
        end

        bytes
      end

      def continuation?(weight)
        weight & CASE_BITS_MASK == CASE_BITS_MASK
      end

      def remove_continuation_bits(weight)
        weight & REMOVE_CASE_MASK
      end

      def init_tertiary_constants
        @case_switch = @case_first == :upper ? CASE_SWITCH : NO_CASE_SWITCH

        if @case_first
          @tertiary_mask     = KEEP_CASE_MASK
          @tertiary_addition = TERTIARY_ADDITION_CASE_FIRST

          if @case_first == :upper
            @tertiary_common = TERTIARY_COMMON_UPPER_FIRST
            @tertiary_top    = TERTIARY_TOP_UPPER_FIRST
            @tertiary_bottom = TERTIARY_BOTTOM_UPPER_FIRST
          else # @case_first == :lower
            @tertiary_common = TERTIARY_COMMON_NORMAL
            @tertiary_top    = TERTIARY_TOP_LOWER_FIRST
            @tertiary_bottom = TERTIARY_BOTTOM_LOWER_FIRST
          end
        else
          @tertiary_mask     = REMOVE_CASE_MASK
          @tertiary_addition = TERTIARY_ADDITION_NORMAL

          @tertiary_common = TERTIARY_COMMON_NORMAL
          @tertiary_top    = TERTIARY_TOP_NORMAL
          @tertiary_bottom = TERTIARY_BOTTOM_NORMAL
        end

        total_tertiary_count   = @tertiary_top - @tertiary_bottom - 1
        @tertiary_top_count    = (TERTIARY_PROPORTION * total_tertiary_count).to_i
        @tertiary_bottom_count = total_tertiary_count - @tertiary_top_count

        @tertiary_common_space = {
            common:       @tertiary_common,
            bottom:       @tertiary_bottom,
            bottom_count: @tertiary_bottom_count,
            top:          @tertiary_top,
            top_count:    @tertiary_top_count
        }
      end

      # Primary level compression constants

      PRIMARY_BYTE_MIN = 0x3
      PRIMARY_BYTE_MAX = 0xFF

      MIN_NON_LATIN_PRIMARY = 0x5B
      MAX_REGULAR_PRIMARY   = 0x7A

      # Secondary level compression constants

      SECONDARY_BOTTOM       = 0x05
      SECONDARY_TOP          = 0x86
      SECONDARY_PROPORTION   = 0.5
      SECONDARY_COMMON       = SECONDARY_BOTTOM
      SECONDARY_TOTAL_COUNT  = SECONDARY_TOP - SECONDARY_BOTTOM - 1
      SECONDARY_TOP_COUNT    = (SECONDARY_PROPORTION * SECONDARY_TOTAL_COUNT).to_i
      SECONDARY_BOTTOM_COUNT = SECONDARY_TOTAL_COUNT - SECONDARY_TOP_COUNT

      SECONDARY_COMMON_SPACE = {
          common:       SECONDARY_COMMON,
          bottom:       SECONDARY_BOTTOM,
          bottom_count: SECONDARY_BOTTOM_COUNT,
          top:          SECONDARY_TOP,
          top_count:    SECONDARY_TOP_COUNT
      }

      # Tertiary level compression constants

      REMOVE_CASE_MASK = 0x3F
      KEEP_CASE_MASK   = 0xFF

      CASE_BITS_MASK = 0xC0

      CASE_SWITCH    = 0xC0
      NO_CASE_SWITCH = 0

      TERTIARY_ADDITION_NORMAL     = 0x80
      TERTIARY_ADDITION_CASE_FIRST = 0x40

      TERTIARY_PROPORTION = 0.667

      # Normal (case-first disabled)
      TERTIARY_BOTTOM_NORMAL = 0x05
      TERTIARY_TOP_NORMAL    = 0x85
      TERTIARY_COMMON_NORMAL = TERTIARY_BOTTOM_NORMAL

      # Lower first
      TERTIARY_BOTTOM_LOWER_FIRST = TERTIARY_BOTTOM_NORMAL
      TERTIARY_TOP_LOWER_FIRST    = 0x45
      TERTIARY_COMMON_LOWER_FIRST = TERTIARY_BOTTOM_LOWER_FIRST

      # Upper first
      TERTIARY_BOTTOM_UPPER_FIRST = 0x86
      TERTIARY_TOP_UPPER_FIRST    = 0xC5
      TERTIARY_COMMON_UPPER_FIRST = TERTIARY_TOP_UPPER_FIRST

    end

  end
end