twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/collation/implicit_collation_elements.rb

Summary

Maintainability
B
6 hrs
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Collation

    # ImplicitCollationElements generates implicit collation elements for code points (including some CJK characters),
    # that are not explicitly mentioned in the collation elements table.
    #
    # This module was ported from the ICU4J library (ImplicitCEGenerator class). See NOTICE file for license information.
    #
    module ImplicitCollationElements

      DEFAULT_SECONDARY_AND_TERTIARY = 5

      class << self

        def for_code_point(code_point)
          [[primary_weight(swapCJK(code_point) + 1), DEFAULT_SECONDARY_AND_TERTIARY, DEFAULT_SECONDARY_AND_TERTIARY]]
        end

        private

        # Generates the primary weight of the implicit CE for a given code point.
        #
        def primary_weight(code_point)
          byte0 = code_point - MIN_4_BOUNDARY

          if byte0 < 0
            byte1 = code_point / FINAL_3_COUNT
            byte0 = code_point % FINAL_3_COUNT

            byte2 = byte1 / MEDIAL_COUNT
            byte1 %= MEDIAL_COUNT

            # spread out, leaving gap at start
            byte0 = MIN_TRAIL + byte0 * FINAL_3_MULTIPLIER

            # offset
            byte1 += MIN_TRAIL
            byte2 += MIN_PRIMARY

            (byte2 << 16) + (byte1 << 8) + byte0
          else
            byte1 = byte0 / FINAL_4_COUNT
            byte0 %= FINAL_4_COUNT

            byte2 = byte1 / MEDIAL_COUNT
            byte1 %= MEDIAL_COUNT

            byte3 = byte2 / MEDIAL_COUNT
            byte2 %= MEDIAL_COUNT

            # spread out, leaving gap at start
            byte0 = MIN_TRAIL + byte0 * FINAL_4_MULTIPLIER

            # offset
            byte1 += MIN_TRAIL
            byte2 += MIN_TRAIL
            byte3 += MIN_4_PRIMARY

            (byte3 << 24) + (byte2 << 16) + (byte1 << 8) + byte0
          end
        end

        # Method used to:
        #   a) collapse two different Han ranges from UCA into one (in the right order)
        #   b) bump any non-CJK characters by NON_CJK_OFFSET.
        #
        # The relevant blocks are:
        # A:  4E00..9FFF; CJK Unified Ideographs
        #     F900..FAFF; CJK Compatibility Ideographs
        # B:  3400..4DBF; CJK Unified Ideographs Extension A
        #     20000..XX;  CJK Unified Ideographs Extension B (and others later on)
        #
        # As long as
        #   no new B characters are allocated between 4E00 and FAFF, and
        #   no new A characters are outside of this range,
        # (very high probability) this simple code will work.
        #
        # The reordered blocks are:
        #   Block1 is CJK
        #   Block2 is CJK_COMPAT_USED
        #   Block3 is CJK_A
        #   (all contiguous)
        #
        # Any other CJK gets its normal code point.
        #
        # When we reorder Block1, we make sure that it is at the very start, so that it will use a 3-byte form.
        #
        def swapCJK(code_point)
          if code_point >= CJK_BASE
            return code_point - CJK_BASE                                      if code_point < CJK_LIMIT
            return code_point + NON_CJK_OFFSET                                if code_point < CJK_COMPAT_USED_BASE
            return code_point - CJK_COMPAT_USED_BASE + (CJK_LIMIT - CJK_BASE) if code_point < CJK_COMPAT_USED_LIMIT
            return code_point + NON_CJK_OFFSET                                if code_point < CJK_B_BASE
            return code_point                                                 if code_point < CJK_B_LIMIT # non-BMP-CJK
            return code_point + NON_CJK_OFFSET                                if code_point < CJK_C_BASE
            return code_point                                                 if code_point < CJK_C_LIMIT # non-BMP-CJK
            return code_point + NON_CJK_OFFSET                                if code_point < CJK_D_BASE
            return code_point                                                 if code_point < CJK_D_LIMIT # non-BMP-CJK

            return code_point + NON_CJK_OFFSET # non-CJK
          end

          return code_point + NON_CJK_OFFSET if code_point < CJK_A_BASE
          return code_point - CJK_A_BASE + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE) if code_point < CJK_A_LIMIT

          code_point + NON_CJK_OFFSET # non-CJK
        end

      end

      # primary value
      MIN_PRIMARY = 0xE0
      MAX_PRIMARY = 0xe4

      # final byte
      MIN_TRAIL = 0x04
      MAX_TRAIL = 0xFE

      # gap for tailoring of 3-byte forms
      GAP_3 = 1

      # number of 3-byte primaries that can be used
      PRIMARIES_3_COUNT = 1

      # 2 * [Unicode range] + 2
      MAX_INPUT = 0x220001

      # medials can use full range
      MEDIAL_COUNT       = MAX_TRAIL - MIN_TRAIL + 1

      # number of values we can use in trailing bytes
      # leave room for empty values between AND above, e.g., if gap = 2
      #   range 3..7 => +3 -4 -5 -6 -7: so 1 value
      #   range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
      #   range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
      FINAL_3_MULTIPLIER = GAP_3 + 1
      FINAL_3_COUNT      = MEDIAL_COUNT / FINAL_3_MULTIPLIER

      # find out how many values fit in each form
      THREE_BYTE_COUNT = MEDIAL_COUNT * FINAL_3_COUNT

      # now determine where the 3/4 boundary is
      # we use 3 bytes below the boundary, and 4 above
      PRIMARIES_AVAILABLE = MAX_PRIMARY - MIN_PRIMARY + 1
      PRIMARIES_4_COUNT   = PRIMARIES_AVAILABLE - PRIMARIES_3_COUNT
      MIN_4_PRIMARY       = MIN_PRIMARY + PRIMARIES_3_COUNT
      MIN_4_BOUNDARY      = PRIMARIES_3_COUNT * THREE_BYTE_COUNT

      TOTAL_NEEDED            = MAX_INPUT - MIN_4_BOUNDARY
      NEEDED_PER_PRIMARY_BYTE = (TOTAL_NEEDED - 1) / PRIMARIES_4_COUNT + 1
      NEEDED_PER_FINAL_BYTE   = (NEEDED_PER_PRIMARY_BYTE - 1) / (MEDIAL_COUNT * MEDIAL_COUNT) + 1

      GAP_4 = (MAX_TRAIL - MIN_TRAIL - 1) / NEEDED_PER_FINAL_BYTE

      FINAL_4_MULTIPLIER = GAP_4 + 1
      FINAL_4_COUNT      = NEEDED_PER_FINAL_BYTE

      # CJK constants

      NON_CJK_OFFSET = 0x110000

      CJK_COMPAT_USED_BASE  = 0xFA0E
      CJK_COMPAT_USED_LIMIT = 0xFA2F + 1

      CJK_BASE    = 0x4E00        # 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
      CJK_LIMIT   = 0x9FCC + 1    # 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;

      CJK_A_BASE  = 0x3400        # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
      CJK_A_LIMIT = 0x4DB5 + 1    # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;

      CJK_B_BASE  = 0x20000       # 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
      CJK_B_LIMIT = 0x2A6D6 + 1   # 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;

      CJK_C_BASE  = 0x2A700       # 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
      CJK_C_LIMIT = 0x2B734 + 1   # 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;

      CJK_D_BASE  = 0x2B740       # 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
      CJK_D_LIMIT = 0x2B81D + 1   # 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;

    end

  end
end