twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/shared/bidi.rb

Summary

Maintainability
F
5 days
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

# This code was adapted from GNU Classpath, but modified significantly.  Ordinarily, derivatives are
# treated as falling under the same license as the original source, but classpath comes with the
# following exception:
#
# "As a special exception, the copyright holders of this library give you
# permission to link this library with independent modules to produce an
# executable, regardless of the license terms of these independent
# modules, and to copy and distribute the resulting executable under
# terms of your choice, provided that you also meet, for each linked
# independent module, the terms and conditions of the license of that
# module.  An independent module is a module which is not derived from
# or based on this library.  If you modify this library, you may extend
# this exception to your version of the library, but you are not
# obligated to do so.  If you do not wish to do so, delete this
# exception statement from your version."
#
# We are assuming here that building a gem with the compiled version of bidi.java falls under these terms,
# specifically that we are "link(ing) this library with independent modules to produce an executable."
# We are NOT including the original source code to avoid licensing restrictions, but it can be viewed here:
# http://developer.classpath.org/doc/java/text/Bidi-source.html


module TwitterCldr
  module Shared
    class Bidi
      attr_reader :types, :levels, :string_arr

      MAX_DEPTH = 62

      class << self
        def from_string(str, options = {})
          string_arr = str.unpack("U*")
          Bidi.new(options.merge(types: compute_types(string_arr), string_arr: string_arr))
        end

        def from_type_array(types, options = {})
          Bidi.new(options.merge(types: types))
        end

        protected

        def compute_types(arr)
          arr.map do |code_point|
            TwitterCldr::Shared::CodePoint.get(code_point).bidi_class.to_sym
          end
        end
      end

      def initialize(options = {})
        @string_arr = options[:string_arr] || options[:types]
        @types = options[:types] || []
        @levels = []
        @runs = []
        @direction = options[:direction]
        @default_direction = options[:default_direction] || :LTR
        @length = @types.size
        run_bidi
      end

      def to_s
        @string_arr.pack("U*")
      end

      def reorder_visually!
        raise "No string given!" unless @string_arr

        # Do this explicitly so we can also find the maximum depth at the
        # same time.
        max = 0
        lowest_odd = MAX_DEPTH + 1

        @levels.each do |level|
          max = [level, max].max
          lowest_odd = [lowest_odd, level].min unless level.even?
        end

        # Reverse the runs starting with the deepest.
        max.downto(lowest_odd) do |depth|
          start = 0

          while start < @levels.size
            # Find the start of a run >= DEPTH.
            start += 1 while start < @levels.size && @levels[start] < depth

            break if start == @levels.size

            # Find the end of the run.
            finish = start + 1
            finish += 1 while finish < @levels.size && @levels[finish] >= depth

            # Reverse this run.
            ((finish - start) / 2).times do |i|
              tmpb = @levels[finish - i - 1]
              @levels[finish - i - 1] = @levels[start + i]
              @levels[start + i] = tmpb

              tmpo = @string_arr[finish - i - 1]
              @string_arr[finish - i - 1] = @string_arr[start + i]
              @string_arr[start + i] = tmpo
            end

            # Handle the next run.
            start = finish + 1
          end
        end

        self
      end

      protected

      def compute_paragraph_embedding_level
        # First check to see if the user supplied a directionality override.
        if [:LTR, :RTL].include?(@direction)
          @direction == :LTR ? 0 : 1
        else
          # This implements rules P2 and P3.
          # (Note that we don't need P1, as the user supplies
          # a paragraph.)
          @types.each do |type|
            return 0 if type == :L
            return 1 if type == :R
          end

          @default_direction == :LTR ? 0 : 1
        end
      end

      def compute_explicit_levels
        current_embedding = @base_embedding

        # The directional override is a Character directionality
        # constant.  -1 means there is no override.
        directional_override = -1

        # The stack of pushed embeddings, and the stack pointer.
        # Note that because the direction is inherent in the depth,
        # and because we have a bit left over in a byte, we can encode
        # the override, if any, directly in this value on the stack.

        embedding_stack = []
        @formatter_indices ||= []
        sp = 0

        @length.times do |i|
          is_ltr = false
          is_special = true
          is_ltr = @types[i] == :LRE || @types[i] == :LRO

          case @types[i]
            when :RLE, :RLO, :LRE, :LRO
              new_embedding = if is_ltr
                # Least greater even.
                ((current_embedding & ~1) + 2)
              else
                # Least greater odd.
                ((current_embedding + 1) | 1)
              end

              # FIXME: we don't properly handle invalid pushes.
              if new_embedding < MAX_DEPTH
                # The new level is valid.  Push the old value.
                # See above for a comment on the encoding here.

                current_embedding |= -0x80 if (directional_override != -1)
                embedding_stack[sp] = current_embedding
                current_embedding = new_embedding
                sp += 1

                directional_override = if @types[i] == :LRO
                  :L
                elsif @types[i] == :RLO
                  :R
                else
                  -1
                end
              end

            when :PDF
              # FIXME: we don't properly handle a pop with a corresponding
              # invalid push.
              # If sp === 0, we saw a pop without a push.  Just ignore it.
              if sp > 0
                sp -= 1
                new_embedding = embedding_stack[sp]
                current_embedding = new_embedding & 0x7f

                directional_override = if new_embedding < 0
                  (new_embedding & 1) == 0 ? :L : :R
                else
                  -1
                end
              end

            else
              is_special = false
          end

          @levels[i] = current_embedding

          if is_special
            # Mark this character for removal.
            @formatter_indices << i
          elsif directional_override != -1
            @types[i] = directional_override
          end
        end

        # Remove the formatting codes and update both the arrays
        # and 'length'.  It would be more efficient not to remove
        # these codes, but it is also more complicated.  Also, the
        # Unicode algorithm reference does not properly describe
        # how this is to be done -- from what I can tell, their suggestions
        # in this area will not yield the correct results.

        output = 0
        input = 0
        size = @formatter_indices.size

        0.upto(size).each do |i|
          if i == size
            next_fmt = @length
          else
            next_fmt = @formatter_indices[i]
          end

          len = next_fmt - input

          # Non-formatter codes are from 'input' to 'next_fmt'.
          arraycopy(@levels, input, @levels, output, len)
          arraycopy(@types, input, @types, output, len)

          output += len
          input = next_fmt + 1
        end

        @length -= @formatter_indices.size
      end

      def compute_runs
        run_count = 0
        current_embedding = @base_embedding

        @length.times do |i|
          if @levels[i] != current_embedding
            current_embedding = @levels[i]
            run_count += 1
          end
        end

        # This may be called multiple times.  If so, and if
        # the number of runs has not changed, then don't bother
        # allocating a new array.
        where = 0
        last_run_start = 0
        current_embedding = @base_embedding

        @length.times do |i|
          if @levels[i] != current_embedding
            @runs[where] = last_run_start
            where += 1
            last_run_start = i
            current_embedding = @levels[i]
          end
        end

        @runs[where] = last_run_start
      end

      def resolve_weak_types
        run_count = @runs.size
        previous_level = @base_embedding

        run_count.times do |run_idx|
          start = get_run_start(run_idx)
          finish = get_run_limit(run_idx)
          level = get_run_level(run_idx) || 0

          # These are the names used in the Bidi algorithm.
          sor = [previous_level, level].max.even? ? :L : :R

          next_level = if run_idx == (run_count - 1)
            @base_embedding
          else
            get_run_level(run_idx + 1) || 0
          end

          eor = [level, next_level].max.even? ? :L : :R
          prev_type = sor
          prev_strong_type = sor

          start.upto(finish - 1) do |i|
            next_type = (i == finish - 1) ? eor : @types[i + 1]

            # Rule W1: change NSM to the prevailing direction.
            if @types[i] == :NSM
              @types[i] = prev_type
            else
              prev_type = @types[i]
            end

            # Rule W2: change EN to AN in some cases.
            if @types[i] == :EN
              if prev_strong_type == :AL
                @types[i] = :AN
              end
            elsif @types[i] == :L || @types[i] == :R || @types[i] == :AL
              prev_strong_type = @types[i]
            end

            # Rule W3: change AL to R.
            if @types[i] == :AL
              @types[i] = :R
            end

            # Rule W4: handle separators between two numbers.
            if prev_type == :EN && next_type == :EN
              if @types[i] == :ES || @types[i] == :CS
                @types[i] = nextType
              end
            elsif prev_type == :AN && next_type == :AN && @types[i] == :CS
              @types[i] = next_type
            end

            # Rule W5: change a sequence of european terminators to
            # european numbers, if they are adjacent to european numbers.
            # We also include BN characters in this.
            if @types[i] == :ET || @types[i] == :BN
              if prev_type == :EN
                @types[i] = prev_type
              else
                # Look ahead to see if there is an EN terminating this
                # sequence of ETs.
                j = i + 1

                while j < finish && @types[j] == :ET || @types[j] == :BN
                  j += 1
                end

                if j < finish && @types[j] == :EN
                  # Change them all to EN now.
                  i.upto(j - 1) do |k|
                    @types[k] = :EN
                  end
                end
              end
            end

            # Rule W6: separators and terminators change to ON.
            # Again we include BN.
            if @types[i] == :ET || @types[i] == :CS || @types[i] == :BN
              @types[i] = :ON
            end

            # Rule W7: change european number types.
            if prev_strong_type == :L && @types[i] == :EN
              @types[i] = prev_strong_type
            end
          end

          previous_level = level
        end
      end

      def get_run_count
        @runs.size
      end

      def get_run_level(which)
        @levels[@runs[which]]
      end

      def get_run_limit(which)
        if which == (@runs.length - 1)
          @length
        else
          @runs[which + 1]
        end
      end

      def get_run_start(which)
        @runs[which]
      end

      def resolve_implicit_levels
        # This implements rules I1 and I2.
        @length.times do |i|
          if (@levels[i] & 1) == 0
            if @types[i] == :R
              @levels[i] += 1
            elsif @types[i] == :AN || @types[i] == :EN
              @levels[i] += 2
            end
          else
            if @types[i] == :L || @types[i] == :AN || @types[i] == :EN
              @levels[i] += 1
            end
          end
        end
      end

      def resolve_neutral_types
        # This implements rules N1 and N2.
        run_count = get_run_count
        previous_level = @base_embedding

        run_count.times do |run|
          start = get_run_start(run)
          finish = get_run_limit(run)
          level = get_run_level(run)
          next unless level

          embedding_direction = level.even? ? :L : :R
          # These are the names used in the Bidi algorithm.
          sor = [previous_level, level].max.even? ? :L : :R

          next_level = if run == (run_count - 1)
            @base_embedding
          else
            get_run_level(run + 1)
          end

          eor = [level, next_level].max.even? ? :L : :R
          prev_strong = sor
          neutral_start = -1

          start.upto(finish) do |i|
            new_strong = -1
            this_type = i == finish ? eor : @types[i]

            case this_type
              when :L
                new_strong = :L
              when :R, :AN, :EN
                new_strong = :R
              when :BN, :ON, :S, :B, :WS
                neutral_start = i if neutral_start == -1
            end

            # If we see a strong character, update all the neutrals.
            if new_strong != -1
              if neutral_start != -1
                override = prev_strong == new_strong ? prev_strong : embedding_direction
                neutral_start.upto(i - 1) { |j| @types[j] = override }
              end

              prev_strong = new_strong
              neutral_start = -1
            end
          end

          previous_level = level
        end
      end

      def reinsert_formatting_codes
        if @formatter_indices
          input = @length
          output = @levels.size

          # Process from the end as we are copying the array over itself here.
          (@formatter_indices.size - 1).downto(0) do |index|
            next_fmt = @formatter_indices[index]

            # nextFmt points to a location in the original array.  So,
            # nextFmt+1 is the target of our copying.  output is the location
            # to which we last copied, thus we can derive the length of the
            # copy from it.
            len = output - next_fmt - 1
            output = next_fmt
            input -= len

            # Note that we no longer need 'types' at this point, so we
            # only edit 'levels'.
            if next_fmt + 1 < @levels.size
              arraycopy(@levels, input, @levels, next_fmt + 1, len)
            end

            # Now set the level at the reinsertion point.
            right_level = if output == @levels.length - 1
              @base_embedding
            else
              @levels[output + 1] || 0
            end

            left_level = if input == 0
              @base_embedding
            else
              @levels[input] || 0;
            end

            @levels[output] = [left_level, right_level].max
          end
        end

        @length = @levels.size
      end

      def arraycopy(orig, orig_index, dest, dest_index, length)
        orig[orig_index...(orig_index + length)].each_with_index do |elem, count|
          dest[dest_index + count] = elem
        end
      end

      def run_bidi
        @base_embedding = compute_paragraph_embedding_level

        compute_explicit_levels
        compute_runs
        resolve_weak_types
        resolve_neutral_types
        resolve_implicit_levels
        reinsert_formatting_codes

        # After resolving the implicit levels, the number
        # of runs may have changed.
        compute_runs
      end
    end
  end
end