lib/twitter_cldr/shared/bidi.rb
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
# This code was adapted from GNU Classpath, but modified significantly. Ordinarily, derivatives are
# treated as falling under the same license as the original source, but classpath comes with the
# following exception:
#
# "As a special exception, the copyright holders of this library give you
# permission to link this library with independent modules to produce an
# executable, regardless of the license terms of these independent
# modules, and to copy and distribute the resulting executable under
# terms of your choice, provided that you also meet, for each linked
# independent module, the terms and conditions of the license of that
# module. An independent module is a module which is not derived from
# or based on this library. If you modify this library, you may extend
# this exception to your version of the library, but you are not
# obligated to do so. If you do not wish to do so, delete this
# exception statement from your version."
#
# We are assuming here that building a gem with the compiled version of bidi.java falls under these terms,
# specifically that we are "link(ing) this library with independent modules to produce an executable."
# We are NOT including the original source code to avoid licensing restrictions, but it can be viewed here:
# http://developer.classpath.org/doc/java/text/Bidi-source.html
module TwitterCldr
module Shared
class Bidi
attr_reader :types, :levels, :string_arr
MAX_DEPTH = 62
class << self
def from_string(str, options = {})
string_arr = str.unpack("U*")
Bidi.new(options.merge(types: compute_types(string_arr), string_arr: string_arr))
end
def from_type_array(types, options = {})
Bidi.new(options.merge(types: types))
end
protected
def compute_types(arr)
arr.map do |code_point|
TwitterCldr::Shared::CodePoint.get(code_point).bidi_class.to_sym
end
end
end
def initialize(options = {})
@string_arr = options[:string_arr] || options[:types]
@types = options[:types] || []
@levels = []
@runs = []
@direction = options[:direction]
@default_direction = options[:default_direction] || :LTR
@length = @types.size
run_bidi
end
def to_s
@string_arr.pack("U*")
end
def reorder_visually!
raise "No string given!" unless @string_arr
# Do this explicitly so we can also find the maximum depth at the
# same time.
max = 0
lowest_odd = MAX_DEPTH + 1
@levels.each do |level|
max = [level, max].max
lowest_odd = [lowest_odd, level].min unless level.even?
end
# Reverse the runs starting with the deepest.
max.downto(lowest_odd) do |depth|
start = 0
while start < @levels.size
# Find the start of a run >= DEPTH.
start += 1 while start < @levels.size && @levels[start] < depth
break if start == @levels.size
# Find the end of the run.
finish = start + 1
finish += 1 while finish < @levels.size && @levels[finish] >= depth
# Reverse this run.
((finish - start) / 2).times do |i|
tmpb = @levels[finish - i - 1]
@levels[finish - i - 1] = @levels[start + i]
@levels[start + i] = tmpb
tmpo = @string_arr[finish - i - 1]
@string_arr[finish - i - 1] = @string_arr[start + i]
@string_arr[start + i] = tmpo
end
# Handle the next run.
start = finish + 1
end
end
self
end
protected
def compute_paragraph_embedding_level
# First check to see if the user supplied a directionality override.
if [:LTR, :RTL].include?(@direction)
@direction == :LTR ? 0 : 1
else
# This implements rules P2 and P3.
# (Note that we don't need P1, as the user supplies
# a paragraph.)
@types.each do |type|
return 0 if type == :L
return 1 if type == :R
end
@default_direction == :LTR ? 0 : 1
end
end
def compute_explicit_levels
current_embedding = @base_embedding
# The directional override is a Character directionality
# constant. -1 means there is no override.
directional_override = -1
# The stack of pushed embeddings, and the stack pointer.
# Note that because the direction is inherent in the depth,
# and because we have a bit left over in a byte, we can encode
# the override, if any, directly in this value on the stack.
embedding_stack = []
@formatter_indices ||= []
sp = 0
@length.times do |i|
is_ltr = false
is_special = true
is_ltr = @types[i] == :LRE || @types[i] == :LRO
case @types[i]
when :RLE, :RLO, :LRE, :LRO
new_embedding = if is_ltr
# Least greater even.
((current_embedding & ~1) + 2)
else
# Least greater odd.
((current_embedding + 1) | 1)
end
# FIXME: we don't properly handle invalid pushes.
if new_embedding < MAX_DEPTH
# The new level is valid. Push the old value.
# See above for a comment on the encoding here.
current_embedding |= -0x80 if (directional_override != -1)
embedding_stack[sp] = current_embedding
current_embedding = new_embedding
sp += 1
directional_override = if @types[i] == :LRO
:L
elsif @types[i] == :RLO
:R
else
-1
end
end
when :PDF
# FIXME: we don't properly handle a pop with a corresponding
# invalid push.
# If sp === 0, we saw a pop without a push. Just ignore it.
if sp > 0
sp -= 1
new_embedding = embedding_stack[sp]
current_embedding = new_embedding & 0x7f
directional_override = if new_embedding < 0
(new_embedding & 1) == 0 ? :L : :R
else
-1
end
end
else
is_special = false
end
@levels[i] = current_embedding
if is_special
# Mark this character for removal.
@formatter_indices << i
elsif directional_override != -1
@types[i] = directional_override
end
end
# Remove the formatting codes and update both the arrays
# and 'length'. It would be more efficient not to remove
# these codes, but it is also more complicated. Also, the
# Unicode algorithm reference does not properly describe
# how this is to be done -- from what I can tell, their suggestions
# in this area will not yield the correct results.
output = 0
input = 0
size = @formatter_indices.size
0.upto(size).each do |i|
if i == size
next_fmt = @length
else
next_fmt = @formatter_indices[i]
end
len = next_fmt - input
# Non-formatter codes are from 'input' to 'next_fmt'.
arraycopy(@levels, input, @levels, output, len)
arraycopy(@types, input, @types, output, len)
output += len
input = next_fmt + 1
end
@length -= @formatter_indices.size
end
def compute_runs
run_count = 0
current_embedding = @base_embedding
@length.times do |i|
if @levels[i] != current_embedding
current_embedding = @levels[i]
run_count += 1
end
end
# This may be called multiple times. If so, and if
# the number of runs has not changed, then don't bother
# allocating a new array.
where = 0
last_run_start = 0
current_embedding = @base_embedding
@length.times do |i|
if @levels[i] != current_embedding
@runs[where] = last_run_start
where += 1
last_run_start = i
current_embedding = @levels[i]
end
end
@runs[where] = last_run_start
end
def resolve_weak_types
run_count = @runs.size
previous_level = @base_embedding
run_count.times do |run_idx|
start = get_run_start(run_idx)
finish = get_run_limit(run_idx)
level = get_run_level(run_idx) || 0
# These are the names used in the Bidi algorithm.
sor = [previous_level, level].max.even? ? :L : :R
next_level = if run_idx == (run_count - 1)
@base_embedding
else
get_run_level(run_idx + 1) || 0
end
eor = [level, next_level].max.even? ? :L : :R
prev_type = sor
prev_strong_type = sor
start.upto(finish - 1) do |i|
next_type = (i == finish - 1) ? eor : @types[i + 1]
# Rule W1: change NSM to the prevailing direction.
if @types[i] == :NSM
@types[i] = prev_type
else
prev_type = @types[i]
end
# Rule W2: change EN to AN in some cases.
if @types[i] == :EN
if prev_strong_type == :AL
@types[i] = :AN
end
elsif @types[i] == :L || @types[i] == :R || @types[i] == :AL
prev_strong_type = @types[i]
end
# Rule W3: change AL to R.
if @types[i] == :AL
@types[i] = :R
end
# Rule W4: handle separators between two numbers.
if prev_type == :EN && next_type == :EN
if @types[i] == :ES || @types[i] == :CS
@types[i] = nextType
end
elsif prev_type == :AN && next_type == :AN && @types[i] == :CS
@types[i] = next_type
end
# Rule W5: change a sequence of european terminators to
# european numbers, if they are adjacent to european numbers.
# We also include BN characters in this.
if @types[i] == :ET || @types[i] == :BN
if prev_type == :EN
@types[i] = prev_type
else
# Look ahead to see if there is an EN terminating this
# sequence of ETs.
j = i + 1
while j < finish && @types[j] == :ET || @types[j] == :BN
j += 1
end
if j < finish && @types[j] == :EN
# Change them all to EN now.
i.upto(j - 1) do |k|
@types[k] = :EN
end
end
end
end
# Rule W6: separators and terminators change to ON.
# Again we include BN.
if @types[i] == :ET || @types[i] == :CS || @types[i] == :BN
@types[i] = :ON
end
# Rule W7: change european number types.
if prev_strong_type == :L && @types[i] == :EN
@types[i] = prev_strong_type
end
end
previous_level = level
end
end
def get_run_count
@runs.size
end
def get_run_level(which)
@levels[@runs[which]]
end
def get_run_limit(which)
if which == (@runs.length - 1)
@length
else
@runs[which + 1]
end
end
def get_run_start(which)
@runs[which]
end
def resolve_implicit_levels
# This implements rules I1 and I2.
@length.times do |i|
if (@levels[i] & 1) == 0
if @types[i] == :R
@levels[i] += 1
elsif @types[i] == :AN || @types[i] == :EN
@levels[i] += 2
end
else
if @types[i] == :L || @types[i] == :AN || @types[i] == :EN
@levels[i] += 1
end
end
end
end
def resolve_neutral_types
# This implements rules N1 and N2.
run_count = get_run_count
previous_level = @base_embedding
run_count.times do |run|
start = get_run_start(run)
finish = get_run_limit(run)
level = get_run_level(run)
next unless level
embedding_direction = level.even? ? :L : :R
# These are the names used in the Bidi algorithm.
sor = [previous_level, level].max.even? ? :L : :R
next_level = if run == (run_count - 1)
@base_embedding
else
get_run_level(run + 1)
end
eor = [level, next_level].max.even? ? :L : :R
prev_strong = sor
neutral_start = -1
start.upto(finish) do |i|
new_strong = -1
this_type = i == finish ? eor : @types[i]
case this_type
when :L
new_strong = :L
when :R, :AN, :EN
new_strong = :R
when :BN, :ON, :S, :B, :WS
neutral_start = i if neutral_start == -1
end
# If we see a strong character, update all the neutrals.
if new_strong != -1
if neutral_start != -1
override = prev_strong == new_strong ? prev_strong : embedding_direction
neutral_start.upto(i - 1) { |j| @types[j] = override }
end
prev_strong = new_strong
neutral_start = -1
end
end
previous_level = level
end
end
def reinsert_formatting_codes
if @formatter_indices
input = @length
output = @levels.size
# Process from the end as we are copying the array over itself here.
(@formatter_indices.size - 1).downto(0) do |index|
next_fmt = @formatter_indices[index]
# nextFmt points to a location in the original array. So,
# nextFmt+1 is the target of our copying. output is the location
# to which we last copied, thus we can derive the length of the
# copy from it.
len = output - next_fmt - 1
output = next_fmt
input -= len
# Note that we no longer need 'types' at this point, so we
# only edit 'levels'.
if next_fmt + 1 < @levels.size
arraycopy(@levels, input, @levels, next_fmt + 1, len)
end
# Now set the level at the reinsertion point.
right_level = if output == @levels.length - 1
@base_embedding
else
@levels[output + 1] || 0
end
left_level = if input == 0
@base_embedding
else
@levels[input] || 0;
end
@levels[output] = [left_level, right_level].max
end
end
@length = @levels.size
end
def arraycopy(orig, orig_index, dest, dest_index, length)
orig[orig_index...(orig_index + length)].each_with_index do |elem, count|
dest[dest_index + count] = elem
end
end
def run_bidi
@base_embedding = compute_paragraph_embedding_level
compute_explicit_levels
compute_runs
resolve_weak_types
resolve_neutral_types
resolve_implicit_levels
reinsert_formatting_codes
# After resolving the implicit levels, the number
# of runs may have changed.
compute_runs
end
end
end
end