lib/twitter_cldr/parsers/unicode_regex_parser.rb
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
module TwitterCldr
module Parsers
class UnicodeRegexParserError < StandardError; end
class UnicodeRegexParser < Parser
autoload :Component, "twitter_cldr/parsers/unicode_regex/component"
autoload :CharacterClass, "twitter_cldr/parsers/unicode_regex/character_class"
autoload :CharacterRange, "twitter_cldr/parsers/unicode_regex/character_range"
autoload :CharacterSet, "twitter_cldr/parsers/unicode_regex/character_set"
autoload :Literal, "twitter_cldr/parsers/unicode_regex/literal"
autoload :UnicodeString, "twitter_cldr/parsers/unicode_regex/unicode_string"
def parse(tokens, options = {})
super(
preprocess(
substitute_variables(tokens, options[:symbol_table])
), options
)
end
private
# Types that are allowed to be used in character ranges.
RANGED_CHARACTER_CLASS_TOKEN_TYPES = [
:variable, :character_set, :negated_character_set, :unicode_char,
:multichar_string, :string, :escaped_character, :character_range
]
CHARACTER_CLASS_TOKEN_TYPES = RANGED_CHARACTER_CLASS_TOKEN_TYPES + [
:open_bracket, :special_char
]
NEGATED_TOKEN_TYPES = [
:negated_character_set
]
BINARY_OPERATORS = [
:pipe, :ampersand, :dash, :union
]
UNARY_OPERATORS = [
:negate
]
def make_token(type, value = nil)
TwitterCldr::Tokenizers::Token.new({
type: type,
value: value
})
end
# Identifies regex ranges
def preprocess(tokens)
result = []
i = 0
while i < tokens.size
is_range = valid_ranged_character_class_token?(tokens[i]) &&
valid_ranged_character_class_token?(tokens[i + 2]) &&
tokens[i + 1].type == :dash
if is_range
initial = send(tokens[i].type, tokens[i])
final = send(tokens[i + 2].type, tokens[i + 2])
result << make_character_range(initial, final)
i += 3
else
if negated_token?(tokens[i])
result += [
make_token(:open_bracket),
make_token(:negate),
tokens[i],
make_token(:close_bracket)
]
else
result << tokens[i]
end
i += 1
end
end
result
end
def substitute_variables(tokens, symbol_table)
return tokens unless symbol_table
tokens.inject([]) do |ret, token|
if token.type == :variable && sub = symbol_table.fetch(token.value)
# variables can themselves contain references to other variables
# note: this could be cached somehow
ret += substitute_variables(sub, symbol_table)
else
ret << token
end
ret
end
end
def make_character_range(initial, final)
CharacterRange.new(initial, final)
end
def negated_token?(token)
token && NEGATED_TOKEN_TYPES.include?(token.type)
end
def valid_character_class_token?(token)
token && CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
end
def valid_ranged_character_class_token?(token)
token && RANGED_CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
end
def unary_operator?(token)
token && UNARY_OPERATORS.include?(token.type)
end
def binary_operator?(token)
token && BINARY_OPERATORS.include?(token.type)
end
def do_parse(options)
elements = []
while current_token
case current_token.type
when :open_bracket
elements << character_class
when :union
next_token(:union)
else
elements << send(current_token.type, current_token)
next_token(current_token.type)
end
end
elements
end
def character_set(token)
CharacterSet.new(
token.value.gsub(/^\\p/, "").gsub(/[\{\}\[\]:]/, "")
)
end
def negated_character_set(token)
CharacterSet.new(
token.value.gsub(/^\\[pP]/, "").gsub(/[\{\}\[\]:^]/, "")
)
end
def unicode_char(token)
UnicodeString.new(
[token.value.gsub(/^\\u/, "").gsub(/[\{\}]/, "").to_i(16)]
)
end
def string(token)
UnicodeString.new(
token.value.unpack("U*")
)
end
def multichar_string(token)
UnicodeString.new(
token.value.gsub(/[\{\}]/, "").unpack("U*")
)
end
def escaped_character(token)
Literal.new(token.value)
end
def special_char(token)
Literal.new(token.value)
end
alias :negate :special_char
alias :pipe :special_char
alias :ampersand :special_char
# current_token is already a CharacterRange object
def character_range(token)
token
end
def character_class
operator_stack = []
operand_stack = []
open_count = 0
loop do
case current_token.type
when *CharacterClass.closing_types
open_count -= 1
build_until_open(operator_stack, operand_stack)
add_implicit_union(operator_stack, open_count)
when *CharacterClass.opening_types
open_count += 1
operator_stack.push(current_token)
when *(BINARY_OPERATORS + UNARY_OPERATORS)
operator_stack.push(current_token)
else
add_implicit_union(operator_stack, open_count)
operand_stack.push(
send(current_token.type, current_token)
)
end
next_token(current_token.type)
break if operator_stack.empty? && open_count == 0
end
CharacterClass.new(operand_stack.pop)
end
def build_until_open(operator_stack, operand_stack)
last_operator = peek(operator_stack)
opening_type = CharacterClass.opening_type_for(current_token.type)
until last_operator.type == opening_type
operator = operator_stack.pop
node = get_operator_node(operator, operand_stack)
operand_stack.push(node)
last_operator = peek(operator_stack)
end
operator_stack.pop
end
def get_operator_node(operator, operand_stack)
if operator.type == :dash && operand_stack.size < 2
get_non_range_dash_node(operator, operand_stack)
else
if unary_operator?(operator)
unary_operator_node(operator.type, operand_stack.pop)
else
binary_operator_node(
operator.type, operand_stack.pop, operand_stack.pop
)
end
end
end
# Most regular expression engines allow character classes
# to contain a literal hyphen caracter as the first character.
# For example, [-abc] is a legal expression. It denotes a
# character class that contains the letters '-', 'a', 'b',
# and 'c'. For example, /[-abc]*/.match('-ba') returns 0 in Ruby.
def get_non_range_dash_node(operator, operand_stack)
binary_operator_node(
:union, operand_stack.pop, string(make_token(:string, '-'))
)
end
def add_implicit_union(operator_stack, open_count)
if n = @tokens[@token_index + 1]
if valid_character_class_token?(n) && open_count > 0
operator_stack.push(make_token(:union))
end
end
end
def peek(array)
array.last
end
def binary_operator_node(operator, right, left)
CharacterClass::BinaryOperator.new(
operator, left, right
)
end
def unary_operator_node(operator, child)
CharacterClass::UnaryOperator.new(
operator, child
)
end
end
end
end