lib/twitter_cldr/parsers/unicode_regex/character_class.rb
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
module TwitterCldr
module Parsers
class UnicodeRegexParser
# This is analogous to ICU's UnicodeSet class.
class CharacterClass < Component
GROUPING_PAIRS = {
close_bracket: :open_bracket
}
# Character classes can include set operations (eg. union, intersection, etc).
BinaryOperator = Struct.new(:operator, :left, :right) do
def type
:binary_operator
end
end
UnaryOperator = Struct.new(:operator, :child) do
def type
:unary_operator
end
end
class << self
def opening_types
@opening_types ||= GROUPING_PAIRS.values
end
def closing_types
@closing_types ||= GROUPING_PAIRS.keys
end
def opening_type_for(type)
GROUPING_PAIRS[type]
end
end
def initialize(root)
@root = root
end
def type
:character_class
end
def to_regexp_str
set_to_regex(to_set)
end
def to_set
evaluate(root)
end
def codepoints
codepoints_from(root)
end
def to_s
stringify(root)
end
def negated?
root.type == :unary_operator && root.operator == :negate
end
private
attr_reader :root
def codepoints_from(node)
case node
when UnaryOperator
codepoints_from(node.child)
when BinaryOperator
codepoints_from(node.left) + codepoints_from(node.right)
else
node.codepoints
end
end
def stringify(node)
case node
when UnaryOperator, BinaryOperator
op_str = case node.operator
when :negate then '^'
when :union, :pipe then ''
when :dash then '-'
when :ampersand then '&'
end
left = stringify(node.left)
right = stringify(node.right)
"#{left}#{op_str}#{right}"
else
node.to_s
end
end
def evaluate(node)
case node
when UnaryOperator, BinaryOperator
case node.operator
when :negate
TwitterCldr::Shared::UnicodeRegex.valid_regexp_chars.subtract(
evaluate(node.child)
)
when :union, :pipe
evaluate(node.left).union(
evaluate(node.right)
)
when :dash
evaluate(node.left).difference(
evaluate(node.right)
)
when :ampersand
evaluate(node.left).intersection(
evaluate(node.right)
)
end
else
if node
node.to_set
else
TwitterCldr::Utils::RangeSet.new([])
end
end
end
end
end
end
end