twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/parsers/unicode_regex_parser.rb

Summary

Maintainability
B
6 hrs
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

module TwitterCldr
  module Parsers

    class UnicodeRegexParserError < StandardError; end

    class UnicodeRegexParser < Parser

      autoload :Component,      "twitter_cldr/parsers/unicode_regex/component"
      autoload :CharacterClass, "twitter_cldr/parsers/unicode_regex/character_class"
      autoload :CharacterRange, "twitter_cldr/parsers/unicode_regex/character_range"
      autoload :CharacterSet,   "twitter_cldr/parsers/unicode_regex/character_set"
      autoload :Literal,        "twitter_cldr/parsers/unicode_regex/literal"
      autoload :UnicodeString,  "twitter_cldr/parsers/unicode_regex/unicode_string"

      def parse(tokens, options = {})
        super(
          preprocess(
            substitute_variables(tokens, options[:symbol_table])
          ), options
        )
      end

      private

      # Types that are allowed to be used in character ranges.
      RANGED_CHARACTER_CLASS_TOKEN_TYPES = [
        :variable, :character_set, :negated_character_set, :unicode_char,
        :multichar_string, :string, :escaped_character, :character_range
      ]

      CHARACTER_CLASS_TOKEN_TYPES = RANGED_CHARACTER_CLASS_TOKEN_TYPES + [
        :open_bracket, :special_char
      ]

      NEGATED_TOKEN_TYPES = [
        :negated_character_set
      ]

      BINARY_OPERATORS = [
        :pipe, :ampersand, :dash, :union
      ]

      UNARY_OPERATORS = [
        :negate
      ]

      def make_token(type, value = nil)
        TwitterCldr::Tokenizers::Token.new({
          type: type,
          value: value
        })
      end

      # Identifies regex ranges
      def preprocess(tokens)
        result = []
        i = 0

        while i < tokens.size
          is_range = valid_ranged_character_class_token?(tokens[i]) &&
            valid_ranged_character_class_token?(tokens[i + 2]) &&
            tokens[i + 1].type == :dash

          if is_range
            initial = send(tokens[i].type, tokens[i])
            final = send(tokens[i + 2].type, tokens[i + 2])
            result << make_character_range(initial, final)
            i += 3
          else
            if negated_token?(tokens[i])
              result += [
                make_token(:open_bracket),
                make_token(:negate),
                tokens[i],
                make_token(:close_bracket)
              ]
            else
              result << tokens[i]
            end

            i += 1
          end
        end

        result
      end

      def substitute_variables(tokens, symbol_table)
        return tokens unless symbol_table
        tokens.inject([]) do |ret, token|
          if token.type == :variable && sub = symbol_table.fetch(token.value)
            # variables can themselves contain references to other variables
            # note: this could be cached somehow
            ret += substitute_variables(sub, symbol_table)
          else
            ret << token
          end
          ret
        end
      end

      def make_character_range(initial, final)
        CharacterRange.new(initial, final)
      end

      def negated_token?(token)
        token && NEGATED_TOKEN_TYPES.include?(token.type)
      end

      def valid_character_class_token?(token)
        token && CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
      end

      def valid_ranged_character_class_token?(token)
        token && RANGED_CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
      end

      def unary_operator?(token)
        token && UNARY_OPERATORS.include?(token.type)
      end

      def binary_operator?(token)
        token && BINARY_OPERATORS.include?(token.type)
      end

      def do_parse(options)
        elements = []

        while current_token
          case current_token.type
            when :open_bracket
              elements << character_class
            when :union
              next_token(:union)
            else
              elements << send(current_token.type, current_token)
              next_token(current_token.type)
          end
        end

        elements
      end

      def character_set(token)
        CharacterSet.new(
          token.value.gsub(/^\\p/, "").gsub(/[\{\}\[\]:]/, "")
        )
      end

      def negated_character_set(token)
        CharacterSet.new(
          token.value.gsub(/^\\[pP]/, "").gsub(/[\{\}\[\]:^]/, "")
        )
      end

      def unicode_char(token)
        UnicodeString.new(
          [token.value.gsub(/^\\u/, "").gsub(/[\{\}]/, "").to_i(16)]
        )
      end

      def string(token)
        UnicodeString.new(
          token.value.unpack("U*")
        )
      end

      def multichar_string(token)
        UnicodeString.new(
          token.value.gsub(/[\{\}]/, "").unpack("U*")
        )
      end

      def escaped_character(token)
        Literal.new(token.value)
      end

      def special_char(token)
        Literal.new(token.value)
      end

      alias :negate :special_char
      alias :pipe :special_char
      alias :ampersand :special_char

      # current_token is already a CharacterRange object
      def character_range(token)
        token
      end

      def character_class
        operator_stack = []
        operand_stack = []
        open_count = 0

        loop do
          case current_token.type
            when *CharacterClass.closing_types
              open_count -= 1
              build_until_open(operator_stack, operand_stack)
              add_implicit_union(operator_stack, open_count)

            when *CharacterClass.opening_types
              open_count += 1
              operator_stack.push(current_token)

            when *(BINARY_OPERATORS + UNARY_OPERATORS)
              operator_stack.push(current_token)

            else
              add_implicit_union(operator_stack, open_count)
              operand_stack.push(
                send(current_token.type, current_token)
              )
          end

          next_token(current_token.type)
          break if operator_stack.empty? && open_count == 0
        end

        CharacterClass.new(operand_stack.pop)
      end

      def build_until_open(operator_stack, operand_stack)
        last_operator = peek(operator_stack)
        opening_type = CharacterClass.opening_type_for(current_token.type)

        until last_operator.type == opening_type
          operator = operator_stack.pop
          node = get_operator_node(operator, operand_stack)
          operand_stack.push(node)
          last_operator = peek(operator_stack)
        end

        operator_stack.pop
      end

      def get_operator_node(operator, operand_stack)
        if operator.type == :dash && operand_stack.size < 2
          get_non_range_dash_node(operator, operand_stack)
        else
          if unary_operator?(operator)
            unary_operator_node(operator.type, operand_stack.pop)
          else
            binary_operator_node(
              operator.type, operand_stack.pop, operand_stack.pop
            )
          end
        end
      end

      # Most regular expression engines allow character classes
      # to contain a literal hyphen caracter as the first character.
      # For example, [-abc] is a legal expression. It denotes a
      # character class that contains the letters '-', 'a', 'b',
      # and 'c'. For example, /[-abc]*/.match('-ba') returns 0 in Ruby.
      def get_non_range_dash_node(operator, operand_stack)
        binary_operator_node(
          :union, operand_stack.pop, string(make_token(:string, '-'))
        )
      end

      def add_implicit_union(operator_stack, open_count)
        if n = @tokens[@token_index + 1]
          if valid_character_class_token?(n) && open_count > 0
            operator_stack.push(make_token(:union))
          end
        end
      end

      def peek(array)
        array.last
      end

      def binary_operator_node(operator, right, left)
        CharacterClass::BinaryOperator.new(
          operator, left, right
        )
      end

      def unary_operator_node(operator, child)
        CharacterClass::UnaryOperator.new(
          operator, child
        )
      end

    end

  end
end