tom-lord/regexp-examples

View on GitHub
lib/regexp-examples/chargroup_parser.rb

Summary

Maintainability
A
0 mins
Test Coverage
require_relative 'parser_helpers/charset_negation_helper'
require_relative 'parser_helpers/parse_group_helper'
require_relative 'parser_helpers/parse_after_backslash_group_helper'

module RegexpExamples
  # A "sub-parser", for char groups in a regular expression
  # Some examples of what this class needs to parse:
  # [abc]          - plain characters
  # [a-z]          - ranges
  # [\n\b\d]       - escaped characters (which may represent character sets)
  # [^abc]         - negated group
  # [[a][bc]]      - sub-groups (should match "a", "b" or "c")
  # [[:lower:]]    - POSIX group
  # [[a-f]&&[d-z]] - set intersection (should match "d", "e" or "f")
  # [[^:alpha:]&&[\n]a-c] - all of the above!!!! (should match "\n")
  class ChargroupParser
    include CharsetNegationHelper
    include ParseGroupHelper
    include ParseAfterBackslashGroupHelper

    attr_reader :regexp_string, :current_position
    alias length current_position

    def initialize(regexp_string, is_sub_group: false)
      @regexp_string = regexp_string
      @is_sub_group = is_sub_group
      @current_position = 0
      @charset = []
      @negative = false
    end

    def parse
      parse_first_chars
      until next_char == ']'
        case next_char
        when '['
          parse_sub_group_concat
        when '-'
          parse_after_hyphen
        when '&'
          parse_after_ampersand
        else
          @charset.concat parse_checking_backlash
        end
      end

      @charset.uniq!
      @current_position += 1 # To account for final "]"
    end

    def result
      negate_if(@charset, @negative)
    end

    private

    def parse_first_chars
      if next_char == '^'
        @negative = true
        @current_position += 1
      end

      case rest_of_string
      when /\A[-\]]/ # e.g. /[]]/ (match "]") or /[-]/ (match "-")
        @charset << next_char
        @current_position += 1
      when /\A:(\^?)([^:]+):\]/ # e.g. [[:alpha:]] - POSIX group
        parse_posix_group(Regexp.last_match(1), Regexp.last_match(2)) if @is_sub_group
      end
    end

    def parse_posix_group(negation_flag, name)
      @charset.concat negate_if(CharSets::POSIXCharMap[name], !negation_flag.empty?)
      @current_position += (negation_flag.length + # 0 or 1, if '^' is present
                            name.length +
                            2) # Length of opening and closing colons (always 2)
    end

    # Always returns an Array, for consistency
    def parse_checking_backlash
      if next_char == '\\'
        @current_position += 1
        parse_after_backslash
      else
        r = [next_char]
        @current_position += 1
        r
      end
    end

    def parse_after_backslash
      if next_char == 'b'
        @current_position += 1
        ["\b"]
      elsif rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/
        @current_position += 1
        parse_backslash_unicode_sequence(Regexp.last_match(1)).result.map(&:to_s)
      else
        char = CharSets::BackslashCharMap.fetch(next_char, [next_char])
        @current_position += 1
        char
      end
    end

    def parse_sub_group_concat
      @current_position += 1
      sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
      sub_group_parser.parse
      @charset.concat sub_group_parser.result
      @current_position += sub_group_parser.length
    end

    def parse_after_ampersand
      if regexp_string[@current_position + 1] == '&'
        parse_sub_group_intersect
      else
        @charset << '&'
        @current_position += 1
      end
    end

    def parse_sub_group_intersect
      @current_position += 2
      sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
      sub_group_parser.parse
      @charset &= sub_group_parser.result
      @current_position += (sub_group_parser.length - 1)
    end

    def parse_after_hyphen
      r = if regexp_string[@current_position + 1] == ']' # e.g. /[abc-]/ -- not a range!
        @current_position += 1
        @charset << '-'
      elsif rest_of_string =~ /\A-\\u(\h{4}|\{\h{1,4}\})/
        @current_position += 3
        char = parse_backslash_unicode_sequence(Regexp.last_match(1)).result.first.to_s
        @charset.concat((@charset.last..char).to_a)
      else
        @current_position += 1
        @charset.concat((@charset.last..parse_checking_backlash.first).to_a)
      end
      r
    end

    def rest_of_string
      regexp_string[@current_position..-1]
    end

    def next_char
      regexp_string[@current_position]
    end
  end
end