lib/regexp-examples/chargroup_parser.rb
require_relative 'parser_helpers/charset_negation_helper'
require_relative 'parser_helpers/parse_group_helper'
require_relative 'parser_helpers/parse_after_backslash_group_helper'
module RegexpExamples
# A "sub-parser", for char groups in a regular expression
# Some examples of what this class needs to parse:
# [abc] - plain characters
# [a-z] - ranges
# [\n\b\d] - escaped characters (which may represent character sets)
# [^abc] - negated group
# [[a][bc]] - sub-groups (should match "a", "b" or "c")
# [[:lower:]] - POSIX group
# [[a-f]&&[d-z]] - set intersection (should match "d", "e" or "f")
# [[^:alpha:]&&[\n]a-c] - all of the above!!!! (should match "\n")
class ChargroupParser
include CharsetNegationHelper
include ParseGroupHelper
include ParseAfterBackslashGroupHelper
attr_reader :regexp_string, :current_position
alias length current_position
def initialize(regexp_string, is_sub_group: false)
@regexp_string = regexp_string
@is_sub_group = is_sub_group
@current_position = 0
@charset = []
@negative = false
end
def parse
parse_first_chars
until next_char == ']'
case next_char
when '['
parse_sub_group_concat
when '-'
parse_after_hyphen
when '&'
parse_after_ampersand
else
@charset.concat parse_checking_backlash
end
end
@charset.uniq!
@current_position += 1 # To account for final "]"
end
def result
negate_if(@charset, @negative)
end
private
def parse_first_chars
if next_char == '^'
@negative = true
@current_position += 1
end
case rest_of_string
when /\A[-\]]/ # e.g. /[]]/ (match "]") or /[-]/ (match "-")
@charset << next_char
@current_position += 1
when /\A:(\^?)([^:]+):\]/ # e.g. [[:alpha:]] - POSIX group
parse_posix_group(Regexp.last_match(1), Regexp.last_match(2)) if @is_sub_group
end
end
def parse_posix_group(negation_flag, name)
@charset.concat negate_if(CharSets::POSIXCharMap[name], !negation_flag.empty?)
@current_position += (negation_flag.length + # 0 or 1, if '^' is present
name.length +
2) # Length of opening and closing colons (always 2)
end
# Always returns an Array, for consistency
def parse_checking_backlash
if next_char == '\\'
@current_position += 1
parse_after_backslash
else
r = [next_char]
@current_position += 1
r
end
end
def parse_after_backslash
if next_char == 'b'
@current_position += 1
["\b"]
elsif rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/
@current_position += 1
parse_backslash_unicode_sequence(Regexp.last_match(1)).result.map(&:to_s)
else
char = CharSets::BackslashCharMap.fetch(next_char, [next_char])
@current_position += 1
char
end
end
def parse_sub_group_concat
@current_position += 1
sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
sub_group_parser.parse
@charset.concat sub_group_parser.result
@current_position += sub_group_parser.length
end
def parse_after_ampersand
if regexp_string[@current_position + 1] == '&'
parse_sub_group_intersect
else
@charset << '&'
@current_position += 1
end
end
def parse_sub_group_intersect
@current_position += 2
sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
sub_group_parser.parse
@charset &= sub_group_parser.result
@current_position += (sub_group_parser.length - 1)
end
def parse_after_hyphen
r = if regexp_string[@current_position + 1] == ']' # e.g. /[abc-]/ -- not a range!
@current_position += 1
@charset << '-'
elsif rest_of_string =~ /\A-\\u(\h{4}|\{\h{1,4}\})/
@current_position += 3
char = parse_backslash_unicode_sequence(Regexp.last_match(1)).result.first.to_s
@charset.concat((@charset.last..char).to_a)
else
@current_position += 1
@charset.concat((@charset.last..parse_checking_backlash.first).to_a)
end
r
end
def rest_of_string
regexp_string[@current_position..-1]
end
def next_char
regexp_string[@current_position]
end
end
end