ammar/regexp_parser

View on GitHub
lib/regexp_parser/lexer.rb

Summary

Maintainability
B
5 hrs
Test Coverage
# A very thin wrapper around the scanner that breaks quantified literal runs,
# collects emitted tokens into an array, calculates their nesting depth, and
# normalizes tokens for the parser, and checks if they are implemented by the
# given syntax flavor.
class Regexp::Lexer

  OPENING_TOKENS = %i[
    capture passive lookahead nlookahead lookbehind nlookbehind
    atomic options options_switch named absence open
  ].freeze

  CLOSING_TOKENS = %i[close].freeze

  CONDITION_TOKENS = %i[condition condition_close].freeze

  def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
    new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
  end

  def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
    syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT

    self.block = block
    self.collect_tokens = collect_tokens
    self.tokens = []
    self.prev_token = nil
    self.preprev_token = nil
    self.nesting = 0
    self.set_nesting = 0
    self.conditional_nesting = 0
    self.shift = 0

    Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
      type, token = *syntax.normalize(type, token)
      syntax.check! type, token

      ascend(type, token)

      if (last = prev_token) &&
         type == :quantifier &&
         (
           (last.type == :literal         && (parts = break_literal(last))) ||
           (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
         )
        emit(parts[0])
        last = parts[1]
      end

      current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                  nesting, set_nesting, conditional_nesting)

      if type == :conditional && CONDITION_TOKENS.include?(token)
        current = merge_condition(current, last)
      elsif last
        last.next = current
        current.previous = last
        emit(last)
      end

      self.preprev_token = last
      self.prev_token = current

      descend(type, token)
    end

    emit(prev_token) if prev_token

    collect_tokens ? tokens : nil
  end

  def emit(token)
    if block
      # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
      res = block.call(token)
      tokens << res if collect_tokens
    else
      tokens << token
    end
  end

  class << self
    alias :scan :lex
  end

  private

  attr_accessor :block,
                :collect_tokens, :tokens, :prev_token, :preprev_token,
                :nesting, :set_nesting, :conditional_nesting, :shift

  def ascend(type, token)
    return unless CLOSING_TOKENS.include?(token)

    case type
    when :group, :assertion
      self.nesting = nesting - 1
    when :set
      self.set_nesting = set_nesting - 1
    when :conditional
      self.conditional_nesting = conditional_nesting - 1
    else
      raise "unhandled nesting type #{type}"
    end
  end

  def descend(type, token)
    return unless OPENING_TOKENS.include?(token)

    case type
    when :group, :assertion
      self.nesting = nesting + 1
    when :set
      self.set_nesting = set_nesting + 1
    when :conditional
      self.conditional_nesting = conditional_nesting + 1
    else
      raise "unhandled nesting type #{type}"
    end
  end

  # called by scan to break a literal run that is longer than one character
  # into two separate tokens when it is followed by a quantifier
  def break_literal(token)
    lead, last, _ = token.text.partition(/.\z/mu)
    return if lead.empty?

    token_1 = Regexp::Token.new(:literal, :literal, lead,
              token.ts, (token.te - last.length),
              nesting, set_nesting, conditional_nesting)
    token_2 = Regexp::Token.new(:literal, :literal, last,
              (token.ts + lead.length), token.te,
              nesting, set_nesting, conditional_nesting)

    token_1.previous = preprev_token
    token_1.next = token_2
    token_2.previous = token_1 # .next will be set by #lex
    [token_1, token_2]
  end

  # if a codepoint list is followed by a quantifier, that quantifier applies
  # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
  # c.f. #break_literal.
  def break_codepoint_list(token)
    lead, _, tail = token.text.rpartition(' ')
    return if lead.empty?

    token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
              token.ts, (token.te - tail.length),
              nesting, set_nesting, conditional_nesting)
    token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
              (token.ts + lead.length + 1), (token.te + 3),
              nesting, set_nesting, conditional_nesting)

    self.shift = shift + 3 # one space less, but extra \, u, {, and }

    token_1.previous = preprev_token
    token_1.next = token_2
    token_2.previous = token_1 # .next will be set by #lex
    [token_1, token_2]
  end

  def merge_condition(current, last)
    token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
      last.ts, current.te, nesting, set_nesting, conditional_nesting)
    token.previous = preprev_token # .next will be set by #lex
    token
  end

end # module Regexp::Lexer