lib/parser/lexer.rl from whitequark/parser

lib/parser/lexer.rl
Summary

Maintainability

Test Coverage

Issues
%%machine lex; # % fix highlighting

#
# === BEFORE YOU START ===
#
# Read the Ruby Hacking Guide chapter 11, available in English at
# http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
#
# Remember two things about Ragel scanners:
#
#   1) Longest match wins.
#
#   2) If two matches have the same length, the first
#      in source code wins.
#
# General rules of making Ragel and Bison happy:
#
#  * `p` (position) and `@te` contain the index of the character
#    they're pointing to ("current"), plus one. `@ts` contains the index
#    of the corresponding character. The code for extracting matched token is:
#
#       @source_buffer.slice(@ts...@te)
#
#  * If your input is `foooooooobar` and the rule is:
#
#       'f' 'o'+
#
#    the result will be:
#
#       foooooooobar
#       ^ ts=0   ^ p=te=9
#
#  * A Ragel lexer action should not emit more than one token, unless
#    you know what you are doing.
#
#  * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
#
#  * If an action emits the token and transitions to another state, use
#    these Ragel commands:
#
#       emit($whatever)
#       fnext $next_state; fbreak;
#
#    If you perform `fgoto` in an action which does not emit a token nor
#    rewinds the stream pointer, the parser's side-effectful,
#    context-sensitive lookahead actions will break in a hard to detect
#    and debug way.
#
#  * If an action does not emit a token:
#
#       fgoto $next_state;
#
#  * If an action features lookbehind, i.e. matches characters with the
#    intent of passing them to another action:
#
#       p = @ts - 1
#       fgoto $next_state;
#
#    or, if the lookbehind consists of a single character:
#
#       fhold; fgoto $next_state;
#
#  * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
#    `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
#    _will_ invoke the action `act`.
#
#    e_something stands for "something with **e**mbedded action".
#
#  * EOF is explicit and is matched by `c_eof`. If you want to introspect
#    the state of the lexer, add this rule to the state:
#
#       c_eof => do_eof;
#
#  * If you proceed past EOF, the lexer will complain:
#
#       NoMethodError: undefined method `ord' for nil:NilClass
#

class Parser::Lexer

  %% write data nofinal;
  # %

  attr_reader   :source_buffer

  attr_accessor :diagnostics
  attr_accessor :static_env
  attr_accessor :force_utf32

  attr_accessor :cond, :cmdarg, :context, :command_start

  attr_accessor :tokens, :comments

  attr_reader :paren_nest, :cmdarg_stack, :cond_stack, :lambda_stack, :version

  def initialize(version)
    @version    = version
    @static_env = nil
    @context    = nil

    @tokens     = nil
    @comments   = nil

    @_lex_actions =
      if self.class.respond_to?(:_lex_actions, true)
        self.class.send :_lex_actions
      else
        []
      end

    @emit_integer = lambda { |chars, p| emit(:tINTEGER,   chars); p }
    @emit_rational = lambda { |chars, p| emit(:tRATIONAL,  Rational(chars)); p }
    @emit_imaginary = lambda { |chars, p| emit(:tIMAGINARY, Complex(0, chars)); p }
    @emit_imaginary_rational = lambda { |chars, p| emit(:tIMAGINARY, Complex(0, Rational(chars))); p }
    @emit_integer_re = lambda { |chars, p| emit(:tINTEGER,   chars, @ts, @te - 2); p - 2 }
    @emit_integer_if = lambda { |chars, p| emit(:tINTEGER,   chars, @ts, @te - 2); p - 2 }
    @emit_integer_rescue = lambda { |chars, p| emit(:tINTEGER,   chars, @ts, @te - 6); p - 6 }

    @emit_float = lambda { |chars, p| emit(:tFLOAT,     Float(chars)); p }
    @emit_imaginary_float = lambda { |chars, p| emit(:tIMAGINARY, Complex(0, Float(chars))); p }
    @emit_float_if =     lambda { |chars, p| emit(:tFLOAT,     Float(chars), @ts, @te - 2); p - 2 }
    @emit_float_rescue = lambda { |chars, p| emit(:tFLOAT,     Float(chars), @ts, @te - 6); p - 6 }

    reset
  end

  def reset(reset_state=true)
    # Ragel state:
    if reset_state
      # Unit tests set state prior to resetting lexer.
      @cs     = self.class.lex_en_line_begin

      @cond   = StackState.new('cond')
      @cmdarg = StackState.new('cmdarg')
      @cond_stack   = []
      @cmdarg_stack = []
    end

    @force_utf32   = false # Set to true by some tests

    @source_pts    = nil # @source as a codepoint array

    @p             = 0   # stream position (saved manually in #advance)
    @ts            = nil # token start
    @te            = nil # token end
    @act           = 0   # next action

    @stack         = []  # state stack
    @top           = 0   # state stack top pointer

    # Lexer state:
    @token_queue   = []

    @eq_begin_s    = nil # location of last encountered =begin
    @sharp_s       = nil # location of last encountered #

    @newline_s     = nil # location of last encountered newline

    @num_base      = nil # last numeric base
    @num_digits_s  = nil # starting position of numeric digits
    @num_suffix_s  = nil # starting position of numeric suffix
    @num_xfrm      = nil # numeric suffix-induced transformation

    # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
    # encountered after a matching closing parenthesis.
    @paren_nest    = 0
    @lambda_stack  = []

    # If the lexer is in `command state' (aka expr_value)
    # at the entry to #advance, it will transition to expr_cmdarg
    # instead of expr_arg at certain points.
    @command_start = true

    # State before =begin / =end block comment
    @cs_before_block_comment = self.class.lex_en_line_begin

    @strings = Parser::LexerStrings.new(self, @version)
  end

  def source_buffer=(source_buffer)
    @source_buffer = source_buffer

    if @source_buffer
      source = @source_buffer.source

      if source.encoding == Encoding::UTF_8
        @source_pts = source.unpack('U*')
      else
        @source_pts = source.unpack('C*')
      end

      if @source_pts[0] == 0xfeff
        # Skip byte order mark.
        @p = 1
      end
    else
      @source_pts = nil
    end

    @strings.source_buffer = @source_buffer
    @strings.source_pts = @source_pts
  end

  def encoding
    @source_buffer.source.encoding
  end

  LEX_STATES = {
    :line_begin    => lex_en_line_begin,
    :expr_dot      => lex_en_expr_dot,
    :expr_fname    => lex_en_expr_fname,
    :expr_value    => lex_en_expr_value,
    :expr_beg      => lex_en_expr_beg,
    :expr_mid      => lex_en_expr_mid,
    :expr_arg      => lex_en_expr_arg,
    :expr_cmdarg   => lex_en_expr_cmdarg,
    :expr_end      => lex_en_expr_end,
    :expr_endarg   => lex_en_expr_endarg,
    :expr_endfn    => lex_en_expr_endfn,
    :expr_labelarg => lex_en_expr_labelarg,

    :inside_string => lex_en_inside_string
  }

  def state
    LEX_STATES.invert.fetch(@cs, @cs)
  end

  def state=(state)
    @cs = LEX_STATES.fetch(state)
  end

  def push_cmdarg
    @cmdarg_stack.push(@cmdarg)
    @cmdarg = StackState.new("cmdarg.#{@cmdarg_stack.count}")
  end

  def pop_cmdarg
    @cmdarg = @cmdarg_stack.pop
  end

  def push_cond
    @cond_stack.push(@cond)
    @cond = StackState.new("cond.#{@cond_stack.count}")
  end

  def pop_cond
    @cond = @cond_stack.pop
  end

  def dedent_level
    @strings.dedent_level
  end

  # Return next token: [type, value].
  def advance
    unless @token_queue.empty?
      return @token_queue.shift
    end

    # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
    klass = self.class
    _lex_trans_keys         = klass.send :_lex_trans_keys
    _lex_key_spans          = klass.send :_lex_key_spans
    _lex_index_offsets      = klass.send :_lex_index_offsets
    _lex_indicies           = klass.send :_lex_indicies
    _lex_trans_targs        = klass.send :_lex_trans_targs
    _lex_trans_actions      = klass.send :_lex_trans_actions
    _lex_to_state_actions   = klass.send :_lex_to_state_actions
    _lex_from_state_actions = klass.send :_lex_from_state_actions
    _lex_eof_trans          = klass.send :_lex_eof_trans
    _lex_actions            = @_lex_actions

    pe = @source_pts.size + 2
    p, eof = @p, pe

    cmd_state = @command_start
    @command_start = false

    %% write exec;
    # %

    # Ragel creates a local variable called `testEof` but it doesn't use
    # it in any assignment. This dead code is here to swallow the warning.
    # It has no runtime cost because Ruby doesn't produce any instructions from it.
    if false
      testEof
    end

    @p = p

    if @token_queue.any?
      @token_queue.shift
    elsif @cs == klass.lex_error
      [ false, [ '$error'.freeze, range(p - 1, p) ] ]
    else
      eof = @source_pts.size
      [ false, [ '$eof'.freeze,   range(eof, eof) ] ]
    end
  end

  protected

  def version?(*versions)
    versions.include?(@version)
  end

  def stack_pop
    @top -= 1
    @stack[@top]
  end

  def tok(s = @ts, e = @te)
    @source_buffer.slice(s, e - s)
  end

  def range(s = @ts, e = @te)
    Parser::Source::Range.new(@source_buffer, s, e)
  end

  def emit(type, value = tok, s = @ts, e = @te)
    token = [ type, [ value, range(s, e) ] ]

    @token_queue.push(token)

    @tokens.push(token) if @tokens

    token
  end

  def emit_table(table, s = @ts, e = @te)
    value = tok(s, e)

    emit(table[value], value, s, e)
  end

  def emit_do(do_block=false)
    if @cond.active?
      emit(:kDO_COND, 'do'.freeze)
    elsif @cmdarg.active? || do_block
      emit(:kDO_BLOCK, 'do'.freeze)
    else
      emit(:kDO, 'do'.freeze)
    end
  end

  def arg_or_cmdarg(cmd_state)
    if cmd_state
      self.class.lex_en_expr_cmdarg
    else
      self.class.lex_en_expr_arg
    end
  end

  def emit_comment(s = @ts, e = @te)
    if @comments
      @comments.push(Parser::Source::Comment.new(range(s, e)))
    end

    if @tokens
      @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
    end

    nil
  end

  def emit_comment_from_range(p, pe)
    emit_comment(@sharp_s, p == pe ? p - 2 : p)
  end

  def diagnostic(type, reason, arguments=nil, location=range, highlights=[])
    @diagnostics.process(
        Parser::Diagnostic.new(type, reason, arguments, location, highlights))
  end


  def e_lbrace
    @cond.push(false); @cmdarg.push(false)

    current_literal = @strings.literal
    if current_literal
      current_literal.start_interp_brace
    end
  end

  def numeric_literal_int
    digits = tok(@num_digits_s, @num_suffix_s)

    if digits.end_with? '_'.freeze
      diagnostic :error, :trailing_in_number, { :character => '_'.freeze },
                 range(@te - 1, @te)
    elsif digits.empty? && @num_base == 8 && version?(18)
      # 1.8 did not raise an error on 0o.
      digits = '0'.freeze
    elsif digits.empty?
      diagnostic :error, :empty_numeric
    elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/))
      invalid_s = @num_digits_s + invalid_idx
      diagnostic :error, :invalid_octal, nil,
                 range(invalid_s, invalid_s + 1)
    end
    digits
  end

  def on_newline(p)
    @strings.on_newline(p)
  end

  def check_ambiguous_slash(tm)
    if tok(tm, tm + 1) == '/'.freeze
      # Ambiguous regexp literal.
      if @version < 30
        diagnostic :warning, :ambiguous_literal, nil, range(tm, tm + 1)
      else
        diagnostic :warning, :ambiguous_regexp, nil, range(tm, tm + 1)
      end
    end
  end

  def emit_global_var(ts = @ts, te = @te)
    if tok(ts, te) =~ /^\$([1-9][0-9]*)$/
      emit(:tNTH_REF, tok(ts + 1, te).to_i, ts, te)
    elsif tok =~ /^\$([&`'+])$/
      emit(:tBACK_REF, tok(ts, te), ts, te)
    else
      emit(:tGVAR, tok(ts, te), ts, te)
    end
  end

  def emit_class_var(ts = @ts, te = @te)
    if tok(ts, te) =~ /^@@[0-9]/
      diagnostic :error, :cvar_name, { :name => tok(ts, te) }
    end

    emit(:tCVAR, tok(ts, te), ts, te)
  end

  def emit_instance_var(ts = @ts, te = @te)
    if tok(ts, te) =~ /^@[0-9]/
      diagnostic :error, :ivar_name, { :name => tok(ts, te) }
    end

    emit(:tIVAR, tok(ts, te), ts, te)
  end

  def emit_rbrace_rparen_rbrack
    emit_table(PUNCTUATION)

    if @version < 24
      @cond.lexpop
      @cmdarg.lexpop
    else
      @cond.pop
      @cmdarg.pop
    end
  end

  def emit_colon_with_digits(p, tm, diag_msg)
    if @version >= 27
      diagnostic :error, diag_msg, { name: tok(tm, @te) }, range(tm, @te)
    else
      emit(:tCOLON, tok(@ts, @ts + 1), @ts, @ts + 1)
      p = @ts
    end
    p
  end

  def emit_singleton_class
    emit(:kCLASS, 'class'.freeze, @ts, @ts + 5)
    emit(:tLSHFT, '<<'.freeze,    @te - 2, @te)
  end

  # Mapping of strings to parser tokens.

  PUNCTUATION = {
    '='   => :tEQL,     '&'   => :tAMPER2,  '|'   => :tPIPE,
    '!'   => :tBANG,    '^'   => :tCARET,   '+'   => :tPLUS,
    '-'   => :tMINUS,   '*'   => :tSTAR2,   '/'   => :tDIVIDE,
    '%'   => :tPERCENT, '~'   => :tTILDE,   ','   => :tCOMMA,
    ';'   => :tSEMI,    '.'   => :tDOT,     '..'  => :tDOT2,
    '...' => :tDOT3,    '['   => :tLBRACK2, ']'   => :tRBRACK,
    '('   => :tLPAREN2, ')'   => :tRPAREN,  '?'   => :tEH,
    ':'   => :tCOLON,   '&&'  => :tANDOP,   '||'  => :tOROP,
    '-@'  => :tUMINUS,  '+@'  => :tUPLUS,   '~@'  => :tTILDE,
    '**'  => :tPOW,     '->'  => :tLAMBDA,  '=~'  => :tMATCH,
    '!~'  => :tNMATCH,  '=='  => :tEQ,      '!='  => :tNEQ,
    '>'   => :tGT,      '>>'  => :tRSHFT,   '>='  => :tGEQ,
    '<'   => :tLT,      '<<'  => :tLSHFT,   '<='  => :tLEQ,
    '=>'  => :tASSOC,   '::'  => :tCOLON2,  '===' => :tEQQ,
    '<=>' => :tCMP,     '[]'  => :tAREF,    '[]=' => :tASET,
    '{'   => :tLCURLY,  '}'   => :tRCURLY,  '`'   => :tBACK_REF2,
    '!@'  => :tBANG,    '&.'  => :tANDDOT,
  }

  PUNCTUATION_BEGIN = {
    '&'   => :tAMPER,   '*'   => :tSTAR,    '**'  => :tDSTAR,
    '+'   => :tUPLUS,   '-'   => :tUMINUS,  '::'  => :tCOLON3,
    '('   => :tLPAREN,  '{'   => :tLBRACE,  '['   => :tLBRACK,
  }

  KEYWORDS = {
    'if'     => :kIF_MOD,      'unless'   => :kUNLESS_MOD,
    'while'  => :kWHILE_MOD,   'until'    => :kUNTIL_MOD,
    'rescue' => :kRESCUE_MOD,  'defined?' => :kDEFINED,
    'BEGIN'  => :klBEGIN,      'END'      => :klEND,
  }

  KEYWORDS_BEGIN = {
    'if'     => :kIF,          'unless'   => :kUNLESS,
    'while'  => :kWHILE,       'until'    => :kUNTIL,
    'rescue' => :kRESCUE,      'defined?' => :kDEFINED,
    'BEGIN'  => :klBEGIN,      'END'      => :klEND,
  }

  ESCAPE_WHITESPACE = {
    " "  => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
    "\v" => '\v', "\f" => '\f'
  }

  %w(class module def undef begin end then elsif else ensure case when
     for break next redo retry in do return yield super self nil true
     false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
    KEYWORDS_BEGIN[keyword] = KEYWORDS[keyword] = :"k#{keyword.upcase}"
  end

  %%{
  # %

  access @;
  getkey (@source_pts[p] || 0);

  # === CHARACTER CLASSES ===
  #
  # Pay close attention to the differences between c_any and any.
  # c_any does not include EOF and so will cause incorrect behavior
  # for machine subtraction (any-except rules) and default transitions
  # for scanners.

  action do_nl {
    # Record position of a newline for precise location reporting on tNL
    # tokens.
    #
    # This action is embedded directly into c_nl, as it is idempotent and
    # there are no cases when we need to skip it.
    @newline_s = p
  }

  c_nl       = '\n' $ do_nl;
  c_space    = [ \t\r\f\v];
  c_space_nl = c_space | c_nl;

  c_eof      = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
  c_eol      = c_nl | c_eof;
  c_any      = any - c_eof;

  c_nl_zlen  = c_nl | zlen;
  c_line     = any - c_nl_zlen;

  c_ascii    = 0x00..0x7f;
  c_unicode  = c_any - c_ascii;
  c_upper    = [A-Z];
  c_lower    = [a-z_]  | c_unicode;
  c_alpha    = c_lower | c_upper;
  c_alnum    = c_alpha | [0-9];

  action do_eof {
    # Sit at EOF indefinitely. #advance would return $eof each time.
    # This allows to feed the lexer more data if needed; this is only used
    # in tests.
    #
    # Note that this action is not embedded into e_eof like e_nl and e_bs
    # below. This is due to the fact that scanner state at EOF is observed
    # by tests, and encapsulating it in a rule would break the introspection.
    fhold; fbreak;
  }

  #
  # === TOKEN DEFINITIONS ===
  #

  # All operators are punctuation. There is more to punctuation
  # than just operators. Operators can be overridden by user;
  # punctuation can not.

  # A list of operators which are valid in the function name context, but
  # have different semantics in others.
  operator_fname      = '[]' | '[]=' | '`'  | '-@' | '+@' | '~@'  | '!@' ;

  # A list of operators which can occur within an assignment shortcut (+ → +=).
  operator_arithmetic = '&'  | '|'   | '&&' | '||' | '^'  | '+'   | '-'  |
                        '*'  | '/'   | '**' | '~'  | '<<' | '>>'  | '%'  ;

  # A list of all user-definable operators not covered by groups above.
  operator_rest       = '=~' | '!~' | '==' | '!=' | '!'   | '===' |
                        '<'  | '<=' | '>'  | '>=' | '<=>' | '=>'  ;

  # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
  # as they are ambiguous with interpolation `#{}` and should be counted.
  # These braces are not present in punctuation lists.

  # A list of punctuation which has different meaning when used at the
  # beginning of expression.
  punctuation_begin   = '-'  | '+'  | '::' | '('  | '['  |
                        '*'  | '**' | '&'  ;

  # A list of all punctuation except punctuation_begin.
  punctuation_end     = ','  | '='  | '->' | '('  | '['  | ']'   |
                        '::' | '?'  | ':'  | '.'  | '..' | '...' ;

  # A list of keywords which have different meaning at the beginning of expression.
  keyword_modifier    = 'if'     | 'unless' | 'while'  | 'until' | 'rescue' ;

  # A list of keywords which accept an argument-like expression, i.e. have the
  # same post-processing as method calls or commands. Example: `yield 1`,
  # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
  keyword_with_arg    = 'yield'  | 'super'  | 'not'    | 'defined?' ;

  # A list of keywords which accept a literal function name as an argument.
  keyword_with_fname  = 'def'    | 'undef'  | 'alias'  ;

  # A list of keywords which accept an expression after them.
  keyword_with_value  = 'else'   | 'case'   | 'ensure' | 'module' | 'elsif' | 'then'  |
                        'for'    | 'in'     | 'do'     | 'when'   | 'begin' | 'class' |
                        'and'    | 'or'     ;

  # A list of keywords which accept a value, and treat the keywords from
  # `keyword_modifier` list as modifiers.
  keyword_with_mid    = 'rescue' | 'return' | 'break'  | 'next'   ;

  # A list of keywords which do not accept an expression after them.
  keyword_with_end    = 'end'    | 'self'   | 'true'   | 'false'  | 'retry'    |
                        'redo'   | 'nil'    | 'BEGIN'  | 'END'    | '__FILE__' |
                        '__LINE__' | '__ENCODING__';

  # All keywords.
  keyword             = keyword_with_value | keyword_with_mid |
                        keyword_with_end   | keyword_with_arg |
                        keyword_with_fname | keyword_modifier ;

  constant       = c_upper c_alnum*;
  bareword       = c_alpha c_alnum*;

  call_or_var    = c_lower c_alnum*;
  class_var      = '@@' bareword;
  instance_var   = '@' bareword;
  global_var     = '$'
      ( bareword | digit+
      | [`'+~*$&?!@/\\;,.=:<>"] # `
      | '-' c_alnum
      )
  ;

  # Ruby accepts (and fails on) variables with leading digit
  # in literal context, but not in unquoted symbol body.
  class_var_v    = '@@' c_alnum+;
  instance_var_v = '@' c_alnum+;

  label          = bareword [?!]? ':';

  #
  # === NUMERIC PARSING ===
  #

  int_hex  = ( xdigit+ '_' )* xdigit* '_'? ;
  int_dec  = ( digit+ '_' )* digit* '_'? ;
  int_bin  = ( [01]+ '_' )* [01]* '_'? ;

  flo_int  = [1-9] [0-9]* ( '_' digit+ )* | '0';
  flo_frac = '.' ( digit+ '_' )* digit+;
  flo_pow  = [eE] [+\-]? ( digit+ '_' )* digit+;

  int_suffix =
    ''       % { @num_xfrm = @emit_integer }
  | 'r'      % { @num_xfrm = @emit_rational }
  | 'i'      % { @num_xfrm = @emit_imaginary }
  | 'ri'     % { @num_xfrm = @emit_imaginary_rational }
  | 're'     % { @num_xfrm = @emit_integer_re }
  | 'if'     % { @num_xfrm = @emit_integer_if }
  | 'rescue' % { @num_xfrm = @emit_integer_rescue };

  flo_pow_suffix =
    ''   % { @num_xfrm = @emit_float }
  | 'i'  % { @num_xfrm = @emit_imaginary_float }
  | 'if' % { @num_xfrm = @emit_float_if };

  flo_suffix =
    flo_pow_suffix
  | 'r'      % { @num_xfrm = @emit_rational }
  | 'ri'     % { @num_xfrm = @emit_imaginary_rational }
  | 'rescue' % { @num_xfrm = @emit_float_rescue };

  #
  # === INTERPOLATION PARSING ===
  #

  e_lbrace = '{' % {
    e_lbrace
  };

  e_rbrace = '}' % {
    if @strings.close_interp_on_current_literal(p)
      fhold;
      fnext inside_string;
      fbreak;
    end

    @paren_nest -= 1
  };

  #
  # === WHITESPACE HANDLING ===
  #

  # Various contexts in Ruby allow various kinds of whitespace
  # to be used. They are grouped to clarify the lexing machines
  # and ease collection of comments.

  # A line of code with inline #comment at end is always equivalent
  # to a line of code ending with just a newline, so an inline
  # comment is deemed equivalent to non-newline whitespace
  # (c_space character class).

  e_nl = c_nl % {
    p = on_newline(p)
  };

  w_space =
      c_space+
    | '\\' e_nl
    ;

  w_comment =
      '#'     %{ @sharp_s = p - 1 }
      # The (p == pe) condition compensates for added "\0" and
      # the way Ragel handles EOF.
      c_line* %{ emit_comment_from_range(p, pe) }
    ;

  w_space_comment =
      w_space
    | w_comment
    ;

  # A newline in non-literal context always interoperates with
  # here document logic and can always be escaped by a backslash,
  # still interoperating with here document logic in the same way,
  # yet being invisible to anything else.
  #
  # To demonstrate:
  #
  #     foo = <<FOO \
  #     bar
  #     FOO
  #      + 2
  #
  # is equivalent to `foo = "bar\n" + 2`.

  w_newline =
      e_nl;

  w_any =
      w_space
    | w_comment
    | w_newline
    ;


  #
  # === EXPRESSION PARSING ===
  #

  # These rules implement a form of manually defined lookahead.
  # The default longest-match scanning does not work here due
  # to sheer ambiguity.

  ambiguous_fid_suffix =         # actual    parsed
      [?!]    %{ tm = p }      | # a?        a?
      [?!]'=' %{ tm = p - 2 }    # a!=b      a != b
  ;

  ambiguous_ident_suffix =       # actual    parsed
      ambiguous_fid_suffix     |
      '='     %{ tm = p }      | # a=        a=
      '=='    %{ tm = p - 2 }  | # a==b      a == b
      '=~'    %{ tm = p - 2 }  | # a=~b      a =~ b
      '=>'    %{ tm = p - 2 }  | # a=>b      a => b
      '==='   %{ tm = p - 3 }    # a===b     a === b
  ;

  ambiguous_symbol_suffix =      # actual    parsed
      ambiguous_ident_suffix |
      '==>'   %{ tm = p - 2 }    # :a==>b    :a= => b
  ;

  # Ambiguous with 1.9 hash labels.
  ambiguous_const_suffix =       # actual    parsed
      '::'    %{ tm = p - 2 }    # A::B      A :: B
  ;

  # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
  # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.

  e_lbrack = '[' % {
    @cond.push(false); @cmdarg.push(false)

    @paren_nest += 1
  };

  e_rbrack = ']' % {
    @paren_nest -= 1
  };

  # Ruby 1.9 lambdas require parentheses counting in order to
  # emit correct opening kDO/tLBRACE.

  e_lparen = '(' % {
    @cond.push(false); @cmdarg.push(false)

    @paren_nest += 1

    if version?(18)
      @command_start = true
    end
  };

  e_rparen = ')' % {
    @paren_nest -= 1
  };

  # Ruby is context-sensitive wrt/ local identifiers.
  action local_ident {
    emit(:tIDENTIFIER)

    if !@static_env.nil? && @static_env.declared?(tok)
      fnext expr_endfn; fbreak;
    elsif @version >= 32 && tok =~ /\A_[1-9]\z/
      fnext expr_endfn; fbreak;
    else
      fnext *arg_or_cmdarg(cmd_state); fbreak;
    end
  }

  # Variable lexing code is accessed from both expressions and
  # string interpolation related code.
  #
  expr_variable := |*
      global_var
      => {
        emit_global_var

        fnext *stack_pop; fbreak;
      };

      class_var_v
      => {
        emit_class_var

        fnext *stack_pop; fbreak;
      };

      instance_var_v
      => {
        emit_instance_var

        fnext *stack_pop; fbreak;
      };
  *|;

  # Literal function name in definition (e.g. `def class`).
  # Keywords are returned as their respective tokens; this is used
  # to support singleton def `def self.foo`. Global variables are
  # returned as `tGVAR`; this is used in global variable alias
  # statements `alias $a $b`. Symbols are returned verbatim; this
  # is used in `alias :a :"b#{foo}"` and `undef :a`.
  #
  # Transitions to `expr_endfn` afterwards.
  #
  expr_fname := |*
      keyword
      => { emit_table(KEYWORDS_BEGIN);
           fnext expr_endfn; fbreak; };

      constant
      => { emit(:tCONSTANT)
           fnext expr_endfn; fbreak; };

      bareword [?=!]?
      => { emit(:tIDENTIFIER)
           fnext expr_endfn; fbreak; };

      global_var
      => { p = @ts - 1
           fnext expr_end; fcall expr_variable; };

      # If the handling was to be delegated to expr_end,
      # these cases would transition to something else than
      # expr_endfn, which is incorrect.
      operator_fname      |
      operator_arithmetic |
      operator_rest
      => { emit_table(PUNCTUATION)
           fnext expr_endfn; fbreak; };

      '::'
      => { fhold; fhold; fgoto expr_end; };

      ':'
      => { fhold; fgoto expr_beg; };

      '%s' (c_ascii - [A-Za-z0-9])
      => {
        if version?(23)
          type, delimiter = tok[0..-2], tok[-1].chr
          @strings.push_literal(type, delimiter, @ts)
          fgoto inside_string;
        else
          p = @ts - 1
          fgoto expr_end;
        end
      };

      w_any;

      c_any
      => { fhold; fgoto expr_end; };

      c_eof => do_eof;
  *|;

  # After literal function name in definition. Behaves like `expr_end`,
  # but allows a tLABEL.
  #
  # Transitions to `expr_end` afterwards.
  #
  expr_endfn := |*
      label ( any - ':' )
      => { emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
           fhold; fnext expr_labelarg; fbreak; };

      '...'
      => {
        if @version >= 31 && @context.in_argdef
          emit(:tBDOT3, '...'.freeze)
          # emit(:tNL, "\n".freeze, @te - 1, @te)
          fnext expr_end; fbreak;
        else
          p -= 3;
          fgoto expr_end;
        end
      };

      w_space_comment;

      c_any
      => { fhold; fgoto expr_end; };

      c_eof => do_eof;
  *|;

  # Literal function name in method call (e.g. `a.class`).
  #
  # Transitions to `expr_arg` afterwards.
  #
  expr_dot := |*
      constant
      => { emit(:tCONSTANT)
           fnext *arg_or_cmdarg(cmd_state); fbreak; };

      call_or_var
      => { emit(:tIDENTIFIER)
           fnext *arg_or_cmdarg(cmd_state); fbreak; };

      bareword ambiguous_fid_suffix
      => { emit(:tFID, tok(@ts, tm), @ts, tm)
           fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };

      # See the comment in `expr_fname`.
      operator_fname      |
      operator_arithmetic |
      operator_rest
      => { emit_table(PUNCTUATION)
           fnext expr_arg; fbreak; };

      w_any;

      c_any
      => { fhold; fgoto expr_end; };

      c_eof => do_eof;
  *|;

  # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
  # is consumed; the current expression is a command or method call.
  #
  expr_arg := |*
      #
      # COMMAND MODE SPECIFIC TOKENS
      #

      # cmd (1 + 2)
      # See below the rationale about expr_endarg.
      w_space+ e_lparen
      => {
        if version?(18)
          emit(:tLPAREN2, '('.freeze, @te - 1, @te)
          fnext expr_value; fbreak;
        else
          emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
          fnext expr_beg; fbreak;
        end
      };

      # meth(1 + 2)
      # Regular method call.
      e_lparen
      => { emit(:tLPAREN2, '('.freeze)
           fnext expr_beg; fbreak; };

      # meth [...]
      # Array argument. Compare with indexing `meth[...]`.
      w_space+ e_lbrack
      => { emit(:tLBRACK, '['.freeze, @te - 1, @te)
           fnext expr_beg; fbreak; };

      # cmd {}
      # Command: method call without parentheses.
      w_space* e_lbrace
      => {
        if @lambda_stack.last == @paren_nest
          @lambda_stack.pop
          emit(:tLAMBEG, '{'.freeze, @te - 1, @te)
        else
          emit(:tLCURLY, '{'.freeze, @te - 1, @te)
        end
        @command_start = true
        @paren_nest += 1
        fnext expr_value; fbreak;
      };

      #
      # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
      #

      # a??
      # Ternary operator
      '?' c_space_nl
      => {
        # Unlike expr_beg as invoked in the next rule, do not warn
        p = @ts - 1
        fgoto expr_end;
      };

      # a ?b, a? ?
      # Character literal or ternary operator
      w_space* '?'
      => { fhold; fgoto expr_beg; };

      # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
      # a /foo/ (but not "a / foo" or "a /=foo")
      # a <<HEREDOC
      w_space+ %{ tm = p }
      ( [%/] ( c_any - c_space_nl - '=' ) # /
      | '<<'
      )
      => {
        check_ambiguous_slash(tm)

        p = tm - 1
        fgoto expr_beg;
      };

      # x *1
      # Ambiguous splat, kwsplat or block-pass.
      w_space+ %{ tm = p } ( '+' | '-' | '*' | '&' | '**' )
      => {
        diagnostic :warning, :ambiguous_prefix, { :prefix => tok(tm, @te) },
                   range(tm, @te)

        p = tm - 1
        fgoto expr_beg;
      };

      # x ::Foo
      # Ambiguous toplevel constant access.
      w_space+ '::'
      => { fhold; fhold; fgoto expr_beg; };

      # x:b
      # Symbol.
      w_space* ':'
      => { fhold; fgoto expr_beg; };

      w_space+ label
      => { p = @ts - 1; fgoto expr_beg; };

      #
      # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
      #

      # a ? b
      # Ternary operator.
      w_space+ %{ tm = p } '?' c_space_nl
      => { p = tm - 1; fgoto expr_end; };

      # x + 1: Binary operator or operator-assignment.
      w_space* operator_arithmetic
                  ( '=' | c_space_nl )?    |
      # x rescue y: Modifier keyword.
      w_space* keyword_modifier            |
      # a &. b: Safe navigation operator.
      w_space* '&.'                        |
      # Miscellanea.
      w_space* punctuation_end
      => {
        p = @ts - 1
        fgoto expr_end;
      };

      w_space;

      w_comment
      => { fgoto expr_end; };

      w_newline
      => { fhold; fgoto expr_end; };

      c_any
      => { fhold; fgoto expr_beg; };

      c_eof => do_eof;
  *|;

  # The previous token was an identifier which was seen while in the
  # command mode (that is, the state at the beginning of #advance was
  # expr_value). This state is very similar to expr_arg, but disambiguates
  # two very rare and specific condition:
  #   * In 1.8 mode, "foo (lambda do end)".
  #   * In 1.9+ mode, "f x: -> do foo do end end".
  expr_cmdarg := |*
      w_space+ e_lparen
      => {
        emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
        if version?(18)
          fnext expr_value; fbreak;
        else
          fnext expr_beg; fbreak;
        end
      };

      w_space* 'do'
      => {
        if @cond.active?
          emit(:kDO_COND, 'do'.freeze, @te - 2, @te)
        else
          emit(:kDO, 'do'.freeze, @te - 2, @te)
        end
        fnext expr_value; fbreak;
      };

      c_any             |
      # Disambiguate with the `do' rule above.
      w_space* bareword |
      w_space* label
      => { p = @ts - 1
           fgoto expr_arg; };

      c_eof => do_eof;
  *|;

  # The rationale for this state is pretty complex. Normally, if an argument
  # is passed to a command and then there is a block (tLCURLY...tRCURLY),
  # the block is attached to the innermost argument (`f` in `m f {}`), or it
  # is a parse error (`m 1 {}`). But there is a special case for passing a single
  # primary expression grouped with parentheses: if you write `m (1) {}` or
  # (2.0 only) `m () {}`, then the block is attached to `m`.
  #
  # Thus, we recognize the opening `(` of a command (remember, a command is
  # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
  # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
  # lexer's state to `expr_endarg`, which makes it emit the possibly following
  # `{` as `tLBRACE_ARG`.
  #
  # The default post-`expr_endarg` state is `expr_end`, so this state also handles
  # `do` (as `kDO_BLOCK` in `expr_beg`).
  expr_endarg := |*
      e_lbrace
      => {
        if @lambda_stack.last == @paren_nest
          @lambda_stack.pop
          emit(:tLAMBEG, '{'.freeze)
        else
          emit(:tLBRACE_ARG, '{'.freeze)
        end
        @paren_nest += 1
        @command_start = true
        fnext expr_value; fbreak;
      };

      'do'
      => { emit_do(true)
           fnext expr_value; fbreak; };

      w_space_comment;

      c_any
      => { fhold; fgoto expr_end; };

      c_eof => do_eof;
  *|;

  # The rationale for this state is that several keywords accept value
  # (i.e. should transition to `expr_beg`), do not accept it like a command
  # (i.e. not an `expr_arg`), and must behave like a statement, that is,
  # accept a modifier if/while/etc.
  #
  expr_mid := |*
      keyword_modifier
      => { emit_table(KEYWORDS)
           fnext expr_beg; fbreak; };

      bareword
      => { p = @ts - 1; fgoto expr_beg; };

      w_space_comment;

      w_newline
      => { fhold; fgoto expr_end; };

      c_any
      => { fhold; fgoto expr_beg; };

      c_eof => do_eof;
  *|;

  # Beginning of an expression.
  #
  # Don't fallthrough to this state from `c_any`; make sure to handle
  # `c_space* c_nl` and let `expr_end` handle the newline.
  # Otherwise code like `f\ndef x` gets glued together and the parser
  # explodes.
  #
  expr_beg := |*
      # +5, -5, -  5
      [+\-] w_any* [0-9]
      => {
        emit(:tUNARY_NUM, tok(@ts, @ts + 1), @ts, @ts + 1)
        fhold; fnext expr_end; fbreak;
      };

      # splat *a
      '*'
      => { emit(:tSTAR, '*'.freeze)
           fbreak; };

      #
      # STRING AND REGEXP LITERALS
      #

      # /regexp/oui
      # /=/ (disambiguation with /=)
      '/' c_any
      => {
        type = delimiter = tok[0].chr
        @strings.push_literal(type, delimiter, @ts)

        fhold;
        fgoto inside_string;
      };

      # %<string>
      '%' ( c_ascii - [A-Za-z0-9] )
      => {
        type, delimiter = @source_buffer.slice(@ts, 1).chr, tok[-1].chr
        @strings.push_literal(type, delimiter, @ts)
        fgoto inside_string;
      };

      # %w(we are the people)
      '%' [A-Za-z] (c_ascii - [A-Za-z0-9])
      => {
        type, delimiter = tok[0..-2], tok[-1].chr
        @strings.push_literal(type, delimiter, @ts)
        fgoto inside_string;
      };

      '%' c_eof
      => {
        diagnostic :fatal, :string_eof, nil, range(@ts, @ts + 1)
      };

      # Heredoc start.
      # <<END  | <<'END'  | <<"END"  | <<`END`  |
      # <<-END | <<-'END' | <<-"END" | <<-`END` |
      # <<~END | <<~'END' | <<~"END" | <<~`END`
      '<<' [~\-]?
        ( '"' ( any - '"' )* '"'
        | "'" ( any - "'" )* "'"
        | "`" ( any - "`" )* "`"
        | bareword ) % { heredoc_e      = p }
        c_line* c_nl % { new_herebody_s = p }
      => {
        tok(@ts, heredoc_e) =~ /^<<(-?)(~?)(["'`]?)(.*)\3$/m

        indent      = !$1.empty? || !$2.empty?
        dedent_body = !$2.empty?
        type        =  $3.empty? ? '<<"'.freeze : ('<<'.freeze + $3)
        delimiter   =  $4

        if @version >= 27
          if delimiter.count("\n") > 0 || delimiter.count("\r") > 0
            diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
          end
        elsif @version >= 24
          if delimiter.count("\n") > 0
            if delimiter.end_with?("\n")
              diagnostic :warning, :heredoc_id_ends_with_nl, nil, range(@ts, @ts + 1)
              delimiter = delimiter.rstrip
            else
              diagnostic :fatal, :heredoc_id_has_newline, nil, range(@ts, @ts + 1)
            end
          end
        end

        if dedent_body && version?(18, 19, 20, 21, 22)
          emit(:tLSHFT, '<<'.freeze, @ts, @ts + 2)
          p = @ts + 1
          fnext expr_beg; fbreak;
        else
          @strings.push_literal(type, delimiter, @ts, heredoc_e, indent, dedent_body);
          @strings.herebody_s ||= new_herebody_s

          p = @strings.herebody_s - 1
          fnext inside_string;
        end
      };

      # Escaped unterminated heredoc start
      # <<'END  | <<"END  | <<`END  |
      # <<-'END | <<-"END | <<-`END |
      # <<~'END | <<~"END | <<~`END
      #
      # If the heredoc is terminated the rule above should handle it
      '<<' [~\-]?
        ('"' (any - c_nl - '"')*
        |"'" (any - c_nl - "'")*
        |"`" (any - c_nl - "`")
        )
      => {
        diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
      };

      #
      # SYMBOL LITERALS
      #

      # :&&, :||
      ':' ('&&' | '||') => {
        fhold; fhold;
        emit(:tSYMBEG, tok(@ts, @ts + 1), @ts, @ts + 1)
        fgoto expr_fname;
      };

      # :"bar", :'baz'
      ':' ['"] # '
      => {
        type, delimiter = tok, tok[-1].chr
        @strings.push_literal(type, delimiter, @ts);

        fgoto inside_string;
      };

      # :!@ is :!
      # :~@ is :~
      ':' [!~] '@'
      => {
        emit(:tSYMBOL, tok(@ts + 1, @ts + 2))
        fnext expr_end; fbreak;
      };

      ':' bareword ambiguous_symbol_suffix
      => {
        emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm)
        p = tm - 1
        fnext expr_end; fbreak;
      };

      ':' ( bareword | global_var | class_var | instance_var |
            operator_fname | operator_arithmetic | operator_rest )
      => {
        gvar_name = tok(@ts + 1)

        if @version >= 33 && gvar_name.start_with?('$0') && gvar_name.length > 2
          diagnostic :error, :gvar_name, { :name => gvar_name }, range(@ts + 1, @te)
        end

        emit(:tSYMBOL, gvar_name, @ts)
        fnext expr_end; fbreak;
      };

      ':' ( '@'  %{ tm = p - 1; diag_msg = :ivar_name }
          | '@@' %{ tm = p - 2; diag_msg = :cvar_name }
          ) [0-9]*
      => {
        emit_colon_with_digits(p, tm, diag_msg)

        fnext expr_end; fbreak;
      };

      #
      # AMBIGUOUS TERNARY OPERATOR
      #

      # Character constant, like ?a, ?\n, ?\u1000, and so on
      # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
      '?' c_any
      => {
        p, next_state = @strings.read_character_constant(@ts)
        fhold; # Ragel will do `p += 1` to consume input, prevent it

        # If strings lexer founds a character constant (?a) emit it,
        # otherwise read ternary operator
        if @token_queue.empty?
          fgoto *next_state;
        else
          fnext *next_state;
          fbreak;
        end
      };

      '?' c_eof
      => {
        diagnostic :fatal, :incomplete_escape, nil, range(@ts, @ts + 1)
      };

      #
      # AMBIGUOUS EMPTY BLOCK ARGUMENTS
      #

      # Ruby >= 2.7 emits it as two tPIPE terminals
      # while Ruby < 2.7 as a single tOROP (like in `a || b`)
      '||'
      => {
        if @version >= 27
          emit(:tPIPE, tok(@ts, @ts + 1), @ts, @ts + 1)
          fhold;
          fnext expr_beg; fbreak;
        else
          p -= 2
          fgoto expr_end;
        end
      };

      #
      # KEYWORDS AND PUNCTUATION
      #

      # a({b=>c})
      e_lbrace
      => {
        if @lambda_stack.last == @paren_nest
          @lambda_stack.pop
          @command_start = true
          emit(:tLAMBEG, '{'.freeze)
        else
          emit(:tLBRACE, '{'.freeze)
        end
        @paren_nest += 1
        fbreak;
      };

      # a([1, 2])
      e_lbrack
      => { emit(:tLBRACK, '['.freeze)
           fbreak; };

      # a()
      e_lparen
      => { emit(:tLPAREN, '('.freeze)
           fbreak; };

      # a(+b)
      punctuation_begin
      => { emit_table(PUNCTUATION_BEGIN)
           fbreak; };

      # rescue Exception => e: Block rescue.
      # Special because it should transition to expr_mid.
      'rescue' %{ tm = p } '=>'?
      => { emit(:kRESCUE, 'rescue'.freeze, @ts, tm)
           p = tm - 1
           fnext expr_mid; fbreak; };

      # if a: Statement if.
      keyword_modifier
      => { emit_table(KEYWORDS_BEGIN)
           @command_start = true
           fnext expr_value; fbreak; };

      #
      # RUBY 1.9 HASH LABELS
      #

      label ( any - ':' )
      => {
        fhold;

        if version?(18)
          ident = tok(@ts, @te - 2)

          emit((@source_buffer.slice(@ts, 1) =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
               ident, @ts, @te - 2)
          fhold; # continue as a symbol

          if !@static_env.nil? && @static_env.declared?(ident)
            fnext expr_end;
          else
            fnext *arg_or_cmdarg(cmd_state);
          end
        else
          emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
          fnext expr_labelarg;
        end

        fbreak;
      };

      #
      # RUBY 2.7 BEGINLESS RANGE

      '..'
      => {
        if @version >= 27
          emit(:tBDOT2)
        else
          emit(:tDOT2)
        end

        fnext expr_beg; fbreak;
      };

      '...' c_nl?
      => {
        # Here we scan and conditionally emit "\n":
        # + if it's there
        #   + and emitted we do nothing
        #   + and not emitted we return `p` to "\n" to process it on the next scan
        # + if it's not there we do nothing
        followed_by_nl = @te - 1 == @newline_s
        nl_emitted = false
        dots_te = followed_by_nl ? @te - 1 : @te

        if @version >= 30
          if @lambda_stack.any? && @lambda_stack.last + 1 == @paren_nest
            # To reject `->(...)` like `->...`
            emit(:tDOT3, '...'.freeze, @ts, dots_te)
          else
            emit(:tBDOT3, '...'.freeze, @ts, dots_te)

            if @version >= 31 && followed_by_nl && @context.in_argdef
              emit(:tNL, @te - 1, @te)
              nl_emitted = true
            end
          end
        elsif @version >= 27
          emit(:tBDOT3, '...'.freeze, @ts, dots_te)
        else
          emit(:tDOT3, '...'.freeze, @ts, dots_te)
        end

        if followed_by_nl && !nl_emitted
          # return "\n" to process it on the next scan
          fhold;
        end

        fnext expr_beg; fbreak;
      };

      #
      # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
      #

      # foo= bar:  Disambiguate with bareword rule below.
      bareword ambiguous_ident_suffix |
      # def foo:   Disambiguate with bareword rule below.
      keyword
      => { p = @ts - 1
           fgoto expr_end; };

      # a = 42;     a [42]: Indexing.
      # def a; end; a [42]: Array argument.
      call_or_var
      => local_ident;

      (call_or_var - keyword)
        % { ident_tok = tok; ident_ts = @ts; ident_te = @te; }
      w_space+ '('
      => {
        emit(:tIDENTIFIER, ident_tok, ident_ts, ident_te)
        p = ident_te - 1

        if !@static_env.nil? && @static_env.declared?(ident_tok) && @version < 25
          fnext expr_endfn;
        else
          fnext expr_cmdarg;
        end
        fbreak;
      };

      #
      # WHITESPACE
      #

      w_any;

      e_nl '=begin' ( c_space | c_nl_zlen )
      => {
        p = @ts - 1
        @cs_before_block_comment = @cs
        fgoto line_begin;
      };

      #
      # DEFAULT TRANSITION
      #

      # The following rules match most binary and all unary operators.
      # Rules for binary operators provide better error reporting.
      operator_arithmetic '='    |
      operator_rest              |
      punctuation_end            |
      c_any
      => { p = @ts - 1; fgoto expr_end; };

      c_eof => do_eof;
  *|;

  # Special newline handling for "def a b:"
  #
  expr_labelarg := |*
    w_space_comment;

    w_newline
    => {
      if @context.in_kwarg
        fhold; fgoto expr_end;
      else
        fgoto line_begin;
      end
    };

    c_any
    => { fhold; fgoto expr_beg; };

    c_eof => do_eof;
  *|;

  # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
  #
  expr_value := |*
      # a:b: a(:b), a::B, A::B
      label (any - ':')
      => { p = @ts - 1
           fgoto expr_end; };

      # "bar", 'baz'
      ['"] # '
      => {
        @strings.push_literal(tok, tok, @ts)
        fgoto inside_string;
      };

      w_space_comment;

      w_newline
      => { fgoto line_begin; };

      c_any
      => { fhold; fgoto expr_beg; };

      c_eof => do_eof;
  *|;

  expr_end := |*
      #
      # STABBY LAMBDA
      #

      '->'
      => {
        emit(:tLAMBDA, '->'.freeze, @ts, @ts + 2)

        @lambda_stack.push @paren_nest
        fnext expr_endfn; fbreak;
      };

      e_lbrace | 'do'
      => {
        if @lambda_stack.last == @paren_nest
          @lambda_stack.pop

          if tok == '{'.freeze
            emit(:tLAMBEG, '{'.freeze)
          else # 'do'
            emit(:kDO_LAMBDA, 'do'.freeze)
          end
        else
          if tok == '{'.freeze
            emit(:tLCURLY, '{'.freeze)
          else # 'do'
            emit_do
          end
        end
        if tok == '{'.freeze
          @paren_nest += 1
        end
        @command_start = true

        fnext expr_value; fbreak;
      };

      #
      # KEYWORDS
      #

      keyword_with_fname
      => { emit_table(KEYWORDS)
           fnext expr_fname; fbreak; };

      'class' w_any* '<<'
      => { emit_singleton_class
           fnext expr_value; fbreak; };

      # a if b:c: Syntax error.
      keyword_modifier
      => { emit_table(KEYWORDS)
           fnext expr_beg; fbreak; };

      # elsif b:c: elsif b(:c)
      keyword_with_value
      => { emit_table(KEYWORDS)
           @command_start = true
           fnext expr_value; fbreak; };

      keyword_with_mid
      => { emit_table(KEYWORDS)
           fnext expr_mid; fbreak; };

      keyword_with_arg
      => {
        emit_table(KEYWORDS)

        if version?(18) && tok == 'not'.freeze
          fnext expr_beg; fbreak;
        else
          fnext expr_arg; fbreak;
        end
      };

      '__ENCODING__'
      => {
        if version?(18)
          emit(:tIDENTIFIER)

          unless !@static_env.nil? && @static_env.declared?(tok)
            fnext *arg_or_cmdarg(cmd_state);
          end
        else
          emit(:k__ENCODING__, '__ENCODING__'.freeze)
        end
        fbreak;
      };

      keyword_with_end
      => { emit_table(KEYWORDS)
           fbreak; };

      #
      # NUMERIC LITERALS
      #

      ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p } int_hex
      | '0' [Dd] %{ @num_base = 10; @num_digits_s = p } int_dec
      | '0' [Oo] %{ @num_base = 8;  @num_digits_s = p } int_dec
      | '0' [Bb] %{ @num_base = 2;  @num_digits_s = p } int_bin
      | [1-9] digit* '_'? %{ @num_base = 10; @num_digits_s = @ts } int_dec
      | '0'   digit* '_'? %{ @num_base = 8;  @num_digits_s = @ts } int_dec
      ) %{ @num_suffix_s = p } int_suffix
      => {
        digits = numeric_literal_int

        if version?(18, 19, 20)
          emit(:tINTEGER, digits.to_i(@num_base), @ts, @num_suffix_s)
          p = @num_suffix_s - 1
        else
          p = @num_xfrm.call(digits.to_i(@num_base), p)
        end
        fbreak;
      };

      flo_frac flo_pow?
      => {
        diagnostic :error, :no_dot_digit_literal
      };

      flo_int [eE]
      => {
        if version?(18, 19, 20)
          diagnostic :error,
                     :trailing_in_number, { :character => tok(@te - 1, @te) },
                     range(@te - 1, @te)
        else
          emit(:tINTEGER, tok(@ts, @te - 1).to_i, @ts, @te - 1)
          fhold; fbreak;
        end
      };

      flo_int flo_frac [eE]
      => {
        if version?(18, 19, 20)
          diagnostic :error,
                     :trailing_in_number, { :character => tok(@te - 1, @te) },
                     range(@te - 1, @te)
        else
          emit(:tFLOAT, tok(@ts, @te - 1).to_f, @ts, @te - 1)
          fhold; fbreak;
        end
      };

      flo_int
      ( flo_frac? flo_pow %{ @num_suffix_s = p } flo_pow_suffix
      | flo_frac          %{ @num_suffix_s = p } flo_suffix
      )
      => {
        digits = tok(@ts, @num_suffix_s)

        if version?(18, 19, 20)
          emit(:tFLOAT, Float(digits), @ts, @num_suffix_s)
          p = @num_suffix_s - 1
        else
          p = @num_xfrm.call(digits, p)
        end
        fbreak;
      };

      #
      # STRING AND XSTRING LITERALS
      #

      # `echo foo`, "bar", 'baz'
      '`' | ['"] # '
      => {
        type, delimiter = tok, tok[-1].chr
        @strings.push_literal(type, delimiter, @ts, nil, false, false, true);
        fgoto inside_string;
      };

      #
      # CONSTANTS AND VARIABLES
      #

      constant
      => { emit(:tCONSTANT)
           fnext *arg_or_cmdarg(cmd_state); fbreak; };

      constant ambiguous_const_suffix
      => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
           p = tm - 1; fbreak; };

      global_var | class_var_v | instance_var_v
      => { p = @ts - 1; fcall expr_variable; };

      #
      # METHOD CALLS
      #

      '.' | '&.' | '::'
      => { emit_table(PUNCTUATION)
           fnext expr_dot; fbreak; };

      call_or_var
      => local_ident;

      bareword ambiguous_fid_suffix
      => {
        if tm == @te
          # Suffix was consumed, e.g. foo!
          emit(:tFID)
        else
          # Suffix was not consumed, e.g. foo!=
          emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
          p = tm - 1
        end
        fnext expr_arg; fbreak;
      };

      #
      # OPERATORS
      #

      '*' | '=>'
      => {
        emit_table(PUNCTUATION)
        fnext expr_value; fbreak;
      };

      # When '|', '~', '!', '=>' are used as operators
      # they do not accept any symbols (or quoted labels) after.
      # Other binary operators accept it.
      ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' )
      => {
        emit_table(PUNCTUATION);
        fnext expr_value; fbreak;
      };

      ( e_lparen | '|' | '~' | '!' )
      => { emit_table(PUNCTUATION)
           fnext expr_beg; fbreak; };

      e_rbrace | e_rparen | e_rbrack
      => {
        emit_rbrace_rparen_rbrack

        if tok == '}'.freeze || tok == ']'.freeze
          if @version >= 25
            fnext expr_end;
          else
            fnext expr_endarg;
          end
        else # )
          # fnext expr_endfn; ?
        end

        fbreak;
      };

      operator_arithmetic '='
      => { emit(:tOP_ASGN, tok(@ts, @te - 1))
           fnext expr_beg; fbreak; };

      '?'
      => { emit(:tEH, '?'.freeze)
           fnext expr_value; fbreak; };

      e_lbrack
      => { emit(:tLBRACK2, '['.freeze)
           fnext expr_beg; fbreak; };

      '...' c_nl
      => {
        if @paren_nest == 0
          diagnostic :warning, :triple_dot_at_eol, nil, range(@ts, @te - 1)
        end

        emit(:tDOT3, '...'.freeze, @ts, @te - 1)
        fhold;
        fnext expr_beg; fbreak;
      };

      punctuation_end
      => { emit_table(PUNCTUATION)
           fnext expr_beg; fbreak; };

      #
      # WHITESPACE
      #

      w_space_comment;

      w_newline
      => { fgoto leading_dot; };

      ';'
      => { emit(:tSEMI, ';'.freeze)
           @command_start = true
           fnext expr_value; fbreak; };

      '\\' c_line {
        diagnostic :error, :bare_backslash, nil, range(@ts, @ts + 1)
        fhold;
      };

      c_any
      => {
        diagnostic :fatal, :unexpected, { :character => tok.inspect[1..-2] }
      };

      c_eof => do_eof;
  *|;

  leading_dot := |*
      # Insane leading dots:
      # a #comment
      #  # post-2.7 comment
      #  .b: a.b

      # Here we use '\n' instead of w_newline to not modify @newline_s
      # and eventually properly emit tNL
      (c_space* w_space_comment '\n')+
      => {
        if @version < 27
          # Ruby before 2.7 doesn't support comments before leading dot.
          # If a line after "a" starts with a comment then "a" is a self-contained statement.
          # So in that case we emit a special tNL token and start reading the
          # next line as a separate statement.
          #
          # Note: block comments before leading dot are not supported on any version of Ruby.
          emit(:tNL, nil, @newline_s, @newline_s + 1)
          fhold; fnext line_begin; fbreak;
        end
      };

      c_space* '..'
      => {
        emit(:tNL, nil, @newline_s, @newline_s + 1)
        if @version < 27
          fhold; fnext line_begin; fbreak;
        else
          emit(:tBDOT2)
          fnext expr_beg; fbreak;
        end
      };

      c_space* '...'
      => {
        emit(:tNL, nil, @newline_s, @newline_s + 1)
        if @version < 27
          fhold; fnext line_begin; fbreak;
        else
          emit(:tBDOT3)
          fnext expr_beg; fbreak;
        end
      };

      c_space* %{ tm = p } ('.' | '&.')
      => { p = tm - 1; fgoto expr_end; };

      any
      => { emit(:tNL, nil, @newline_s, @newline_s + 1)
           fhold; fnext line_begin; fbreak; };
  *|;

  #
  # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
  #

  line_comment := |*
      '=end' c_line* c_nl_zlen
      => {
        emit_comment(@eq_begin_s, @te)
        fgoto *@cs_before_block_comment;
      };

      c_line* c_nl;

      c_line* zlen
      => {
        diagnostic :fatal, :embedded_document, nil,
                   range(@eq_begin_s, @eq_begin_s + '=begin'.length)
      };
  *|;

  line_begin := |*
      w_any;

      '=begin' ( c_space | c_nl_zlen )
      => { @eq_begin_s = @ts
           fgoto line_comment; };

      '__END__' ( c_eol - zlen )
      => { p = pe - 3 };

      c_any
      => { cmd_state = true; fhold; fgoto expr_value; };

      c_eof => do_eof;
  *|;

  inside_string := |*
      any
      => {
        p, next_state = @strings.advance(p)

        fhold; # Ragel will do `p += 1` to consume input, prevent it
        fnext *next_state;
        fbreak;
      };
  *|;

  }%%
  # %
end