whitequark/parser

View on GitHub
lib/parser/lexer/literal.rb

Summary

Maintainability
C
7 hrs
Test Coverage
# encoding: binary
# frozen_string_literal: true

module Parser

  class Lexer::Literal
    DELIMITERS = { '(' => ')', '[' => ']', '{' => '}', '<' => '>' }
    SPACE = ' '.ord
    TAB = "\t".ord

    TYPES = {
    # type       start token     interpolate?
      "'"   => [ :tSTRING_BEG,   false ],
      "<<'" => [ :tSTRING_BEG,   false ],
      '%q'  => [ :tSTRING_BEG,   false ],
      '"'   => [ :tSTRING_BEG,   true  ],
      '<<"' => [ :tSTRING_BEG,   true  ],
      '%'   => [ :tSTRING_BEG,   true  ],
      '%Q'  => [ :tSTRING_BEG,   true  ],

      '%w'  => [ :tQWORDS_BEG,   false ],
      '%W'  => [ :tWORDS_BEG,    true  ],

      '%i'  => [ :tQSYMBOLS_BEG, false ],
      '%I'  => [ :tSYMBOLS_BEG,  true  ],

      ":'"  => [ :tSYMBEG,       false ],
      '%s'  => [ :tSYMBEG,       false ],
      ':"'  => [ :tSYMBEG,       true  ],

      '/'   => [ :tREGEXP_BEG,   true  ],
      '%r'  => [ :tREGEXP_BEG,   true  ],

      '%x'  => [ :tXSTRING_BEG,  true  ],
      '`'   => [ :tXSTRING_BEG,  true  ],
      '<<`' => [ :tXSTRING_BEG,  true  ],
    }

    attr_reader   :heredoc_e, :str_s, :dedent_level
    attr_accessor :saved_herebody_s

    def initialize(lexer, str_type, delimiter, str_s, heredoc_e = nil,
                   indent = false, dedent_body = false, label_allowed = false)
      @lexer       = lexer
      @nesting     = 1

      # DELIMITERS and TYPES are hashes with keys encoded in binary.
      # Coerce incoming data to the same encoding.
      str_type     = coerce_encoding(str_type)
      delimiter    = coerce_encoding(delimiter)

      unless TYPES.include?(str_type)
        lexer.send(:diagnostic, :error, :unexpected_percent_str,
                   { :type => str_type }, @lexer.send(:range, str_s, str_s + 2))
      end

      # String type. For :'foo', it is :'
      @str_type    = str_type
      # Start of the string type specifier.
      @str_s       = str_s

      @start_tok, @interpolate = TYPES[str_type]
      @start_delim = DELIMITERS.include?(delimiter) ? delimiter : nil
      @end_delim   = DELIMITERS.fetch(delimiter, delimiter)

      @heredoc_e     = heredoc_e
      @indent        = indent
      @label_allowed = label_allowed

      @dedent_body   = dedent_body
      @dedent_level  = nil

      @interp_braces = 0

      @space_emitted = true

      # Monolithic strings are glued into a single token, e.g.
      # tSTRING_BEG tSTRING_CONTENT tSTRING_END -> tSTRING.
      @monolithic  = (@start_tok == :tSTRING_BEG  &&
                      %w(' ").include?(str_type) &&
                      !heredoc?)

      # Capture opening delimiter in percent-literals.
      @str_type += delimiter if @str_type.start_with?('%'.freeze)

      clear_buffer

      emit_start_tok unless @monolithic
    end

    def interpolate?
      @interpolate
    end

    def words?
      type == :tWORDS_BEG || type == :tQWORDS_BEG ||
        type == :tSYMBOLS_BEG || type == :tQSYMBOLS_BEG
    end

    def regexp?
      type == :tREGEXP_BEG
    end

    def heredoc?
      !!@heredoc_e
    end

    def plain_heredoc?
      heredoc? && !@dedent_body
    end

    def squiggly_heredoc?
      heredoc? && @dedent_body
    end

    def backslash_delimited?
      @end_delim == '\\'.freeze
    end

    def type
      @start_tok
    end

    def munge_escape?(character)
      character = coerce_encoding(character)

      if words? && character =~ /[ \t\v\r\f\n]/
        true
      else
        ['\\'.freeze, @start_delim, @end_delim].include?(character)
      end
    end

    def nest_and_try_closing(delimiter, ts, te, lookahead=nil)
      delimiter = coerce_encoding(delimiter)

      if @start_delim && @start_delim == delimiter
        @nesting += 1
      elsif delimiter?(delimiter)
        @nesting -= 1
      end

      # Finalize if last matching delimiter is closed.
      if @nesting == 0
        if words?
          extend_space(ts, ts)
        end

        if lookahead && @label_allowed && lookahead[0] == ?: &&
           lookahead[1] != ?: && @start_tok == :tSTRING_BEG
          # This is a quoted label.
          flush_string
          emit(:tLABEL_END, @end_delim, ts, te + 1)
        elsif @monolithic
          # Emit the string as a single token.
          emit(:tSTRING, @buffer, @str_s, te)
        else
          # If this is a heredoc, @buffer contains the sentinel now.
          # Just throw it out. Lexer flushes the heredoc after each
          # non-heredoc-terminating \n anyway, so no data will be lost.
          flush_string unless heredoc?

          emit(:tSTRING_END, @end_delim, ts, te)
        end
      end
    end

    def infer_indent_level(line)
      return if !@dedent_body

      indent_level = 0
      line.each_char do |char|
        case char
        when ?\s
          indent_level += 1
        when ?\t
          indent_level += (8 - indent_level % 8)
        else
          if @dedent_level.nil? || @dedent_level > indent_level
            @dedent_level = indent_level
          end
          break
        end
      end
    end

    def start_interp_brace
      @interp_braces += 1
    end

    def end_interp_brace_and_try_closing
      @interp_braces -= 1

      (@interp_braces == 0)
    end

    def extend_string(string, ts, te)
      @buffer_s ||= ts
      @buffer_e = te

      @buffer << string
    end

    def flush_string
      if @monolithic
        emit_start_tok
        @monolithic = false
      end

      unless @buffer.empty?
        emit(:tSTRING_CONTENT, @buffer, @buffer_s, @buffer_e)

        clear_buffer
        extend_content
      end
    end

    def extend_content
      @space_emitted = false
    end

    def extend_space(ts, te)
      flush_string

      unless @space_emitted
        emit(:tSPACE, nil, ts, te)

        @space_emitted = true
      end
    end

    def supports_line_continuation_via_slash?
      !words? && @interpolate
    end

    protected

    def delimiter?(delimiter)
      if heredoc?
        # This heredoc is valid:
        # <<~E
        #   E
        # and this:
        # <<~E
        # E
        # but this one is not:
        # <<~'  E'
        # E
        # because there are not enough leading spaces in the closing delimiter.
        delimiter.end_with?(@end_delim) &&
          delimiter.sub(/#{Regexp.escape(@end_delim)}\z/, '').bytes.all? { |c| c == SPACE || c == TAB }
      elsif @indent
        @end_delim == delimiter.lstrip
      else
        @end_delim == delimiter
      end
    end

    def coerce_encoding(string)
      string.b
    end

    def clear_buffer
      @buffer = ''.dup

      # Prime the buffer with lexer encoding; otherwise,
      # concatenation will produce varying results.
      @buffer.force_encoding(@lexer.source_buffer.source.encoding)

      @buffer_s = nil
      @buffer_e = nil
    end

    def emit_start_tok
      str_e = @heredoc_e || @str_s + @str_type.length
      emit(@start_tok, @str_type, @str_s, str_e)
    end

    def emit(token, type, s, e)
      @lexer.send(:emit, token, type, s, e)
    end
  end

end