lib/import/shake_grammar/lexer.rb from guerilla-di/tracksperanto

lib/import/shake_grammar/lexer.rb
Summary

Maintainability

5 hrs
Test Coverage

Issues

require 'bychar'

module Tracksperanto::ShakeGrammar
  class WrongInputError < RuntimeError; end
  
  # Since Shake uses a C-like language for it's scripts we rig up a very sloppy
  # but concise C-like lexer to cope
  class Lexer
    
    # Parsed stack
    attr_reader :stack
    
    # Access to the sentinel object
    attr_reader :sentinel
    
    STOP_TOKEN = :__stop #:nodoc:
    MAX_BUFFER_SIZE = 32000
    MAX_STACK_DEPTH = 127
    
    # The first argument is the IO handle to the data of the Shake script.
    # The second argument is a "sentinel" that is going to be passed
    # to the downstream lexers instantiated for nested data structures.
    # You can use the sentinel to collect data from child nodes for example.
    def initialize(with_io, sentinel = nil, limit_to_one_stmt = false, stack_depth = 0)
      # We parse byte by byte, but reading byte by byte is very slow. We therefore use a buffering reader
      # that will cache in chunks, and then read from there byte by byte.
      # This yields a substantial speedup (4.9 seconds for the test
      # as opposed to 7.9 without this). We do check for the proper class only once so that when we use nested lexers
      # we only wrap the passed IO once, and only if necessary.
      with_io = Bychar.wrap(with_io) unless with_io.respond_to?(:read_one_char)
      @io, @stack, @buf, @sentinel, @limit_to_one_stmt, @stack_depth  = with_io, [], '', sentinel, limit_to_one_stmt, stack_depth
      
      catch(STOP_TOKEN) do
        loop { parse }
      end
      
      @in_comment ? consume_comment! : consume_atom!
    end
    
    private
    
    def push_comment
      push [:comment, @buf.gsub(/(\s+?)\/\/{1}/, '')]
    end
    
    def consume_comment!
      push_comment
      erase_buffer
    end
    
    def parse
      
      if @buf.length > MAX_BUFFER_SIZE # Wrong format and the buffer is filled up, bail
        raise WrongInputError, "Atom buffer overflow at #{MAX_BUFFER_SIZE} bytes, this is definitely not a Shake script"
      end
      
      if @stack_depth > MAX_STACK_DEPTH # Wrong format - parentheses overload
        raise WrongInputError, "Stack overflow at level #{MAX_STACK_DEPTH}, this is probably a LISP program uploaded by accident"
      end
      
      c = @io.read_one_char
      throw :__stop if c.nil? # IO has run out
      
      if c == '/' && (@buf[-1].chr rescue nil) == '/' # Comment start
        # If some other data from this line has been accumulated we first consume that
        @buf = @buf[0..-2] # everything except the opening slash of the comment
        consume_atom!
        erase_buffer
        @in_comment = true
      elsif @in_comment && c == "\n" # Comment end
        consume_comment!
        @in_comment = false
      elsif @in_comment
        @buf << c
      elsif !@buf.empty? && (c == "(") # Funcall
        push([:funcall, @buf.strip] + self.class.new(@io, @sentinel, limit_to_one_stmt = false, @stack_depth + 1).stack)
        erase_buffer
      elsif c == '{' # OFX curly braces or a subexpression in a node's knob
        # Discard subexpr
        substack = self.class.new(@io, @sentinel, limit_to_one_stmt = true, @stack_depth + 1).stack
        push(:expr)
      elsif c == "[" # Array, booring
        push([:arr, self.class.new(@io).stack])
      elsif c == "}"# && @limit_to_one_stmt
        throw STOP_TOKEN
      elsif (c == "]" || c == ")" || c == ";" && @limit_to_one_stmt)
        # Bailing out of a subexpression
        consume_atom!
        throw STOP_TOKEN
      elsif (c == "," && @limit_to_one_stmt)
        consume_atom!
        throw STOP_TOKEN
      elsif (c == ",")
        consume_atom!
      elsif (c == "@")
        consume_atom!
        @buf << c
      elsif (c == ";" || c == "\n")
        # Skip these - the subexpression already is expanded anyway
      elsif (c == "=")
        vardef_atom = vardef(@buf.strip)
        push [:assign, vardef_atom, self.class.new(@io, @sentinel, limit_to_one_stmt = true, @stack_depth + 1).stack.shift]
        
        erase_buffer
      else
        @buf << c
      end
    end
    
    INT_ATOM = /^(\d+)$/
    FLOAT_ATOM = /^([\-\d\.]+)$/
    STR_ATOM = /^\"/
    AT_FRAME = /^@(-?\d+)/
    
    # Grab the minimum atomic value
    def consume_atom!
      at = @buf.strip
      erase_buffer
      return if at.empty?
      
      the_atom = case at
        when INT_ATOM
          at.to_i
        when STR_ATOM
          unquote_s(at)
        when FLOAT_ATOM
          at.to_f
        when AT_FRAME
          if $1.include?(".")
            [:value_at, $1.to_f, @stack.pop]
          else
            [:value_at, $1.to_i, @stack.pop]
          end
        else
          [:atom, at]
      end
      
      push(the_atom)
    end
    
    def unquote_s(string)
      string.strip.gsub(/^\"/, '').gsub(/\"$/, '').gsub(/\\\"/, '"')
    end
    
    # In the default impl. this just puts things on the stack. However,
    # if you want to unwrap structures as they come along (whych you do for big files)
    # you have to override this
    def push(atom_array)
      @stack << atom_array
    end
    
    def vardef(var_specifier)
      # Since we can have two-word pointers as typedefs (char *) we only use the last
      # part of the thing as varname. Nodes return the :image type implicitly.
      varname_re = /\w+$/
      varname = var_specifier.scan(varname_re).flatten.join
      typedef = var_specifier.gsub(varname_re, '').strip
      typedef = :image if typedef.empty?
      
      [:vardef, typedef, varname]
    end
    
    def erase_buffer
      @buf = ''
    end
  end
end