jeffijoe/awilix

View on GitHub
src/function-tokenizer.ts

Summary

Maintainability
D
2 days
Test Coverage
/**
 * Token type.
 */
export type TokenType =
  | 'ident'
  | '('
  | ')'
  | ','
  | '='
  | '*'
  | 'function'
  | 'class'
  | 'EOF'

/**
 * Lexer Token.
 */
export interface Token {
  type: TokenType
  value?: string
}

/**
 * Flags that can be passed to the tokenizer to toggle certain things.
 */
export const enum TokenizerFlags {
  None = 0,
  /**
   * If this is set, the tokenizer will not attempt to be smart about skipping expressions.
   */
  Dumb = 1,
}

/**
 * Creates a tokenizer for the specified source.
 *
 * @param source
 */
export function createTokenizer(source: string) {
  const end = source.length
  let pos: number = 0
  let type: TokenType = 'EOF'
  let value: string = ''
  let flags = TokenizerFlags.None
  // These are used to greedily skip as much as possible.
  // Whenever we reach a paren, we increment these.
  let parenLeft = 0
  let parenRight = 0

  return {
    next,
    done,
  }

  /**
   * Advances the tokenizer and returns the next token.
   */
  function next(nextFlags = TokenizerFlags.None): Token {
    flags = nextFlags
    advance()
    return createToken()
  }

  /**
   * Advances the tokenizer state.
   */
  function advance() {
    value = ''
    type = 'EOF'
    // eslint-disable-next-line no-constant-condition
    while (true) {
      if (pos >= end) {
        return (type = 'EOF')
      }

      const ch = source.charAt(pos)
      // Whitespace is irrelevant
      if (isWhiteSpace(ch)) {
        pos++
        continue
      }

      switch (ch) {
        case '(':
          pos++
          parenLeft++
          return (type = ch)
        case ')':
          pos++
          parenRight++
          return (type = ch)
        case '*':
          pos++
          return (type = ch)
        case ',':
          pos++
          return (type = ch)
        case '=':
          pos++
          if ((flags & TokenizerFlags.Dumb) === 0) {
            // Not in dumb-mode, so attempt to skip.
            skipExpression()
          }
          // We need to know that there's a default value so we can
          // skip it if it does not exist when resolving.
          return (type = ch)
        case '/': {
          pos++
          const nextCh = source.charAt(pos)
          if (nextCh === '/') {
            skipUntil((c) => c === '\n', true)
            pos++
          }
          if (nextCh === '*') {
            skipUntil((c) => {
              const closing = source.charAt(pos + 1)
              return c === '*' && closing === '/'
            }, true)
            pos++
          }
          continue
        }
        default:
          // Scans an identifier.
          if (isIdentifierStart(ch)) {
            scanIdentifier()
            return type
          }

          // Elegantly skip over tokens we don't care about.
          pos++
      }
    }
  }

  /**
   * Scans an identifier, given it's already been proven
   * we are ready to do so.
   */
  function scanIdentifier() {
    const identStart = source.charAt(pos)
    const start = ++pos
    while (isIdentifierPart(source.charAt(pos))) {
      pos++
    }
    value = '' + identStart + source.substring(start, pos)
    type = value === 'function' || value === 'class' ? value : 'ident'
    if (type !== 'ident') {
      value = ''
    }
    return value
  }

  /**
   * Skips everything until the next comma or the end of the parameter list.
   * Checks the parenthesis balance so we correctly skip function calls.
   */
  function skipExpression() {
    skipUntil((ch) => {
      const isAtRoot = parenLeft === parenRight + 1
      if (ch === ',' && isAtRoot) {
        return true
      }

      if (ch === '(') {
        parenLeft++
        return false
      }

      if (ch === ')') {
        parenRight++
        if (isAtRoot) {
          return true
        }
      }

      return false
    })
  }

  /**
   * Skips strings and whilespace until the predicate is true.
   *
   * @param callback stops skipping when this returns `true`.
   * @param dumb if `true`, does not skip whitespace and strings;
   * it only stops once the callback returns `true`.
   */
  function skipUntil(callback: (ch: string) => boolean, dumb = false) {
    while (pos < source.length) {
      const ch = source.charAt(pos)
      if (callback(ch)) {
        return
      }

      if (!dumb) {
        if (isWhiteSpace(ch)) {
          pos++
          continue
        }

        if (isStringQuote(ch)) {
          skipString()
          continue
        }
      }
      pos++
    }
  }

  /**
   * Given the current position is at a string quote, skips the entire string.
   */
  function skipString() {
    const quote = source.charAt(pos)
    pos++
    while (pos < source.length) {
      const ch = source.charAt(pos)
      const prev = source.charAt(pos - 1)
      // Checks if the quote was escaped.
      if (ch === quote && prev !== '\\') {
        pos++
        return
      }

      // Template strings are a bit tougher, we want to skip the interpolated values.
      if (quote === '`') {
        const next = source.charAt(pos + 1)
        if (next === '$') {
          const afterDollar = source.charAt(pos + 2)
          if (afterDollar === '{') {
            // This is the start of an interpolation; skip the ${
            pos = pos + 2
            // Skip strings and whitespace until we reach the ending }.
            // This includes skipping nested interpolated strings. :D
            skipUntil((ch) => ch === '}')
          }
        }
      }

      pos++
    }
  }

  /**
   * Creates a token from the current state.
   */
  function createToken(): Token {
    if (value) {
      return { value, type }
    }
    return { type }
  }

  /**
   * Determines if we are done parsing.
   */
  function done() {
    return type === 'EOF'
  }
}

/**
 * Determines if the given character is a whitespace character.
 *
 * @param  {string}  ch
 * @return {boolean}
 */
function isWhiteSpace(ch: string): boolean {
  switch (ch) {
    case '\r':
    case '\n':
    case ' ':
      return true
  }
  return false
}

/**
 * Determines if the specified character is a string quote.
 * @param  {string}  ch
 * @return {boolean}
 */
function isStringQuote(ch: string): boolean {
  switch (ch) {
    case "'":
    case '"':
    case '`':
      return true
  }
  return false
}

// NOTE: I've added the `.` character so that member expression paths
// are seen as identifiers. This is so we don't get a constructor token for
// stuff like `MyClass.prototype.constructor()`
const IDENT_START_EXPR = /^[_$a-zA-Z\xA0-\uFFFF]$/
const IDENT_PART_EXPR = /^[._$a-zA-Z0-9\xA0-\uFFFF]$/

/**
 * Determines if the character is a valid JS identifier start character.
 */
function isIdentifierStart(ch: string) {
  return IDENT_START_EXPR.test(ch)
}

/**
 * Determines if the character is a valid JS identifier start character.
 */
function isIdentifierPart(ch: string) {
  return IDENT_PART_EXPR.test(ch)
}