src/Tokenizer.js from benmarch/spel2js

src/Tokenizer.js
Summary

Maintainability

1 wk
Test Coverage

Issues
/*
 * Copyright 2002-2015 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * @author Andy Clement
 * @author Phillip Webb
 * @author Ben March
 * @since 0.2.0
 */

import {Token} from './Token';
import {TokenKind} from './TokenKind';

var ALTERNATIVE_OPERATOR_NAMES = ['DIV', 'EQ', 'GE', 'GT', 'LE', 'LT', 'MOD', 'NE', 'NOT'],
    FLAGS = [],
    IS_DIGIT = 1,
    IS_HEXDIGIT = 2,
    IS_ALPHA = 4;

function init() {
    var ch;

    for (ch = '0'.charCodeAt(0); ch <= '9'.charCodeAt(0); ch += 1) {
        FLAGS[ch] |= IS_DIGIT | IS_HEXDIGIT;
    }
    for (ch = 'A'.charCodeAt(0); ch <= 'F'.charCodeAt(0); ch += 1) {
        FLAGS[ch] |= IS_HEXDIGIT;
    }
    for (ch = 'a'.charCodeAt(0); ch <= 'f'.charCodeAt(0); ch += 1) {
        FLAGS[ch] |= IS_HEXDIGIT;
    }
    for (ch = 'A'.charCodeAt(0); ch <= 'Z'.charCodeAt(0); ch += 1) {
        FLAGS[ch] |= IS_ALPHA;
    }
    for (ch = 'a'.charCodeAt(0); ch <= 'z'.charCodeAt(0); ch += 1) {
        FLAGS[ch] |= IS_ALPHA;
    }
}

init();

function tokenize(inputData) {
    var expressionString = inputData,
        toProcess = inputData + '\0',
        max = toProcess.length,
        pos = 0,
        tokens = [];

    function process() {
        var ch;

        while (pos < max) {
            ch = toProcess[pos];
            if (isAlphabetic(ch)) {
                lexIdentifier();
            }
            else {
                switch (ch) {
                case '+':
                    if (isTwoCharToken(TokenKind.INC)) {
                        pushPairToken(TokenKind.INC);
                    }
                    else {
                        pushCharToken(TokenKind.PLUS);
                    }
                    break;
                case '_': // the other way to start an identifier
                    lexIdentifier();
                    break;
                case '-':
                    if (isTwoCharToken(TokenKind.DEC)) {
                        pushPairToken(TokenKind.DEC);
                    }
                    else {
                        pushCharToken(TokenKind.MINUS);
                    }
                    break;
                case ':':
                    pushCharToken(TokenKind.COLON);
                    break;
                case '.':
                    pushCharToken(TokenKind.DOT);
                    break;
                case ',':
                    pushCharToken(TokenKind.COMMA);
                    break;
                case '*':
                    pushCharToken(TokenKind.STAR);
                    break;
                case '/':
                    pushCharToken(TokenKind.DIV);
                    break;
                case '%':
                    pushCharToken(TokenKind.MOD);
                    break;
                case '(':
                    pushCharToken(TokenKind.LPAREN);
                    break;
                case ')':
                    pushCharToken(TokenKind.RPAREN);
                    break;
                case '[':
                    pushCharToken(TokenKind.LSQUARE);
                    break;
                case '#':
                    pushCharToken(TokenKind.HASH);
                    break;
                case ']':
                    pushCharToken(TokenKind.RSQUARE);
                    break;
                case '{':
                    pushCharToken(TokenKind.LCURLY);
                    break;
                case '}':
                    pushCharToken(TokenKind.RCURLY);
                    break;
                case '@':
                    pushCharToken(TokenKind.BEAN_REF);
                    break;
                case '^':
                    if (isTwoCharToken(TokenKind.SELECT_FIRST)) {
                        pushPairToken(TokenKind.SELECT_FIRST);
                    }
                    else {
                        pushCharToken(TokenKind.POWER);
                    }
                    break;
                case '!':
                    if (isTwoCharToken(TokenKind.NE)) {
                        pushPairToken(TokenKind.NE);
                    }
                    else if (isTwoCharToken(TokenKind.PROJECT)) {
                        pushPairToken(TokenKind.PROJECT);
                    }
                    else {
                        pushCharToken(TokenKind.NOT);
                    }
                    break;
                case '=':
                    if (isTwoCharToken(TokenKind.EQ)) {
                        pushPairToken(TokenKind.EQ);
                    }
                    else {
                        pushCharToken(TokenKind.ASSIGN);
                    }
                    break;
                case '&':
                    if (!isTwoCharToken(TokenKind.SYMBOLIC_AND)) {
                        throw {
                            name: 'SpelParseException',
                            message: 'Missing character \'&\' in expression (' + expressionString + ') at position ' + pos
                        };
                    }
                    pushPairToken(TokenKind.SYMBOLIC_AND);
                    break;
                case '|':
                    if (!isTwoCharToken(TokenKind.SYMBOLIC_OR)) {
                        throw {
                            name: 'SpelParseException',
                            message: 'Missing character \'|\' in expression (' + expressionString + ') at position ' + pos
                        };
                    }
                    pushPairToken(TokenKind.SYMBOLIC_OR);
                    break;
                case '?':
                    if (isTwoCharToken(TokenKind.SELECT)) {
                        pushPairToken(TokenKind.SELECT);
                    }
                    else if (isTwoCharToken(TokenKind.ELVIS)) {
                        pushPairToken(TokenKind.ELVIS);
                    }
                    else if (isTwoCharToken(TokenKind.SAFE_NAVI)) {
                        pushPairToken(TokenKind.SAFE_NAVI);
                    }
                    else {
                        pushCharToken(TokenKind.QMARK);
                    }
                    break;
                case '$':
                    if (isTwoCharToken(TokenKind.SELECT_LAST)) {
                        pushPairToken(TokenKind.SELECT_LAST);
                    }
                    else {
                        lexIdentifier();
                    }
                    break;
                case '>':
                    if (isTwoCharToken(TokenKind.GE)) {
                        pushPairToken(TokenKind.GE);
                    }
                    else {
                        pushCharToken(TokenKind.GT);
                    }
                    break;
                case '<':
                    if (isTwoCharToken(TokenKind.LE)) {
                        pushPairToken(TokenKind.LE);
                    }
                    else {
                        pushCharToken(TokenKind.LT);
                    }
                    break;
                case '0':
                case '1':
                case '2':
                case '3':
                case '4':
                case '5':
                case '6':
                case '7':
                case '8':
                case '9':
                    lexNumericLiteral(ch === '0');
                    break;
                case ' ':
                case '\t':
                case '\r':
                case '\n':
                    // drift over white space
                    pos += 1;
                    break;
                case '\'':
                    lexQuotedStringLiteral();
                    break;
                case '"':
                    lexDoubleQuotedStringLiteral();
                    break;
                case '\0':
                    // hit sentinel at end of value
                    pos += 1; // will take us to the end
                    break;
                case '\\':
                    throw {
                        name: 'SpelParseException',
                        message: 'Unexpected escape character in expression (' + expressionString + ') at position ' + pos
                    };
                default:
                    throw {
                        name: 'SpelParseException',
                        message: 'Cannot handle character \'' + ch + '\' in expression (' + expressionString + ') at position ' + pos
                    };
                }
            }
        }
    }

    function lexQuotedStringLiteral() {
        var start = pos,
            terminated = false,
            ch;

        while (!terminated) {
            pos += 1;
            ch = toProcess[pos];
            if (ch === '\'') {
                // may not be the end if the char after is also a '
                if (toProcess[pos + 1] === '\'') {
                    pos += 1; // skip over that too, and continue
                }
                else {
                    terminated = true;
                }
            }
            if (ch.charCodeAt(0) === 0) {
                throw {
                    name: 'SpelParseException',
                    message: 'Non-terminating quoted string in expression (' + expressionString + ') at position ' + pos
                };
            }
        }
        pos += 1;
        tokens.push(new Token(TokenKind.LITERAL_STRING, subarray(start, pos), start, pos));
    }
    function lexDoubleQuotedStringLiteral() {
        var start = pos,
            terminated = false,
            ch;

        while (!terminated) {
            pos += 1;
            ch = toProcess[pos];
            if (ch === '"') {
                // may not be the end if the char after is also a '
                if (toProcess[pos + 1] === '"') {
                    pos += 1; // skip over that too, and continue
                }
                else {
                    terminated = true;
                }
            }
            if (ch.charCodeAt(0) === 0) {
                throw {
                    name: 'SpelParseException',
                    message: 'Non-terminating double-quoted string in expression (' + expressionString + ') at position ' + pos
                };
            }
        }
        pos += 1;
        tokens.push(new Token(TokenKind.LITERAL_STRING, subarray(start, pos), start, pos));
    }

    // REAL_LITERAL :
    // ('.' (DECIMAL_DIGIT)+ (EXPONENT_PART)? (REAL_TYPE_SUFFIX)?) |
    // ((DECIMAL_DIGIT)+ '.' (DECIMAL_DIGIT)+ (EXPONENT_PART)? (REAL_TYPE_SUFFIX)?) |
    // ((DECIMAL_DIGIT)+ (EXPONENT_PART) (REAL_TYPE_SUFFIX)?) |
    // ((DECIMAL_DIGIT)+ (REAL_TYPE_SUFFIX));
    // fragment INTEGER_TYPE_SUFFIX : ( 'L' | 'l' );
    // fragment HEX_DIGIT :
    // '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7'|'8'|'9'|'A'|'B'|'C'|'D'|'E'|'F'|'a'|'b'|'c'|'d'|'e'|'f';
    //
    // fragment EXPONENT_PART : 'e' (SIGN)* (DECIMAL_DIGIT)+ | 'E' (SIGN)*
    // (DECIMAL_DIGIT)+ ;
    // fragment SIGN : '+' | '-' ;
    // fragment REAL_TYPE_SUFFIX : 'F' | 'f' | 'D' | 'd';
    // INTEGER_LITERAL
    // : (DECIMAL_DIGIT)+ (INTEGER_TYPE_SUFFIX)?;

    function lexNumericLiteral(firstCharIsZero) {
        var isReal = false,
            start = pos,
            ch = toProcess[pos + 1],
            isHex = ch === 'x' || ch === 'X',
            dotpos,
            endOfNumber,
            possibleSign,
            isFloat;

        // deal with hexadecimal
        if (firstCharIsZero && isHex) {
            pos = pos + 1;
            do {
                pos += 1;
            }
            while (isHexadecimalDigit(toProcess[pos]));
            if (isChar('L', 'l')) {
                pushHexIntToken(subarray(start + 2, pos), true, start, pos);
                pos += 1;
            }
            else {
                pushHexIntToken(subarray(start + 2, pos), false, start, pos);
            }
            return;
        }

        // real numbers must have leading digits

        // Consume first part of number
        do {
            pos += 1;
        }
        while (isDigit(toProcess[pos]));

        // a '.' indicates this number is a real
        ch = toProcess[pos];
        if (ch === '.') {
            isReal = true;
            dotpos = pos;
            // carry on consuming digits
            do {
                pos += 1;
            }
            while (isDigit(toProcess[pos]));
            if (pos === dotpos + 1) {
                // the number is something like '3.'. It is really an int but may be
                // part of something like '3.toString()'. In this case process it as
                // an int and leave the dot as a separate token.
                pos = dotpos;
                pushIntToken(subarray(start, pos), false, start, pos);
                return;
            }
        }

        endOfNumber = pos;

        // Now there may or may not be an exponent

        // is it a long ?
        if (isChar('L', 'l')) {
            if (isReal) { // 3.4L - not allowed
                throw {
                    name: 'SpelParseException',
                    message: 'Real cannot be long in expression (' + expressionString + ') at position ' + pos
                };
            }
            pushIntToken(subarray(start, endOfNumber), true, start, endOfNumber);
            pos += 1;
        }
        else if (isExponentChar(toProcess[pos])) {
            isReal = true; // if it wasn't before, it is now
            pos += 1;
            possibleSign = toProcess[pos];
            if (isSign(possibleSign)) {
                pos += 1;
            }

            // exponent digits
            do {
                pos += 1;
            }
            while (isDigit(toProcess[pos]));
            isFloat = false;
            if (isFloatSuffix(toProcess[pos])) {
                isFloat = true;
                pos += 1;
                endOfNumber = pos;
            }
            else if (isDoubleSuffix(toProcess[pos])) {
                pos += 1;
                endOfNumber = pos;
            }
            pushRealToken(subarray(start, pos), isFloat, start, pos);
        }
        else {
            ch = toProcess[pos];
            isFloat = false;
            if (isFloatSuffix(ch)) {
                isReal = true;
                isFloat = true;
                pos += 1;
                endOfNumber = pos;
            }
            else if (isDoubleSuffix(ch)) {
                isReal = true;
                pos += 1;
                endOfNumber = pos;
            }
            if (isReal) {
                pushRealToken(subarray(start, endOfNumber), isFloat, start, endOfNumber);
            }
            else {
                pushIntToken(subarray(start, endOfNumber), false, start, endOfNumber);
            }
        }
    }

    function lexIdentifier() {
        var start = pos,
            substring,
            asString,
            idx;
        do {
            pos += 1;
        }
        while (isIdentifier(toProcess[pos]));
        substring = subarray(start, pos);

        // Check if this is the alternative (textual) representation of an operator (see
        // alternativeOperatorNames)
        if ((pos - start) === 2 || (pos - start) === 3) {
            asString = substring.toUpperCase();
            idx = ALTERNATIVE_OPERATOR_NAMES.indexOf(asString);
            if (idx >= 0) {
                pushOneCharOrTwoCharToken(TokenKind.valueOf(asString), start, substring);
                return;
            }
        }
        tokens.push(new Token(TokenKind.IDENTIFIER, substring.replace('\0', ''), start, pos));
    }

    function pushIntToken(data, isLong, start, end) {
        if (isLong) {
            tokens.push(new Token(TokenKind.LITERAL_LONG, data, start, end));
        }
        else {
            tokens.push(new Token(TokenKind.LITERAL_INT, data, start, end));
        }
    }

    function pushHexIntToken(data, isLong, start, end) {
        if (data.length === 0) {
            if (isLong) {
                throw {
                    name: 'SpelParseException',
                    message: 'Not a long in expression (' + expressionString + ') at position ' + pos
                };
            }
            else {
                throw {
                    name: 'SpelParseException',
                    message: 'Not an int in expression (' + expressionString + ') at position ' + pos
                };
            }
        }
        if (isLong) {
            tokens.push(new Token(TokenKind.LITERAL_HEXLONG, data, start, end));
        }
        else {
            tokens.push(new Token(TokenKind.LITERAL_HEXINT, data, start, end));
        }
    }

    function pushRealToken(data, isFloat, start, end) {
        if (isFloat) {
            tokens.push(new Token(TokenKind.LITERAL_REAL_FLOAT, data, start, end));
        }
        else {
            tokens.push(new Token(TokenKind.LITERAL_REAL, data, start, end));
        }
    }

    function subarray(start, end) {
        return toProcess.substring(start, end);
    }

    /**
     * Check if this might be a two character token.
     */
    function isTwoCharToken(kind) {
        if (kind.tokenChars.length === 2 && toProcess[pos] === kind.tokenChars[0]) {
            return toProcess[pos + 1] === kind.tokenChars[1];
        }
        return false;
    }

    /**
     * Push a token of just one character in length.
     */
    function pushCharToken(kind) {
        tokens.push(new Token(kind, null, pos, pos + 1));
        pos += 1;
    }

    /**
     * Push a token of two characters in length.
     */
    function pushPairToken(kind) {
        tokens.push(new Token(kind, null, pos, pos + 2));
        pos += 2;
    }

    function pushOneCharOrTwoCharToken(kind, pos, data) {
        tokens.push(new Token(kind, data, pos, pos + kind.getLength()));
    }

    // ID: ('a'..'z'|'A'..'Z'|'_'|'$') ('a'..'z'|'A'..'Z'|'_'|'$'|'0'..'9'|DOT_ESCAPED)*;
    function isIdentifier(ch) {
        return isAlphabetic(ch) || isDigit(ch) || ch === '_' || ch === '$';
    }

    function isChar(a, b) {
        var ch = toProcess[pos];
        return ch === a || ch === b;
    }

    function isExponentChar(ch) {
        return ch === 'e' || ch === 'E';
    }

    function isFloatSuffix(ch) {
        return ch === 'f' || ch === 'F';
    }

    function isDoubleSuffix(ch) {
        return ch === 'd' || ch === 'D';
    }

    function isSign(ch) {
        return ch === '+' || ch === '-';
    }

    function isDigit(ch) {
        if (ch.charCodeAt(0) > 255) {
            return false;
        }
        return (FLAGS[ch.charCodeAt(0)] & IS_DIGIT) !== 0;
    }

    function isAlphabetic(ch) {
        if (ch.charCodeAt(0) > 255) {
            return false;
        }
        return (FLAGS[ch.charCodeAt(0)] & IS_ALPHA) !== 0;
    }

    function isHexadecimalDigit(ch) {
        if (ch.charCodeAt(0) > 255) {
            return false;
        }
        return (FLAGS[ch.charCodeAt(0)] & IS_HEXDIGIT) !== 0;
    }

    process();

    return tokens;

}

export var Tokenizer = {
    tokenize: tokenize
};