src/Parser/DefaultParser.php from krixon/rules

src/Parser/DefaultParser.php
Summary

Maintainability

5 hrs
Test Coverage

Issues
<?php

namespace Krixon\Rules\Parser;

use DateTimeImmutable;
use DateTimeZone;
use Exception;
use Krixon\Rules\Ast;
use Krixon\Rules\Exception\SyntaxError;
use Krixon\Rules\Lexer\Lexer;
use Krixon\Rules\Lexer\Token;
use function in_array;
use function sprintf;

class DefaultParser implements Parser
{
    private const HINT_DATE     = 'date';
    private const HINT_TIMEZONE = 'timezone';

    private const HINTS = [
        self::HINT_DATE,
        self::HINT_TIMEZONE,
    ];

    /**
     * @var Token[]
     */
    private $tokens;
    private $pointer;
    private $pointerMax;
    private $lexer;
    private $expression;


    public function __construct(Lexer $lexer = null)
    {
        $this->lexer = $lexer ?: new Lexer();
    }


    public function parse(string $expression) : Ast\Node
    {
        $this->expression = $expression;
        $this->tokens     = $this->lexer->tokenize($this->expression);
        $this->pointer    = 0;
        $this->pointerMax = count($this->tokens) - 1;

        // Check for an empty expression explicitly.
        // This is not technically necessary, but allows for a more helpful error message over the more
        // generic "Expected 'IDENTIFIER', got 'EOF'." that would otherwise be produced.
        if ($this->token()->is(Token::EOF)) {
            throw new SyntaxError('Empty expression.', '', 0);
        }

        $node = $this->parseExpression();

        $this->match(Token::EOF);

        return $node;
    }


    /**
     * @throws SyntaxError
     */
    private function parseExpression($precedence = 0) : Ast\Node
    {
        $left = $this->parsePrimaryNode();

        // Represents the precedence of the current token if it's an operator. Right now every operator has the same
        // level of precedence and everything is left-associate but this will allow us to easily add new operators in
        // the future with varying levels of precedence.
        $operatorPrecedence = 1;

        while ($this->is(...Token::OPERATORS) && $operatorPrecedence > $precedence) {
            $left = $this->parseLogicalExpression($left);
        }

        return $left;
    }


    /**
     * @throws SyntaxError
     */
    private function parsePrimaryNode() : Ast\Node
    {
        if ($this->token()->is(Token::LEFT_PAREN)) {
            $node = $this->parseSubExpression();
        } else {
            $node = $this->parseComparisonExpression();
        }

        return $node;
    }


    /**
     * @throws SyntaxError
     */
    private function parseLogicalExpression(Ast\Node $left) : Ast\Node
    {
        $this->matchLogicalOperator();

        $token = $this->token();

        $this->next();

        switch ($token->type()) {
            case Token::AND:
                return Ast\LogicalNode::and($left, $this->parseExpression(1));
            case Token::OR:
                return Ast\LogicalNode::or($left, $this->parseExpression(1));
            case Token::XOR:
                return Ast\LogicalNode::xor($left, $this->parseExpression(1));
            default:
                // @codeCoverageIgnoreStart
                // This has already been validated by matchLogicalOperator(), but is thrown here to
                // help prevent future bugs if a new token type is implemented without a corresponding branch
                // in this case statement.
                throw SyntaxError::unexpectedToken($this->expression, 'LOGICAL_OPERATOR', $token);
                // @codeCoverageIgnoreEnd
        }
    }


    /**
     * @throws SyntaxError
     */
    private function parseComparisonExpression() : Ast\Node
    {
        $left    = $this->parseIdentifier();
        $negated = false;

        // NOT negates the following comparison operator.

        if ($this->is(Token::NOT)) {
            $next = $this->peek();

            // Consume the NOT.
            $this->next();

            if ($next && $this->isLiteralOrTypeHint($next)) {
                // NOT was found immediately before a literal rather than a comparison operator (e.g. foo not "bar").
                // This is considered shorthand for NOT EQUALS.
                return new Ast\NegationNode(Ast\ComparisonNode::equals($left, $this->parseLiteral()));
            }

            // This is regular negation of the subsequent comparison operator.
            $negated = true;
        }

        $token = $this->token();

        $this->matchComparisonOperator();
        $this->next();

        switch ($token->type()) {
            case TOKEN::EQUALS:
                $node = Ast\ComparisonNode::equals($left, $this->parseLiteral());
                break;
            case TOKEN::GREATER:
                $node = Ast\ComparisonNode::greaterThan($left, $this->parseLiteral());
                break;
            case TOKEN::GREATER_EQUALS:
                $node = Ast\ComparisonNode::greaterThanOrEqualTo($left, $this->parseLiteral());
                break;
            case TOKEN::LESS:
                $node = Ast\ComparisonNode::lessThan($left, $this->parseLiteral());
                break;
            case TOKEN::LESS_EQUALS:
                $node = Ast\ComparisonNode::lessThanOrEqualTo($left, $this->parseLiteral());
                break;
            case TOKEN::IN:
                $node = Ast\ComparisonNode::in($left, $this->parseLiteralList());
                break;
            case TOKEN::CONTAINS:
                $node = $this->parseContainsComparison($left);
                break;
            case TOKEN::MATCHES:
                $node = $this->parseMatchesComparison($left);
                break;
            case TOKEN::BETWEEN:
                $node = $this->parseBetweenComparison($left);
                break;
            default:
                // @codeCoverageIgnoreStart
                // This has already been validated by matchComparisonOperator(), but is thrown here to
                // help prevent future bugs if a new token type is implemented without a corresponding branch
                // in this case statement.
                throw SyntaxError::unexpectedToken($this->expression, 'COMPARISON_OPERATOR', $token);
                // @codeCoverageIgnoreEnd
        }

        if ($negated) {
            $node = new Ast\NegationNode($node);
        }

        return $node;
    }


    /**
     * @throws SyntaxError
     */
    private function parseContainsComparison(Ast\IdentifierNode $identifier) : Ast\ComparisonNode
    {
        // There are a two main forms of this comparison, ANY and ALL.
        //
        // ANY means that the specification is satisfied when any of of the values is present.
        // `foo contains any [1,2]` -> CONTAINS_ANY
        //
        // ALL means that the specification is satisfied when all of of the values are present.
        // `foo contains all [1,2]` -> CONTAINS_ALL
        //
        // Note that there is no EXACTLY modifier - for this the existing `is` comparison can be used.
        //
        // If no modifier is specified, the comparison is assumed to be ANY:
        // `foo contains [1,2]` -> CONTAINS_ANY
        //
        // The `of` keyword can be added for readability. This is just syntactic sugar and is ignored if present.
        // `foo contains any of [1,2]` -> CONTAINS_ANY
        // `foo contains all of [1,2]` -> CONTAINS_ALL
        //
        // Finally, it is possible to check if a single value is contained. It doesn't matter if ANY or ALL is used
        // here as they are equivalent for a single value.
        // `foo contains 1` -> CONTAINS_ANY
        // Note that for consistency, if the comparison specifies ALL with a single value, it is converted to an ANY.
        // This avoids the need for the compiler to handle both cases. The single value is also converted into a
        // single-item list, again to simplify things for the compiler.

        $modifier = Token::ANY;

        if ($this->is(Token::ANY, Token::ALL)) {
            $modifier = $this->token()->type();
            $this->next();
        }

        if ($this->is(Token::OF)) {
            $this->next();
        }

        $value = $this->parseLiteralSingleOrList();

        if (!$value instanceof Ast\LiteralNodeList) {
            $modifier = Token::ANY;
            $value    = new Ast\LiteralNodeList($value);
        }

        if ($modifier === Token::ANY) {
            return Ast\ComparisonNode::containsAny($identifier, $value);
        }

        return Ast\ComparisonNode::containsAll($identifier, $value);
    }


    /**
     * @throws SyntaxError
     */
    private function parseMatchesComparison(Ast\IdentifierNode $identifier) : Ast\ComparisonNode
    {
        $this->match(Token::STRING);

        $value = $this->token()->value();

        $this->next();

        return Ast\ComparisonNode::matches($identifier, new Ast\StringNode($value));
    }


    /**
     * @throws SyntaxError
     */
    private function parseBetweenComparison(Ast\IdentifierNode $identifier) : Ast\LogicalNode
    {
        // There are two forms of between comparison.
        // The first uses interval notation (https://en.wikipedia.org/wiki/Interval_%28mathematics%29) to explicitly
        // define the boundaries of the interval.
        // The second is a simplified version (`foo between 10 and 20`) which implies a closed interval.
        // Both versions are really just syntactic sugar for two separate comparisons joined with AND.
        // For example, `foo between (10, 20]` is the same as `foo > 10 and foo <= 20`.

        // Simplified version.
        if ($this->is(...Token::LITERALS)) {
            $a = $this->parseLiteral();

            $this->match(Token::AND);
            $this->next();

            $b = $this->parseLiteral();

            return Ast\LogicalNode::and(
                Ast\ComparisonNode::greaterThanOrEqualTo($identifier, $a),
                Ast\ComparisonNode::lessThanOrEqualTo($identifier, $b)
            );
        }

        // Full interval notation.
        return $this->parseIntervalNotation($identifier);
    }


    /**
     * @throws SyntaxError
     */
    private function parseIntervalNotation(Ast\IdentifierNode $identifier) : Ast\LogicalNode
    {
        // (0,1) is open; it does not include its endpoints, so becomes "> 0 && < 1".
        // [0,1] is closed; it includes its endpoints, so becomes ">= 0 && <= 1".
        // [0,1) and (0,1] are half-open; they include one endpoint, so become "> 0 && <= 1" or ">= 0 && < 1".

        $this->match(Token::LEFT_BRACKET, Token::LEFT_PAREN);

        $start = $this->token();
        $this->next();

        $a = $this->parseLiteral();

        $this->match(Token::COMMA);
        $this->next();

        $b = $this->parseLiteral();

        $this->match(Token::RIGHT_BRACKET, Token::RIGHT_PAREN);

        $end = $this->token();
        $this->next();

        if ($start->is(Token::LEFT_BRACKET)) {
            $left = Ast\ComparisonNode::greaterThanOrEqualTo($identifier, $a);
        } else {
            $left = Ast\ComparisonNode::greaterThan($identifier, $a);
        }

        if ($end->is(Token::RIGHT_BRACKET)) {
            $right = Ast\ComparisonNode::lessThanOrEqualTo($identifier, $b);
        } else {
            $right = Ast\ComparisonNode::lessThan($identifier, $b);
        }

        return Ast\LogicalNode::and($left, $right);
    }


    /**
     * @throws SyntaxError
     */
    private function parseIdentifier() : Ast\IdentifierNode
    {
        $this->match(Token::IDENTIFIER);

        $token = $this->token();

        $this->next();

        if (!$this->is(Token::DOT)) {
            return new Ast\IdentifierNode($token->value());
        }

        $this->next();

        return new Ast\IdentifierNode($token->value(), $this->parseIdentifier());
    }


    /**
     * @throws SyntaxError
     */
    private function parseLiteralSingleOrList() : Ast\LiteralNode
    {
        if ($this->is(Token::LEFT_BRACKET)) {
            return $this->parseLiteralList();
        }

        return $this->parseLiteral();
    }


    /**
     * @throws SyntaxError
     */
    private function parseLiteral() : Ast\LiteralNode
    {
        // Literals can be prefixed with a specific type hint, e.g. date:"1st Jan 2019".
        // This can affect the type of LiteralNode which is produced.

        $type = $this->parseLiteralTypeHint();

        if ($type === 'date') {
            return $this->parseDateLiteral();
        }

        if ($type === 'timezone') {
            return $this->parseTimezoneLiteral();
        }

        // Type hints for other types are unsupported as they have distinct literal representations
        // which correspond to specific Tokens.

        $this->matchLiteral();

        $token = $this->token();

        $this->next();

        if ($token->is(Token::STRING)) {
            return new Ast\StringNode($token->value());
        }

        if ($token->is(Token::BOOLEAN)) {
            return new Ast\BooleanNode($token->value() === 'true');
        }

        return new Ast\NumberNode($token->value());
    }


    /**
     * @throws SyntaxError
     */
    private function parseLiteralTypeHint(string ...$allowed) : ?string
    {
        $type = null;

        if ($this->is(Token::IDENTIFIER)) {
            $this->matchLiteralTypeHint(...$allowed);
            $type = $this->token()->value();
            $this->next();

            $this->match(Token::COLON);
            $this->next();
        }

        return $type;
    }


    /**
     * @throws SyntaxError
     */
    private function parseDateLiteral() : Ast\DateNode
    {
        $this->match(TOKEN::STRING);

        $token = $this->token();

        $this->next();

        // Allow an optional timezone to be specified using 'in timezone:"<tz>"'.

        $timezone = null;

        if ($this->is(Token::IN)) {
            $this->next();

            // The type hint is optional, but must be "timezone" if present.
            $this->parseLiteralTypeHint(self::HINT_TIMEZONE);

            $timezone = $this->parseTimezoneLiteral()->value();
        }

        try {
            $date = new DateTimeImmutable($token->value(), $timezone);
        } catch (Exception $e) {
            throw new SyntaxError(
                sprintf("Invalid date literal '%s'.", $token->value()),
                $this->expression,
                $token->position(),
                $e
            );
        }

        return new Ast\DateNode($date);
    }


    /**
     * @throws SyntaxError
     */
    private function parseTimezoneLiteral() : Ast\TimezoneNode
    {
        $this->match(Token::STRING);

        $token = $this->token();

        try {
            $timezone = new DateTimeZone($token->value());
        } catch (Exception $e) {
            throw new SyntaxError(
                sprintf("Invalid timezone literal '%s'.", $token->value()),
                $this->expression,
                $token->position(),
                $e
            );
        }

        $this->next();

        return new Ast\TimezoneNode($timezone);
    }


    /**
     * @throws SyntaxError
     */
    private function parseLiteralList() : Ast\LiteralNodeList
    {
        $this->match(Token::LEFT_BRACKET);

        $this->next();

        $literals = [];
        $type     = null;

        while (true) {
            // In case of mismatched types error, this is the first token with a previously unseen type.
            // Get hold of it before continuing to parse so its position can be used for the error.
            $token = $this->token();

            // Technically it might be better to allow expressions rather than just literals.
            // This would require that a literal on its own is a valid expression though, which doesn't make
            // a lot of sense in the context of a rule.
            $literals[] = $literal = $this->parseLiteral();

            if ($type === null) {
                $type = $literal::type();
            } elseif ($literal::type() !== $type) {
                throw new SyntaxError(
                    sprintf(
                        "Lists must not contain different data types. This list contains '%s' and '%s' data.",
                        $type,
                        $literal::type()
                    ),
                    $this->expression,
                    $token->position()
                );
            }

            if (!$this->is(Token::COMMA)) {
                break;
            }

            $this->next();
        }

        $this->match(Token::RIGHT_BRACKET);

        $this->next();

        return new Ast\LiteralNodeList(...$literals);
    }


    /**
     * @throws SyntaxError
     */
    private function parseSubExpression() : Ast\Node
    {
        $this->match(Token::LEFT_PAREN);

        $this->next();

        $node = $this->parseExpression();

        $this->match(Token::RIGHT_PAREN);

        $this->next();

        return $node;
    }


    private function token() : ?Token
    {
        return $this->pointer > $this->pointerMax ? null : $this->tokens[$this->pointer];
    }


    private function peek() : ?Token
    {
        return $this->tokens[$this->pointer + 1] ?? null;
    }


    private function next() : void
    {
        $this->pointer++;
    }


    private function is(string ...$tokenType) : bool
    {
        $token = $this->token();

        return $token && $token->is(...$tokenType);
    }


    /**
     * @throws SyntaxError
     */
    private function match(string ...$tokenType) : void
    {
        if (!$this->is(...$tokenType)) {
            throw SyntaxError::unexpectedToken($this->expression, $tokenType, $this->token());
        }
    }


    /**
     * @throws SyntaxError
     */
    private function matchComparisonOperator() : void
    {
        $this->match(...Token::COMPARISON_OPERATORS);
    }


    /**
     * @throws SyntaxError
     */
    private function matchLogicalOperator() : void
    {
        $this->match(...Token::LOGICAL_OPERATORS);
    }


    /**
     * @throws SyntaxError
     */
    private function matchLiteral() : void
    {
        $this->match(...Token::LITERALS);
    }


    /**
     * @throws SyntaxError
     */
    private function matchLiteralTypeHint(string ...$hints) : void
    {
        $this->match(Token::IDENTIFIER);

        if (empty($hints)) {
            $hints = self::HINTS;
        }

        $token = $this->token();
        $hint  = $token->value();

        if (!in_array($hint, $hints, true)) {
            throw SyntaxError::unexpectedToken($this->expression, $hints, $token);
        }
    }


    private function isLiteralOrTypeHint(Token $token) : bool
    {
        if ($token->isLiteral()) {
            return true;
        }

        if (!$token->is(Token::IDENTIFIER)) {
            return false;
        }

        return in_array($token->value(), self::HINTS, true);
    }
}