thunderer/Shortcode

View on GitHub
src/Parser/RegularParser.php

Summary

Maintainability
D
2 days
Test Coverage
<?php
namespace Thunder\Shortcode\Parser;

use Thunder\Shortcode\Shortcode\ParsedShortcode;
use Thunder\Shortcode\Shortcode\Shortcode;
use Thunder\Shortcode\Syntax\CommonSyntax;
use Thunder\Shortcode\Syntax\SyntaxInterface;
use Thunder\Shortcode\Utility\RegexBuilderUtility;

/**
 * @author Tomasz Kowalczyk <tomasz@kowalczyk.cc>
 */
final class RegularParser implements ParserInterface
{
    /** @var string */
    private $lexerRegex;
    /** @var string */
    private $nameRegex;
    /** @psalm-var list<array{0:int,1:string,2:int}> */
    private $tokens = array();
    /** @var int */
    private $tokensCount = 0;
    /** @var int */
    private $position = 0;
    /** @var int[] */
    private $backtracks = array();
    /** @var int */
    private $lastBacktrack = 0;

    const TOKEN_OPEN = 1;
    const TOKEN_CLOSE = 2;
    const TOKEN_MARKER = 3;
    const TOKEN_SEPARATOR = 4;
    const TOKEN_DELIMITER = 5;
    const TOKEN_STRING = 6;
    const TOKEN_WS = 7;

    public function __construct(SyntaxInterface $syntax = null)
    {
        $this->lexerRegex = $this->prepareLexer($syntax ?: new CommonSyntax());
        $this->nameRegex = '~^'.RegexBuilderUtility::buildNameRegex().'$~us';
    }

    /**
     * @param string $text
     *
     * @return ParsedShortcode[]
     */
    public function parse($text)
    {
        $nestingLevel = ini_set('xdebug.max_nesting_level', '-1');
        $this->tokens = $this->tokenize($text);
        $this->backtracks = array();
        $this->lastBacktrack = 0;
        $this->position = 0;
        $this->tokensCount = \count($this->tokens);

        $shortcodes = array();
        while($this->position < $this->tokensCount) {
            while($this->position < $this->tokensCount && false === $this->lookahead(self::TOKEN_OPEN)) {
                $this->position++;
            }
            $names = array();
            $this->beginBacktrack();
            $matches = $this->shortcode($names);
            if(false === $matches) {
                $this->backtrack();
                $this->match(null, true);
                continue;
            }
            if(\is_array($matches)) {
                foreach($matches as $shortcode) {
                    $shortcodes[] = $shortcode;
                }
            }
        }
        ini_set('xdebug.max_nesting_level', $nestingLevel);

        return $shortcodes;
    }

    /**
     * @param string $name
     * @psalm-param array<string,string|null> $parameters
     * @param string|null $bbCode
     * @param int $offset
     * @param string|null $content
     * @param string $text
     *
     * @return ParsedShortcode
     */
    private function getObject($name, $parameters, $bbCode, $offset, $content, $text)
    {
        return new ParsedShortcode(new Shortcode($name, $parameters, $content, $bbCode), $text, $offset);
    }

    /* --- RULES ----------------------------------------------------------- */

    /**
     * @param string[] $names
     * @psalm-param list<string> $names
     * FIXME: investigate the reason Psalm complains about references
     * @psalm-suppress ReferenceConstraintViolation
     *
     * @return ParsedShortcode[]|string|false
     */
    private function shortcode(array &$names)
    {
        if(!$this->match(self::TOKEN_OPEN, false)) { return false; }
        $offset = $this->tokens[$this->position - 1][2];
        $this->match(self::TOKEN_WS, false);
        if('' === $name = $this->match(self::TOKEN_STRING, false)) { return false; }
        if($this->lookahead(self::TOKEN_STRING)) { return false; }
        if(1 !== preg_match($this->nameRegex, $name, $matches)) { return false; }
        $this->match(self::TOKEN_WS, false);
        // bbCode
        $bbCode = $this->match(self::TOKEN_SEPARATOR, true) ? $this->value() : null;
        if(false === $bbCode) { return false; }
        // parameters
        if(false === ($parameters = $this->parameters())) { return false; }

        // self-closing
        if($this->match(self::TOKEN_MARKER, true)) {
            if(!$this->match(self::TOKEN_CLOSE, false)) { return false; }

            return array($this->getObject($name, $parameters, $bbCode, $offset, null, $this->getBacktrack()));
        }

        // just-closed or with-content
        if(!$this->match(self::TOKEN_CLOSE, false)) { return false; }
        $this->beginBacktrack();
        $names[] = $name;

        // begin inlined content()
        $content = '';
        $shortcodes = array();
        $closingName = null;

        while($this->position < $this->tokensCount) {
            while($this->position < $this->tokensCount && false === $this->lookahead(self::TOKEN_OPEN)) {
                $content .= $this->match(null, true);
            }

            $this->beginBacktrack();
            $contentMatchedShortcodes = $this->shortcode($names);
            if(\is_string($contentMatchedShortcodes)) {
                $closingName = $contentMatchedShortcodes;
                break;
            }
            if(\is_array($contentMatchedShortcodes)) {
                foreach($contentMatchedShortcodes as $matchedShortcode) {
                    $shortcodes[] = $matchedShortcode;
                }
                continue;
            }
            $this->backtrack();

            $this->beginBacktrack();
            if(false !== ($closingName = $this->close($names))) {
                $this->backtrack();
                $shortcodes = array();
                break;
            }
            $closingName = null;
            $this->backtrack();

            $content .= $this->match(null, false);
        }
        $content = $this->position < $this->tokensCount ? $content : false;
        // end inlined content()

        if(null !== $closingName && $closingName !== $name) {
            array_pop($names);
            array_pop($this->backtracks);
            array_pop($this->backtracks);

            return $closingName;
        }
        if(false === $content || $closingName !== $name) {
            $this->backtrack(false);
            $text = $this->backtrack(false);
            array_pop($names);

            return array_merge(array($this->getObject($name, $parameters, $bbCode, $offset, null, $text)), $shortcodes);
        }
        $content = $this->getBacktrack();
        if(!$this->close($names)) { return false; }
        array_pop($names);

        return array($this->getObject($name, $parameters, $bbCode, $offset, $content, $this->getBacktrack()));
    }

    /**
     * @param string[] $names
     *
     * @return string|false
     */
    private function close(array &$names)
    {
        if(!$this->match(self::TOKEN_OPEN, true)) { return false; }
        if(!$this->match(self::TOKEN_MARKER, true)) { return false; }
        if(!$closingName = $this->match(self::TOKEN_STRING, true)) { return false; }
        if(!$this->match(self::TOKEN_CLOSE, false)) { return false; }

        return \in_array($closingName, $names, true) ? $closingName : false;
    }

    /** @psalm-return array<string,string|null>|false */
    private function parameters()
    {
        $parameters = array();

        while(true) {
            $this->match(self::TOKEN_WS, false);
            if($this->lookahead(self::TOKEN_MARKER) || $this->lookahead(self::TOKEN_CLOSE)) { break; }
            if(!$name = $this->match(self::TOKEN_STRING, true)) { return false; }
            if(!$this->match(self::TOKEN_SEPARATOR, true)) { $parameters[$name] = null; continue; }
            if(false === ($value = $this->value())) { return false; }
            $this->match(self::TOKEN_WS, false);

            $parameters[$name] = $value;
        }

        return $parameters;
    }

    /** @return false|string */
    private function value()
    {
        $value = '';

        if($this->match(self::TOKEN_DELIMITER, false)) {
            while($this->position < $this->tokensCount && false === $this->lookahead(self::TOKEN_DELIMITER)) {
                $value .= $this->match(null, false);
            }

            return $this->match(self::TOKEN_DELIMITER, false) ? $value : false;
        }

        if('' !== $tmp = $this->match(self::TOKEN_STRING, false)) {
            $value .= $tmp;
            while('' !== $tmp = $this->match(self::TOKEN_STRING, false)) {
                $value .= $tmp;
            }

            return $value;
        }

        return false;
    }

    /* --- PARSER ---------------------------------------------------------- */

    /** @return void */
    private function beginBacktrack()
    {
        $this->backtracks[] = $this->position;
        $this->lastBacktrack = $this->position;
    }

    /** @return string */
    private function getBacktrack()
    {
        $position = array_pop($this->backtracks);
        $backtrack = '';
        for($i = $position; $i < $this->position; $i++) {
            $backtrack .= $this->tokens[$i][1];
        }

        return $backtrack;
    }

    /**
     * @param bool $modifyPosition
     *
     * @return string
     */
    private function backtrack($modifyPosition = true)
    {
        $position = array_pop($this->backtracks);
        if($modifyPosition) {
            $this->position = $position;
        }

        $backtrack = '';
        for($i = $position; $i < $this->lastBacktrack; $i++) {
            $backtrack .= $this->tokens[$i][1];
        }
        $this->lastBacktrack = $position;

        return $backtrack;
    }

    /**
     * @param int $type
     *
     * @return bool
     */
    private function lookahead($type)
    {
        return $this->position < $this->tokensCount && $this->tokens[$this->position][0] === $type;
    }

    /**
     * @param int|null $type
     * @param bool $ws
     *
     * @return string
     */
    private function match($type, $ws)
    {
        if($this->position >= $this->tokensCount) {
            return '';
        }

        $token = $this->tokens[$this->position];
        if(!empty($type) && $token[0] !== $type) {
            return '';
        }

        $this->position++;
        if($ws && $this->position < $this->tokensCount && $this->tokens[$this->position][0] === self::TOKEN_WS) {
            $this->position++;
        }

        return $token[1];
    }

    /* --- LEXER ----------------------------------------------------------- */

    /**
     * @param string $text
     *
     * @psalm-return list<array{0:int,1:string,2:int}>
     */
    private function tokenize($text)
    {
        $count = preg_match_all($this->lexerRegex, $text, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
        if(false === $count || preg_last_error() !== PREG_NO_ERROR) {
            throw new \RuntimeException(sprintf('PCRE failure `%s`.', preg_last_error()));
        }

        $tokens = array();
        $position = 0;

        foreach($matches as $match) {
            switch(true) {
                case -1 !== $match['string'][1]: { $token = $match['string'][0]; $type = self::TOKEN_STRING; break; }
                case -1 !== $match['ws'][1]: { $token = $match['ws'][0]; $type = self::TOKEN_WS; break; }
                case -1 !== $match['marker'][1]: { $token = $match['marker'][0]; $type = self::TOKEN_MARKER; break; }
                case -1 !== $match['delimiter'][1]: { $token = $match['delimiter'][0]; $type = self::TOKEN_DELIMITER; break; }
                case -1 !== $match['separator'][1]: { $token = $match['separator'][0]; $type = self::TOKEN_SEPARATOR; break; }
                case -1 !== $match['open'][1]: { $token = $match['open'][0]; $type = self::TOKEN_OPEN; break; }
                case -1 !== $match['close'][1]: { $token = $match['close'][0]; $type = self::TOKEN_CLOSE; break; }
                default: { throw new \RuntimeException(sprintf('Invalid token.')); }
            }
            $tokens[] = array($type, $token, $position);
            $position += mb_strlen($token, 'utf-8');
        }

        return $tokens;
    }

    /** @return string */
    private function prepareLexer(SyntaxInterface $syntax)
    {
        // FIXME: for some reason Psalm does not understand the `@psalm-var callable() $var` annotation
        /** @psalm-suppress MissingClosureParamType, MissingClosureReturnType */
        $group = function($text, $group) {
            return '(?<'.(string)$group.'>'.preg_replace('/(.)/us', '\\\\$0', (string)$text).')';
        };
        /** @psalm-suppress MissingClosureParamType, MissingClosureReturnType */
        $quote = function($text) {
            return preg_replace('/(.)/us', '\\\\$0', (string)$text);
        };

        $rules = array(
            '(?<string>\\\\.|(?:(?!'.implode('|', array(
                $quote($syntax->getOpeningTag()),
                $quote($syntax->getClosingTag()),
                $quote($syntax->getClosingTagMarker()),
                $quote($syntax->getParameterValueSeparator()),
                $quote($syntax->getParameterValueDelimiter()),
                '\s+',
            )).').)+)',
            '(?<ws>\s+)',
            $group($syntax->getClosingTagMarker(), 'marker'),
            $group($syntax->getParameterValueDelimiter(), 'delimiter'),
            $group($syntax->getParameterValueSeparator(), 'separator'),
            $group($syntax->getOpeningTag(), 'open'),
            $group($syntax->getClosingTag(), 'close'),
        );

        return '~('.implode('|', $rules).')~us';
    }
}