phplrt/phplrt

View on GitHub
libs/lexer/src/Runtime/Compiler.php

Summary

Maintainability
A
0 mins
Test Coverage
<?php

declare(strict_types=1);

namespace Phplrt\Lexer\Runtime;

use Phplrt\Lexer\Exception\CompilationException;

/**
 * @internal This is an internal library class, please do not use it in your code.
 * @psalm-internal Phplrt\Lexer\Runtime
 */
class Compiler
{
    /**
     * Default PCRE delimiter.
     *
     * @var non-empty-string
     */
    final public const DEFAULT_DELIMITER = '/';

    /**
     * @link https://www.php.net/manual/en/reference.pcre.pattern.modifiers.php
     *
     * @var list<non-empty-string>
     */
    final public const DEFAULT_MODIFIERS = [
        //
        // - "S": When a pattern is going to be used several times, it is worth
        // spending more time analyzing it in order to speed up the time taken
        // for matching. If this modifier is set, then this extra analysis is
        // performed. At present, studying a pattern is useful only for
        // non-anchored patterns that do not have a single fixed starting
        // character.
        //
        'S',
        //
        // - "s": If this modifier is set, a dot metacharacter in the pattern
        // matches all characters, including newlines. Without it, newlines are
        // excluded. This modifier is equivalent to Perl's /s modifier. A
        // negative class such as [^a] always matches a newline character,
        // independent of the setting of this modifier.
        //
        's',
        //
        // - "u": This modifier turns on additional functionality of PCRE that
        // is incompatible with Perl. Pattern and subject strings are treated as
        // UTF-8. An invalid subject will cause the preg_* function to match
        // nothing; an invalid pattern will trigger an error of level E_WARNING.
        // Five and six octet UTF-8 sequences are regarded as invalid since
        // PHP 5.3.4 (resp. PCRE 7.3 2007-08-28); formerly those have been
        // regarded as valid UTF-8.
        //
        'u',
        //
        // - "m": By default, PCRE treats the subject string as consisting of a
        // single "line" of characters (even if it actually contains several
        // newlines).
        //
        // The "start of line" metacharacter (^) matches only at the start of
        // the string, while the "end of line" metacharacter ($) matches only at
        // the end of the string, or before a terminating newline (unless D
        // modifier is set). This is the same as Perl. When this modifier is
        // set, the "start of line" and "end of line" constructs match
        // immediately following or immediately before any newline in the
        // subject string, respectively, as well as at the very start and end.
        // This is equivalent to Perl's /m modifier. If there are no "\n"
        // characters in a subject string, or no occurrences of ^ or $ in a
        // pattern, setting this modifier has no effect.
        //
        'm',
    ];

    /**
     * @param bool $debug Enables or disables debug mode with validation of
     *        generated code.
     * @param non-empty-string $delimiter Escape character for regular
     *        expressions based on which special characters in token names and
     *        values will be escaped.
     */
    public function __construct(
        private readonly bool $debug,
        private readonly string $delimiter = self::DEFAULT_DELIMITER,
    ) {}

    /**
     * @param iterable<non-empty-string, non-empty-string> $tokens List of
     *        PCRE-compatible tokens in format [string(NAME) => string(VALUE)].
     * @param list<non-empty-string> $modifiers List of PCRE modifiers. See
     *        also {@link https://www.php.net/manual/en/reference.pcre.pattern.modifiers.php}
     *
     * @return non-empty-string
     *
     * @throws CompilationException
     */
    public function compile(iterable $tokens, array $modifiers = self::DEFAULT_MODIFIERS): string
    {
        // List of compilable string chunks.
        $chunks = [];

        // Build a collection of "[NAME => PCRE]" tokens to a list
        // from a specific marker "(?:(?:PCRE)(*MARK:NAME))".
        foreach ($tokens as $name => $pcre) {
            if ($name === '') {
                throw CompilationException::fromEmptyTokenName($pcre);
            }

            $chunks[] = $chunk = \vsprintf('(?:(?:%s)(*MARK:%s))', [
                $this->pattern($pcre),
                $this->name($name),
            ]);

            if ($this->debug) {
                $this->assertValidToken($name, $chunk, $modifiers);
            }
        }

        // Finalize compilation to the format:
        //  \G(?|
        //      (?:(?:PCRE_1)(*MARK:NAME_1)) |
        //      (?:(?:PCRE_2)(*MARK:NAME_2)) |
        //  )
        $body = \sprintf('\\G(?|%s)', \implode('|', $chunks));

        if ($this->debug) {
            $this->assertValidResult($body, $modifiers);
        }

        return $this->build($body, $modifiers);
    }

    /**
     * @param non-empty-string|int $name
     * @return non-empty-string
     */
    private function name(string|int $name): string
    {
        /** @var non-empty-string */
        return \preg_quote((string) $name, $this->delimiter);
    }

    /**
     * @param non-empty-string $pattern
     * @return non-empty-string
     */
    private function pattern(string $pattern): string
    {
        /** @var non-empty-string */
        return \addcslashes($pattern, $this->delimiter);
    }

    /**
     * @param non-empty-string|int $name
     * @param non-empty-string $pattern
     * @param list<non-empty-string> $modifiers
     *
     * @throws CompilationException
     */
    private function assertValidToken(string|int $name, string $pattern, array $modifiers): void
    {
        if (\is_int($name)) {
            $name = '#' . $name;
        }

        $this->assertValid($pattern, $modifiers, $name);
    }

    /**
     * @param non-empty-string $pattern
     * @param list<non-empty-string> $modifiers
     *
     * @throws CompilationException
     */
    private function assertValidResult(string $pattern, array $modifiers): void
    {
        $this->assertValid($pattern, $modifiers);
    }

    /**
     * @param non-empty-string $pattern
     * @param list<non-empty-string> $modifiers
     * @param non-empty-string|null $original
     *
     * @throws CompilationException
     */
    private function assertValid(string $pattern, array $modifiers, string $original = null): void
    {
        \error_clear_last();

        $flags = \PREG_SET_ORDER | \PREG_OFFSET_CAPTURE;

        @\preg_match_all($this->build($pattern, $modifiers), '', $matches, $flags);

        if ($error = \error_get_last()) {
            throw new CompilationException(
                $this->formatException($error['message'], $original),
            );
        }
    }

    /**
     * @param non-empty-string $pcre
     * @param list<non-empty-string> $modifiers
     *
     * @return non-empty-string
     */
    private function build(string $pcre, array $modifiers): string
    {
        return $this->delimiter . $pcre . $this->delimiter
            . \implode('', $modifiers);
    }

    /**
     * @return non-empty-string
     */
    private function formatException(string $message, string $token = null): string
    {
        $suffix = \sprintf(' in %s token definition', $token ?: '<unknown>');

        $message = \str_replace('Compilation failed: ', '', $message);
        $message = \preg_replace('/([\w_]+\(\):\h+)/', '', $message);
        $message = \preg_replace('/\h*at\h+offset\h+\d+/', '', $message);

        /** @var non-empty-string */
        return \ucfirst($message) . (\is_string($token) ? $suffix : '');
    }
}