wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Parser/QueryStringRegex/KeywordParser.php

Summary

Maintainability
C
1 day
Test Coverage
<?php

namespace CirrusSearch\Parser\QueryStringRegex;

use CirrusSearch\Parser\AST\KeywordFeatureNode;
use CirrusSearch\Parser\AST\NegatedNode;
use CirrusSearch\Parser\AST\ParsedNode;
use CirrusSearch\Parser\AST\ParseWarning;
use CirrusSearch\Query\KeywordFeature;
use CirrusSearch\WarningCollector;
use Wikimedia\Assert\Assert;

/**
 * Parser for KeywordFeature
 */
class KeywordParser implements WarningCollector {

    /**
     * @var int
     */
    private $currentOffset;

    /**
     * @var ParseWarning[]
     */
    private $warnings = [];

    /**
     * @param string $query
     * @param KeywordFeature $feature
     * @param OffsetTracker $tracker
     * @param int $startOffset start offset of the query in $query
     * @return ParsedNode[]
     */
    public function parse( $query, KeywordFeature $feature, OffsetTracker $tracker, $startOffset = 0 ) {
        if ( $feature->greedy() ) {
            Assert::precondition( !$feature->allowEmptyValue(),
                "greedy keywords must not accept empty value" );
            // XXX: we ignore value delimiter for greedy keywords
            Assert::precondition( $feature->getValueDelimiters() === [ [ 'delimiter' => '"' ] ],
                "getValueDelimiters() must not be overridden with greedy keywords" );
        }
        $offset = $tracker->getMinimalUnconsumedOffset( $startOffset );
        $keyListRegex = implode(
            '|',
            array_map(
                static function ( $kw ) {
                    return preg_quote( $kw, '/' );
                },
                $feature->getKeywordPrefixes()
            )
        );
        // Hook to the beginning allowing optional spaces if we are a queryHeader
        // otherwise lookbehind allowing begin or space.
        // \G is similar to ^ but also works when offset is set is if we ran substr on it
        $begin = $feature->queryHeader() ? '(?:\G[\pZ\pC]*)' : '(?<=\G|[\pZ\pC])';
        $keywordRegex = '(?<key>-?(?:' . $keyListRegex . '))';
        $valueSideRegex = '';
        if ( $feature->hasValue() ) {
            $valueRegex = '(?<value>' . $this->getValueRegex( $feature ) . ')';
            // If we allow empty values we don't allow spaces between
            // the keyword and its value, a space would mean "empty value"
            $spacesAfterSep = $feature->allowEmptyValue() ? '' : '[\pZ\pC]*';
            $valueSideRegex = "{$spacesAfterSep}{$valueRegex}";
        }
        $matches = [];
        preg_match_all( "/{$begin}{$keywordRegex}(?<colon>:){$valueSideRegex}/u",
            $query, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE, $offset );
        $output = [];
        foreach ( $matches as $match ) {
            $key = $match['key'][0];
            Assert::invariant( $feature->hasValue() === isset( $match['value'] ),
                'a value must have matched if the keyword wants a value.' );
            $quotedValue = '';
            $value = '';
            $valueDelimiter = '';
            $valueSuffix = '';
            $valueStart = $match['colon'][1] + strlen( $match['colon'][0] );
            if ( $feature->hasValue() ) {
                $quotedValue = $match['value'][0];
                if ( isset( $match['unquoted'] ) && $match['unquoted'][1] >= 0 ) {
                    $value = $match['unquoted'][0];
                } else {
                    $valueDelimiter = $match['delim'][0];
                    $value = str_replace( "\\$valueDelimiter", $valueDelimiter, $match['quoted'][0] );
                }
                if ( isset( $match['suffixes'] ) && $match['suffixes'][1] >= 1 ) {
                    $valueSuffix = $match['suffixes'][0];
                    $quotedValue = rtrim( $quotedValue, $valueSuffix );
                }
            }

            $negationChar = '';
            if ( $key[0] === '-' ) {
                $negationChar = $key[0];
                $key = substr( $key, 1 );
            }
            // We take the key as start offset, the whole match can eat some spaces
            // at the beginning for query headers.
            $kwStart = $match['key'][1] + strlen( $negationChar );
            $wholeStart = $match['key'][1];
            // $end is whole match length minus chars between start and key
            $end = $wholeStart + strlen( $match[0][0] ) - ( $wholeStart - $match[0][1] );
            $parsedValue = null;
            if ( $feature->hasValue() && $quotedValue !== '' ) {
                // Set the current offset so that we can collect warnings at the keyword offset
                $this->currentOffset = $valueStart;
                $parsedValue = $feature->parseValue(
                    $key, $value, $quotedValue, $valueDelimiter, $valueSuffix, $this );
                if ( $parsedValue === false ) {
                    Assert::postcondition( $feature->allowEmptyValue(),
                        'Only features accepting empty value can reject a value' );
                    $value = '';
                    $quotedValue = '';
                    $end = $valueStart;
                    $parsedValue = null;
                }
            }
            if ( !$tracker->overlap( $wholeStart, $end ) ) {
                $node = new KeywordFeatureNode( $kwStart, $end, $feature, $key, $value, $quotedValue,
                    $valueDelimiter, $valueSuffix, $parsedValue );
                if ( $negationChar !== '' ) {
                    $node = new NegatedNode( $wholeStart, $end, $node, $negationChar );
                }
                $output[] = $node;
            }
        }
        return $output;
    }

    /**
     * @param KeywordFeature $feature
     * @return string
     */
    private function getValueRegex( KeywordFeature $feature ) {
        Assert::invariant( $feature->hasValue(), __METHOD__ . ' called but hasValue() is false' );
        if ( $feature->greedy() ) {
            // XXX: we send raw value to the keyword
            return '(?<unquoted>.+)';
        } else {
            $quantifier = $feature->allowEmptyValue() ? '*' : '+';
            // Collect all quoted vlaue delimiter (usually only " but can be / for regexes)
            $allDelims = '';
            $optionalSuffixes = [];
            foreach ( $feature->getValueDelimiters() as $delimConfig ) {
                Assert::precondition( strlen( $delimConfig['delimiter'] ) === 1,
                    "Value delimiter must be a single byte char" );
                $delim = preg_quote( $delimConfig['delimiter'], '/' );
                $allDelims .= $delim;
                if ( isset( $delimConfig['suffixes'] ) ) {
                    // Use lookbehind to only match the suffix if it was used with the proper delimiter
                    // i.e i should only be matched in /regex/i not "regex"i
                    $optionalSuffixes[] = "(?<=$delim)" . preg_quote( $delimConfig['suffixes'], '/' );
                }
            }
            $quotedValue = "(?<delim>[$allDelims])" . // Capture the delimiter used to use in backreferences
                           // use negative lookbehind to consume any char that is not the captured delimiter
                           // but also accept to escape the captured delimiter
                           "(?<quoted>(?:\\\\\g{delim}|(?!\g{delim}).)*)" .
                           "\g{delim}";
            if ( $optionalSuffixes ) {
                $quotedValue .= "(?<suffixes>" . implode( '|', $optionalSuffixes ) . ')?';
            }
            // XXX: we support only " to break the unquoted value
            $unquotedValue = "(?<unquoted>[^\"\pZ\pC]$quantifier)";
            return "(?:$quotedValue|$unquotedValue)";
        }
    }

    /**
     * @return ParseWarning[]
     */
    public function getWarnings() {
        return $this->warnings;
    }

    /**
     * Add a warning
     *
     * @param string $message i18n message key
     * @param mixed ...$params
     */
    public function addWarning( $message, ...$params ) {
        $this->warnings[] = new ParseWarning( $message, $this->currentOffset, [], null, $params );
    }
}