includes/Parser/QueryStringRegex/NonPhraseParser.php from wikimedia/mediawiki-extensions-CirrusSearch

includes/Parser/QueryStringRegex/NonPhraseParser.php
Summary

Maintainability

5 hrs
Test Coverage

Issues
<?php

namespace CirrusSearch\Parser\QueryStringRegex;

use CirrusSearch\Parser\AST\FuzzyNode;
use CirrusSearch\Parser\AST\NegatedNode;
use CirrusSearch\Parser\AST\ParsedNode;
use CirrusSearch\Parser\AST\PrefixNode;
use CirrusSearch\Parser\AST\WildcardNode;
use CirrusSearch\Parser\AST\WordsQueryNode;
use CirrusSearch\Search\Escaper;
use Wikimedia\Assert\Assert;

/**
 * Parse non-phrase query parts.
 * Emit a ParsedQueryStringNode if lucene QueryString syntax is detected
 * A WordsQueryNode otherwise.
 */
class NonPhraseParser {

    /**
     * Detects prefixed negation but ignores negation if not followed by a letter, a number or _
     *   -word: properly negated
     *   --word: eaten as "--word"
     *
     * few markups are added
     */
    private const NEGATION = '/\G[-!](?=[\w])/u';

    /**
     * Consume non quoted chars (negated phrase queries as well)
     * allows:
     * - all escaped sequences
     * - !- only if they are not followed by " (accepts $ to consume !- at the end of the string)
     * - stops at first ", ! or -
     */
    private const NON_QUOTE = '/\\\\.|[!-](?!")|(?<stop>["!\pZ\pC-])/u';

    /**
     * Detect simple prefix nodes
     * only letters and number allowed
     */
    private const PREFIX_QUERY = '/^(?<prefix>\w+)[*]+$/u';

    /**
     * Wildcards disallowed at the beginning
     * we arbitrarily allow 3 wildcards to avoid catching random garbage
     * and too costly queries.
     */
    private const DISALLOWED_LEADING_WILDCARD = '/^(?:\w+[?*]){1,3}\w*$/u';

    /**
     * Wildcards allowed at the beginning
     * but we still force the wildcards to be surrounded by letters
     * we allow only 3 wildcards
     */
    private const ALLOWED_LEADING_WILDCARD = '/^(?:(?:[?*](?=\w)(?:\w+[?*]|\w+){1,2}\w*)|(?:(?:\w+[?*]){1,3}\w*))$/u';

    /**
     * We force fuzzy words to have letters in them
     * NOTE that we disallow * or ? here so we can't
     * match fuzzy and wildcard at the same time
     */
    private const FUZZY_WORD = '/^(?<word>\w+)~(?<fuzzyness>[0-2])?$/u';

    /**
     * @var Escaper
     */
    private $escaper;

    /**
     * @var string regex used to detect wildcards
     */
    private $wildcardRegex;

    /**
     * @param Escaper $escaper
     */
    public function __construct( Escaper $escaper ) {
        $this->escaper = $escaper;
        if ( $this->escaper->getAllowLeadingWildcard() ) {
            $this->wildcardRegex = self::ALLOWED_LEADING_WILDCARD;
        } else {
            $this->wildcardRegex = self::DISALLOWED_LEADING_WILDCARD;
        }
    }

    /**
     * @param string $query
     * @param int $start
     * @param int $end
     * @return ParsedNode|null
     */
    public function parse( string $query, int $start, int $end ) {
        $match = [];
        Assert::precondition( $start < $end, '$start < $end' );
        Assert::precondition( $end <= strlen( $query ), '$end <= strlen( $query )' );
        $ret = preg_match( self::NEGATION, $query, $match, PREG_OFFSET_CAPTURE, $start );
        Assert::postcondition( $ret !== false, 'Regex failed: ' . preg_last_error() );
        $wholeStart = $start;
        $wordStart = $start;
        $negationType = '';
        if ( $ret ) {
            $wordStart = $start + strlen( $match[0][0] );
            $negationType = $match[0][0];
            $start = $match[0][1];
        }
        $wholeEnd = -1;

        while ( $start < $end ) {
            $ret = preg_match( self::NON_QUOTE, $query, $match, PREG_OFFSET_CAPTURE, $start );
            Assert::postcondition( $ret !== false, 'Regex failed: ' . preg_last_error() );
            if ( !$ret ) {
                $wholeEnd = $end;
                break;
            }
            if ( isset( $match['stop'] ) && $match['stop'][1] >= 0 ) {
                $wholeEnd = $match['stop'][1];
                break;
            }
            $start = $match[0][1] + strlen( $match[0][0] );
            $wholeEnd = $start;
        }

        if ( $wholeEnd == $wordStart ) {
            return null;
        }

        $wordLen = $wholeEnd - $wordStart;
        $word = substr( $query, $wordStart, $wordLen );
        $node = null;
        $match = [];
        if ( str_contains( $word, '~' ) && preg_match( self::FUZZY_WORD, $word, $match ) ) {
            $word = $match['word'];
            if ( isset( $match['fuzzyness'] ) && strlen( $match['fuzzyness'] ) > 0 ) {
                $fuzzyness = intval( $match['fuzzyness'] );
            } else {
                $fuzzyness = -1;
            }
            // No need to unescape here, we don't match any punctuation except_
            $node = new FuzzyNode( $wordStart, $wholeEnd, $word, $fuzzyness );
        } elseif ( str_contains( $word, '*' ) || str_contains( $word, '?' ) ) {
            if ( preg_match( self::PREFIX_QUERY, $word, $match ) ) {
                $node = new PrefixNode( $wordStart, $wholeEnd, $match['prefix'] );
            } elseif ( preg_match( $this->wildcardRegex, $word ) ) {
                $node = new WildcardNode( $wordStart, $wholeEnd, $word );
            }
        }

        if ( $node === null ) {
            $node = new WordsQueryNode( $wordStart, $wholeEnd, $this->escaper->unescape( $word ) );
        }
        if ( $negationType !== '' ) {
            $node = new NegatedNode( $wholeStart, $wholeEnd, $node, $negationType );
        }
        return $node;
    }
}