wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Parser/QueryStringRegex/QueryStringRegexParser.php

Summary

Maintainability
F
3 days
Test Coverage
<?php

namespace CirrusSearch\Parser\QueryStringRegex;

use CirrusSearch\Parser\AST\BooleanClause;
use CirrusSearch\Parser\AST\EmptyQueryNode;
use CirrusSearch\Parser\AST\KeywordFeatureNode;
use CirrusSearch\Parser\AST\NamespaceHeaderNode;
use CirrusSearch\Parser\AST\NegatedNode;
use CirrusSearch\Parser\AST\ParsedBooleanNode;
use CirrusSearch\Parser\AST\ParsedNode;
use CirrusSearch\Parser\AST\ParsedQuery;
use CirrusSearch\Parser\AST\ParseWarning;
use CirrusSearch\Parser\AST\PhraseQueryNode;
use CirrusSearch\Parser\AST\Visitor\KeywordNodeVisitor;
use CirrusSearch\Parser\AST\WordsQueryNode;
use CirrusSearch\Parser\NamespacePrefixParser;
use CirrusSearch\Parser\ParsedQueryClassifiersRepository;
use CirrusSearch\Parser\QueryParser;
use CirrusSearch\Query\KeywordFeature;
use CirrusSearch\Query\PrefixFeature;
use CirrusSearch\Search\Escaper;
use CirrusSearch\Util;
use LogicException;
use MediaWiki\Message\Message;
use Wikimedia\Assert\Assert;

/**
 * Full text query parser that uses regex to parse its token.
 *
 * Far from being a state of the art parser it detects most of its
 * tokens using regular expression. And make arbitrary decisions
 * at tokenization.
 *
 * The tokenizer will understand few token types:
 * - WHITESPACE: all unicode whitespace and control chars ([\pZ\pC])
 *   the WHITESPACE token is ignored and never presented to the parser
 * - EOF: dummy type used to mark end of string
 * - BOOL_AND/BOOL_OR/BOOL_NOT: explicit boolean opeartors
 * - PARSED_NODE: complex type (usually part of the query)
 *
 * PARSED_NODE is a type that groups:
 * - Keywords
 * - Phrase
 * - Words
 * - Wildcards/Prefix
 *
 * Phrase does not have its own token " and is part the tokenization and is never exposed
 * to the parser.
 * Same for negation prefix (! and -), they are parsed at tokenization time.
 *
 * NOTE that this parser is broken by design:
 * - no lexical context support, we first parse keywords
 * - no support for groupings (parenthesis)
 */
class QueryStringRegexParser implements QueryParser {
    /**
     * Whitespace regex including unicode and some control chars
     */
    private const WHITESPACE_REGEX = '/\G[\pZ\pC]+/u';

    public const QUERY_LEN_HARD_LIMIT = 4096;

    /**
     * see T66350
     */
    private const GERSHAYIM_REGEX = '/(\p{L}{2,})(?:")(\p{L})(?=[^\p{L}]|$)/u';

    /**
     * Supported explicit boolean operator
     *
     */
    private const EXPLICIT_BOOLEAN_OPERATOR = '/\G(?:(?<AND>AND|&&)|(?<OR>OR|\|\|)|(?<NOT>NOT))(?![^\pZ\pC"])/u';

    /**
     * @var \CirrusSearch\Parser\KeywordRegistry
     */
    private $keywordRegistry;

    /**
     * @var Escaper
     */
    private $escaper;

    /**
     * @var ParsedQueryClassifiersRepository
     */
    private $classifierRepository;

    /**
     * @var string|null user query (null when not yet cleaned up)
     */
    private $query;

    /**
     * @var string Either "all", "break", or "final"
     */
    private $questionMarkStripLevel;

    /**
     * @var string the raw query as received by the search engine
     */
    private $rawQuery;

    /**
     * @var KeywordParser
     */
    private $keywordParser;

    /**
     * @var PhraseQueryParser
     */
    private $phraseQueryParser;

    /**
     * @var NonPhraseParser
     */
    private $nonPhraseParser;

    /**
     * @var OffsetTracker track offsets of parsed keywords
     */
    private $keywordOffsetsTracker;

    /**
     * @var ParsedNode[]
     */
    private $preTaggedNodes = [];

    /**
     * Token set after calling nextToken
     * @var Token|null
     */
    private $token;

    /**
     * Last token seen (set within nextToken)
     * @var Token|null
     */
    private $lookBehind;

    /**
     * Current offset
     * NOTE: offset is moved after call advance
     * @var int
     */
    private $offset;

    /**
     * @var bool[] indexed cleanups applied (indexed by the cleanup type)
     * @see ParsedQuery::hasCleanup()
     */
    private $queryCleanups = [];

    /**
     * Errors detected while parsing the query
     * @var ParseWarning[]
     */
    private $warnings = [];

    /**
     * @var NamespaceHeaderNode|null
     */
    private $namespaceHeader;

    /**
     * @var NamespacePrefixParser
     */
    private $namespacePrefixParser;

    private const DEFAULT_OCCUR = BooleanClause::MUST;

    /**
     * @var int
     */
    private $maxQueryLen;

    /**
     * @param \CirrusSearch\Parser\KeywordRegistry $keywordRegistry
     * @param Escaper $escaper
     * @param string $qmarkStripLevel Level of question mark stripping to apply, either "all",
     *  "break", or "final"
     * @param ParsedQueryClassifiersRepository $classifierRepository
     * @param NamespacePrefixParser $namespacePrefixParser
     * @param int|null $maxQueryLen maximum length of the query in chars
     * @see Util::stripQuestionMarks() for acceptable $qmarkStripLevel values
     */
    public function __construct(
        \CirrusSearch\Parser\KeywordRegistry $keywordRegistry,
        Escaper $escaper,
        $qmarkStripLevel,
        ParsedQueryClassifiersRepository $classifierRepository,
        NamespacePrefixParser $namespacePrefixParser,
        ?int $maxQueryLen
    ) {
        $this->keywordRegistry = $keywordRegistry;
        $this->escaper = $escaper;
        $this->keywordParser = new KeywordParser();
        $this->phraseQueryParser = new PhraseQueryParser( $escaper );
        $this->nonPhraseParser = new NonPhraseParser( $escaper );
        $this->questionMarkStripLevel = $qmarkStripLevel;
        $this->classifierRepository = $classifierRepository;
        $this->namespacePrefixParser = $namespacePrefixParser;
        $this->maxQueryLen = $maxQueryLen ?: 300;
    }

    /**
     * Reinit internal parser states
     * @param string $rawQuery
     */
    private function reInit( $rawQuery ) {
        $this->rawQuery = $rawQuery;
        $this->query = null;
        $this->keywordOffsetsTracker = new OffsetTracker();
        $this->token = null;
        $this->lookBehind = null;
        $this->preTaggedNodes = [];
        $this->warnings = [];
        $this->queryCleanups = [];
        $this->namespaceHeader = null;
        $this->offset = 0;
    }

    /**
     * Apply some cleanups to the input query prior to parsing it
     * Ideally the parser should be able to handle the query without modifying it
     * but in some cases it simply way easier to handle this this way.
     * Cleanups applied:
     * - Question mark stripping depending on $this->questionMarkStripLevel
     * - gershayim quirks if $this->escaper->getLanguage() is hebrew
     */
    private function cleanup() {
        $query = $this->rawQuery;
        $nquery = Util::stripQuestionMarks( $query, $this->questionMarkStripLevel );
        if ( $nquery !== $query ) {
            $this->queryCleanups[ParsedQuery::CLEANUP_QMARK_STRIPPING] = true;
            $query = $nquery;
        }
        if ( $this->escaper->getLanguage() === 'he' ) {
            $nquery = preg_replace( self::GERSHAYIM_REGEX, '$1\"$2', $query );
            if ( $nquery !== $query ) {
                $this->queryCleanups[ParsedQuery::CLEANUP_GERSHAYIM_QUIRKS] = true;
                $query = $nquery;
            }
        }
        if ( strlen( $query ) > 0 && $query[0] === '~' ) {
            $query = substr( $query, 1 );
            $this->queryCleanups[ParsedQuery::TILDE_HEADER] = true;
        }
        $this->query = $query;
    }

    /**
     * @param string $query
     * @return \CirrusSearch\Parser\AST\ParsedQuery
     * @throws SearchQueryParseException
     */
    public function parse( string $query ): ParsedQuery {
        $this->reInit( $query );
        $queryLen = mb_strlen( $query );
        if ( $queryLen > self::QUERY_LEN_HARD_LIMIT ) {
            throw new SearchQueryParseException( 'cirrussearch-query-too-long',
                $queryLen, self::QUERY_LEN_HARD_LIMIT );
        }
        $this->cleanup();
        $this->parseNsHeader();
        $this->token = new Token( $this->query );
        $this->lookBehind = new Token( $this->query );

        // First parse keywords
        $nonGreedyHeaders = [];
        $greedyHeaders = [];
        $greedy = [];
        $allowingEmpty = [];
        $normalKeywords = [];
        foreach ( $this->keywordRegistry->getKeywords() as $keyword ) {
            // Parsing depends on the nature of the keyword
            // 1. non greedy query headers
            // 2. greedy query headers
            // 3. greedy
            // 4. allowed empty values (see prefer-recent)
            // 5. normal
            // FIXME: refactor this so that parsing is less dependent on keyword ordering
            // we could try to identify all keyword prefixes in a single regex like /(?<=\G|[\pZ\pC])(intitle|prefer-recent|...):/
            // and iterate over there. Currently we workaround this issue by separating keywords into categories but that is not
            // sufficient it's still dependent on ordering within a specific category.
            if ( !$keyword->greedy() && $keyword->queryHeader() ) {
                $nonGreedyHeaders[] = $keyword;
            } elseif ( $keyword->greedy() && $keyword->queryHeader() ) {
                $greedyHeaders[] = $keyword;
            } elseif ( $keyword->greedy() ) {
                $greedy[] = $keyword;
            } elseif ( $keyword->allowEmptyValue() ) {
                $allowingEmpty[] = $keyword;
            } else {
                $normalKeywords[] = $keyword;
            }
        }
        $this->parseKeywords( $nonGreedyHeaders );
        $this->parseKeywords( $greedyHeaders );
        $this->parseKeywords( $greedy );
        $this->parseKeywords( $allowingEmpty );
        $this->parseKeywords( $normalKeywords );
        $this->warnings = array_merge( $this->warnings, $this->keywordParser->getWarnings() );
        // All parsed keywords have their offsets marked in $this->keywordOffsetsTracker
        // We then reparse the query from the beginning finding holes between keyword offsets
        uasort( $this->preTaggedNodes, static function ( ParsedNode $a, ParsedNode $b ) {
            if ( $a->getStartOffset() < $b->getStartOffset() ) {
                return -1;
            } else {
                // We cannot have equality here
                return 1;
            }
        } );
        reset( $this->preTaggedNodes );

        $this->checkQueryLen();
        $root = $this->expression();
        $additionalNamespaces = $this->extractRequiredNamespaces( $root );
        return new ParsedQuery( $root, $this->query, $this->rawQuery, $this->queryCleanups,
            $this->namespaceHeader, $additionalNamespaces, $this->warnings, $this->classifierRepository );
    }

    private function createClause( ParsedNode $node, $explicit = false, $occur = null ) {
        if ( $node instanceof NegatedNode ) {
            // OR NOT is simply MUST_NOT, there's no SHOULD_NOT in lucene
            // so simply do what lucene QueryString does: force MUST_NOT whenever
            // we encounter a negated clause.
            return new BooleanClause( $node->getChild(), BooleanClause::MUST_NOT,
                $explicit || $node->getNegationType() === 'NOT', $node );
        }
        return new BooleanClause( $node, $occur ?? self::DEFAULT_OCCUR, $explicit );
    }

    /**
     * (([NOT] Node) (AND|OR)?)*
     * Tries to follow behavior with backward order precedence:
     * - A AND B OR C => MUST:A SHOULD:B SHOULD:C
     * - A OR B AND C => SHOULD:A MUST:B MUST:C
     *
     * NOT is always MUST_NOT:
     * - A OR NOT B: A -B
     *
     * Syntax errors fallback:
     * - NOT: MUST:NOT
     * - NOT NOT: MUST_NOT:NOT
     * - NOT AND FOO: MUST_NOT:AND MUST:FOO
     * - NOT !FOO: MUST:FOO
     * @return ParsedNode
     */
    private function expression() {
        $clauses = [];
        $left = null;
        // Last boolean operator seen, -1 means none
        $lastBoolType = -1;
        $explicitNegation = false;
        // TODO: simplify, this is a bit hairy
        while ( $this->nextToken() ) {
            if ( $left === null ) {
                // First iteration
                if ( !$this->isLeaf() ) {
                    $left = $this->fallbackToWord( [ Token::NOT, Token::PARSED_NODE ] );
                } else {
                    $left = $this->negatedLeaf();
                }
                Assert::postcondition( $left !== null, '$left must not be null' );
                continue;
            }

            $explicitNegation = false;
            // The last boolean operator seen before the last one, -1 means none
            // used to know if the node was attached explicitly by the user
            $beforeLastBoolType = $lastBoolType;
            $lastBoolType = -1;
            switch ( $this->token->getType() ) {
                case Token::NOT:
                    // NOT something
                    $this->advance();
                    if ( !$this->nextToken() ) {
                        // NOT<EOF>
                        // strategy is to simply eat the NOT token as a word query
                        $node = $this->unexpectedEOF( [ Token::PARSED_NODE ] );
                        if ( $left instanceof WordsQueryNode ) {
                            // Collapse it to previous words
                            $left = $this->mergeWords( $left, $node );
                        } else {
                            // or add new boolean clause
                            $clauses[] = $this->createClause( $left, $beforeLastBoolType !== -1 );
                            $left = $node;
                        }
                    } else {
                        $explicitNegation = true;
                        $clauses[] = $this->createClause( $left );
                        $left = $this->explicitlyNegatedNode();
                    }
                    Assert::postcondition( $left !== null, '$left must not be null' );
                    break;
                case Token::PARSED_NODE:
                    // A word or a keyword
                    // note: negation prefix is eaten by the tokenizer
                    if ( $left instanceof WordsQueryNode
                            && $this->token->getNode() instanceof WordsQueryNode
                    ) {
                        $lastBoolType = $beforeLastBoolType;
                        $left = $this->collapseWords( $left );

                    } else {
                        $clauses[] = $this->createClause( $left, $explicitNegation );
                        $left = $this->leaf();
                    }
                    Assert::postcondition( $left !== null, '$left must not be null' );
                    break;
                case Token::BOOL_AND:
                case Token::BOOL_OR:
                    $lastBoolType = $this->token->getType();
                    $this->advance();
                    if ( !$this->nextToken() ) {
                        $lastBoolType = $beforeLastBoolType;
                        $node = $this->unexpectedEOF( [ Token::NOT, Token::PARSED_NODE ] );
                        if ( $left instanceof WordsQueryNode ) {
                            // "catapult ||"
                            $left = $this->mergeWords( $left, $node );
                        } else {
                            // "!catapult ||"
                            $clauses[] = $this->createClause( $left, $beforeLastBoolType !== -1 );
                            $left = $node;
                        }
                        Assert::postcondition( $left !== null, '$left must not be null' );
                        break;
                    }
                    $occur = $this->boolToOccur( $lastBoolType );
                    if ( $this->isLeaf() ) {
                        $node = $this->negatedLeaf();
                        $clauses[] = $this->createClause( $left, true, $occur );
                        $left = $node;
                    } else {
                        $clauses[] = $this->createClause( $left );
                        $left = $this->fallbackToWord( [ Token::NOT, Token::PARSED_NODE ] );
                    }
                    Assert::postcondition( $left !== null, '$left must not be null' );
                    break;
                default:
                    throw new LogicException( "BUG: unexpected token type {$this->token->getType()}" );
            }
        }

        if ( $left === null ) {
            return new EmptyQueryNode( 0, strlen( $this->query ) );
        }
        if ( $clauses !== [] ) {
            if ( $lastBoolType !== -1 ) {
                $occur = $this->boolToOccur( $lastBoolType );
                $clauses[] = $this->createClause( $left, true, $occur );
            } else {
                $clauses[] = $this->createClause( $left, $explicitNegation );
            }
            return $this->createBoolNode( $clauses );
        } elseif ( $left instanceof NegatedNode ) {
            $clauses[] = $this->createClause( $left );
            return $this->createBoolNode( $clauses );
        }
        return $left;
    }

    /**
     * @param BooleanClause[] $clauses
     * @return ParsedBooleanNode
     */
    private function createBoolNode( array $clauses ) {
        $end = end( $clauses )->getNode()->getEndOffset();
        $start = reset( $clauses )->getNode()->getStartOffset();
        return new ParsedBooleanNode( $start, $end, $clauses );
    }

    /**
     * Collapse the current token with $left.
     * @param WordsQueryNode $left
     * @return WordsQueryNode
     */
    private function collapseWords( WordsQueryNode $left ) {
        $start = $left->getStartOffset();
        $end = $this->token->getEnd();
        $word = new WordsQueryNode( $start, $end,
            $this->escaper->unescape( substr( $this->query, $start, $end - $start ) ) );
        $this->advance();
        return $word;
    }

    private function mergeWords( WordsQueryNode $left, WordsQueryNode $right ) {
        $start = $left->getStartOffset();
        $end = $right->getEndOffset();
        return new WordsQueryNode( $start, $end,
            $this->escaper->unescape( substr( $this->query, $start, $end - $start ) ) );
    }

    /**
     * @param int[] $expected token types
     * @return WordsQueryNode
     */
    private function fallbackToWord( array $expected ) {
        $this->warnings[] = new ParseWarning(
            'cirrussearch-parse-error-unexpected-token',
            $this->token->getStart(),
            Token::getTypeLabels( $expected ),
            Token::getTypeLabel( $this->token->getType() )
        );
        $word = new WordsQueryNode( $this->token->getStart(), $this->token->getEnd(),
            $this->token->getImage() );
        $this->advance();
        return $word;
    }

    /**
     * @param int[] $expected token types
     * @return WordsQueryNode
     */
    private function unexpectedEOF( array $expected ) {
        $this->warnings[] = new ParseWarning(
            'cirrussearch-parse-error-unexpected-end',
            strlen( $this->query ),
            Token::getTypeLabels( $expected ),
            Token::getTypeLabel( $this->token->getType() )
        );
        return new WordsQueryNode( $this->lookBehind->getStart(), $this->lookBehind->getEnd(),
            $this->lookBehind->getImage() );
    }

    private function negatedLeaf() {
        if ( $this->token->getType() === Token::NOT ) {
            $this->advance();
            if ( !$this->nextToken() ) {
                return $this->unexpectedEOF( [ Token::PARSED_NODE ] );
            }
            return $this->explicitlyNegatedNode();
        }
        return $this->leaf();
    }

    private function leaf() {
        $node = $this->token->getNode();
        $this->advance();
        return $node;
    }

    /**
     * (word|phrase|keyword|wildcard|fuzzy)
     *
     * Warnings on :
     *
     * !(word|phrase|keyword|special_word) => double negation dropped
     * (AND|OR) => NOT("AND|OR")
     * EOF => "NOT"
     *
     * @return ParsedNode
     */
    private function explicitlyNegatedNode() {
        if ( $this->token->getType() === Token::PARSED_NODE ) {
            $node = $this->leaf();
            if ( $node instanceof NegatedNode ) {
                $this->warnings[] = new ParseWarning( 'cirrussearch-parse-error-double-negation',
                    $node->getStartOffset() );
                $node = $node->getChild();
            } else {
                $node = new NegatedNode( $this->lookBehind->getStart(), $node->getEndOffset(),
                    // @phan-suppress-next-line PhanTypeMismatchArgumentNullable $node not null after getType
                    $node, $this->lookBehind->getImage() );
            }
        } else {
            $node = $this->fallbackToWord( [ Token::PARSED_NODE ] );
        }
        return $node;
    }

    /**
     * @return bool true if it's a simple word token
     */
    private function isLeaf() {
        return $this->token->getType() === Token::PARSED_NODE
            || $this->token->getType() === Token::NOT;
    }

    /**
     * @param KeywordFeature[] $keywords
     */
    private function parseKeywords( array $keywords ) {
        foreach ( $keywords as $kw ) {
            $parsedKeywords =
                $this->keywordParser->parse( $this->query, $kw, $this->keywordOffsetsTracker, $this->offset );
            $this->keywordOffsetsTracker->appendNodes( $parsedKeywords );
            foreach ( $parsedKeywords as $keyword ) {
                $this->preTaggedNodes[] = $keyword;
            }
        }
    }

    /**
     * @return bool
     */
    private function nextToken() {
        Assert::precondition( $this->token->getStart() < $this->offset,
            'You should not call nextToken() twice on the same offsets' );
        $this->token->copyTo( $this->lookBehind );
        $this->token->reset();
        $nextPretagged = current( $this->preTaggedNodes );
        $queryLen = strlen( $this->query );
        $maxOffset = $nextPretagged !== false ?
            $nextPretagged->getStartOffset() : $queryLen;

        $this->consumeWS();

        if ( $this->offset >= $queryLen ) {
            $this->token->eof();
            return false;
        }

        if ( $nextPretagged !== false && $this->offset === $nextPretagged->getStartOffset() ) {
            $this->token->node( $nextPretagged );
            return true;
        }

        Assert::precondition( $this->offset < $maxOffset, '$this->offset < $maxOffset' );
        if ( $this->consumeBoolOp() ) {
            return true;
        }

        if ( $this->consumePhrase( $maxOffset ) ) {
            return true;
        }

        if ( $this->consumeWord( $maxOffset ) ) {
            return true;
        }

        if ( $this->consumeUnbalancedPhrase( $maxOffset ) ) {
            $this->warnings[] = new ParseWarning( "cirrus-parse-error-unbalanced-phrase", $this->token->getStart() );
            // this is theorically the only remaining option (unbalanced phrase query)
            // if not it means the above is broken and needs a fix
            return true;
        }
        throw new \RuntimeException( "BUG: cannot consume query at offset $this->offset (need to go to $maxOffset)" );
    }

    /**
     * @return bool
     */
    private function consumeWS() {
        $matches = [];
        if ( preg_match( self::WHITESPACE_REGEX, $this->query, $matches, 0, $this->offset ) === 1 ) {
            $this->offset += strlen( $matches[0] );
            return true;
        }
        return false;
    }

    /**
     * Consume an explicit boolean operator (and/or/not)
     * @return bool true if consumed, false otherwise.
     */
    private function consumeBoolOp() {
        $match = [];
        if ( preg_match( self::EXPLICIT_BOOLEAN_OPERATOR, $this->query, $match, 0, $this->offset ) ) {
            // Check captured backward so no need to check that the group actually matched
            if ( isset( $match['NOT'] ) ) {
                $this->token->setType( Token::NOT, $this->offset, $this->offset + strlen( $match['NOT'] ) );
            } elseif ( isset( $match['OR'] ) ) {
                $this->token->setType( Token::BOOL_OR, $this->offset, $this->offset + strlen( $match['OR'] ) );
            } else {
                $this->token->setType( Token::BOOL_AND, $this->offset,
                $this->offset + strlen( $match['AND'] ) );
            }
            return true;
        }
        return false;
    }

    /**
     * @param int $maxOffset
     * @return bool true if consumed, false otherwise.
     */
    private function consumePhrase( $maxOffset ) {
        $node = $this->phraseQueryParser->parse( $this->query, $this->offset, $maxOffset );
        if ( $node !== null ) {
            $this->token->node( $node );
            return true;
        }
        return false;
    }

    /**
     * @param int $maxOffset
     * @return bool
     */
    private function consumeWord( int $maxOffset ) {
        $node = $this->nonPhraseParser->parse( $this->query, $this->offset, $maxOffset );
        if ( $node !== null ) {
            $this->token->node( $node );
            return true;
        }
        return false;
    }

    /**
     * @param int $maxOffset
     * @return bool
     */
    private function consumeUnbalancedPhrase( $maxOffset ) {
        $matches = [];
        if ( preg_match( PhraseQueryParser::PHRASE_START, $this->query, $matches, 0, $this->offset ) === 1 ) {
            $negated = isset( $matches['negate'] ) && strlen( $matches['negate'] ) > 0 ? $matches['negate'] : '';
            $wholeStart = $this->offset;
            $phraseStart = $wholeStart + strlen( $negated );
            $innerStart = $phraseStart + strlen( '"' );
            Assert::invariant( $wholeStart <= $maxOffset, '$start <= $to' );
            $phrase = $maxOffset > $innerStart ? substr( $this->query, $innerStart, $maxOffset - $innerStart ) : "";
            $node = PhraseQueryNode::unbalanced( $phraseStart, $maxOffset, $phrase );
            if ( $negated !== '' ) {
                $node = new NegatedNode( $wholeStart, $maxOffset, $node, $negated );
            }
            $this->token->node( $node );
            return true;
        }
        return false;
    }

    /**
     * advance current offset to the end of the current token
     */
    private function advance() {
        $pretagged = current( $this->preTaggedNodes );
        if ( $pretagged !== false && $this->token->getStart() === $pretagged->getStartOffset() ) {
            next( $this->preTaggedNodes );
        }
        $this->offset = $this->token->getEnd();
    }

    /**
     * @param int $boolOperator
     * @return string
     */
    private function boolToOccur( $boolOperator ) {
        return $boolOperator === Token::BOOL_OR ? BooleanClause::SHOULD : BooleanClause::MUST;
    }

    /**
     * @param ParsedNode $root
     * @return array|string array of additional namespaces, 'all' for everything
     */
    private function extractRequiredNamespaces( ParsedNode $root ) {
        $visitor = new class( [], [ PrefixFeature::class ] ) extends KeywordNodeVisitor {
            /** @var string|int[] */
            public $total = [];

            /**
             * @param KeywordFeatureNode $node
             */
            public function doVisitKeyword( KeywordFeatureNode $node ) {
                if ( $this->total === 'all' ) {
                    return;
                }

                Assert::parameter( $node->getKeyword() instanceof PrefixFeature, '$node', 'must be parsed from PrefixFeature' );
                // @phan-suppress-next-line PhanTypeArraySuspiciousNullable
                $additional = $node->getParsedValue()[PrefixFeature::PARSED_NAMESPACES];
                if ( $additional === 'all' ) {
                    $this->total = 'all';
                    return;
                }
                Assert::precondition( is_array( $additional ),
                    'PrefixFeature::PARSED_NAMESPACES key must point to an array or "all"' );
                $this->total = array_merge( $this->total, array_filter( $additional, function ( $v ) {
                    return !in_array( $v, $this->total );
                } ) );
            }
        };
        $root->accept( $visitor );
        return $visitor->total;
    }

    /**
     * Inspect $this->query to see if it mentions a namespace in the first few chars
     * If yes $this->namespaceHeader will be set and this->offset will be advanced
     * to the actual start of the query.
     */
    private function parseNsHeader() {
        Assert::precondition( $this->offset === 0, 'ns header must be the first parsed bits or ' .
            'you must properly handle offset in this method.' );
        $queryAndNs = $this->namespacePrefixParser->parse( $this->query );
        if ( $queryAndNs !== false ) {
            Assert::postcondition( count( $queryAndNs ) === 2,
                '\SearchEngine::parseNamespacePrefixes() must return false or a 2 elements array' );
            $queryOffset = strlen( $this->query ) - strlen( $queryAndNs[0] );
            if ( $queryAndNs[1] ) {
                Assert::postcondition( is_array( $queryAndNs[1] ) && count( $queryAndNs[1] ) === 1,
                        '\SearchEngine::parseNamespacePrefixes() should return an array whose second ' .
                        'element is falsy or an array of size 1' );
                $this->namespaceHeader = new NamespaceHeaderNode( $this->offset, $queryOffset, reset( $queryAndNs[1] ) );
            } else {
                $this->namespaceHeader = new NamespaceHeaderNode( $this->offset, $queryOffset, 'all' );
            }
            $this->offset = $queryOffset;
        }
    }

    /**
     * Check the length of the query and throws SearchQueryParseException
     * if it's more than what we allow.
     *
     * @throws SearchQueryParseException
     */
    private function checkQueryLen(): void {
        Assert::precondition( $this->query !== null, "Query must be set" );
        $maxLen = $this->maxQueryLen;
        $exemptKeywords = [];
        foreach ( $this->preTaggedNodes as $node ) {
            $realNode = $node;
            if ( $node instanceof NegatedNode ) {
                $realNode = $node->getChild();
            }
            if ( $realNode instanceof KeywordFeatureNode ) {
                $maxLen += mb_strlen( substr( $this->query, $node->getStartOffset(), $node->getEndOffset() ) );
                $exemptKeywords[] = $realNode->getKey();
            }
        }
        $queryLen = mb_strlen( $this->query );
        if ( $queryLen > $maxLen ) {
            if ( $exemptKeywords ) {
                sort( $exemptKeywords );
                throw new SearchQueryParseException( 'cirrussearch-query-too-long-with-exemptions',
                    $queryLen, $this->maxQueryLen, Message::listParam( array_unique( $exemptKeywords ), 'comma' ) );
            } else {
                throw new SearchQueryParseException( 'cirrussearch-query-too-long', $queryLen,
                    $this->maxQueryLen );
            }
        }
    }
}