wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Parser/AST/Visitor/QueryFixer.php

Summary

Maintainability
A
3 hrs
Test Coverage
<?php

namespace CirrusSearch\Parser\AST\Visitor;

use CirrusSearch\Parser\AST\BooleanClause;
use CirrusSearch\Parser\AST\EmptyQueryNode;
use CirrusSearch\Parser\AST\FuzzyNode;
use CirrusSearch\Parser\AST\KeywordFeatureNode;
use CirrusSearch\Parser\AST\NamespaceHeaderNode;
use CirrusSearch\Parser\AST\NegatedNode;
use CirrusSearch\Parser\AST\ParsedBooleanNode;
use CirrusSearch\Parser\AST\ParsedNode;
use CirrusSearch\Parser\AST\ParsedQuery;
use CirrusSearch\Parser\AST\PhrasePrefixNode;
use CirrusSearch\Parser\AST\PhraseQueryNode;
use CirrusSearch\Parser\AST\PrefixNode;
use CirrusSearch\Parser\AST\WildcardNode;
use CirrusSearch\Parser\AST\WordsQueryNode;
use HtmlArmor;
use Wikimedia\Assert\Assert;

/**
 * Inspect a query and determine what parts of it can be sent to a typo correction mechanism and
 * provide a method to fix the query once the corrected substring is known.
 */
class QueryFixer implements Visitor {
    /**
     * @var \SplObjectStorage
     */
    private static $cache;

    /**
     * @var ParsedQuery
     */
    private $parsedQuery;

    /**
     * @var bool
     */
    private $visited = false;

    /**
     * @var ParsedNode|null
     */
    private $node;

    /**
     * @var bool
     */
    private $hasQMarkInWildcard = false;

    /**
     * @var int
     */
    private $currentSize = 0;

    /**
     * @var bool true when this branch is "negated".
     */
    private $inNegation;

    /**
     * @var bool
     */
    private $isComplex = false;

    /**
     * @param ParsedQuery $query
     */
    public function __construct( ParsedQuery $query ) {
        $this->parsedQuery = $query;
    }

    /**
     * @param ParsedQuery $query
     * @return QueryFixer
     */
    public static function build( ParsedQuery $query ) {
        if ( self::$cache === null || count( self::$cache ) > 100 ) {
            // Build the cache for the first time or drop it for a new empty one just in case this class
            // is used from a maint script that treats/parses millions of queries
            self::$cache = new \SplObjectStorage();
        }

        $fixer = self::$cache[$query] ?? null;
        if ( $fixer === null ) {
            $fixer = new self( $query );
            self::$cache[$query] = $fixer;
        }
        return $fixer;
    }

    /**
     * Get the longest phrase that is subject to typo correction.
     * It's generally a set of consecutive words.
     *
     * @return string|null
     */
    public function getFixablePart() {
        if ( !$this->visited ) {
            $this->visited = true;
            $this->parsedQuery->getRoot()->accept( $this );
        }

        if ( $this->isComplex ) {
            $this->node = null;
        }

        if ( $this->hasQMarkInWildcard && $this->parsedQuery->hasCleanup( ParsedQuery::CLEANUP_QMARK_STRIPPING ) ) {
            // We may not be able to reconstruct this kind of queries properly
            // If a question mark is legimetely removed we agree that it's OK to present the user
            // with its original query minus the question marks.
            // But if the user explicitely escaped the question mark so that it generates a valid
            // wildcard query we don't attempt to re-escape the resulting query.
            $this->node = null;
        }

        // @phan-suppress-next-line PhanSuspiciousValueComparison
        if ( $this->node === null ) {
            return null;
        }

        if ( $this->node instanceof KeywordFeatureNode ) {
            return $this->node->getValue();
        } elseif ( $this->node instanceof WordsQueryNode ) {
            return $this->node->getWords();
        } else {
            /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */
            Assert::invariant( false, "Unsupported node type " . get_class( $this->node ) );
            return null;
        }
    }

    /**
     * Replace the fixable part of the visited query with the provided replacement
     *
     * @param HtmlArmor|string $replacement If HtmlArmor is provided all modifications will be
     *  html safe and HtmlArmor will be returned. If a string is provided no escaping will occur.
     * @return HtmlArmor|string|null
     */
    public function fix( $replacement ) {
        Assert::precondition( $this->visited, "getFixablePart must be called before trying to fix the query" );
        if ( $this->node === null ) {
            return null;
        }

        $escapeBoundaries = false;
        if ( $replacement instanceof HtmlArmor ) {
            $escapeBoundaries = true;
            $replacement = HtmlArmor::getHtml( $replacement );
            if ( $replacement === null ) {
                throw new \InvalidArgumentException( '$replacement cannot be null nor wrap a null value' );
            }
        }
        $replacement = preg_replace( '/[~?*"\\\\]/', '\\\\$0', $replacement );

        $prefix = "";
        if ( $this->parsedQuery->hasCleanup( ParsedQuery::TILDE_HEADER ) ) {
            $prefix .= "~";
        }
        $prefix .= substr( $this->parsedQuery->getQuery(), 0, $this->node->getStartOffset() );
        if ( $this->node instanceof KeywordFeatureNode ) {
            $prefix .= $this->node->getKey() . ':';
        }

        $suffix = substr( $this->parsedQuery->getQuery(), $this->node->getEndOffset() );

        if ( $escapeBoundaries ) {
            $prefix = htmlspecialchars( $prefix );
            $suffix = htmlspecialchars( $suffix );
            $fixed = $prefix . $replacement . $suffix;
            return new HtmlArmor( $fixed );
        }

        return $prefix . $replacement . $suffix;
    }

    /**
     * @param WordsQueryNode $node
     */
    public function visitWordsQueryNode( WordsQueryNode $node ) {
        if ( $this->inNegation ) {
            return;
        }
        $siz = mb_strlen( $node->getWords() );
        if ( $siz > $this->currentSize ) {
            if ( !$this->acceptableString( $node->getWords() ) ) {
                return;
            }
            $this->node = $node;
            $this->currentSize = $siz;
        }
    }

    /**
     * Determine if this substring of the query is suitable for being fixed.
     * Excludes string with chars that may require escaping (*, ?, " and \)
     * @param string $str
     * @return bool
     */
    private function acceptableString( $str ) {
        // We ignore word parts that we me have to escape
        // when presenting the query back to the user
        return preg_match( '/[*?"\\\\]/', $str ) !== 1;
    }

    /**
     * @param PhraseQueryNode $node
     */
    public function visitPhraseQueryNode( PhraseQueryNode $node ) {
        $this->isComplex = true;
    }

    /**
     * @param PhrasePrefixNode $node
     */
    public function visitPhrasePrefixNode( PhrasePrefixNode $node ) {
        $this->isComplex = true;
    }

    /**
     * @param FuzzyNode $node
     */
    public function visitFuzzyNode( FuzzyNode $node ) {
        $this->isComplex = true;
    }

    /**
     * @param PrefixNode $node
     */
    public function visitPrefixNode( PrefixNode $node ) {
        $this->isComplex = true;
    }

    /**
     * @param WildcardNode $node
     */
    public function visitWildcardNode( WildcardNode $node ) {
        if ( str_contains( $node->getWildcardQuery(), '?' ) ) {
            $this->hasQMarkInWildcard = true;
        }
        $this->isComplex = true;
    }

    /**
     * @param EmptyQueryNode $node
     */
    public function visitEmptyQueryNode( EmptyQueryNode $node ) {
    }

    /**
     * @param KeywordFeatureNode $node
     */
    public function visitKeywordFeatureNode( KeywordFeatureNode $node ) {
        // FIXME: fixing intitle is perhaps a side effect of the original cirrus query parser
        if ( !$this->inNegation && $node->getKey() === 'intitle' && $node->getDelimiter() === '' ) {
            $siz = strlen( $node->getValue() );
            if ( $siz > $this->currentSize && $this->acceptableString( $node->getValue() ) ) {
                $this->node = $node;
                $this->currentSize = $siz;
            }
        }
    }

    /**
     * @param ParsedBooleanNode $node
     */
    public function visitParsedBooleanNode( ParsedBooleanNode $node ) {
        foreach ( $node->getClauses() as $clause ) {
            $this->visitBooleanClause( $clause );
        }
    }

    /**
     * @param BooleanClause $clause
     */
    public function visitBooleanClause( BooleanClause $clause ) {
        if ( $clause->isExplicit() ) {
            $this->isComplex = true;
        }
        $oldNegated = $this->inNegation;
        $node = $clause->getNode();
        if ( $node instanceof KeywordFeatureNode && $node->getKey() === 'intitle' && $node->getDelimiter() === '' ) {
            // Inhibits the fixer when it sees an un-acceptable value inside a keyword (legacy browsertest_176)
            $this->isComplex = $this->isComplex || !$this->acceptableString( $node->getValue() );
        }
        if ( $clause->getOccur() === BooleanClause::MUST_NOT ) {
            if ( !$node instanceof KeywordFeatureNode ) {
                // FIXME: (legacy) only negated keywords were accepted
                $this->isComplex = true;
            }
            $this->inNegation = !$this->inNegation;
        }

        $clause->getNode()->accept( $this );
        $this->inNegation = $oldNegated;
    }

    /**
     * @param NegatedNode $node
     */
    final public function visitNegatedNode( NegatedNode $node ) {
        /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */
        Assert::invariant( false, 'NegatedNode should be optimized at parse time' );
    }

    /**
     * @param NamespaceHeaderNode $node
     */
    final public function visitNamespaceHeader( NamespaceHeaderNode $node ) {
        /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */
        Assert::invariant( false, 'Not yet part of the AST, should not be visited.' );
    }
}