wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Query/Builder/NearMatchFieldQueryBuilder.php

Summary

Maintainability
A
2 hrs
Test Coverage
<?php

namespace CirrusSearch\Query\Builder;

use CirrusSearch\Parser\AST\EmptyQueryNode;
use CirrusSearch\Parser\AST\FuzzyNode;
use CirrusSearch\Parser\AST\KeywordFeatureNode;
use CirrusSearch\Parser\AST\ParsedNode;
use CirrusSearch\Parser\AST\ParsedQuery;
use CirrusSearch\Parser\AST\PhrasePrefixNode;
use CirrusSearch\Parser\AST\PhraseQueryNode;
use CirrusSearch\Parser\AST\PrefixNode;
use CirrusSearch\Parser\AST\Visitor\LeafVisitor;
use CirrusSearch\Parser\AST\WildcardNode;
use CirrusSearch\Parser\AST\WordsQueryNode;
use CirrusSearch\Query\InTitleFeature;
use CirrusSearch\SearchConfig;
use Elastica\Query\AbstractQuery;
use Elastica\Query\MatchNone;
use Elastica\Query\MultiMatch;
use Wikimedia\Assert\Assert;

/**
 * ParseQuery visitor that attempts to extract a form that resembles to the near match query.
 * This implementation tries to mimic the strategy of the old query parser that works by removing
 * keywords. It might make sense in the future to reconsider this approach and see if there are
 * better strategies to apply with the help of the ParsedQuery.
 */
class NearMatchFieldQueryBuilder {
    public const ALL_NEAR_MATCH = "all_near_match";
    public const ALL_NEAR_MATCH_ACCENT_FOLDED = self::ALL_NEAR_MATCH . ".asciifolding";
    private array $profile;

    public static function defaultFromSearchConfig( SearchConfig $config ): self {
        return self::defaultFromWeight( $config->get( 'CirrusSearchNearMatchWeight' ) ?: 2 );
    }

    public static function defaultFromWeight( float $weight ): self {
        return new self(
            [ "fields" => [
                [ "name" => self::ALL_NEAR_MATCH, "weight" => round( $weight, 3 ) ],
                [ "name" => self::ALL_NEAR_MATCH_ACCENT_FOLDED, "weight" => round( $weight * 0.75, 3 ) ]
            ] ]
        );
    }

    public function __construct( array $profile ) {
        $this->profile = $profile;
    }

    public function buildFromParsedQuery( ParsedQuery $query ): AbstractQuery {
        $visitor = new class( $query ) extends LeafVisitor {
            public string $nearMatch;

            public function __construct( ParsedQuery $query ) {
                parent::__construct();
                $this->nearMatch = $query->getQuery();
                $nsHeader = $query->getNamespaceHeader();
                if ( $nsHeader != null ) {
                    $this->blank( $nsHeader );
                }
            }

            /**
             * Blank the portion of the search query located at the same location as the $node.
             * A custom replacement can be passed but must not have a length greater than this location.
             * @param ParsedNode $node the node holding the location of the query string we want to blank
             * @param string $replacement optional replacement string to use
             */
            private function blank( ParsedNode $node, string $replacement = "" ): void {
                $l = $node->getEndOffset() - $node->getStartOffset();
                Assert::parameter( strlen( $replacement ) < $l, '$replacement',
                    'must be shorter than the replaced ParsedNode' );
                $this->nearMatch = substr_replace(
                    $this->nearMatch,
                    str_pad( $replacement, $l, " " ),
                    $node->getStartOffset(), $l
                );
            }

            /** {@inheritdoc} */
            public function visitWordsQueryNode( WordsQueryNode $node ) {
            }

            /** {@inheritdoc} */
            public function visitPhraseQueryNode( PhraseQueryNode $node ) {
            }

            /** {@inheritdoc} */
            public function visitPhrasePrefixNode( PhrasePrefixNode $node ) {
            }

            /** {@inheritdoc} */
            public function visitFuzzyNode( FuzzyNode $node ) {
            }

            /** {@inheritdoc} */
            public function visitPrefixNode( PrefixNode $node ) {
            }

            /** {@inheritdoc} */
            public function visitWildcardNode( WildcardNode $node ) {
            }

            /** {@inheritdoc} */
            public function visitEmptyQueryNode( EmptyQueryNode $node ) {
            }

            /** {@inheritdoc} */
            public function visitKeywordFeatureNode( KeywordFeatureNode $node ) {
                if ( !$this->negated() && ( $node->getKeyword() instanceof InTitleFeature ) && $node->getParsedValue() == [] ) {
                    // TODO: generalize this InTitleFeature behavior
                    // We want to keep the text of the intitle keyword on if:
                    // - it's not negated
                    // - it's not a regular expression (using $node->getParsedValue() == [] )
                    $this->blank( $node, $node->getQuotedValue() );
                } else {
                    $clause = $this->getCurrentBooleanClause();
                    // painful attempt to keep a weird edge-case of the old query parser that does not
                    // support negating keyword clause with an explicit NOT.
                    // Might be interesting to re-consider the usefulness of such edge-case
                    // "NOT keyword:value" becomes "NOT"
                    // but "-keyword:value" becomes ""
                    // we detect the use of NOT or - using BooleanClause::isExplicit
                    $negatedNode = $clause != null ? $clause->getNegatedNode() : null;
                    if ( $negatedNode !== null && !$clause->isExplicit() ) {
                        // the negated node should have the proper offsets to blank the "-"
                        $this->blank( $negatedNode );
                    } else {
                        $this->blank( $node );
                    }
                }
            }
        };
        $query->getRoot()->accept( $visitor );
        $queryString = trim( preg_replace( '/\s{2,}/', ' ', $visitor->nearMatch ) );

        return $this->buildFromQueryString( $queryString );
    }

    public function buildFromQueryString( string $query ): AbstractQuery {
        if ( preg_match( '/^\s*$/', $query ) === 1 ) {
            return new MatchNone();
        }
        $allQuery = new MultiMatch();
        $allQuery->setQuery( $query );
        $allQuery->setFields(
            array_map(
                static function ( array $fieldDef ): string {
                    return $fieldDef["name"] . "^" . $fieldDef["weight"];
                },
                $this->profile["fields"]
            )
        );
        return $allQuery;
    }

}