wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Query/FullTextSimpleMatchQueryBuilder.php

Summary

Maintainability
B
6 hrs
Test Coverage
<?php

namespace CirrusSearch\Query;

use CirrusSearch\Search\SearchContext;
use CirrusSearch\SearchConfig;
use Elastica\Query\AbstractQuery;
use Elastica\Query\MatchNone;

/**
 * Simple Match query builder, currently based on
 * FullTextQueryStringQueryBuilder to reuse its parsing logic.
 * It will only support queries that do not use the lucene QueryString syntax
 * and fallbacks to FullTextQueryStringQueryBuilder in such cases.
 * It generates only simple match/multi_match queries. It supports merging
 * multiple clauses into a dismax query with 'in_dismax'.
 */
class FullTextSimpleMatchQueryBuilder extends FullTextQueryStringQueryBuilder {
    /**
     * @var bool true is the main used the experimental query
     */
    private $usedExpQuery = false;

    /**
     * @var float[]|array[] mixed array of field settings used for the main query
     */
    private $fields;

    /**
     * @var float[]|array[] mixed array of field settings used for the phrase rescore query
     */
    private $phraseFields;

    /**
     * @var float default weight to use for stems
     */
    private $defaultStemWeight;

    /**
     * @var string default multimatch query type
     */
    private $defaultQueryType;

    /**
     * @var string default multimatch min should match
     */
    private $defaultMinShouldMatch;

    /**
     * @var array[] dismax query settings
     */
    private $dismaxSettings;

    /**
     * @var array filter settings
     */
    private $filter;

    public function __construct( SearchConfig $config, array $feature, array $settings ) {
        parent::__construct( $config, $feature );
        $this->fields = $settings['fields'];
        $this->filter = $settings['filter'] ?? [ 'type' => 'default' ];
        $this->phraseFields = $settings['phrase_rescore_fields'];
        $this->defaultStemWeight = $settings['default_stem_weight'];
        $this->defaultQueryType = $settings['default_query_type'];
        $this->defaultMinShouldMatch = $settings['default_min_should_match'];
        $this->dismaxSettings = $settings['dismax_settings'] ?? [];
    }

    /**
     * Build the primary query used for full text search.
     * If query_string syntax is not used the experimental query is built.
     * We fallback to parent implementation otherwise.
     *
     * @param SearchContext $context
     * @param string[] $fields
     * @param AbstractQuery $nearMatchQuery
     * @param string $queryString
     * @return \Elastica\Query\AbstractQuery
     */
    protected function buildSearchTextQuery(
        SearchContext $context,
        array $fields,
        AbstractQuery $nearMatchQuery,
        $queryString
    ) {
        if ( $context->isSyntaxUsed( 'query_string' ) ) {
            return parent::buildSearchTextQuery( $context, $fields,
                $nearMatchQuery, $queryString );
        }
        $context->addSyntaxUsed( 'full_text_simple_match', 5 );
        $this->usedExpQuery = true;
        $queryForMostFields = $this->buildExpQuery( $queryString );
        if ( $nearMatchQuery instanceof MatchNone ) {
            return $queryForMostFields;
        }

        // Build one query for the full text fields and one for the near match fields so that
        // the near match can run unescaped.
        $bool = new \Elastica\Query\BoolQuery();
        $bool->setMinimumShouldMatch( 1 );
        $bool->addShould( $queryForMostFields );
        $bool->addShould( $nearMatchQuery );

        return $bool;
    }

    /**
     * Builds the highlight query
     * @param SearchContext $context
     * @param string[] $fields
     * @param string $queryText
     * @param int $slop
     * @return \Elastica\Query\AbstractQuery
     */
    protected function buildHighlightQuery( SearchContext $context, array $fields, $queryText, $slop ) {
        $query = parent::buildHighlightQuery( $context, $fields, $queryText, $slop );
        if ( $this->usedExpQuery && $query instanceof \Elastica\Query\QueryString ) {
            // the exp query accepts more docs (stopwords in query are not required)
            $query->setDefaultOperator( 'OR' );
        }
        return $query;
    }

    /**
     * Builds the phrase rescore query
     * @param SearchContext $context
     * @param string[] $fields
     * @param string $queryText
     * @param int $slop
     * @return \Elastica\Query\AbstractQuery
     */
    protected function buildPhraseRescoreQuery( SearchContext $context, array $fields, $queryText, $slop ) {
        if ( $this->usedExpQuery ) {
            $phrase = new \Elastica\Query\MultiMatch();
            $phrase->setParam( 'type', 'phrase' );
            $phrase->setParam( 'slop', $slop );
            $fields = [];
            foreach ( $this->phraseFields as $f => $b ) {
                $fields[] = "$f^$b";
            }
            $phrase->setFields( $fields );
            $phrase->setQuery( $queryText );
            return $this->maybeWrapWithTokenCountRouter( $queryText, $phrase );
        } else {
            return parent::buildPhraseRescoreQuery( $context, $fields, $queryText, $slop );
        }
    }

    /**
     * @inheritDoc
     */
    protected function getMultiTermRewriteMethod() {
        // Use blended freq as a rewrite method. The
        // top_terms_boost_1024 method used by the parent is not well
        // suited for a weighted sum and for some reasons uses the
        // queryNorms which depends on the number of terms found by the
        // wildcard. Using this one we'll use the similarity configured
        // for this field instead of a constant score and in the case
        // of BM25 queryNorm is ignored (removed in lucene 7)
        return 'top_terms_blended_freqs_1024';
    }

    /**
     * Generate an elasticsearch query by reading profile settings
     * @param string $queryString the query text
     * @return \Elastica\Query\AbstractQuery
     */
    private function buildExpQuery( $queryString ) {
        $query = new \Elastica\Query\BoolQuery();
        $query->setMinimumShouldMatch( 0 );
        $this->attachFilter( $this->filter, $queryString, $query );
        $dismaxQueries = [];

        foreach ( $this->fields as $f => $settings ) {
            $mmatch = new \Elastica\Query\MultiMatch();
            $mmatch->setQuery( $queryString );
            $queryType = $this->defaultQueryType;
            $minShouldMatch = $this->defaultMinShouldMatch;
            $stemWeight = $this->defaultStemWeight;
            $boost = 1;
            $fields = [ "$f.plain^1", "$f^$stemWeight" ];
            $in_dismax = null;

            if ( is_array( $settings ) ) {
                $boost = $settings['boost'] ?? $boost;
                $queryType = $settings['query_type'] ?? $queryType;
                $minShouldMatch = $settings['min_should_match'] ?? $minShouldMatch;
                if ( isset( $settings['is_plain'] ) && $settings['is_plain'] ) {
                    $fields = [ $f ];
                } else {
                    $fields = [ "$f.plain^1", "$f^$stemWeight" ];
                }
                $in_dismax = $settings['in_dismax'] ?? null;
            } else {
                $boost = $settings;
            }

            if ( $boost === 0 ) {
                continue;
            }

            $mmatch->setParam( 'boost', $boost );
            $mmatch->setMinimumShouldMatch( $minShouldMatch );
            $mmatch->setType( $queryType );
            $mmatch->setFields( $fields );
            $mmatch->setParam( 'boost', $boost );
            $mmatch->setQuery( $queryString );
            if ( $in_dismax ) {
                $dismaxQueries[$in_dismax][] = $mmatch;
            } else {
                $query->addShould( $mmatch );
            }
        }
        foreach ( $dismaxQueries as $name => $queries ) {
            $dismax = new \Elastica\Query\DisMax();
            if ( isset( $this->dismaxSettings[$name] ) ) {
                $settings = $this->dismaxSettings[$name];
                if ( isset( $settings['tie_breaker'] ) ) {
                    $dismax->setTieBreaker( $settings['tie_breaker'] );
                }
                if ( isset( $settings['boost'] ) ) {
                    $dismax->setBoost( $settings['boost'] );
                }
            }
            foreach ( $queries as $q ) {
                $dismax->addQuery( $q );
            }
            $query->addShould( $dismax );
        }
        return $query;
    }

    /**
     * Attach the query filter to $boolQuery
     *
     * @param array $filterDef filter definition
     * @param string $query query text
     * @param \Elastica\Query\BoolQuery $boolQuery the query to attach the filter to
     */
    private function attachFilter( array $filterDef, $query, \Elastica\Query\BoolQuery $boolQuery ) {
        if ( !isset( $filterDef['type'] ) ) {
            throw new \RuntimeException( "Cannot configure the filter clause, 'type' must be defined." );
        }
        $type = $filterDef['type'];
        $filter = null;

        switch ( $type ) {
            case 'default':
                $filter = $this->buildSimpleAllFilter( $filterDef, $query );
                break;
            case 'constrain_title':
                $filter = $this->buildTitleFilter( $filterDef, $query );
                break;
            default:
                throw new \RuntimeException( "Cannot build the filter clause: unknown filter type $type" );
        }

        $boolQuery->addFilter( $filter );
    }

    /**
     * Builds a simple filter on all and all.plain when all terms must match
     *
     * @param array[] $options array containing filter options
     * @param string $query
     * @return \Elastica\Query\AbstractQuery
     */
    private function buildSimpleAllFilter( $options, $query ) {
        $filter = new \Elastica\Query\BoolQuery();
        $filter->setMinimumShouldMatch( 1 );
        // FIXME: We can't use solely the stem field here
        // - Depending on languages it may lack stopwords,
        // A dedicated field used for filtering would be nice
        foreach ( [ 'all', 'all.plain' ] as $field ) {
            $m = new \Elastica\Query\MatchQuery();
            $m->setFieldQuery( $field, $query );
            $minShouldMatch = '100%';
            if ( isset( $options['settings'][$field]['minimum_should_match'] ) ) {
                $minShouldMatch = $options['settings'][$field]['minimum_should_match'];
            }
            if ( $minShouldMatch === '100%' ) {
                $m->setFieldOperator( $field, 'AND' );
            } else {
                $m->setFieldMinimumShouldMatch( $field, $minShouldMatch );
            }
            $filter->addShould( $m );
        }
        return $filter;
    }

    /**
     * Builds a simple filter based on buildSimpleAllFilter + a constraint
     * on title/redirect :
     * (all:query OR all.plain:query) AND (title:query OR redirect:query)
     * where the filter on title/redirect can be controlled by setting
     * minimum_should_match to relax the constraint on title.
     * (defaults to '3<80%')
     *
     * @param array[] $options array containing filter options
     * @param string $query the user query
     * @return \Elastica\Query\AbstractQuery
     */
    private function buildTitleFilter( $options, $query ) {
        $filter = new \Elastica\Query\BoolQuery();
        $filter->addMust( $this->buildSimpleAllFilter( $options, $query ) );
        $minShouldMatch = '3<80%';
        if ( isset( $options['settings']['minimum_should_match'] ) ) {
            $minShouldMatch = $options['settings']['minimum_should_match'];
        }
        $titleFilter = new \Elastica\Query\BoolQuery();
        $titleFilter->setMinimumShouldMatch( 1 );

        foreach ( [ 'title', 'redirect.title' ] as $field ) {
            $m = new \Elastica\Query\MatchQuery();
            $m->setFieldQuery( $field, $query );
            $m->setFieldMinimumShouldMatch( $field, $minShouldMatch );
            $titleFilter->addShould( $m );
        }
        $filter->addMust( $titleFilter );
        return $filter;
    }
}