wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/CompletionSuggester.php

Summary

Maintainability
C
7 hrs
Test Coverage
<?php

namespace CirrusSearch;

use CirrusSearch\Profile\SearchProfileService;
use CirrusSearch\Query\CompSuggestQueryBuilder;
use CirrusSearch\Query\PrefixSearchQueryBuilder;
use CirrusSearch\Search\CompletionResultsCollector;
use CirrusSearch\Search\FancyTitleResultsType;
use CirrusSearch\Search\MSearchRequests;
use CirrusSearch\Search\SearchContext;
use CirrusSearch\Search\SearchRequestBuilder;
use Closure;
use Elastica\Index;
use Elastica\Multi\Search as MultiSearch;
use Elastica\Query;
use Elastica\ResultSet;
use Elastica\Search;
use MediaWiki\MediaWikiServices;
use MediaWiki\Status\Status;
use MediaWiki\User\User;
use SearchSuggestionSet;
use Wikimedia\Assert\Assert;

/**
 * Performs search as you type queries using Completion Suggester.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */

/**
 * Completion Suggester Searcher
 *
 * NOTES:
 * The CompletionSuggester is built on top of the ElasticSearch Completion
 * Suggester.
 * (https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-completion.html).
 *
 * This class is used at query time, see
 * CirrusSearch\BuildDocument\SuggestBuilder for index time logic.
 *
 * Document model: Cirrus documents are indexed with 2 suggestions:
 *
 * 1. The title suggestion (and close redirects).
 * This helps to avoid displaying redirects with typos (e.g. Albert Enstein,
 * Unietd States) where we make the assumption that if the redirect is close
 * enough it's likely a typo and it's preferable to display the canonical title.
 * This decision is made at index-time in SuggestBuilder::extractTitleAndSimilarRedirects.
 *
 * 2. The redirect suggestions
 * Because the same canonical title can be returned twice we support fetch_limit_factor
 * in suggest profiles to fetch more than what the use asked.
 *
 * Additionally if the namespaces request include non NS_MAIN a prefix search query
 * is sent to the main index. Results are appended to the suggest results. Appending
 * is far from ideal but in the current state scores between the suggest index and prefix
 * search are not comparable.
 * TODO: investigate computing the comp suggest score on main indices to properly merge
 * results.
 */
class CompletionSuggester extends ElasticsearchIntermediary {
    /**
     * @const string multisearch key to identify the comp suggest request
     */
    private const MSEARCH_KEY_SUGGEST = "suggest";

    /**
     * @const string multisearch key to identify the prefix search request
     */
    private const MSEARCH_KEY_PREFIX = "prefix";

    /**
     * Search type (used for logs & timeout configs)
     */
    private const SEARCH_TYPE = 'comp_suggest';

    /**
     * @var int maximum number of result (final)
     */
    private $limit;

    /**
     * @var int offset (final)
     */
    private $offset;

    /**
     * @var string index base name to use (final)
     */
    private $indexBaseName;

    /**
     * @var Index (final)
     */
    private $completionIndex;

    /**
     * Search environment configuration (final)
     * @var SearchConfig
     */
    private $config;

    /**
     * @var SearchContext (final)
     */
    private $searchContext;

    /**
     * @var CompSuggestQueryBuilder (final)
     */
    private $compSuggestBuilder;

    /**
     * @var PrefixSearchQueryBuilder (final)
     */
    private $prefixSearchQueryBuilder;

    /**
     * @var SearchRequestBuilder the builder to build the search for prefix search queries
     */
    private $prefixSearchRequestBuilder;

    /**
     * @param Connection $conn
     * @param int $limit Limit the results to this many
     * @param int $offset
     * @param SearchConfig|null $config Configuration settings
     * @param int[]|null $namespaces Array of namespace numbers to search or null to search all namespaces.
     * @param User|null $user user for which this search is being performed.  Attached to slow request logs.
     * @param string|bool $index Base name for index to search from, defaults to $wgCirrusSearchIndexBaseName
     * @param string|null $profileName force the profile to use otherwise SearchProfileService defaults will be used
     * @param CirrusDebugOptions|null $debugOptions
     */
    public function __construct( Connection $conn, $limit, $offset = 0, SearchConfig $config = null, array $namespaces = null,
        User $user = null, $index = false, $profileName = null,
                                 CirrusDebugOptions $debugOptions = null ) {
        if ( $config === null ) {
            // @todo connection has an embedded config ... reuse that? somehow should
            // at least ensure they are the same.
            $config = MediaWikiServices::getInstance()
                ->getConfigFactory()
                ->makeConfig( 'CirrusSearch' );
        }

        parent::__construct( $conn, $user, $config->get( 'CirrusSearchSlowSearch' ) );
        $this->config = $config;
        $this->limit = $limit;
        $this->offset = $offset;
        $this->indexBaseName = $index ?: $config->get( SearchConfig::INDEX_BASE_NAME );
        $this->completionIndex = $this->connection->getIndex( $this->indexBaseName,
            Connection::TITLE_SUGGEST_INDEX_SUFFIX );
        $this->searchContext = new SearchContext( $this->config, $namespaces, $debugOptions );

        $profileDefinition = $this->config->getProfileService()
            ->loadProfile( SearchProfileService::COMPLETION, SearchProfileService::CONTEXT_DEFAULT, $profileName );
        $this->compSuggestBuilder = new CompSuggestQueryBuilder(
            $this->searchContext,
            $profileDefinition,
            $limit,
            $offset
        );
        $this->prefixSearchQueryBuilder = new PrefixSearchQueryBuilder();
    }

    /**
     * Produce a set of completion suggestions for text using _suggest
     * See https://www.elastic.co/guide/en/elasticsearch/reference/1.6/search-suggesters-completion.html
     *
     * WARNING: experimental API
     *
     * @param string $text Search term
     * @param string[]|null $variants Search term variants
     *  Usually issued via LanguageConverter::autoConvertToAllVariants( $text ) for the content language.
     * @return Status
     */
    public function suggest( $text, $variants = null ) {
        $suggestSearch = $this->getSuggestSearchRequest( $text, $variants );
        $mSearchRequests = new MSearchRequests();

        if ( $suggestSearch !== null ) {
            $mSearchRequests->addRequest( self::MSEARCH_KEY_SUGGEST, $suggestSearch );
        }

        $prefixSearch = $this->getPrefixSearchRequest( $text, $variants );
        if ( $prefixSearch !== null ) {
            $mSearchRequests->addRequest( self::MSEARCH_KEY_PREFIX, $prefixSearch );
        }

        if ( !$mSearchRequests->getRequests() ) {
            return Status::newGood( SearchSuggestionSet::emptySuggestionSet() );
        }
        $description = "{queryType} search for '{query}'";

        if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
            return $mSearchRequests->dumpQuery( $description );
        }

        $multiSearch = new MultiSearch( $this->connection->getClient() );
        $multiSearch->addSearches( $mSearchRequests->getRequests() );

        $this->connection->setTimeout( $this->getClientTimeout( self::SEARCH_TYPE ) );

        $status = Util::doPoolCounterWork( 'CirrusSearch-Completion', $this->user,
                function () use ( $multiSearch, $text, $description ) {
                    $log = $this->newLog( $description, self::SEARCH_TYPE, [
                        'query' => $text,
                        'offset' => $this->offset,
                    ] );

                    $resultsTransformer = $this->getResultsTransformer( $log );

                    return $this->runMSearch( $multiSearch, $log, $this->connection,
                        $resultsTransformer );
                } );

        if ( $status->isOk() && $this->searchContext->getDebugOptions()->isCirrusDumpResult() ) {
            $resultSets = $status->getValue()->getResultSets();
            $responses = $mSearchRequests->toMSearchResponses( $resultSets );

            return $responses->dumpResults( $description );
        }

        return $status;
    }

    /**
     * @param ResultSet[] $results
     * @param CompletionRequestLog $log
     * @return SearchSuggestionSet
     */
    private function processMSearchResponse( array $results, CompletionRequestLog $log ) {
        $collector = new CompletionResultsCollector(
            $this->limit, $this->offset, $this->config->get( 'CirrusSearchCompletionBannedPageIds' ) );
        $totalHits = $this->collectCompSuggestResults( $collector, $results, $log );
        $totalHits += $this->collectPrefixSearchResults( $collector, $results, $log );
        $log->setTotalHits( $totalHits );
        return $collector->logAndGetSet( $log );
    }

    /**
     * @param CompletionResultsCollector $collector
     * @param ResultSet[] $results
     * @param CompletionRequestLog $log
     * @return int
     */
    private function collectCompSuggestResults( CompletionResultsCollector $collector, array $results, CompletionRequestLog $log ) {
        if ( !isset( $results[self::MSEARCH_KEY_SUGGEST] ) ) {
            return 0;
        }
        $log->addIndex( $this->completionIndex->getName() );
        $suggestResults = $results[self::MSEARCH_KEY_SUGGEST];
        $log->setSuggestTookMs( intval( $suggestResults->getResponse()->getQueryTime() * 1000 ) );
        return $this->compSuggestBuilder->postProcess(
            $collector,
            $suggestResults,
            $this->completionIndex->getName()
        );
    }

    /**
     * @param CompletionResultsCollector $collector
     * @param ResultSet[] $results
     * @param CompletionRequestLog $log
     * @return int
     * @throws \Exception
     */
    private function collectPrefixSearchResults( CompletionResultsCollector $collector, array $results, CompletionRequestLog $log ) {
        if ( !isset( $results[self::MSEARCH_KEY_PREFIX] ) ) {
            return 0;
        }
        $indexName = $this->prefixSearchRequestBuilder->getIndex()->getName();
        $prefixResults = $results[self::MSEARCH_KEY_PREFIX];
        $totalHits = $prefixResults->getTotalHits();
        $log->addIndex( $indexName );
        $log->setPrefixTookMs( intval( $prefixResults->getResponse()->getQueryTime() * 1000 ) );
        // We only append as we can't really compare scores without more complex code/evaluation
        if ( $collector->isFull() ) {
            return $totalHits;
        }
        /** @var FancyTitleResultsType $rType */
        $rType = $this->prefixSearchRequestBuilder->getSearchContext()->getResultsType();
        // the code below highly depends on the array format built by
        // FancyTitleResultsType::transformOneElasticResult assert that this type
        // is properly set so that we fail during unit tests if someone changes it
        // inadvertently.
        Assert::precondition( $rType instanceof FancyTitleResultsType, '$rType must be a FancyTitleResultsType' );
        // scores can go negative, it's not a problem we only use scores for sorting
        // they'll be forgotten in client response
        $score = $collector->getMinScore() !== null ? $collector->getMinScore() - 1 : count( $prefixResults->getResults() );

        $namespaces = $this->prefixSearchRequestBuilder->getSearchContext()->getNamespaces();
        foreach ( $prefixResults->getResults() as $res ) {
            $pageId = $this->config->makePageId( $res->getId() );
            $title = FancyTitleResultsType::chooseBestTitleOrRedirect( $rType->transformOneElasticResult( $res, $namespaces ) );
            if ( $title === false ) {
                continue;
            }
            $suggestion = new \SearchSuggestion( $score--, $title->getPrefixedText(), $title, $pageId );
            if ( !$collector->collect( $suggestion, 'prefix', $indexName ) && $collector->isFull() ) {
                break;
            }
        }
        return $totalHits;
    }

    /**
     * @param string $text Search term
     * @param string[]|null $variants Search term variants
     *  Usually issued via LanguageConverter::autoConvertToAllVariants( $text ) for the content language.
     * @return Search|null
     */
    private function getSuggestSearchRequest( $text, $variants ) {
        if ( !$this->compSuggestBuilder->areResultsPossible() ) {
            return null;
        }

        $suggest = $this->compSuggestBuilder->build( $text, $variants );
        $query = new Query( new Query\MatchNone() );
        $query->setSize( 0 );
        $query->setSuggest( $suggest );
        $query->setSource( [ 'target_title' ] );
        $search = new Search( $this->connection->getClient() );
        $search->addIndex( $this->completionIndex );
        $search->setQuery( $query );
        return $search;
    }

    /**
     * @param string $term Search term
     * @param string[]|null $variants Search term variants
     *  Usually issued via LanguageConverter::autoConvertToAllVariants( $text ) for the content language.
     * @return Search|null
     */
    private function getPrefixSearchRequest( $term, $variants ) {
        $namespaces = $this->searchContext->getNamespaces();
        if ( $namespaces === null ) {
            return null;
        }

        foreach ( $namespaces as $k => $v ) {
            // non-strict comparison, it can be strings
            if ( $v === NS_MAIN ) {
                unset( $namespaces[$k] );
            }
        }

        if ( $namespaces === [] ) {
            return null;
        }
        $limit = CompSuggestQueryBuilder::computeHardLimit( $this->limit, $this->offset, $this->config );
        if ( $this->offset > $limit ) {
            return null;
        }
        $prefixSearchContext = new SearchContext( $this->config, $namespaces );
        $prefixSearchContext->setResultsType( new FancyTitleResultsType( 'prefix' ) );
        $this->prefixSearchQueryBuilder->build( $prefixSearchContext, $term, $variants );
        if ( !$prefixSearchContext->areResultsPossible() ) {
            // $prefixSearchContext might contain warnings, but these are lost.
            return null;
        }
        $this->prefixSearchRequestBuilder = new SearchRequestBuilder( $prefixSearchContext, $this->connection, $this->indexBaseName );
        $this->prefixSearchRequestBuilder->setTimeout( $this->getTimeout( self::SEARCH_TYPE ) );
        return $this->prefixSearchRequestBuilder->setLimit( $limit )
            // collect all results up to $limit, $this->offset is the offset the client wants
            // not the offset in prefix search results.
            ->setOffset( 0 )
            ->build();
    }

    /**
     * @param string $description
     * @param string $queryType
     * @param array $extra
     * @return CompletionRequestLog
     */
    protected function newLog( $description, $queryType, array $extra = [] ) {
        return new CompletionRequestLog(
            $description,
            $queryType,
            $extra,
            $this->searchContext->getNamespaces()
        );
    }

    /**
     * @return Index
     */
    public function getCompletionIndex() {
        return $this->completionIndex;
    }

    /**
     * @param CompletionRequestLog $log
     * @return Closure|null
     */
    private function getResultsTransformer( CompletionRequestLog $log ): ?Closure {
        $resultsTransformer = null;
        if ( !$this->searchContext->getDebugOptions()->isCirrusDumpResult() ) {
            $resultsTransformer = function ( \Elastica\Multi\ResultSet $results ) use ( $log ) {
                return $this->processMSearchResponse( $results->getResultSets(), $log );
            };
        }

        return $resultsTransformer;
    }

}