wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Searcher.php

Summary

Maintainability
F
5 days
Test Coverage
<?php

namespace CirrusSearch;

use CirrusSearch\Fallbacks\FallbackRunner;
use CirrusSearch\Fallbacks\SearcherFactory;
use CirrusSearch\Maintenance\NullPrinter;
use CirrusSearch\MetaStore\MetaStoreIndex;
use CirrusSearch\Parser\BasicQueryClassifier;
use CirrusSearch\Parser\FullTextKeywordRegistry;
use CirrusSearch\Parser\NamespacePrefixParser;
use CirrusSearch\Profile\SearchProfileService;
use CirrusSearch\Query\CountContentWordsBuilder;
use CirrusSearch\Query\FullTextQueryBuilder;
use CirrusSearch\Query\KeywordFeature;
use CirrusSearch\Query\NearMatchQueryBuilder;
use CirrusSearch\Query\PrefixSearchQueryBuilder;
use CirrusSearch\Search\BaseCirrusSearchResultSet;
use CirrusSearch\Search\FullTextResultsType;
use CirrusSearch\Search\MSearchRequests;
use CirrusSearch\Search\MSearchResponses;
use CirrusSearch\Search\ResultsType;
use CirrusSearch\Search\SearchContext;
use CirrusSearch\Search\SearchQuery;
use CirrusSearch\Search\SearchRequestBuilder;
use CirrusSearch\Search\TeamDraftInterleaver;
use CirrusSearch\Search\TitleHelper;
use CirrusSearch\Search\TitleResultsType;
use Elastica\Exception\ResponseException;
use Elastica\Exception\RuntimeException;
use Elastica\Multi\Search as MultiSearch;
use Elastica\Query;
use Elastica\Query\BoolQuery;
use Elastica\Query\MultiMatch;
use Elastica\Search;
use MediaWiki\Context\RequestContext;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use MediaWiki\Request\WebRequest;
use MediaWiki\Status\Status;
use MediaWiki\Title\Title;
use MediaWiki\User\User;
use MediaWiki\WikiMap\WikiMap;
use Wikimedia\Assert\Assert;
use Wikimedia\ObjectFactory\ObjectFactory;
use Wikimedia\Stats\StatsFactory;

/**
 * Performs searches using Elasticsearch.  Note that each instance of this class
 * is single use only.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */
class Searcher extends ElasticsearchIntermediary implements SearcherFactory {
    public const SUGGESTION_HIGHLIGHT_PRE = '<em>';
    public const SUGGESTION_HIGHLIGHT_POST = '</em>';
    public const HIGHLIGHT_PRE_MARKER = ''; // \uE000. Can't be a unicode literal until php7
    public const HIGHLIGHT_PRE = '<span class="searchmatch">';
    public const HIGHLIGHT_POST_MARKER = ''; // \uE001
    public const HIGHLIGHT_POST = '</span>';

    /**
     * Maximum offset + limit depth allowed. As in the deepest possible result
     * to return. Too deep will cause very slow queries. 10,000 feels plenty
     * deep. This should be <= index.max_result_window in elasticsearch.
     */
    private const MAX_OFFSET_LIMIT = 10000;

    /**
     * Identifies the main search in MSearchRequests/MSearchResponses
     */
    public const MAINSEARCH_MSEARCH_KEY = '__main__';

    /**
     * Identifies the "tested" search request in MSearchRequests/MSearchResponses
     */
    private const INTERLEAVED_MSEARCH_KEY = '__interleaved__';

    /**
     * @var int search offset
     */
    protected $offset;

    /**
     * @var int maximum number of result
     */
    protected $limit;

    /**
     * @var string sort type
     */
    private $sort = 'relevance';

    /**
     * @var string index base name to use
     */
    protected $indexBaseName;

    /**
     * Search environment configuration
     * @var SearchConfig
     */
    protected $config;

    /**
     * @var SearchContext
     */
    protected $searchContext;

    /**
     * Indexing type we'll be using.
     * @var string|\Elastica\Index
     */
    private $index;

    /**
     * @var NamespacePrefixParser|null
     */
    private $namespacePrefixParser;
    /**
     * @var InterwikiResolver
     */
    protected $interwikiResolver;

    /** @var TitleHelper */
    protected $titleHelper;
    /**
     * @var CirrusSearchHookRunner
     */
    protected $cirrusSearchHookRunner;

    /**
     * @param Connection $conn
     * @param int $offset Offset the results by this much
     * @param int $limit Limit the results to this many
     * @param SearchConfig $config Configuration settings
     * @param int[]|null $namespaces Array of namespace numbers to search or null to search all namespaces.
     * @param User|null $user user for which this search is being performed.  Attached to slow request logs.
     * @param string|bool $index Base name for index to search from, defaults to $wgCirrusSearchIndexBaseName
     * @param CirrusDebugOptions|null $options the debugging options to use or null to use defaults
     * @param NamespacePrefixParser|null $namespacePrefixParser
     * @param InterwikiResolver|null $interwikiResolver
     * @param TitleHelper|null $titleHelper
     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
     * @see CirrusDebugOptions::defaultOptions()
     */
    public function __construct(
        Connection $conn, $offset,
        $limit,
        SearchConfig $config,
        array $namespaces = null,
        User $user = null,
        $index = false,
        CirrusDebugOptions $options = null,
        NamespacePrefixParser $namespacePrefixParser = null,
        InterwikiResolver $interwikiResolver = null,
        TitleHelper $titleHelper = null,
        CirrusSearchHookRunner $cirrusSearchHookRunner = null
    ) {
        parent::__construct(
            $conn,
            $user,
            $config->get( 'CirrusSearchSlowSearch' ),
            $config->get( 'CirrusSearchExtraBackendLatency' )
        );
        $this->config = $config;
        $this->setOffsetLimit( $offset, $limit );
        $this->indexBaseName = $index ?: $config->get( SearchConfig::INDEX_BASE_NAME );
        // TODO: Make these params mandatory once WBCS stops extending this class
        $this->namespacePrefixParser = $namespacePrefixParser;
        $this->interwikiResolver = $interwikiResolver ?: MediaWikiServices::getInstance()->getService( InterwikiResolver::SERVICE );
        $this->titleHelper = $titleHelper ?: new TitleHelper( WikiMap::getCurrentWikiId(), $this->interwikiResolver );
        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
            MediaWikiServices::getInstance()->getHookContainer() );
        $this->searchContext = new SearchContext( $this->config, $namespaces, $options, null, null, $this->cirrusSearchHookRunner );
    }

    /**
     * Unified search public entry-point.
     *
     * NOTE: only fulltext search supported for now.
     * @param SearchQuery $query
     * @return Status
     */
    public function search( SearchQuery $query ) {
        if ( $query->getDebugOptions()->isCirrusDumpQueryAST() ) {
            return Status::newGood( [ 'ast' => $query->getParsedQuery()->toArray() ] );
        }
        // TODO: properly pass the profile context name and its params once we have a dispatch service.
        $this->searchContext = SearchContext::fromSearchQuery( $query, FallbackRunner::create( $query, $this->interwikiResolver ),
            $this->cirrusSearchHookRunner );
        $this->setOffsetLimit( $query->getOffset(), $query->getLimit() );
        $this->config = $query->getSearchConfig();
        $this->sort = $query->getSort();

        if ( $query->getSearchEngineEntryPoint() === SearchQuery::SEARCH_TEXT ) {
            $this->searchContext->setResultsType(
                new FullTextResultsType(
                    $this->searchContext->getFetchPhaseBuilder(),
                    $query->getParsedQuery()->isQueryOfClass( BasicQueryClassifier::COMPLEX_QUERY ),
                    $this->titleHelper,
                    $query->getExtraFieldsToExtract(),
                    $this->searchContext->getConfig()->getElement( 'CirrusSearchDeduplicateInMemory' ) === true
                )
            );
            return $this->searchTextInternal( $query->getParsedQuery()->getQueryWithoutNsHeader() );
        } else {
            throw new \RuntimeException( 'Only ' . SearchQuery::SEARCH_TEXT . ' is supported for now' );
        }
    }

    /**
     * @param ResultsType $resultsType results type to return
     */
    public function setResultsType( $resultsType ) {
        $this->searchContext->setResultsType( $resultsType );
    }

    /**
     * Is this searcher used to return debugging info?
     * @return bool true if the search will return raw output
     */
    public function isReturnRaw() {
        return $this->searchContext->getDebugOptions()->isReturnRaw();
    }

    /**
     * Set the type of sort to perform.  Must be 'relevance', 'title_asc', 'title_desc'.
     * @param string $sort sort type
     */
    public function setSort( $sort ) {
        $this->sort = $sort;
    }

    /**
     * Should this search limit results to the local wiki?  If not called the default is false.
     * @param bool $limitSearchToLocalWiki should the results be limited?
     */
    public function limitSearchToLocalWiki( $limitSearchToLocalWiki ) {
        $this->searchContext->setLimitSearchToLocalWiki( $limitSearchToLocalWiki );
    }

    /**
     * Perform a "near match" title search which is pretty much a prefix match without the prefixes.
     * @param string $term text by which to search
     * @return Status status containing results defined by resultsType on success
     */
    public function nearMatchTitleSearch( $term ) {
        ( new NearMatchQueryBuilder() )->build( $this->searchContext, $term );
        return $this->searchOne();
    }

    /**
     * Perform a sum over the number of words in the content index
     * @return Status status containing a single integer
     */
    public function countContentWords() {
        ( new CountContentWordsBuilder() )->build( $this->searchContext );
        $this->limit = 1;
        return $this->searchOne();
    }

    /**
     * Perform a prefix search.
     * @param string $term text by which to search
     * @param string[] $variants variants to search for
     * @return Status status containing results defined by resultsType on success
     */
    public function prefixSearch( $term, $variants = [] ) {
        ( new PrefixSearchQueryBuilder() )->build( $this->searchContext, $term, $variants );
        return $this->searchOne();
    }

    /**
     * Build full text search for articles with provided term. All the
     * state is applied to $this->searchContext. The returned query
     * builder can be used to build a degraded query if necessary.
     *
     * @param string $term term to search
     * @return FullTextQueryBuilder
     */
    protected function buildFullTextSearch( $term ) {
        // Convert the unicode character 'ideographic whitespace' into standard
        // whitespace. Cirrussearch treats them both as normal whitespace, but
        // the preceding isn't appropriately trimmed.
        // No searching for nothing! That takes forever!
        $term = trim( str_replace( "\xE3\x80\x80", " ", $term ) );
        if ( $term === '' ) {
            $this->searchContext->setResultsPossible( false );
        }

        $builderSettings = $this->config->getProfileService()
            ->loadProfileByName( SearchProfileService::FT_QUERY_BUILDER,
                $this->searchContext->getFulltextQueryBuilderProfile() );
        $features = ( new FullTextKeywordRegistry( $this->config ) )->getKeywords();
        $qb = self::buildFullTextBuilder( $builderSettings, $this->config, $features );

        $qb->build( $this->searchContext, $term );

        if ( $this->searchContext->getSearchQuery() !== null ) {
            $degradeOnParseWarnings = [
                // && test, test AND && test
                'cirrussearch-parse-error-unexpected-token',
                // test AND
                'cirrussearch-parse-error-unexpected-end'
            ];
            // Quick hack to avoid sending bad queries to the backend
            foreach ( $this->searchContext->getSearchQuery()->getParsedQuery()->getParseWarnings() as $warning ) {
                if ( in_array( $warning->getMessage(), $degradeOnParseWarnings ) ) {
                    $qb->buildDegraded( $this->searchContext );
                    return $qb;
                }
            }
        }

        return $qb;
    }

    /**
     * @param string $term
     * @return Status
     */
    private function searchTextInternal( $term ) {
        // Searcher needs to be cloned before any actual query building is done.
        $interleaveSearcher = $this->buildInterleaveSearcher();

        $qb = $this->buildFullTextSearch( $term );
        $mainSearch = $this->buildSearch();
        $searches = MSearchRequests::build( self::MAINSEARCH_MSEARCH_KEY, $mainSearch );
        $description = "{$this->searchContext->getSearchType()} search for '{$this->searchContext->getOriginalSearchTerm()}'";

        if ( !$this->searchContext->areResultsPossible() ) {
            if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
                // return the empty array to suggest that no query will be run
                return Status::newGood( [] );
            }
            $status = $this->emptyResultSet();
            if ( $this->searchContext->getDebugOptions()->isCirrusDumpResult() ) {
                return Status::newGood(
                    ( new MSearchResponses( [ $status->getValue() ], [] ) )->dumpResults( $description )
                );
            }
            return $status;
        }

        if ( $interleaveSearcher !== null ) {
            $interleaveSearcher->buildFullTextSearch( $term );
            $interleaveSearch = $interleaveSearcher->buildSearch();
            if ( $this->areSearchesTheSame( $mainSearch, $interleaveSearch ) ) {
                $interleaveSearcher = null;
            } else {
                $searches->addRequest( self::INTERLEAVED_MSEARCH_KEY, $interleaveSearch );
            }
        }

        $fallbackRunner = $this->searchContext->getFallbackRunner();
        $fallbackRunner->attachSearchRequests( $searches, $this->connection->getClient() );

        if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
            return $searches->dumpQuery( $description );
        }

        $responses = $this->searchMulti( $searches );
        if ( $responses->hasFailure() ) {
            $status = $responses->getFailure();
            if ( ElasticaErrorHandler::isParseError( $status ) ) {
                // Rebuild the search context because we need a fresh fetchPhaseBuilder
                $this->searchContext = $this->searchContext->withConfig( $this->config );
                if ( $qb->buildDegraded( $this->searchContext ) ) {
                    // If that doesn't work we're out of luck but it should.
                    // There no guarantee it'll work properly with the syntax
                    // we've built above but it'll do _something_ and we'll
                    // still work on fixing all the parse errors that come in.
                    $status = $this->searchOne();
                }
            }
            return $status;
        }

        if ( $this->searchContext->getDebugOptions()->isCirrusDumpResult() ) {
            return $responses->dumpResults( $description );
        }

        $rType = $this->getSearchContext()->getResultsType();
        $mainSet = $responses->transformAsResultSet( $rType, self::MAINSEARCH_MSEARCH_KEY );
        if ( $interleaveSearcher !== null ) {
            $interleaver = new TeamDraftInterleaver( $this->searchContext->getOriginalSearchTerm() );
            $testedSet = $responses->transformAsResultSet( $rType, self::INTERLEAVED_MSEARCH_KEY );
            $response = $interleaver->interleave( $mainSet, $testedSet, $this->limit );
        } else {
            $response = $mainSet;
        }

        $status = Status::newGood();
        if ( $this->namespacePrefixParser !== null ) {
            $status = Status::newGood( $fallbackRunner->run( $this, $response, $responses,
                $this->namespacePrefixParser, $this->cirrusSearchHookRunner ) );
            $this->appendMetrics( $fallbackRunner );
        }

        foreach ( $this->searchContext->getWarnings() as $warning ) {
            $status->warning( ...$warning );
        }
        return $status;
    }

    /**
     * Get the page with $docId.  Note that the result is a status containing _all_ pages found.
     * It is possible to find more then one page if the page is in multiple indexes.
     * @param string[] $docIds array of document ids
     * @param string[]|bool $sourceFiltering source filtering to apply
     * @param bool $usePoolCounter false to disable the pool counter
     * @return Status containing pages found, containing an empty array if not found,
     *    or an error if there was an error
     */
    public function get( array $docIds, $sourceFiltering, $usePoolCounter = true ) {
        $connection = $this->getOverriddenConnection();
        $indexSuffix = $connection->pickIndexSuffixForNamespaces(
            $this->searchContext->getNamespaces()
        );

        // The worst case would be to have all ids duplicated in all available indices.
        // We set the limit accordingly
        $size = count( $connection->getAllIndexSuffixesForNamespaces(
            $this->searchContext->getNamespaces()
        ) );
        $size *= count( $docIds );

        $work = function () use ( $docIds, $sourceFiltering, $indexSuffix, $size, $connection ) {
            try {
                $this->startNewLog( 'get of {indexSuffix}.{docIds}', 'get', [
                    'indexSuffix' => $indexSuffix,
                    'docIds' => $docIds,
                ] );
                // Shard timeout not supported on get requests so we just use the client side timeout
                $connection->setTimeout( $this->getClientTimeout( 'get' ) );
                // We use a search query instead of _get/_mget, these methods are
                // theorically well suited for this kind of job but they are not
                // supported on aliases with multiple indices (content/general)
                $index = $connection->getIndex( $this->indexBaseName, $indexSuffix );
                $query = new \Elastica\Query( new \Elastica\Query\Ids( $docIds ) );
                if ( is_array( $sourceFiltering ) ) {
                    // The title is a required field in the ApiTrait
                    if ( !in_array( "title", $sourceFiltering ) ) {
                        array_push( $sourceFiltering, "title" );
                    }
                    $query->setParam( '_source', $sourceFiltering );
                }
                $query->addParam( 'stats', 'get' );
                // We ignore limits provided to the searcher
                // otherwize we could return fewer results than
                // the ids requested.
                $query->setFrom( 0 );
                $query->setSize( $size );
                $resultSet = $index->search( $query, [ 'search_type' => 'query_then_fetch' ] );
                if ( !$resultSet->getResponse()->isOK() ) {
                    $request = $connection->getClient()->getLastRequest();
                    if ( $request == null ) {
                        // I can't imagine how this would happen, but the type signature allows
                        // for a null last request so we provide a minimal workaround.
                        throw new \Elastica\Exception\RuntimeException(
                            "Response reports failure, but no last request available" );
                    }
                    throw new ResponseException( $request, $resultSet->getResponse() );
                }
                return $this->success( $resultSet->getResults(), $connection );
            } catch ( \Elastica\Exception\NotFoundException $e ) {
                // NotFoundException just means the field didn't exist.
                // It is up to the caller to decide if that is an error.
                return $this->success( [], $connection );
            } catch ( \Elastica\Exception\ExceptionInterface $e ) {
                return $this->failure( $e, $connection );
            }
        };

        if ( $usePoolCounter ) {
            return Util::doPoolCounterWork( $this->getPoolCounterType(), $this->user, $work );
        } else {
            return $work();
        }
    }

    /**
     * @param string $name
     * @return Status
     */
    private function findNamespace( $name ) {
        return Util::doPoolCounterWork(
            'CirrusSearch-NamespaceLookup',
            $this->user,
            function () use ( $name ) {
                try {
                    $this->startNewLog( 'lookup namespace for {namespaceName}', 'namespace', [
                        'namespaceName' => $name,
                        'query' => $name,
                    ] );
                    $connection = $this->getOverriddenConnection();
                    $connection->setTimeout( $this->getClientTimeout( 'namespace' ) );

                    // A bit awkward, but accepted as this is the backup
                    // implementation of namespace lookup. Deployments should
                    // prefer to install php-intl and use utr30.
                    $store = ( new MetaStoreIndex( $connection, new NullPrinter(), $this->config ) )
                        ->namespaceStore();
                    $resultSet = $store->find( $name, [
                        'timeout' => $this->getTimeout( 'namespace' ),
                    ] );
                    return $this->success( $resultSet->getResults(), $connection );
                } catch ( \Elastica\Exception\ExceptionInterface $e ) {
                    return $this->failure( $e, $connection );
                }
            } );
    }

    /**
     * @return \Elastica\Search
     */
    protected function buildSearch() {
        $builder = new SearchRequestBuilder(
            $this->searchContext, $this->getOverriddenConnection(), $this->indexBaseName );
        return $builder->setLimit( $this->limit )
            ->setOffset( $this->offset )
            ->setIndex( $this->index )
            ->setSort( $this->sort )
            ->setTimeout( $this->getTimeout( $this->searchContext->getSearchType() ) )
            ->build();
    }

    /**
     * Perform a single-query search.
     * @return Status
     */
    protected function searchOne() {
        $search = $this->buildSearch();
        $description = "{$this->searchContext->getSearchType()} search for '{$this->searchContext->getOriginalSearchTerm()}'";
        $msearch = MSearchRequests::build( self::MAINSEARCH_MSEARCH_KEY, $search );
        if ( $this->searchContext->getDebugOptions()->isCirrusDumpQuery() ) {
            return $msearch->dumpQuery( $description );
        }
        if ( !$this->searchContext->areResultsPossible() ) {
            return $this->emptyResultSet();
        }

        $mresults = $this->searchMulti( $msearch );

        if ( $mresults->hasFailure() ) {
            return $mresults->getFailure();
        }

        if ( $this->searchContext->getDebugOptions()->isReturnRaw() ) {
            return $mresults->dumpResults( $description );
        }
        return $mresults->transformAndGetSingle( $this->searchContext->getResultsType(), self::MAINSEARCH_MSEARCH_KEY );
    }

    /**
     * Powers full-text-like searches including prefix search.
     *
     * @param MSearchRequests $msearches
     * @return MSearchResponses search responses
     */
    protected function searchMulti( MSearchRequests $msearches ) {
        $searches = $msearches->getRequests();
        $contextResultsType = $this->searchContext->getResultsType();
        $cirrusDebugOptions = $this->searchContext->getDebugOptions();
        Assert::precondition( !$cirrusDebugOptions->isCirrusDumpQuery(), 'Must not reach this method when dumping the query' );

        // TODO: should this be moved upper in the stack?
        if ( $this->limit <= 0 ) {
            return $msearches->failure( Status::newFatal( 'cirrussearch-offset-too-large',
                self::MAX_OFFSET_LIMIT, $this->offset ) );
        }

        $connection = $this->getOverriddenConnection();
        $log = new MultiSearchRequestLog(
            $connection->getClient(),
            "{queryType} search for '{query}'",
            $this->searchContext->getSearchType(),
            [
                'query' => $this->searchContext->getOriginalSearchTerm(),
                'limit' => $this->limit ?: null,
                // Used syntax
                'syntax' => $this->searchContext->getSyntaxUsed(),
            ],
            $this->searchContext->getNamespaces() ?: []
        );

        // Similar to indexing support only the bulk code path, rather than
        // single and bulk. The extra overhead should be minimal, and the
        // reduced complexity is welcomed.
        $search = new MultiSearch( $connection->getClient() );
        $search->addSearches( $searches );

        $connection->setTimeout( $this->getClientTimeout( $this->searchContext->getSearchType() ) );

        if ( $this->config->get( 'CirrusSearchMoreAccurateScoringMode' ) ) {
            $search->setSearchType( \Elastica\Search::OPTION_SEARCH_TYPE_DFS_QUERY_THEN_FETCH );
        }

        // Perform the search
        $work = function () use ( $search, $log, $connection ) {
            return Util::doPoolCounterWork(
                $this->getPoolCounterType(),
                $this->user,
                function () use ( $search, $log, $connection ) {
                    // @todo only reports the first error, also turns
                    // a partial (single search) error into a complete
                    // failure across the board. Should be addressed
                    // at some point.
                    return $this->runMSearch( $search, $log, $connection );
                },
                $this->searchContext->isSyntaxUsed( 'regex' ) ?
                    'cirrussearch-regex-too-busy-error' : null
            );
        };

        // Wrap with caching if needed, but don't cache debugging queries
        $skipCache = $cirrusDebugOptions->mustNeverBeCached();
        if ( $this->searchContext->getCacheTtl() > 0 && !$skipCache ) {
            $work = function () use ( $work, $searches, $log, $contextResultsType ) {
                $services = MediaWikiServices::getInstance();
                $requestStats = Util::getStatsFactory();
                $cache = $services->getMainWANObjectCache();
                $keyParts = [];
                foreach ( $searches as $key => $search ) {
                    $keyParts[] = $search->getPath() .
                        serialize( $search->getOptions() ) .
                        serialize( $search->getQuery()->toArray() ) .
                        ( $contextResultsType !== null ? get_class( $contextResultsType ) : "NONE" );
                }
                $key = $cache->makeKey( 'cirrussearch', 'search', 'v2', md5(
                    implode( '|', $keyParts )
                ) );
                $cacheResult = $cache->get( $key );
                if ( $cacheResult ) {
                    [ $logVariables, $multiResultSet ] = $cacheResult;
                    $this->recordQueryCacheMetrics( $requestStats, "hit" );
                    $log->setCachedResult( $logVariables );
                    $this->successViaCache( $log );

                    if ( $multiResultSet->isOK() ) {
                        /**
                         * @var $cachedMResultSet \Elastica\Multi\ResultSet
                         */
                        $cachedMResultSet = $multiResultSet->getValue();
                        if ( count( $cachedMResultSet->getResultSets() ) !== count( $searches ) ) {
                            LoggerFactory::getInstance( 'CirrusSearch' )
                                ->warning( 'Ignoring a cached Multi/ResultSet wanted {nb_queries} response(s) but received {nb_responses}',
                                    [
                                        'nb_queries' => count( $searches ),
                                        'nb_responses' => count( $cachedMResultSet->getResultSets() )
                                    ] );
                            $this->recordQueryCacheMetrics( $requestStats, "incoherent" );
                        } else {
                            return $multiResultSet;
                        }
                    } else {
                        LoggerFactory::getInstance( 'CirrusSearch' )
                            ->warning( 'Cached a Status value that is not OK' );
                        $this->recordQueryCacheMetrics( $requestStats, "nok" );
                    }
                } else {
                    $this->recordQueryCacheMetrics( $requestStats, "miss" );
                }

                $multiResultSet = $work();

                if ( $multiResultSet->isOK() ) {
                    $isPartialResult = false;
                    foreach ( $multiResultSet->getValue()->getResultSets() as $resultSet ) {
                        $responseData = $resultSet->getResponse()->getData();
                        if ( isset( $responseData['timed_out'] ) && $responseData['timed_out'] ) {
                            $isPartialResult = true;
                            break;
                        }
                    }
                    if ( !$isPartialResult ) {
                        $this->recordQueryCacheMetrics( $requestStats, "set" );
                        $cache->set(
                            $key,
                            [ $log->getLogVariables(), $multiResultSet ],
                            $this->searchContext->getCacheTtl()
                        );
                    }
                }

                return $multiResultSet;
            };
        }

        $status = $work();

        // @todo Does this need anything special for multi-search changes?
        if ( !$status->isOK() ) {
            return $msearches->failure( $status );
        }

        $response = $status->getValue();
        /**
         * @var $response \Elastica\Multi\ResultSet
         */
        if ( count( $response->getResultSets() ) !== count( $msearches->getRequests() ) ) {
            // Temp hack to investigate T231023 (use php serialize just in case it has some invalid
            // UTF8 sequences that would prevent this message from being sent to logstash
            LoggerFactory::getInstance( 'CirrusSearch' )
                ->warning( "Incoherent response received (#searches != #responses) for {query}: {response}",
                    [ 'query' => $this->searchContext->getOriginalSearchTerm(), 'response' => serialize( $response->getResponse() ) ] );
            return $msearches->failure( Status::newFatal( 'cirrussearch-backend-error' ) );
        }
        $mreponses = $msearches->toMSearchResponses( $response->getResultSets() );
        if ( $mreponses->hasTimeout() ) {
            LoggerFactory::getInstance( 'CirrusSearch' )->warning(
                $log->getDescription() . " timed out and only returned partial results!",
                $log->getLogVariables()
            );
            $this->searchContext->addWarning( $this->searchContext->isSyntaxUsed( 'regex' )
                ? 'cirrussearch-regex-timed-out'
                : 'cirrussearch-timed-out'
            );
        }
        return $mreponses;
    }

    /**
     * Attempt to suck a leading namespace followed by a colon from the query string.
     * Reaches out to Elasticsearch to perform normalized lookup against the namespaces.
     * Should be fast but for the network hop.
     *
     * @param string &$query
     */
    public function updateNamespacesFromQuery( &$query ) {
        $colon = strpos( $query, ':' );
        if ( $colon === false ) {
            return;
        }
        $namespaceName = substr( $query, 0, $colon );
        $status = $this->findNamespace( $namespaceName );
        // Failure case is already logged so just handle success case
        if ( !$status->isOK() ) {
            return;
        }
        $foundNamespace = $status->getValue();
        if ( !$foundNamespace ) {
            return;
        }
        $foundNamespace = $foundNamespace[ 0 ];
        $query = substr( $query, $colon + 1 );
        $this->searchContext->setNamespaces( [ $foundNamespace->namespace_id ] );
    }

    /**
     * @return SearchContext
     */
    public function getSearchContext() {
        return $this->searchContext;
    }

    private function getPoolCounterType(): string {
        // Default pool counter for all search requests. Note that not all
        // possible requests go through Searcher, so this isn't globally
        // definitive.
        $pool = 'CirrusSearch-Search';
        // Pool counter overrides based on query syntax. Goal is to
        // separate expensive or high-volume traffic into dedicated
        // pools with specific limits. Prefix is only high volume
        // when completion is disabled.
        $poolCounterTypes = [
            'deepcat' => 'CirrusSearch-ExpensiveFullText',
            'regex' => 'CirrusSearch-ExpensiveFullText',
            'prefix' => 'CirrusSearch-Prefix',
            'more_like' => 'CirrusSearch-MoreLike',
        ];
        foreach ( $poolCounterTypes as $type => $counter ) {
            if ( $this->searchContext->isSyntaxUsed( $type ) ) {
                $pool = $counter;
                break;
            }
        }
        // Put external automated requests into their own bucket The main idea
        // here is to allow automated access, but prevent that automation from
        // capping out the pools used by interactive queries.
        // It's not clear when the automation bucket should not override other
        // bucketing decisions, for now override everything except Regex since
        // those can be very expensive and usually use a small pool. If both
        // the automation and regex pools filled with regexes it would be
        // significantly more load than expected.
        if ( $pool !== 'CirrusSearch-ExpensiveFullText' && $this->isAutomatedRequest() ) {
            $pool = 'CirrusSearch-Automated';
        }
        return $pool;
    }

    private function isAutomatedRequest(): bool {
        $req = RequestContext::getMain()->getRequest();
        try {
            $ip = $req->getIP();
        } catch ( \MWException $e ) {
            // No IP, typically this means a CLI invocation. We are attempting
            // to segregate external automation, internal automation has its
            // own ability to control configuration and shouldn't be flagged
            if ( MW_ENTRY_POINT === 'cli' ) {
                return false;
            }
            // When can we get here? Is this ever run?
            LoggerFactory::getInstance( 'CirrusSearch' )->info(
                'No IP available during automated request check' );
            return false;
        }
        return Util::looksLikeAutomation(
            $this->config, $ip, $req->getAllHeaders() );
    }

    /**
     * Some queries, like more like this, are quite expensive and can cause
     * latency spikes. This allows redirecting queries using particular
     * features to specific clusters.
     * @return Connection
     */
    private function getOverriddenConnection() {
        $overrides = $this->config->get( 'CirrusSearchClusterOverrides' );
        foreach ( $overrides as $feature => $cluster ) {
            if ( $this->searchContext->isSyntaxUsed( $feature ) ) {
                return Connection::getPool( $this->config, $cluster );
            }
        }
        return $this->connection;
    }

    protected function recordQueryCacheMetrics( StatsFactory $requestStats, string $cacheStatus, ?string $type = null ): void {
        $type = $type ?: $this->getSearchContext()->getSearchType();
        $requestStats->getCounter( "query_cache_total" )
            ->setLabel( "type", $type )
            ->setLabel( "status", $cacheStatus )
            ->copyToStatsdAt( "CirrusSearch.query_cache.$type.$cacheStatus" )
            ->increment();
    }

    /**
     * @param string $description
     * @param string $queryType
     * @param string[] $extra
     * @return SearchRequestLog
     */
    protected function newLog( $description, $queryType, array $extra = [] ) {
        return new SearchRequestLog(
            $this->getOverriddenConnection()->getClient(),
            $description,
            $queryType,
            $extra
        );
    }

    /**
     * If we're supposed to create raw result, create and return it,
     * or output it and finish.
     * @param mixed $result Search result data
     * @param WebRequest $request Request context
     * @return string The new raw result.
     */
    public function processRawReturn( $result, WebRequest $request ) {
        return Util::processSearchRawReturn( $result, $request,
            $this->searchContext->getDebugOptions() );
    }

    /**
     * Search titles in archive
     * @param string $term
     * @return Status<Title[]>
     */
    public function searchArchive( $term ) {
        $this->searchContext->setOriginalSearchTerm( $term );
        $term = $this->searchContext->escaper()->fixupWholeQueryString( $term );
        $this->setResultsType( new TitleResultsType() );

        // This does not support cross-cluster search, but there is also no use case
        // for cross-wiki archive search.
        $this->index = $this->getOverriddenConnection()->getArchiveIndex( $this->indexBaseName );

        // Setup the search query
        $query = new BoolQuery();

        $multi = new MultiMatch();
        $multi->setType( 'best_fields' );
        $multi->setTieBreaker( 0 );
        $multi->setQuery( $term );
        $multi->setFields( [
            'title.near_match^100',
            'title.near_match_asciifolding^75',
            'title.plain^50',
            'title^25'
        ] );
        $multi->setOperator( 'AND' );

        $fuzzy = new \Elastica\Query\MatchQuery();
        $fuzzy->setFieldQuery( 'title.plain', $term );
        $fuzzy->setFieldFuzziness( 'title.plain', 'AUTO' );
        $fuzzy->setFieldOperator( 'title.plain', 'AND' );

        $query->addShould( $multi );
        $query->addShould( $fuzzy );
        $query->setMinimumShouldMatch( 1 );

        $this->sort = 'just_match';

        $this->searchContext->setMainQuery( $query );
        $this->searchContext->addSyntaxUsed( 'archive' );
        $this->searchContext->setRescoreProfile( 'empty' );

        return $this->searchOne();
    }

    /**
     * Tests if two search objects are equivalent
     *
     * @param Search $a
     * @param Search $b
     * @return bool
     */
    private function areSearchesTheSame( Search $a, Search $b ) {
        // same object.
        if ( $a === $b ) {
            return true;
        }

        // Check values not included in toArray()
        if ( $a->getPath() !== $b->getPath()
            || $a->getOptions() != $b->getOptions()
        ) {
            return false;
        }

        $aArray = $a->getQuery()->toArray();
        $bArray = $b->getQuery()->toArray();

        // normalize the 'now' value which contains a timestamp that
        // may vary.
        $fixNow = static function ( &$value, $key ) {
            if ( $key === 'now' && is_int( $value ) ) {
                $value = 12345678;
            }
        };
        array_walk_recursive( $aArray, $fixNow );
        array_walk_recursive( $bArray, $fixNow );

        // Simplest form, requires both arrays to have exact same ordering,
        // types, keys, etc. We could try much harder to remove edge cases,
        // but they probably don't matter too much. The main thing we are
        // looking for is if configuration used for interleaved search didn't
        // have an effect query building. If we get it wrong in some rare
        // cases it should have minimal effects on the interleaved search test.
        return $aArray === $bArray;
    }

    private function buildInterleaveSearcher() {
        // If we aren't on the first page, or the user has specified
        // some custom magic query options (override rescore profile,
        // etc) then don't interleave.
        if ( $this->offset > 0 || $this->searchContext->isDirty() ) {
            return null;
        }

        // Is interleaving configured?
        $overrides = $this->config->get( 'CirrusSearchInterleaveConfig' );
        if ( $overrides === null ) {
            return null;
        }

        $config = new HashSearchConfig( $overrides, [ HashSearchConfig::FLAG_INHERIT ] );
        $other = clone $this;
        $other->config = $config;
        $other->searchContext = $other->searchContext->withConfig( $config );

        return $other;
    }

    /**
     * @return Status
     */
    private function emptyResultSet() {
        $results = $this->searchContext->getResultsType()->createEmptyResult();
        if ( $results instanceof BaseCirrusSearchResultSet ) {
            // TODO: Keywords are very specific to full-text search, while
            // ResultsType and this method are much more general.
            // While awkward, this maintains BC until we decide what to do.
            $results = BaseCirrusSearchResultSet::emptyResultSet(
                $this->searchContext->isSpecialKeywordUsed()
            );
        }
        $status = Status::newGood( $results );
        foreach ( $this->searchContext->getWarnings() as $warning ) {
            $status->warning( ...$warning );
        }
        return $status;
    }

    /**
     * Apply debug options to the elastica query
     * @param Query $query
     * @return Query
     */
    public function applyDebugOptionsToQuery( Query $query ) {
        return $this->searchContext->getDebugOptions()->applyDebugOptions( $query );
    }

    /**
     * @param SearchQuery $query
     * @return Searcher
     */
    public function makeSearcher( SearchQuery $query ) {
        return new self( $this->connection, $query->getOffset(), $query->getLimit(),
            $query->getSearchConfig(), $query->getNamespaces(), $this->user,
            false, $query->getDebugOptions(), $this->namespacePrefixParser, $this->interwikiResolver,
            $this->titleHelper, $this->cirrusSearchHookRunner );
    }

    /**
     * @param int $offset
     * @param int $limit
     */
    private function setOffsetLimit( $offset, $limit ) {
        $this->offset = $offset;
        if ( $offset + $limit > self::MAX_OFFSET_LIMIT ) {
            $this->limit = self::MAX_OFFSET_LIMIT - $offset;
        } else {
            $this->limit = $limit;
        }
    }

    /**
     * Visible for testing
     * @return int[] 2 elements array
     */
    public function getOffsetLimit() {
        Assert::precondition( defined( 'MW_PHPUNIT_TEST' ),
            'getOffsetLimit must only be called for testing purposes' );
        return [ $this->offset, $this->limit ];
    }

    /**
     * Build a FullTextQueryBuilder defined in the $builderSettings:
     * format is:
     * [
     *     'builder_factory' => callback
     *     'settings' => ...
     * ]
     * where callback must be function that accepts the settings array and returns a FullTextQueryBuilder
     *
     * Legacy version:
     * [
     *     'builder_class' => ClassName
     *     'settings' => ...
     * ]
     * where ClassName must declare a constructor with these arguments:
     *   SearchConfig $config, KeywordFeature[] $features, $settings
     *
     * Visible for testing only
     * @param array $builderSettings
     * @param SearchConfig $config
     * @param KeywordFeature[] $features
     * @return FullTextQueryBuilder
     * @throws \ReflectionException
     */
    final public static function buildFullTextBuilder(
        array $builderSettings,
        SearchConfig $config,
        array $features
    ): FullTextQueryBuilder {
        if ( isset( $builderSettings['builder_class'] ) ) {
            $objectFactorySpecs = [
                'class' => $builderSettings['builder_class'],
                'args' => [
                    $config,
                    $features,
                    $builderSettings['settings']
                ]
            ];
        } elseif ( $builderSettings['builder_factory'] ) {
            $objectFactorySpecs = [
                'factory' => $builderSettings['builder_factory'],
                'args' => [
                    $builderSettings['settings']
                ]
            ];
        } else {
            throw new \InvalidArgumentException( 'Missing builder_class or builder_factory in the builderSettings' );
        }

        /** @var FullTextQueryBuilder $qb */
        // @phan-suppress-next-line PhanTypeInvalidCallableArraySize
        $qb = ObjectFactory::getObjectFromSpec( $objectFactorySpecs );
        if ( !( $qb instanceof FullTextQueryBuilder ) ) {
            throw new RuntimeException( 'Bad builder class configured.' );
        }

        return $qb;
    }
}