wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Search/CompletionResultsCollector.php

Summary

Maintainability
A
2 hrs
Test Coverage
<?php

namespace CirrusSearch\Search;

use CirrusSearch\CompletionRequestLog;
use SearchSuggestion;
use SearchSuggestionSet;

/**
 * Collect results from multiple result sets
 */
class CompletionResultsCollector {
    /**
     * @var SearchSuggestion[] suggestions indexed by pageId (mutable)
     */
    private $suggestionsByDocId = [];

    /**
     * @var string[][] profile names indexed by pageId (mutable)
     */
    private $suggestionMetadataByDocId = [];

    /**
     * @var float|null maintains the minScore (mutable)
     */
    private $minScore = null;

    /**
     * @var int|null maintains the doc that has minScore (mutable)
     */
    private $minDoc = null;

    /**
     * @var int how many results we want to keep (final)
     */
    private $limit;

    /**
     * @var int the offset (final)
     */
    private $offset;

    /**
     * Allows immediate removal of unwanted results while appropriate
     * processes are worked out for communities to influence ranking
     * as desired.
     *
     * @var int[] Set of id's to never return as results
     */
    private $bannedIds;

    /**
     * @param int $limit number of results we want to display
     * @param int $offset
     * @param int[] $bannedIds Set of id's to never return
     */
    public function __construct( $limit, $offset = 0, $bannedIds = [] ) {
        if ( $limit <= 0 ) {
            throw new \RuntimeException( "limit must be strictly positive" );
        }
        $this->limit = $limit;
        $this->offset = $offset;
        $this->bannedIds = $bannedIds;
    }

    /**
     * @param int $pageId
     * @param float $score
     * @return bool
     * @internal param int $docId
     */
    private function canCollect( $pageId, $score ) {
        if ( in_array( $pageId, $this->bannedIds ) ) {
            return false;
        }

        // First element
        if ( $this->minScore === null && $this->limit > 0 ) {
            return true;
        }

        // If we have the doc we do not accept it if it has lower score
        if ( isset( $this->suggestionsByDocId[$pageId] ) &&
                $score <= $this->suggestionsByDocId[$pageId]->getScore() ) {
            return false;
        }

        // We always accept docs that are better
        if ( $score > $this->minScore ) {
            return true;
        }

        // For everything else we accept until we are full
        return !$this->isFull();
    }

    /**
     * Collect a doc if possible.
     * The doc will be collected if the capacity is not yet reached or if its score
     * is better than a suggestion already collected.
     * @param SearchSuggestion $suggestion
     * @param string $profileName
     * @param string $index
     * @return bool true if the doc was added false otherwise
     */
    public function collect( SearchSuggestion $suggestion, $profileName, $index ) {
        if ( !$this->canCollect( $suggestion->getSuggestedTitleID(), $suggestion->getScore() ) ) {
            return false;
        }

        if ( isset( $this->suggestionsByDocId[$suggestion->getSuggestedTitleID()] ) ) {
            $oldSugg = $this->suggestionsByDocId[$suggestion->getSuggestedTitleID()];
            if ( $oldSugg->getScore() > $suggestion->getScore() ) {
                return false;
            }
            unset( $this->suggestionsByDocId[$suggestion->getSuggestedTitleID()] );
            unset( $this->suggestionMetadataByDocId[$suggestion->getSuggestedTitleID()] );
            // worst case 1: existing doc with better score
            $this->updateMinDoc();
        }

        if ( $this->isFull() ) {
            unset( $this->suggestionsByDocId[$this->minDoc] );
            unset( $this->suggestionMetadataByDocId[$this->minDoc] );
            // worst case 2: collector full but better score found
            $this->updateMinDoc();
        }
        if ( $this->minScore === null || $this->minScore > $suggestion->getScore() ) {
            $this->minScore = $suggestion->getScore();
            $this->minDoc = $suggestion->getSuggestedTitleID();
        }
        $this->suggestionsByDocId[$suggestion->getSuggestedTitleID()] = $suggestion;
        $this->suggestionMetadataByDocId[$suggestion->getSuggestedTitleID()] = [
            'profile' => $profileName,
            'index' => $index
        ];
        return true;
    }

    /**
     * Test whether the collector is full
     * @return bool true if it's full
     */
    public function isFull() {
        return !( $this->size() < ( $this->limit + $this->offset ) );
    }

    /**
     * Number of suggestions collected
     * @return int
     */
    public function size() {
        return count( $this->suggestionsByDocId );
    }

    /**
     * Find the min doc.
     * This is called on worst case scenario:
     * - when the collector is full but a better doc is found
     * - when an already collected doc is found with a better score
     *
     * Realistically this should not happen too frequently since
     * docs are usually fetched from elastic which returns them
     * in order. If it appears to cause perf issues we might
     * want to investigate an approach based on SplMinHeap.
     */
    private function updateMinDoc() {
        $minScore = null;
        $minDoc = null;
        foreach ( $this->suggestionsByDocId as $sugg ) {
            if ( $minScore === null || $minScore > $sugg->getScore() ) {
                $minScore = $sugg->getScore();
                $minDoc = $sugg->getSuggestedTitleID();
            }
        }
        $this->minDoc = $minDoc;
        $this->minScore = $minScore;
    }

    /**
     * Return the set of suggestions collected so far and log
     * its states to CompletionRequestLog.
     *
     * @param CompletionRequestLog $log
     * @return SearchSuggestionSet
     */
    public function logAndGetSet( CompletionRequestLog $log ) {
        uasort( $this->suggestionsByDocId, static function ( SearchSuggestion $a, SearchSuggestion $b ) {
            if ( $b->getScore() > $a->getScore() ) {
                return 1;
            } elseif ( $b->getScore() < $a->getScore() ) {
                return -1;
            }
            return 0;
        } );
        $results = array_slice( $this->suggestionsByDocId, $this->offset,
            $this->limit, true );
        $log->setResult( $results, $this->suggestionMetadataByDocId );
        return new SearchSuggestionSet( $results );
    }

    /**
     * @return float|null
     */
    public function getMinScore() {
        return $this->minScore;
    }
}