wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/ChangeListener.php

Summary

Maintainability
B
5 hrs
Test Coverage
<?php

namespace CirrusSearch;

use CirrusSearch\Job\CirrusTitleJob;
use CirrusSearch\Job\DeletePages;
use CirrusSearch\Job\LinksUpdate;
use JobQueueGroup;
use ManualLogEntry;
use MediaWiki\Config\ConfigFactory;
use MediaWiki\Deferred\DeferredUpdates;
use MediaWiki\Deferred\LinksUpdate\LinksTable;
use MediaWiki\Hook\ArticleRevisionVisibilitySetHook;
use MediaWiki\Hook\LinksUpdateCompleteHook;
use MediaWiki\Hook\PageMoveCompleteHook;
use MediaWiki\Hook\TitleMoveHook;
use MediaWiki\Hook\UploadCompleteHook;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\Page\Hook\PageDeleteCompleteHook;
use MediaWiki\Page\Hook\PageDeleteHook;
use MediaWiki\Page\PageReference;
use MediaWiki\Page\ProperPageIdentity;
use MediaWiki\Page\RedirectLookup;
use MediaWiki\Permissions\Authority;
use MediaWiki\Revision\RevisionRecord;
use MediaWiki\Status\Status;
use MediaWiki\Title\Title;
use MediaWiki\User\User;
use MediaWiki\Utils\MWTimestamp;
use Wikimedia\Assert\Assert;
use Wikimedia\Rdbms\IConnectionProvider;

/**
 * Implementation to all the hooks that CirrusSearch needs to listen in order to keep its index
 * in sync with main SQL database.
 */
class ChangeListener extends PageChangeTracker implements
    LinksUpdateCompleteHook,
    TitleMoveHook,
    PageMoveCompleteHook,
    UploadCompleteHook,
    ArticleRevisionVisibilitySetHook,
    PageDeleteHook,
    PageDeleteCompleteHook
{
    private JobQueueGroup $jobQueue;
    private SearchConfig $searchConfig;
    private IConnectionProvider $dbProvider;
    private RedirectLookup $redirectLookup;

    /** @var Connection */
    private $connection;

    /** @var array state holding the titles being moved */
    private $movingTitles = [];

    public static function create(
        JobQueueGroup $jobQueue,
        ConfigFactory $configFactory,
        IConnectionProvider $dbProvider,
        RedirectLookup $redirectLookup
    ): ChangeListener {
        /** @phan-suppress-next-line PhanTypeMismatchArgumentSuperType $config is actually a SearchConfig */
        return new self( $jobQueue, $configFactory->makeConfig( "CirrusSearch" ), $dbProvider, $redirectLookup );
    }

    public function __construct(
        JobQueueGroup $jobQueue,
        SearchConfig $searchConfig,
        IConnectionProvider $dbProvider,
        RedirectLookup $redirectLookup
    ) {
        parent::__construct();
        $this->jobQueue = $jobQueue;
        $this->searchConfig = $searchConfig;
        $this->dbProvider = $dbProvider;
        $this->redirectLookup = $redirectLookup;
    }

    /**
     * Check whether at least one cluster is writeable or not.
     * If not there are no reasons to schedule a job.
     *
     * @return bool true if at least one cluster is writeable
     */
    private function isEnabled(): bool {
        return $this->searchConfig
            ->getClusterAssignment()
            ->getWritableClusters( UpdateGroup::PAGE ) != [];
    }

    /**
     * Called when a revision is deleted. In theory, we shouldn't need to to this since
     * you can't delete the current text of a page (so we should've already updated when
     * the page was updated last). But we're paranoid, because deleted revisions absolutely
     * should not be in the index.
     *
     * @param Title $title The page title we've had a revision deleted on
     * @param int[] $ids IDs to set the visibility for
     * @param array $visibilityChangeMap Map of revision ID to oldBits and newBits.
     *   This array can be examined to determine exactly what visibility bits
     *   have changed for each revision. This array is of the form:
     *   [id => ['oldBits' => $oldBits, 'newBits' => $newBits], ... ]
     */
    public function onArticleRevisionVisibilitySet( $title, $ids, $visibilityChangeMap ) {
        if ( !$this->isEnabled() ) {
            return;
        }
        $this->jobQueue->lazyPush( LinksUpdate::newPastRevisionVisibilityChange( $title ) );
    }

    /**
     * Hooked to update the search index when pages change directly or when templates that
     * they include change.
     * @param \MediaWiki\Deferred\LinksUpdate\LinksUpdate $linksUpdate
     * @param mixed $ticket Prior result of LBFactory::getEmptyTransactionTicket()
     */
    public function onLinksUpdateComplete( $linksUpdate, $ticket ) {
        if ( !$this->isEnabled() ) {
            return;
        }
        // defer processing the LinksUpdateComplete hook until other hooks tagged in PageChangeTracker
        // have a chance to run. Reason is that we want to detect what are the links updates triggered
        // by a "page change". The definition of a "page change" we use is the one used by EventBus
        // PageChangeHooks.
        DeferredUpdates::addCallableUpdate( function () use ( $linksUpdate ) {
            $linkedArticlesToUpdate = $this->searchConfig->get( 'CirrusSearchLinkedArticlesToUpdate' );
            $unLinkedArticlesToUpdate = $this->searchConfig->get( 'CirrusSearchUnlinkedArticlesToUpdate' );
            $updateDelay = $this->searchConfig->get( 'CirrusSearchUpdateDelay' );

            // Titles that are created by a move don't need their own job.
            if ( in_array( $linksUpdate->getTitle()->getPrefixedDBkey(), $this->movingTitles ) ) {
                return;
            }

            $params = [];
            if ( $this->searchConfig->get( 'CirrusSearchEnableIncomingLinkCounting' ) ) {
                $params['addedLinks'] = self::preparePageReferencesForLinksUpdate(
                    $linksUpdate->getPageReferenceArray( 'pagelinks', LinksTable::INSERTED ),
                    $linkedArticlesToUpdate
                );
                // We exclude links that contains invalid UTF-8 sequences, reason is that page created
                // before T13143 was fixed might sill have bad links the pagelinks table
                // and thus will cause LinksUpdate to believe that these links are removed.
                $params['removedLinks'] = self::preparePageReferencesForLinksUpdate(
                    $linksUpdate->getPageReferenceArray( 'pagelinks', LinksTable::DELETED ),
                    $unLinkedArticlesToUpdate,
                    true
                );
            }

            if ( $this->isPageChange( $linksUpdate->getPageId() ) ) {
                $jobParams = $params + LinksUpdate::buildJobDelayOptions( LinksUpdate::class,
                        $updateDelay['prioritized'], $this->jobQueue );
                $job = LinksUpdate::newPageChangeUpdate( $linksUpdate->getTitle(),
                    $linksUpdate->getRevisionRecord(), $jobParams );
                if ( ( MWTimestamp::time() - $job->params[CirrusTitleJob::ROOT_EVENT_TIME] ) > ( 3600 * 24 ) ) {
                    LoggerFactory::getInstance( 'CirrusSearch' )->debug(
                        "Scheduled a page-change-update for {title} on a revision created more than 24hours ago, " .
                        "the cause is {causeAction}",
                        [
                            'title' => $linksUpdate->getTitle()->getPrefixedDBkey(),
                            'causeAction' => $linksUpdate->getCauseAction()
                        ] );
                }
            } else {
                $job = LinksUpdate::newPageRefreshUpdate( $linksUpdate->getTitle(),
                    $params + LinksUpdate::buildJobDelayOptions( LinksUpdate::class, $updateDelay['default'], $this->jobQueue ) );
            }

            $this->jobQueue->lazyPush( $job );
        } );
    }

    /**
     * Hook into UploadComplete, because overwritten files mistakenly do not trigger
     * LinksUpdateComplete (T344285). Since files do contain indexed metadata
     * we need to refresh the search index when a file is overwritten on an
     * existing title.
     *
     * @param \UploadBase $uploadBase
     */
    public function onUploadComplete( $uploadBase ) {
        if ( !$this->isEnabled() ) {
            return;
        }
        if ( $uploadBase->getTitle()->exists() ) {
            $this->jobQueue->lazyPush( LinksUpdate::newPageChangeUpdate( $uploadBase->getTitle(), null, [] ) );
        }
    }

    /**
     * This hook is called before a page is deleted.
     *
     * @since 1.37
     *
     * @param ProperPageIdentity $page Page being deleted.
     * @param Authority $deleter Who is deleting the page
     * @param string $reason Reason the page is being deleted
     * @param \StatusValue $status Add any error here
     * @param bool $suppress Whether this is a suppression deletion or not
     * @return bool|void True or no return value to continue; false to abort, which also requires adding
     * a fatal error to $status.
     */
    public function onPageDelete(
        ProperPageIdentity $page,
        Authority $deleter,
        string $reason,
        \StatusValue $status,
        bool $suppress
    ) {
        if ( !$this->isEnabled() ) {
            return;
        }
        parent::onPageDelete( $page, $deleter, $reason, $status, $suppress );
        // We use this to pick up redirects so we can update their targets.
        // Can't re-use PageDeleteComplete because the page info's
        // already gone
        // If we abort or fail deletion it's no big deal because this will
        // end up being a no-op when it executes.
        $targetLink = $this->redirectLookup->getRedirectTarget( $page );
        $target = null;
        if ( $targetLink != null ) {
            $target = Title::castFromLinkTarget( $targetLink );
        }
        if ( $target ) {
            $this->jobQueue->lazyPush( new Job\LinksUpdate( $target, [] ) );
        }
    }

    /**
     * @param ProperPageIdentity $page
     * @param Authority $deleter
     * @param string $reason
     * @param int $pageID
     * @param RevisionRecord $deletedRev
     * @param ManualLogEntry $logEntry
     * @param int $archivedRevisionCount
     * @return void
     */
    public function onPageDeleteComplete( ProperPageIdentity $page, Authority $deleter,
        string $reason, int $pageID, RevisionRecord $deletedRev, ManualLogEntry $logEntry,
        int $archivedRevisionCount
    ) {
        if ( !$this->isEnabled() ) {
            return;
        }
        parent::onPageDeleteComplete( $page, $deleter, $reason, $pageID, $deletedRev, $logEntry, 1 );
        // Note that we must use the article id provided or it'll be lost in the ether.  The job can't
        // load it from the title because the page row has already been deleted.
        $title = Title::castFromPageIdentity( $page );
        Assert::postcondition( $title !== null, '$page can be cast to a Title' );
        $this->jobQueue->lazyPush(
            DeletePages::build(
                $title,
                $this->searchConfig->makeId( $pageID ),
                $logEntry->getTimestamp() !== false ? MWTimestamp::convert( TS_UNIX, $logEntry->getTimestamp() ) : MWTimestamp::time()
            )
        );
    }

    /**
     * Before we've moved a title from $title to $newTitle.
     *
     * @param Title $old Old title
     * @param Title $nt New title
     * @param User $user User who does the move
     * @param string $reason Reason provided by the user
     * @param Status &$status To abort the move, add a fatal error to this object
     *       (i.e. call $status->fatal())
     * @return bool|void True or no return value to continue or false to abort
     */
    public function onTitleMove( Title $old, Title $nt, User $user, $reason, Status &$status ) {
        if ( !$this->isEnabled() ) {
            return;
        }
        $this->movingTitles[] = $old->getPrefixedDBkey();
    }

    /**
     * When we've moved a Title from A to B.
     * @param \MediaWiki\Linker\LinkTarget $old Old title
     * @param \MediaWiki\Linker\LinkTarget $new New title
     * @param \MediaWiki\User\UserIdentity $user User who did the move
     * @param int $pageid Database ID of the page that's been moved
     * @param int $redirid Database ID of the created redirect
     * @param string $reason Reason for the move
     * @param \MediaWiki\Revision\RevisionRecord $revision RevisionRecord created by the move
     * @return bool|void True or no return value to continue or false stop other hook handlers,
     *     doesn't abort the move itself
     */
    public function onPageMoveComplete(
        $old, $new, $user, $pageid, $redirid,
        $reason, $revision
    ) {
        if ( !$this->isEnabled() ) {
            return;
        }
        parent::onPageMoveComplete( $old, $new, $user, $pageid, $redirid, $reason, $revision );
        // When a page is moved the update and delete hooks are good enough to catch
        // almost everything.  The only thing they miss is if a page moves from one
        // index to another.  That only happens if it switches namespace.
        if ( $old->getNamespace() === $new->getNamespace() ) {
            return;
        }

        $conn = $this->getConnection();
        $oldIndexSuffix = $conn->getIndexSuffixForNamespace( $old->getNamespace() );
        $newIndexSuffix = $conn->getIndexSuffixForNamespace( $new->getNamespace() );
        if ( $oldIndexSuffix !== $newIndexSuffix ) {
            $title = Title::newFromLinkTarget( $old );
            $job = new Job\DeletePages( $title, [
                'indexSuffix' => $oldIndexSuffix,
                'docId' => $this->searchConfig->makeId( $pageid )
            ] );
            // Push the job after DB commit but cancel on rollback
            $this->dbProvider->getPrimaryDatabase()->onTransactionCommitOrIdle( function () use ( $job ) {
                $this->jobQueue->lazyPush( $job );
            }, __METHOD__ );
        }
    }

    /**
     * Take a list of titles either linked or unlinked and prepare them for Job\LinksUpdate.
     * This includes limiting them to $max titles.
     * @param PageReference[] $pageReferences titles to prepare
     * @param int $max maximum number of titles to return
     * @param bool $excludeBadUTF exclude links that contains invalid UTF sequences
     * @return array
     */
    public static function preparePageReferencesForLinksUpdate( $pageReferences, int $max, $excludeBadUTF = false ) {
        $pageReferences = self::pickFromArray( $pageReferences, $max );
        $dBKeys = [];
        foreach ( $pageReferences as $pageReference ) {
            $title = Title::newFromPageReference( $pageReference );
            $key = $title->getPrefixedDBkey();
            if ( $excludeBadUTF ) {
                $fixedKey = mb_convert_encoding( $key, 'UTF-8', 'UTF-8' );
                if ( $fixedKey !== $key ) {
                    LoggerFactory::getInstance( 'CirrusSearch' )
                        ->warning( "Ignoring title {title} with invalid UTF-8 sequences.",
                            [ 'title' => $fixedKey ] );
                    continue;
                }
            }
            $dBKeys[] = $title->getPrefixedDBkey();
        }
        return $dBKeys;
    }

    /**
     * Pick $num random entries from $array.
     * @param array $array Array to pick from
     * @param int $num Number of entries to pick
     * @return array of entries from $array
     */
    private static function pickFromArray( $array, $num ) {
        if ( $num > count( $array ) ) {
            return $array;
        }
        if ( $num < 1 ) {
            return [];
        }
        $chosen = array_rand( $array, $num );
        // If $num === 1 then array_rand will return a key rather than an array of keys.
        if ( !is_array( $chosen ) ) {
            return [ $array[ $chosen ] ];
        }
        $result = [];
        foreach ( $chosen as $key ) {
            $result[] = $array[ $key ];
        }
        return $result;
    }

    private function getConnection(): Connection {
        if ( $this->connection === null ) {
            $this->connection = new Connection( $this->searchConfig );
        }
        return $this->connection;
    }
}