wikimedia/mediawiki-core

View on GitHub
includes/deferred/LinksUpdate/LinksUpdate.php

Summary

Maintainability
C
1 day
Test Coverage
<?php
/**
 * Updater for link tracking tables after a page edit.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 */

namespace MediaWiki\Deferred\LinksUpdate;

use IDBAccessObject;
use Job;
use MediaWiki\Cache\BacklinkCache;
use MediaWiki\Deferred\AutoCommitUpdate;
use MediaWiki\Deferred\DataUpdate;
use MediaWiki\Deferred\DeferredUpdates;
use MediaWiki\HookContainer\ProtectedHookAccessorTrait;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MainConfigNames;
use MediaWiki\MediaWikiServices;
use MediaWiki\Page\PageIdentity;
use MediaWiki\Page\PageReference;
use MediaWiki\Page\PageReferenceValue;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Revision\RevisionRecord;
use MediaWiki\Title\Title;
use MediaWiki\User\UserIdentity;
use RefreshLinksJob;
use RuntimeException;
use Wikimedia\Rdbms\IConnectionProvider;
use Wikimedia\Rdbms\IDatabase;
use Wikimedia\ScopedCallback;

/**
 * Class the manages updates of *_link tables as well as similar extension-managed tables
 *
 * @note: LinksUpdate is managed by DeferredUpdates::execute(). Do not run this in a transaction.
 *
 * See docs/deferred.txt
 */
class LinksUpdate extends DataUpdate {
    use ProtectedHookAccessorTrait;

    /** @var int Page ID of the article linked from */
    protected $mId;

    /** @var Title Title object of the article linked from */
    protected $mTitle;

    /** @var ParserOutput */
    protected $mParserOutput;

    /** @var bool Whether to queue jobs for recursive updates */
    protected $mRecursive;

    /** @var bool Whether the page's redirect target may have changed in the latest revision */
    protected $mMaybeRedirectChanged;

    /** @var RevisionRecord Revision for which this update has been triggered */
    private $mRevisionRecord;

    /**
     * @var UserIdentity|null
     */
    private $user;

    /** @var IDatabase */
    private $db;

    /** @var LinksTableGroup */
    private $tableFactory;

    private IConnectionProvider $connectionProvider;

    /**
     * @param PageIdentity $page The page we're updating
     * @param ParserOutput $parserOutput Output from a full parse of this page
     * @param bool $recursive Queue jobs for recursive updates?
     * @param bool $maybeRedirectChanged True if the page's redirect target may have changed in the
     *   latest revision. If false, this is used as a hint to skip some unnecessary updates.
     */
    public function __construct(
        PageIdentity $page,
        ParserOutput $parserOutput,
        $recursive = true,
        $maybeRedirectChanged = true
    ) {
        parent::__construct();

        $this->mTitle = Title::newFromPageIdentity( $page );
        $this->mParserOutput = $parserOutput;
        $this->mRecursive = $recursive;
        $this->mMaybeRedirectChanged = $maybeRedirectChanged;

        $services = MediaWikiServices::getInstance();
        $config = $services->getMainConfig();
        $this->tableFactory = new LinksTableGroup(
            $services->getObjectFactory(),
            $services->getDBLoadBalancerFactory(),
            $services->getCollationFactory(),
            $page,
            $services->getLinkTargetLookup(),
            $config->get( MainConfigNames::UpdateRowsPerQuery ),
            $config->get( MainConfigNames::TempCategoryCollations )
        );
        // TODO: this does not have to be called in LinksDeletionUpdate
        $this->tableFactory->setParserOutput( $parserOutput );
        $this->connectionProvider = $services->getDBLoadBalancerFactory();
    }

    public function setTransactionTicket( $ticket ) {
        parent::setTransactionTicket( $ticket );
        $this->tableFactory->setTransactionTicket( $ticket );
    }

    /**
     * Notify LinksUpdate that a move has just been completed and set the
     * original title
     *
     * @param PageReference $oldPage
     */
    public function setMoveDetails( PageReference $oldPage ) {
        $this->tableFactory->setMoveDetails( $oldPage );
    }

    /**
     * Update link tables with outgoing links from an updated article
     *
     * @note this is managed by DeferredUpdates::execute(). Do not run this in a transaction.
     */
    public function doUpdate() {
        if ( !$this->mId ) {
            // NOTE: subclasses may initialize mId directly!
            $this->mId = $this->mTitle->getArticleID( IDBAccessObject::READ_LATEST );
        }

        if ( !$this->mId ) {
            // Probably due to concurrent deletion or renaming of the page
            $logger = LoggerFactory::getInstance( 'SecondaryDataUpdate' );
            $logger->warning(
                'LinksUpdate: The Title object yields no ID. Perhaps the page was deleted?',
                [
                    'page_title' => $this->mTitle->getPrefixedDBkey(),
                    'cause_action' => $this->getCauseAction(),
                    'cause_agent' => $this->getCauseAgent()
                ]
            );

            // nothing to do
            return;
        }

        // Do any setup that needs to be done prior to acquiring the lock
        // Calling getAll() here has the side-effect of calling
        // LinksUpdateBatch::setParserOutput() on all subclasses, allowing
        // those methods to also do pre-lock operations.
        foreach ( $this->tableFactory->getAll() as $table ) {
            $table->beforeLock();
        }

        if ( $this->ticket ) {
            // Make sure all links update threads see the changes of each other.
            // This handles the case when updates have to batched into several COMMITs.
            $scopedLock = self::acquirePageLock( $this->getDB(), $this->mId );
            if ( !$scopedLock ) {
                throw new RuntimeException( "Could not acquire lock for page ID '{$this->mId}'." );
            }
        }

        $this->getHookRunner()->onLinksUpdate( $this );
        $this->doIncrementalUpdate();

        // Commit and release the lock (if set)
        ScopedCallback::consume( $scopedLock );
        // Run post-commit hook handlers without DBO_TRX
        DeferredUpdates::addUpdate( new AutoCommitUpdate(
            $this->getDB(),
            __METHOD__,
            function () {
                $this->getHookRunner()->onLinksUpdateComplete( $this, $this->ticket );
            }
        ) );
    }

    /**
     * Acquire a session-level lock for performing link table updates for a page on a DB
     *
     * @param IDatabase $dbw
     * @param int $pageId
     * @param string $why One of (job, atomicity)
     * @return ScopedCallback|null
     * @since 1.27
     */
    public static function acquirePageLock( IDatabase $dbw, $pageId, $why = 'atomicity' ) {
        $key = "{$dbw->getDomainID()}:LinksUpdate:$why:pageid:$pageId"; // per-wiki
        $scopedLock = $dbw->getScopedLockAndFlush( $key, __METHOD__, 15 );
        if ( !$scopedLock ) {
            $logger = LoggerFactory::getInstance( 'SecondaryDataUpdate' );
            $logger->info( "Could not acquire lock '{key}' for page ID '{page_id}'.", [
                'key' => $key,
                'page_id' => $pageId,
            ] );
            return null;
        }

        return $scopedLock;
    }

    protected function doIncrementalUpdate() {
        foreach ( $this->tableFactory->getAll() as $table ) {
            $table->update();
        }

        # Refresh links of all pages including this page
        # This will be in a separate transaction
        if ( $this->mRecursive ) {
            $this->queueRecursiveJobs();
        }

        # Update the links table freshness for this title
        $this->updateLinksTimestamp();
    }

    /**
     * Queue recursive jobs for this page
     *
     * Which means do LinksUpdate on all pages that include the current page,
     * using the job queue.
     */
    protected function queueRecursiveJobs() {
        $services = MediaWikiServices::getInstance();
        $backlinkCache = $services->getBacklinkCacheFactory()
            ->getBacklinkCache( $this->mTitle );
        $action = $this->getCauseAction();
        $agent = $this->getCauseAgent();

        self::queueRecursiveJobsForTable(
            $this->mTitle, 'templatelinks', $action, $agent, $backlinkCache
        );
        if ( $this->mMaybeRedirectChanged && $this->mTitle->getNamespace() === NS_FILE ) {
            // Process imagelinks in case the redirect target has changed
            self::queueRecursiveJobsForTable(
                $this->mTitle, 'imagelinks', $action, $agent, $backlinkCache
            );
        }

        // Get jobs for cascade-protected backlinks for a high priority queue.
        // If meta-templates change to using a new template, the new template
        // should be implicitly protected as soon as possible, if applicable.
        // These jobs duplicate a subset of the above ones, but can run sooner.
        // Which ever runs first generally no-ops the other one.
        $jobs = [];
        foreach ( $backlinkCache->getCascadeProtectedLinkPages() as $page ) {
            $jobs[] = RefreshLinksJob::newPrioritized(
                $page,
                [
                    'causeAction' => $action,
                    'causeAgent' => $agent
                ]
            );
        }
        $services->getJobQueueGroup()->push( $jobs );
    }

    /**
     * Queue a RefreshLinks job for any table.
     *
     * @param PageIdentity $page Page to do job for
     * @param string $table Table to use (e.g. 'templatelinks')
     * @param string $action Triggering action
     * @param string $userName Triggering user name
     * @param BacklinkCache|null $backlinkCache
     */
    public static function queueRecursiveJobsForTable(
        PageIdentity $page, $table, $action = 'LinksUpdate', $userName = 'unknown', ?BacklinkCache $backlinkCache = null
    ) {
        $title = Title::newFromPageIdentity( $page );
        if ( !$backlinkCache ) {
            wfDeprecatedMsg( __METHOD__ . " needs a BacklinkCache object, null passed", '1.37' );
            $backlinkCache = MediaWikiServices::getInstance()->getBacklinkCacheFactory()
                ->getBacklinkCache( $title );
        }
        if ( $backlinkCache->hasLinks( $table ) ) {
            $job = new RefreshLinksJob(
                $title,
                [
                    'table' => $table,
                    'recursive' => true,
                ] + Job::newRootJobParams( // "overall" refresh links job info
                    "refreshlinks:{$table}:{$title->getPrefixedText()}"
                ) + [ 'causeAction' => $action, 'causeAgent' => $userName ]
            );

            MediaWikiServices::getInstance()->getJobQueueGroup()->push( $job );
        }
    }

    /**
     * Omit conflict resolution options from the insert query so that testing
     * can confirm that the incremental update logic was correct.
     *
     * @param bool $mode
     */
    public function setStrictTestMode( $mode = true ) {
        $this->tableFactory->setStrictTestMode( $mode );
    }

    /**
     * Return the title object of the page being updated
     * @return Title
     */
    public function getTitle() {
        return $this->mTitle;
    }

    /**
     * Get the page_id of the page being updated
     *
     * @since 1.38
     * @return int
     */
    public function getPageId() {
        if ( $this->mId ) {
            return $this->mId;
        } else {
            return $this->mTitle->getArticleID();
        }
    }

    /**
     * Returns parser output
     * @since 1.19
     * @return ParserOutput
     */
    public function getParserOutput() {
        return $this->mParserOutput;
    }

    /**
     * Return the list of images used as generated by the parser
     * @return array
     */
    public function getImages() {
        return $this->getParserOutput()->getImages();
    }

    /**
     * Set the RevisionRecord corresponding to this LinksUpdate
     *
     * @since 1.35
     * @param RevisionRecord $revisionRecord
     */
    public function setRevisionRecord( RevisionRecord $revisionRecord ) {
        $this->mRevisionRecord = $revisionRecord;
        $this->tableFactory->setRevision( $revisionRecord );
    }

    /**
     * @since 1.35
     * @return RevisionRecord|null
     */
    public function getRevisionRecord() {
        return $this->mRevisionRecord;
    }

    /**
     * Set the user who triggered this LinksUpdate
     *
     * @since 1.27
     * @param UserIdentity $user
     */
    public function setTriggeringUser( UserIdentity $user ) {
        $this->user = $user;
    }

    /**
     * Get the user who triggered this LinksUpdate
     *
     * @since 1.27
     * @return UserIdentity|null
     */
    public function getTriggeringUser(): ?UserIdentity {
        return $this->user;
    }

    /**
     * @return PageLinksTable
     */
    protected function getPageLinksTable(): PageLinksTable {
        // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
        return $this->tableFactory->get( 'pagelinks' );
    }

    /**
     * @return ExternalLinksTable
     */
    protected function getExternalLinksTable(): ExternalLinksTable {
        // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
        return $this->tableFactory->get( 'externallinks' );
    }

    /**
     * @return PagePropsTable
     */
    protected function getPagePropsTable(): PagePropsTable {
        // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
        return $this->tableFactory->get( 'page_props' );
    }

    /**
     * Fetch page links added by this LinksUpdate.  Only available after the update is complete.
     *
     * @since 1.22
     * @deprecated since 1.38 use getPageReferenceIterator() or getPageReferenceArray()
     * @return Title[] Array of Titles
     */
    public function getAddedLinks() {
        return $this->getPageLinksTable()->getTitleArray( LinksTable::INSERTED );
    }

    /**
     * Fetch page links removed by this LinksUpdate.  Only available after the update is complete.
     *
     * @since 1.22
     * @deprecated since 1.38 use getPageReferenceIterator() or getPageReferenceArray()
     * @return Title[] Array of Titles
     */
    public function getRemovedLinks() {
        return $this->getPageLinksTable()->getTitleArray( LinksTable::DELETED );
    }

    /**
     * Fetch external links added by this LinksUpdate. Only available after
     * the update is complete.
     * @since 1.33
     * @return null|array Array of Strings
     */
    public function getAddedExternalLinks() {
        return $this->getExternalLinksTable()->getStringArray( LinksTable::INSERTED );
    }

    /**
     * Fetch external links removed by this LinksUpdate. Only available after
     * the update is complete.
     * @since 1.33
     * @return null|string[]
     */
    public function getRemovedExternalLinks() {
        return $this->getExternalLinksTable()->getStringArray( LinksTable::DELETED );
    }

    /**
     * Fetch page properties added by this LinksUpdate.
     * Only available after the update is complete.
     * @since 1.28
     * @return null|array
     */
    public function getAddedProperties() {
        return $this->getPagePropsTable()->getAssocArray( LinksTable::INSERTED );
    }

    /**
     * Fetch page properties removed by this LinksUpdate.
     * Only available after the update is complete.
     * @since 1.28
     * @return null|array
     */
    public function getRemovedProperties() {
        return $this->getPagePropsTable()->getAssocArray( LinksTable::DELETED );
    }

    /**
     * Get an iterator over PageReferenceValue objects corresponding to a given set
     * type in a given table.
     *
     * @since 1.38
     * @param string $tableName The name of any table that links to local titles
     * @param int $setType One of:
     *    - LinksTable::INSERTED: The inserted links
     *    - LinksTable::DELETED: The deleted links
     *    - LinksTable::CHANGED: Both the inserted and deleted links
     *    - LinksTable::OLD: The old set of links, loaded before the update
     *    - LinksTable::NEW: The new set of links from the ParserOutput
     * @return iterable<PageReferenceValue>
     * @phan-return \Traversable
     */
    public function getPageReferenceIterator( $tableName, $setType ) {
        $table = $this->tableFactory->get( $tableName );
        if ( $table instanceof TitleLinksTable ) {
            return $table->getPageReferenceIterator( $setType );
        } else {
            throw new \InvalidArgumentException(
                __METHOD__ . ": $tableName does not have a list of titles" );
        }
    }

    /**
     * Same as getPageReferenceIterator() but converted to an array for convenience
     * (at the expense of additional time and memory usage)
     *
     * @since 1.38
     * @param string $tableName
     * @param int $setType
     * @return PageReferenceValue[]
     */
    public function getPageReferenceArray( $tableName, $setType ) {
        return iterator_to_array( $this->getPageReferenceIterator( $tableName, $setType ) );
    }

    /**
     * Update links table freshness
     */
    protected function updateLinksTimestamp() {
        if ( $this->mId ) {
            // The link updates made here only reflect the freshness of the parser output
            $timestamp = $this->mParserOutput->getCacheTime();
            $this->getDB()->newUpdateQueryBuilder()
                ->update( 'page' )
                ->set( [ 'page_links_updated' => $this->getDB()->timestamp( $timestamp ) ] )
                ->where( [ 'page_id' => $this->mId ] )
                ->caller( __METHOD__ )->execute();
        }
    }

    /**
     * @return IDatabase
     */
    protected function getDB() {
        if ( !$this->db ) {
            $this->db = $this->connectionProvider->getPrimaryDatabase();
        }

        return $this->db;
    }

    /**
     * Whether or not this LinksUpdate will also update pages which transclude the
     * current page or otherwise depend on it.
     *
     * @return bool
     */
    public function isRecursive() {
        return $this->mRecursive;
    }
}