wikimedia/mediawiki-extensions-Wikibase

View on GitHub
repo/includes/PurgeEntityDataJob.php

Summary

Maintainability
A
45 mins
Test Coverage
<?php

declare( strict_types = 1 );

namespace Wikibase\Repo;

use BatchRowIterator;
use Job;
use MediaWiki\Cache\HTMLCacheUpdater;
use MediaWiki\MediaWikiServices;
use MediaWiki\Title\Title;
use Wikibase\DataModel\Entity\EntityIdParser;
use Wikibase\Lib\Rdbms\RepoDomainDb;
use Wikibase\Repo\LinkedData\EntityDataUriManager;

/**
 * Job to purge Special:EntityData URLs from the HTTP cache after a page was deleted.
 *
 * @license GPL-2.0-or-later
 */
class PurgeEntityDataJob extends Job {

    /** @var EntityIdParser */
    private $entityIdParser;

    /** @var EntityDataUriManager */
    private $entityDataUriManager;

    /** @var RepoDomainDb */
    private $repoDb;

    /** @var HTMLCacheUpdater */
    private $htmlCacheUpdater;

    /** @var int */
    private $batchSize;

    public function __construct(
        EntityIdParser $entityIdParser,
        EntityDataUriManager $entityDataUriManager,
        RepoDomainDb $repoDb,
        HTMLCacheUpdater $htmlCacheUpdater,
        int $batchSize,
        array $params
    ) {
        parent::__construct( 'PurgeEntityData', $params );
        $this->entityIdParser = $entityIdParser;
        $this->entityDataUriManager = $entityDataUriManager;
        $this->repoDb = $repoDb;
        $this->htmlCacheUpdater = $htmlCacheUpdater;
        $this->batchSize = $batchSize;
    }

    public static function newFromGlobalState( Title $unused, array $params ): self {
        $services = MediaWikiServices::getInstance();
        return new self(
            WikibaseRepo::getEntityIdParser( $services ),
            WikibaseRepo::getEntityDataUriManager( $services ),
            WikibaseRepo::getRepoDomainDbFactory( $services )->newRepoDb(),
            $services->getHtmlCacheUpdater(),
            $services->getMainConfig()->get( 'UpdateRowsPerQuery' ),
            $params
        );
    }

    public function run(): bool {
        $entityId = $this->entityIdParser->parse( $this->params['entityId'] );

        // URLs for latest data...
        $urls = $this->entityDataUriManager->getPotentiallyCachedUrls( $entityId );
        foreach ( $this->getArchivedRevisionIds() as $revisionId ) {
            // ...and URLs for each specific revision
            $urls = array_merge( $urls, $this->entityDataUriManager->getPotentiallyCachedUrls(
                $entityId,
                $revisionId
            ) );
        }

        if ( $urls !== [] ) {
            $this->htmlCacheUpdater->purgeUrls( $urls );
        }

        return true;
    }

    private function getArchivedRevisionIds(): iterable {
        // read archive rows from a replica, but only after it has caught up with the primary
        // (in theory we only need to wait for the primary pos as of when the job was pushed,
        // but DBPrimaryPos is an opaque object, so we can’t put it in the job params)
        $this->repoDb->replication()->wait();
        $dbr = $this->repoDb->connections()->getReadConnection();

        $iterator = new BatchRowIterator( $dbr, 'archive', [ 'ar_id' ], $this->batchSize );
        $iterator->setFetchColumns( [ 'ar_rev_id' ] );
        $iterator->addConditions( [
            'ar_page_id' => $this->params['pageId'],
            'ar_title' => $this->params['title'],
            'ar_namespace' => $this->params['namespace'],
        ] );
        $iterator->setCaller( __METHOD__ );

        foreach ( $iterator as $batch ) {
            foreach ( $batch as $row ) {
                yield (int)$row->ar_rev_id;
            }
        }
    }

}