maintenance/prewarmParsoidParserCache.php
<?php
use MediaWiki\Page\PageLookup;
use MediaWiki\Page\PageRecord;
use MediaWiki\Page\ParserOutputAccess;
use MediaWiki\Parser\Parsoid\Config\SiteConfig as ParsoidSiteConfig;
use MediaWiki\Revision\RevisionLookup;
use MediaWiki\Revision\RevisionRecord;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Status\Status;
use Wikimedia\Parsoid\Core\ClientError;
use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
use Wikimedia\Rdbms\SelectQueryBuilder;
// @codeCoverageIgnoreStart
require_once __DIR__ . '/Maintenance.php';
// @codeCoverageIgnoreEnd
/**
* Maintenance script for populating parser cache with parsoid output.
*
* @since 1.41
*
* @license GPL-2.0-or-later
* @author Richika Rana
*/
class PrewarmParsoidParserCache extends Maintenance {
private int $forceParse = 0;
private ParserOutputAccess $parserOutputAccess;
private PageLookup $pageLookup;
private RevisionLookup $revisionLookup;
private ParsoidSiteConfig $parsoidSiteConfig;
public function __construct() {
parent::__construct();
$this->addDescription(
'Populate parser cache with parsoid output. By default, script attempt to run' .
'for supported content model pages (in a specified batch if provided)'
);
$this->addOption(
'force',
'Re-parse pages even if the cached entry seems up to date',
false,
false
);
$this->addOption( 'start-from', 'Start from this page ID', false, true );
$this->addOption( 'namespace', 'Filter pages in this namespace', false, true );
$this->setBatchSize( 100 );
}
private function getPageLookup(): PageLookup {
$this->pageLookup = $this->getServiceContainer()->getPageStore();
return $this->pageLookup;
}
private function getRevisionLookup(): RevisionLookup {
$this->revisionLookup = $this->getServiceContainer()->getRevisionLookup();
return $this->revisionLookup;
}
private function getParserOutputAccess(): ParserOutputAccess {
$this->parserOutputAccess = $this->getServiceContainer()->getParserOutputAccess();
return $this->parserOutputAccess;
}
private function getParsoidSiteConfig(): ParsoidSiteConfig {
$this->parsoidSiteConfig = $this->getServiceContainer()->getParsoidSiteConfig();
return $this->parsoidSiteConfig;
}
private function getQueryBuilder(): SelectQueryBuilder {
$dbr = $this->getReplicaDB();
return $dbr->newSelectQueryBuilder()
->select( [ 'page_id' ] )
->from( 'page' )
->caller( __METHOD__ )
->orderBy( 'page_id', SelectQueryBuilder::SORT_ASC );
}
private function parse(
PageRecord $page,
RevisionRecord $revision
): Status {
$popts = ParserOptions::newFromAnon();
$popts->setUseParsoid();
try {
return $this->getParserOutputAccess()->getParserOutput(
$page,
$popts,
$revision,
$this->forceParse
);
} catch ( ClientError $e ) {
return Status::newFatal( 'parsoid-client-error', $e->getMessage() );
} catch ( ResourceLimitExceededException $e ) {
return Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() );
}
}
/*
* NamespaceInfo::getCanonicalIndex() requires the namespace to be in lowercase,
* so let's do some normalization and return its canonical index.
*
* @param string $namespace The namespace string from the command line
* @return int The canonical index of the namespace
*/
private function normalizeNamespace( string $namespace ): int {
return $this->getServiceContainer()->getNamespaceInfo()
->getCanonicalIndex( strtolower( $namespace ) );
}
/**
* Populate parser cache with parsoid output.
*
* @return bool
*/
public function execute() {
$force = $this->getOption( 'force' );
$startFrom = $this->getOption( 'start-from' );
// We need the namespace index instead of the name to perform the query
// on, because that's what the page table stores (in the page_namespace field).
$namespaceIndex = null;
$namespace = $this->getOption( 'namespace' );
if ( $namespace !== null ) {
$namespaceIndex = $this->normalizeNamespace( $namespace );
}
if ( $force !== null ) {
// If --force is supplied, for a parse for supported pages or supported
// pages in the specified batch.
$this->forceParse = ParserOutputAccess::OPT_FORCE_PARSE;
}
$startFrom = (int)$startFrom;
$this->output( "\nWarming parsoid parser cache with Parsoid output...\n\n" );
while ( true ) {
$query = $this->getQueryBuilder();
if ( $namespaceIndex !== null ) {
$query = $query->where( [ 'page_namespace' => $namespaceIndex ] );
}
$query = $query->where( $this->getReplicaDB()->expr( 'page_id', '>=', $startFrom ) )
->limit( $this->getBatchSize() );
$result = $query->fetchResultSet();
if ( !$result->numRows() ) {
break;
}
$currentBatch = $startFrom + ( $this->getBatchSize() - 1 );
$this->output( "\n\nBatch: $startFrom - $currentBatch\n----\n" );
// Look through pages by pageId and populate the parserCache
foreach ( $result as $row ) {
$page = $this->getPageLookup()->getPageById( $row->page_id );
$startFrom = ( (int)$row->page_id + 1 );
if ( $page === null ) {
$this->output( "\n[Skipped] Page ID: $row->page_id not found.\n" );
continue;
}
$latestRevision = $page->getLatest();
$revision = $this->getRevisionLookup()->getRevisionById( $latestRevision );
$mainSlot = $revision->getSlot( SlotRecord::MAIN );
// POA will write a dummy output to PC, but we don't want that here. Just skip!
if ( !$this->getParsoidSiteConfig()->supportsContentModel( $mainSlot->getModel() ) ) {
$this->output(
'[Skipped] Content model "' .
$mainSlot->getModel() .
"\" not supported for page ID: $row->page_id.\n"
);
continue;
}
$status = $this->parse( $page, $revision );
if ( !$status->isOK() ) {
$this->output(
__METHOD__ .
": Error parsing page ID: $row->page_id or writing to parser cache\n"
);
continue;
}
$this->output( "[Done] Page ID: $row->page_id ✔️\n" );
}
$this->waitForReplication();
}
$this->output( "\nDone pre-warming parsoid parser cache...\n" );
return true;
}
}
// @codeCoverageIgnoreStart
$maintClass = PrewarmParsoidParserCache::class;
require_once RUN_MAINTENANCE_IF_MAIN;
// @codeCoverageIgnoreEnd