wikimedia/mediawiki-core

View on GitHub
maintenance/prewarmParsoidParserCache.php

Summary

Maintainability
B
5 hrs
Test Coverage
<?php
use MediaWiki\Page\PageLookup;
use MediaWiki\Page\PageRecord;
use MediaWiki\Page\ParserOutputAccess;
use MediaWiki\Parser\Parsoid\Config\SiteConfig as ParsoidSiteConfig;
use MediaWiki\Revision\RevisionLookup;
use MediaWiki\Revision\RevisionRecord;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Status\Status;
use Wikimedia\Parsoid\Core\ClientError;
use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
use Wikimedia\Rdbms\SelectQueryBuilder;

// @codeCoverageIgnoreStart
require_once __DIR__ . '/Maintenance.php';
// @codeCoverageIgnoreEnd

/**
 * Maintenance script for populating parser cache with parsoid output.
 *
 * @since 1.41
 *
 * @license GPL-2.0-or-later
 * @author Richika Rana
 */
class PrewarmParsoidParserCache extends Maintenance {
    private int $forceParse = 0;
    private ParserOutputAccess $parserOutputAccess;
    private PageLookup $pageLookup;
    private RevisionLookup $revisionLookup;
    private ParsoidSiteConfig $parsoidSiteConfig;

    public function __construct() {
        parent::__construct();

        $this->addDescription(
            'Populate parser cache with parsoid output. By default, script attempt to run' .
            'for supported content model pages (in a specified batch if provided)'
        );
        $this->addOption(
            'force',
            'Re-parse pages even if the cached entry seems up to date',
            false,
            false
        );
        $this->addOption( 'start-from', 'Start from this page ID', false, true );
        $this->addOption( 'namespace', 'Filter pages in this namespace', false, true );
        $this->setBatchSize( 100 );
    }

    private function getPageLookup(): PageLookup {
        $this->pageLookup = $this->getServiceContainer()->getPageStore();
        return $this->pageLookup;
    }

    private function getRevisionLookup(): RevisionLookup {
        $this->revisionLookup = $this->getServiceContainer()->getRevisionLookup();
        return $this->revisionLookup;
    }

    private function getParserOutputAccess(): ParserOutputAccess {
        $this->parserOutputAccess = $this->getServiceContainer()->getParserOutputAccess();
        return $this->parserOutputAccess;
    }

    private function getParsoidSiteConfig(): ParsoidSiteConfig {
        $this->parsoidSiteConfig = $this->getServiceContainer()->getParsoidSiteConfig();
        return $this->parsoidSiteConfig;
    }

    private function getQueryBuilder(): SelectQueryBuilder {
        $dbr = $this->getReplicaDB();

        return $dbr->newSelectQueryBuilder()
            ->select( [ 'page_id' ] )
            ->from( 'page' )
            ->caller( __METHOD__ )
            ->orderBy( 'page_id', SelectQueryBuilder::SORT_ASC );
    }

    private function parse(
        PageRecord $page,
        RevisionRecord $revision
    ): Status {
        $popts = ParserOptions::newFromAnon();
        $popts->setUseParsoid();
        try {
            return $this->getParserOutputAccess()->getParserOutput(
                $page,
                $popts,
                $revision,
                $this->forceParse
            );
        } catch ( ClientError $e ) {
            return Status::newFatal( 'parsoid-client-error', $e->getMessage() );
        } catch ( ResourceLimitExceededException $e ) {
            return Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() );
        }
    }

    /*
     * NamespaceInfo::getCanonicalIndex() requires the namespace to be in lowercase,
     * so let's do some normalization and return its canonical index.
     *
     * @param string $namespace The namespace string from the command line
     * @return int The canonical index of the namespace
     */
    private function normalizeNamespace( string $namespace ): int {
        return $this->getServiceContainer()->getNamespaceInfo()
            ->getCanonicalIndex( strtolower( $namespace ) );
    }

    /**
     * Populate parser cache with parsoid output.
     *
     * @return bool
     */
    public function execute() {
        $force = $this->getOption( 'force' );
        $startFrom = $this->getOption( 'start-from' );

        // We need the namespace index instead of the name to perform the query
        // on, because that's what the page table stores (in the page_namespace field).
        $namespaceIndex = null;
        $namespace = $this->getOption( 'namespace' );
        if ( $namespace !== null ) {
            $namespaceIndex = $this->normalizeNamespace( $namespace );
        }

        if ( $force !== null ) {
            // If --force is supplied, for a parse for supported pages or supported
            // pages in the specified batch.
            $this->forceParse = ParserOutputAccess::OPT_FORCE_PARSE;
        }

        $startFrom = (int)$startFrom;

        $this->output( "\nWarming parsoid parser cache with Parsoid output...\n\n" );
        while ( true ) {
            $query = $this->getQueryBuilder();
            if ( $namespaceIndex !== null ) {
                $query = $query->where( [ 'page_namespace' => $namespaceIndex ] );
            }
            $query = $query->where( $this->getReplicaDB()->expr( 'page_id', '>=', $startFrom ) )
                ->limit( $this->getBatchSize() );

            $result = $query->fetchResultSet();

            if ( !$result->numRows() ) {
                break;
            }

            $currentBatch = $startFrom + ( $this->getBatchSize() - 1 );
            $this->output( "\n\nBatch: $startFrom - $currentBatch\n----\n" );

            // Look through pages by pageId and populate the parserCache
            foreach ( $result as $row ) {
                $page = $this->getPageLookup()->getPageById( $row->page_id );
                $startFrom = ( (int)$row->page_id + 1 );

                if ( $page === null ) {
                    $this->output( "\n[Skipped] Page ID: $row->page_id not found.\n" );
                    continue;
                }

                $latestRevision = $page->getLatest();
                $revision = $this->getRevisionLookup()->getRevisionById( $latestRevision );
                $mainSlot = $revision->getSlot( SlotRecord::MAIN );

                // POA will write a dummy output to PC, but we don't want that here. Just skip!
                if ( !$this->getParsoidSiteConfig()->supportsContentModel( $mainSlot->getModel() ) ) {
                    $this->output(
                        '[Skipped] Content model "' .
                        $mainSlot->getModel() .
                        "\" not supported for page ID: $row->page_id.\n"
                    );
                    continue;
                }

                $status = $this->parse( $page, $revision );
                if ( !$status->isOK() ) {
                    $this->output(
                        __METHOD__ .
                        ": Error parsing page ID: $row->page_id or writing to parser cache\n"
                    );
                    continue;
                }

                $this->output( "[Done] Page ID: $row->page_id ✔️\n" );
            }
            $this->waitForReplication();
        }

        $this->output( "\nDone pre-warming parsoid parser cache...\n" );

        return true;
    }
}

// @codeCoverageIgnoreStart
$maintClass = PrewarmParsoidParserCache::class;
require_once RUN_MAINTENANCE_IF_MAIN;
// @codeCoverageIgnoreEnd