includes/parser/Parsoid/Config/DataAccess.php
<?php
/**
* Copyright (C) 2011-2022 Wikimedia Foundation and others.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
namespace MediaWiki\Parser\Parsoid\Config;
use File;
use MediaTransformError;
use MediaWiki\Cache\LinkBatchFactory;
use MediaWiki\Category\TrackingCategories;
use MediaWiki\Config\ServiceOptions;
use MediaWiki\Content\ContentHandler;
use MediaWiki\Content\Transform\ContentTransformer;
use MediaWiki\HookContainer\HookContainer;
use MediaWiki\HookContainer\HookRunner;
use MediaWiki\Language\LanguageCode;
use MediaWiki\Linker\Linker;
use MediaWiki\MainConfigNames;
use MediaWiki\Page\File\BadFileLookup;
use MediaWiki\Parser\Parser;
use MediaWiki\Title\Title;
use ParserFactory;
use PPFrame;
use RepoGroup;
use Wikimedia\Assert\UnreachableException;
use Wikimedia\Parsoid\Config\DataAccess as IDataAccess;
use Wikimedia\Parsoid\Config\PageConfig as IPageConfig;
use Wikimedia\Parsoid\Config\PageContent as IPageContent;
use Wikimedia\Parsoid\Core\ContentMetadataCollector;
use Wikimedia\Parsoid\Core\LinkTarget as ParsoidLinkTarget;
use Wikimedia\Rdbms\ReadOnlyMode;
/**
* Implement Parsoid's abstract class for data access.
*
* @since 1.39
* @internal
*/
class DataAccess extends IDataAccess {
public const CONSTRUCTOR_OPTIONS = [
MainConfigNames::SVGMaxSize,
];
private RepoGroup $repoGroup;
private BadFileLookup $badFileLookup;
private HookContainer $hookContainer;
private HookRunner $hookRunner;
private ContentTransformer $contentTransformer;
private TrackingCategories $trackingCategories;
private ParserFactory $parserFactory;
/** Lazy-created via self::prepareParser() */
private ?Parser $parser = null;
private PPFrame $ppFrame;
private ?PageConfig $previousPageConfig = null;
private ServiceOptions $config;
private ReadOnlyMode $readOnlyMode;
private LinkBatchFactory $linkBatchFactory;
/**
* @param ServiceOptions $config MediaWiki main configuration object
* @param RepoGroup $repoGroup
* @param BadFileLookup $badFileLookup
* @param HookContainer $hookContainer
* @param ContentTransformer $contentTransformer
* @param TrackingCategories $trackingCategories
* @param ReadOnlyMode $readOnlyMode used to disable linting when the
* database is read-only.
* @param ParserFactory $parserFactory A legacy parser factory,
* for PST/preprocessing/extension handling
* @param LinkBatchFactory $linkBatchFactory
*/
public function __construct(
ServiceOptions $config,
RepoGroup $repoGroup,
BadFileLookup $badFileLookup,
HookContainer $hookContainer,
ContentTransformer $contentTransformer,
TrackingCategories $trackingCategories,
ReadOnlyMode $readOnlyMode,
ParserFactory $parserFactory,
LinkBatchFactory $linkBatchFactory
) {
$config->assertRequiredOptions( self::CONSTRUCTOR_OPTIONS );
$this->config = $config;
$this->repoGroup = $repoGroup;
$this->badFileLookup = $badFileLookup;
$this->hookContainer = $hookContainer;
$this->contentTransformer = $contentTransformer;
$this->trackingCategories = $trackingCategories;
$this->readOnlyMode = $readOnlyMode;
$this->linkBatchFactory = $linkBatchFactory;
$this->hookRunner = new HookRunner( $hookContainer );
$this->parserFactory = $parserFactory;
$this->previousPageConfig = null; // ensure we initialize parser options
}
/**
* @param IPageConfig $pageConfig
* @param File $file
* @param array $hp
* @return array
*/
private function makeTransformOptions( IPageConfig $pageConfig, $file, array $hp ): array {
// Validate the input parameters like Parser::makeImage()
$handler = $file->getHandler();
if ( !$handler ) {
return []; // will get iconThumb()
}
foreach ( $hp as $name => $value ) {
if ( !$handler->validateParam( $name, $value ) ) {
unset( $hp[$name] );
}
}
// This part is similar to Linker::makeImageLink(). If there is no width,
// set one based on the source file size.
$page = $hp['page'] ?? 0;
if ( !isset( $hp['width'] ) ) {
if ( isset( $hp['height'] ) && $file->isVectorized() ) {
// If it's a vector image, and user only specifies height
// we don't want it to be limited by its "normal" width.
$hp['width'] = $this->config->get( MainConfigNames::SVGMaxSize );
} else {
$hp['width'] = $file->getWidth( $page );
}
// We don't need to fill in a default thumbnail width here, since
// that is done by Parsoid. Parsoid always sets the width parameter
// for thumbnails.
}
// Parser::makeImage() always sets this
$hp['targetlang'] = LanguageCode::bcp47ToInternal(
$pageConfig->getPageLanguageBcp47()
);
return $hp;
}
/** @inheritDoc */
public function getPageInfo( $pageConfigOrTitle, array $titles ): array {
if ( $pageConfigOrTitle instanceof IPageConfig ) {
$context_title = Title::newFromLinkTarget(
$pageConfigOrTitle->getLinkTarget()
);
} elseif ( is_string( $pageConfigOrTitle ) ) {
// Temporary, deprecated.
$context_title = Title::newFromTextThrow( $pageConfigOrTitle );
} elseif ( $pageConfigOrTitle instanceof ParsoidLinkTarget ) {
$context_title = Title::newFromLinkTarget( $pageConfigOrTitle );
} else {
throw new UnreachableException( "Bad type for argument 1" );
}
$titleObjs = [];
$pagemap = [];
$classes = [];
$ret = [];
foreach ( $titles as $name ) {
$t = Title::newFromText( $name );
// Filter out invalid titles. Title::newFromText in core (not our bespoke
// version in src/Utils/Title.php) can return null for invalid titles.
if ( !$t ) {
// FIXME: This is a bandaid to patch up the fact that Env::makeTitle treats
// this as a valid title, but Title::newFromText treats it as invalid.
// T237535
// This matches what ApiQuery::outputGeneralPageInfo() would
// return for an invalid title.
$ret[$name] = [
'pageId' => -1,
'revId' => -1,
'invalid' => true,
'invalidreason' => 'The requested page title is invalid',
];
} else {
$titleObjs[$name] = $t;
}
}
$linkBatch = $this->linkBatchFactory->newLinkBatch( $titleObjs );
$linkBatch->setCaller( __METHOD__ );
$linkBatch->execute();
foreach ( $titleObjs as $obj ) {
$pdbk = $obj->getPrefixedDBkey();
$pagemap[$obj->getArticleID()] = $pdbk;
$classes[$pdbk] = $obj->isRedirect() ? 'mw-redirect' : '';
}
$this->hookRunner->onGetLinkColours(
# $classes is passed by reference and mutated
$pagemap, $classes, $context_title
);
foreach ( $titleObjs as $name => $obj ) {
/** @var Title $obj */
$pdbk = $obj->getPrefixedDBkey();
$c = preg_split(
'/\s+/', $classes[$pdbk] ?? '', -1, PREG_SPLIT_NO_EMPTY
);
$ret[$name] = [
'pageId' => $obj->getArticleID(),
'revId' => $obj->getLatestRevID(),
'missing' => !$obj->exists(),
'known' => $obj->isKnown(),
'redirect' => $obj->isRedirect(),
'linkclasses' => $c, # See ApiQueryInfo::getLinkClasses() in core
];
}
return $ret;
}
/** @inheritDoc */
public function getFileInfo( IPageConfig $pageConfig, array $files ): array {
$page = Title::newFromLinkTarget( $pageConfig->getLinkTarget() );
$keys = [];
foreach ( $files as $f ) {
$keys[] = $f[0];
}
$fileObjs = $this->repoGroup->findFiles( $keys );
$ret = [];
foreach ( $files as $f ) {
$filename = $f[0];
$dims = $f[1];
/** @var File $file */
$file = $fileObjs[$filename] ?? null;
if ( !$file ) {
$ret[] = null;
continue;
}
// See Linker::makeImageLink; 'page' is a key in $handlerParams
// core uses 'false' as the default then casts to (int) => 0
$pageNum = $dims['page'] ?? 0;
$result = [
'width' => $file->getWidth( $pageNum ),
'height' => $file->getHeight( $pageNum ),
'size' => $file->getSize(),
'mediatype' => $file->getMediaType(),
'mime' => $file->getMimeType(),
'url' => $file->getFullUrl(),
'mustRender' => $file->mustRender(),
'badFile' => $this->badFileLookup->isBadFile( $filename, $page ),
'timestamp' => $file->getTimestamp(),
'sha1' => $file->getSha1(),
];
$length = $file->getLength();
if ( $length ) {
$result['duration'] = (float)$length;
}
if ( isset( $dims['seek'] ) ) {
$dims['thumbtime'] = $dims['seek'];
}
$txopts = $this->makeTransformOptions( $pageConfig, $file, $dims );
$mto = $file->transform( $txopts );
if ( $mto ) {
if ( $mto->isError() && $mto instanceof MediaTransformError ) {
$result['thumberror'] = $mto->toText();
} else {
if ( $txopts ) {
// Do srcset scaling
Linker::processResponsiveImages( $file, $mto, $txopts );
if ( count( $mto->responsiveUrls ) ) {
$result['responsiveUrls'] = [];
foreach ( $mto->responsiveUrls as $density => $url ) {
$result['responsiveUrls'][$density] = $url;
}
}
}
// Proposed MediaTransformOutput serialization method for T51896 etc.
// Note that getAPIData(['fullurl']) would return
// UrlUtils::expand(), which wouldn't respect the wiki's
// protocol preferences -- instead it would use the
// protocol used for the API request.
if ( is_callable( [ $mto, 'getAPIData' ] ) ) {
$result['thumbdata'] = $mto->getAPIData( [ 'withhash' ] );
}
$result['thumburl'] = $mto->getUrl();
$result['thumbwidth'] = $mto->getWidth();
$result['thumbheight'] = $mto->getHeight();
}
} else {
$result['thumberror'] = "Presumably, invalid parameters, despite validation.";
}
$ret[] = $result;
}
return $ret;
}
/**
* Prepare MediaWiki's parser for preprocessing or extension tag parsing,
* clearing its state if necessary.
*
* @param IPageConfig $pageConfig
* @param int $outputType
* @return Parser
*/
private function prepareParser( IPageConfig $pageConfig, int $outputType ) {
'@phan-var PageConfig $pageConfig'; // @var PageConfig $pageConfig
// Clear the state only when the PageConfig changes, so that Parser's internal caches can
// be retained. This should also provide better compatibility with extension tags.
$clearState = $this->previousPageConfig !== $pageConfig;
$this->previousPageConfig = $pageConfig;
// Use the same legacy parser object for all calls to extension tag
// processing, for greater compatibility.
$this->parser ??= $this->parserFactory->create();
$this->parser->startExternalParse(
Title::newFromLinkTarget( $pageConfig->getLinkTarget() ),
$pageConfig->getParserOptions(),
$outputType, $clearState, $pageConfig->getRevisionId() );
$this->parser->resetOutput();
// Retain a PPFrame object between preprocess requests since it contains
// some useful caches.
if ( $clearState ) {
$this->ppFrame = $this->parser->getPreprocessor()->newFrame();
}
return $this->parser;
}
/** @inheritDoc */
public function doPst( IPageConfig $pageConfig, string $wikitext ): string {
'@phan-var PageConfig $pageConfig'; // @var PageConfig $pageConfig
// This could use prepareParser(), but it's only called once per page,
// so it's not essential.
$titleObj = Title::newFromLinkTarget( $pageConfig->getLinkTarget() );
$user = $pageConfig->getParserOptions()->getUserIdentity();
$content = ContentHandler::makeContent( $wikitext, $titleObj, CONTENT_MODEL_WIKITEXT );
return $this->contentTransformer->preSaveTransform(
$content,
$titleObj,
$user,
$pageConfig->getParserOptions()
)->serialize();
}
/** @inheritDoc */
public function parseWikitext(
IPageConfig $pageConfig,
ContentMetadataCollector $metadata,
string $wikitext
): string {
$parser = $this->prepareParser( $pageConfig, Parser::OT_HTML );
$html = $parser->parseExtensionTagAsTopLevelDoc( $wikitext );
// XXX: Ideally we will eventually have the legacy parser use our
// ContentMetadataCollector instead of having a new ParserOutput
// created (implicitly in ::prepareParser()/Parser::resetOutput() )
// which we then have to manually merge.
$out = $parser->getOutput();
$out->setRawText( $html );
$out->collectMetadata( $metadata ); # merges $out into $metadata
return Parser::extractBody( $out->getRawText() );
}
/** @inheritDoc */
public function preprocessWikitext(
IPageConfig $pageConfig,
ContentMetadataCollector $metadata,
string $wikitext
): string {
$parser = $this->prepareParser( $pageConfig, Parser::OT_PREPROCESS );
$this->hookRunner->onParserBeforePreprocess(
# $wikitext is passed by reference and mutated
$parser, $wikitext, $parser->getStripState()
);
$wikitext = $parser->replaceVariables( $wikitext, $this->ppFrame );
// FIXME (T289545): StripState markers protect content that need to be protected from further
// "wikitext processing". So, where the result has strip state markers, we actually
// need to tunnel this content through rather than unwrap and let it go through the
// rest of the parsoid pipeline. For example, some parser functions might return HTML
// not wikitext, and where the content might contain wikitext characters, we are now
// going to potentially mangle that output.
$wikitext = $parser->getStripState()->unstripBoth( $wikitext );
// XXX: Ideally we will eventually have the legacy parser use our
// ContentMetadataCollector instead of having a new ParserOutput
// created (implicitly in ::prepareParser()/Parser::resetOutput() )
// which we then have to manually merge.
$out = $parser->getOutput();
$out->collectMetadata( $metadata ); # merges $out into $metadata
return $wikitext;
}
/** @inheritDoc */
public function fetchTemplateSource(
IPageConfig $pageConfig, $title
): ?IPageContent {
'@phan-var PageConfig $pageConfig'; // @var PageConfig $pageConfig
if ( is_string( $title ) ) {
$titleObj = Title::newFromTextThrow( $title );
} else {
$titleObj = Title::newFromLinkTarget( $title );
}
// Use the PageConfig to take advantage of custom template
// fetch hooks like FlaggedRevisions, etc.
$revRecord = $pageConfig->fetchRevisionRecordOfTemplate( $titleObj );
return $revRecord ? new PageContent( $revRecord ) : null;
}
/** @inheritDoc */
public function fetchTemplateData( IPageConfig $pageConfig, $title ): ?array {
$ret = [];
if ( !is_string( $title ) ) {
$titleObj = Title::newFromLinkTarget( $title );
$title = $titleObj->getPrefixedText();
}
// @todo: This hook needs some clean up: T304899
$this->hookRunner->onParserFetchTemplateData(
[ $title ],
$ret # value returned by reference
);
// Cast value to array since the hook returns this as a stdclass
$tplData = $ret[$title] ?? null;
if ( $tplData ) {
// Deep convert to associative array
$tplData = json_decode( json_encode( $tplData ), true );
}
return $tplData;
}
/**
* Add a tracking category with the given key to the metadata for the page.
* @param IPageConfig $pageConfig the page on which the tracking category
* is to be added
* @param ContentMetadataCollector $metadata The metadata for the page
* @param string $key Message key (not localized)
*/
public function addTrackingCategory(
IPageConfig $pageConfig,
ContentMetadataCollector $metadata,
string $key
): void {
$page = Title::newFromLinkTarget( $pageConfig->getLinkTarget() );
$this->trackingCategories->addTrackingCategory(
$metadata, $key, $page
);
}
/** @inheritDoc */
public function logLinterData( IPageConfig $pageConfig, array $lints ): void {
if ( $this->readOnlyMode->isReadOnly() ) {
return;
}
$revId = $pageConfig->getRevisionId();
$title = Title::newFromLinkTarget(
$pageConfig->getLinkTarget()
)->getPrefixedText();
$pageInfo = $this->getPageInfo( $pageConfig, [ $title ] );
$latest = $pageInfo[$title]['revId'];
// Only send the request if it the latest revision
if ( $revId !== null && $revId === $latest ) {
$this->hookRunner->onParserLogLinterData(
$title, $revId, $lints
);
}
}
}