includes/Rest/Handler/ParsoidHandler.php
<?php
/**
* Copyright (C) 2011-2020 Wikimedia Foundation and others.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
namespace MediaWiki\Rest\Handler;
use Composer\Semver\Semver;
use ExtensionRegistry;
use InvalidArgumentException;
use LanguageCode;
use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
use LogicException;
use MediaWiki\Context\RequestContext;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MainConfigNames;
use MediaWiki\MediaWikiServices;
use MediaWiki\Page\PageIdentity;
use MediaWiki\Page\ProperPageIdentity;
use MediaWiki\Rest\Handler;
use MediaWiki\Rest\Handler\Helper\HtmlInputTransformHelper;
use MediaWiki\Rest\Handler\Helper\HtmlOutputRendererHelper;
use MediaWiki\Rest\Handler\Helper\ParsoidFormatHelper;
use MediaWiki\Rest\HttpException;
use MediaWiki\Rest\LocalizedHttpException;
use MediaWiki\Rest\Response;
use MediaWiki\Revision\MutableRevisionRecord;
use MediaWiki\Revision\RevisionAccessException;
use MediaWiki\Revision\RevisionLookup;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Revision\SuppressedDataException;
use MediaWiki\Title\MalformedTitleException;
use MediaWiki\Title\Title;
use MediaWiki\WikiMap\WikiMap;
use MobileContext;
use Wikimedia\Http\HttpAcceptParser;
use Wikimedia\Message\DataMessageValue;
use Wikimedia\Message\MessageValue;
use Wikimedia\Parsoid\Config\DataAccess;
use Wikimedia\Parsoid\Config\PageConfig;
use Wikimedia\Parsoid\Config\PageConfigFactory;
use Wikimedia\Parsoid\Config\SiteConfig;
use Wikimedia\Parsoid\Core\ClientError;
use Wikimedia\Parsoid\Core\PageBundle;
use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
use Wikimedia\Parsoid\DOM\Document;
use Wikimedia\Parsoid\Parsoid;
use Wikimedia\Parsoid\Utils\ContentUtils;
use Wikimedia\Parsoid\Utils\DOMCompat;
use Wikimedia\Parsoid\Utils\DOMUtils;
use Wikimedia\Parsoid\Utils\Timing;
use WikitextContent;
/**
* Base class for Parsoid handlers.
* @internal For use by the Parsoid extension
*/
abstract class ParsoidHandler extends Handler {
// TODO logging, timeouts(?), CORS
// TODO content negotiation (routes.js routes.acceptable)
// TODO handle MaxConcurrentCallsError (pool counter?)
/** @var SiteConfig */
protected $siteConfig;
/** @var PageConfigFactory */
protected $pageConfigFactory;
/** @var DataAccess */
protected $dataAccess;
/** @var ExtensionRegistry */
protected $extensionRegistry;
/** @var ?StatsdDataFactoryInterface A statistics aggregator */
protected $metrics;
/** @var array */
private $requestAttributes;
private RevisionLookup $revisionLookup;
/**
* @return static
*/
public static function factory(): ParsoidHandler {
$services = MediaWikiServices::getInstance();
// @phan-suppress-next-line PhanTypeInstantiateAbstractStatic
return new static(
$services->getRevisionLookup(),
$services->getParsoidSiteConfig(),
$services->getParsoidPageConfigFactory(),
$services->getParsoidDataAccess()
);
}
/**
* @param RevisionLookup $revisionLookup
* @param SiteConfig $siteConfig
* @param PageConfigFactory $pageConfigFactory
* @param DataAccess $dataAccess
*/
public function __construct(
RevisionLookup $revisionLookup,
SiteConfig $siteConfig,
PageConfigFactory $pageConfigFactory,
DataAccess $dataAccess
) {
$this->revisionLookup = $revisionLookup;
$this->siteConfig = $siteConfig;
$this->pageConfigFactory = $pageConfigFactory;
$this->dataAccess = $dataAccess;
$this->extensionRegistry = ExtensionRegistry::getInstance();
$this->metrics = $siteConfig->metrics();
}
/**
* Verify that the {domain} path parameter matches the actual domain.
* @todo Remove this when we no longer need to support the {domain}
* parameter with backwards compatibility with the parsoid
* extension.
* @param string $domain Domain name parameter to validate
*/
protected function assertDomainIsCorrect( $domain ): void {
// We are cutting some corners here (IDN, non-ASCII casing)
// since domain name support is provisional.
// TODO use a proper validator instead
$server = RequestContext::getMain()->getConfig()->get( MainConfigNames::Server );
$expectedDomain = parse_url( $server, PHP_URL_HOST );
if ( !$expectedDomain ) {
throw new LogicException( 'Cannot parse $wgServer' );
}
if ( strcasecmp( $expectedDomain, $domain ) === 0 ) {
return;
}
// TODO: This should really go away! It's only acceptable because
// this entire method is going to be removed once we no longer
// need the parsoid extension endpoints with the {domain} parameter.
if ( $this->extensionRegistry->isLoaded( 'MobileFrontend' ) ) {
// @phan-suppress-next-line PhanUndeclaredClassMethod
$mobileServer = MobileContext::singleton()->getMobileUrl( $server );
$expectedMobileDomain = parse_url( $mobileServer, PHP_URL_HOST );
if ( $expectedMobileDomain && strcasecmp( $expectedMobileDomain, $domain ) === 0 ) {
return;
}
}
$msg = new DataMessageValue(
'mwparsoid-invalid-domain',
[],
'invalid-domain',
[ 'expected' => $expectedDomain, 'actual' => $domain, ]
);
throw new LocalizedHttpException( $msg, 400, [
'error' => 'parameter-validation-failed',
'name' => 'domain',
'value' => $domain,
'failureCode' => $msg->getCode(),
'failureData' => $msg->getData(),
] );
}
/**
* Get the parsed body by content-type
*
* @return array
*/
protected function getParsedBody(): array {
$request = $this->getRequest();
[ $contentType ] = explode( ';', $request->getHeader( 'Content-Type' )[0] ?? '', 2 );
switch ( $contentType ) {
case 'application/x-www-form-urlencoded':
case 'multipart/form-data':
return $request->getPostParams();
case 'application/json':
$json = json_decode( $request->getBody()->getContents(), true );
if ( !is_array( $json ) ) {
throw new LocalizedHttpException(
new MessageValue( "rest-json-body-parse-error", [ 'not a valid JSON object' ] ), 400 );
}
return $json;
default:
throw new LocalizedHttpException(
new MessageValue( "rest-unsupported-content-type", [ $contentType ?? '(null)' ] ),
415
);
}
}
/**
* Rough equivalent of req.local from Parsoid-JS.
* FIXME most of these should be replaced with more native ways of handling the request.
* @return array
*/
protected function &getRequestAttributes(): array {
if ( $this->requestAttributes ) {
return $this->requestAttributes;
}
$request = $this->getRequest();
$body = ( $request->getMethod() === 'POST' ) ? $this->getParsedBody() : [];
$opts = array_merge( $body, array_intersect_key( $request->getPathParams(),
[ 'from' => true, 'format' => true ] ) );
'@phan-var array<string,array|bool|string> $opts'; // @var array<string,array|bool|string> $opts
$contentLanguage = $request->getHeaderLine( 'Content-Language' ) ?: null;
if ( $contentLanguage ) {
$contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
$contentLanguage
);
}
$attribs = [
'pageName' => $request->getPathParam( 'title' ) ?? '',
'oldid' => $request->getPathParam( 'revision' ),
// "body_only" flag to return just the body (instead of the entire HTML doc)
// We would like to deprecate use of this flag: T181657
'body_only' => $request->getQueryParams()['body_only'] ?? $body['body_only'] ?? null,
'errorEnc' => ParsoidFormatHelper::ERROR_ENCODING[$opts['format']] ?? 'plain',
'iwp' => WikiMap::getCurrentWikiId(), // PORT-FIXME verify
'offsetType' => $body['offsetType']
?? $request->getQueryParams()['offsetType']
// Lint requests should return UCS2 offsets by default
?? ( $opts['format'] === ParsoidFormatHelper::FORMAT_LINT ? 'ucs2' : 'byte' ),
'pagelanguage' => $contentLanguage,
];
// For use in getHtmlOutputRendererHelper
$opts['stash'] = $request->getQueryParams()['stash'] ?? false;
if ( $request->getMethod() === 'POST' ) {
if ( isset( $opts['original']['revid'] ) ) {
$attribs['oldid'] = $opts['original']['revid'];
}
if ( isset( $opts['original']['title'] ) ) {
$attribs['pageName'] = $opts['original']['title'];
}
}
if ( $attribs['oldid'] !== null ) {
if ( $attribs['oldid'] === '' ) {
$attribs['oldid'] = null;
} else {
$attribs['oldid'] = (int)$attribs['oldid'];
}
}
// For use in getHtmlOutputRendererHelper
$opts['accept-language'] = $request->getHeaderLine( 'Accept-Language' ) ?: null;
$acceptLanguage = null;
if ( $opts['accept-language'] !== null ) {
$acceptLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
$opts['accept-language']
);
}
// Init pageName if oldid is provided and is a valid revision
if ( ( $attribs['pageName'] === '' ) && $attribs['oldid'] ) {
$rev = $this->revisionLookup->getRevisionById( $attribs['oldid'] );
if ( $rev ) {
$attribs['pageName'] = $rev->getPage()->getDBkey();
}
}
$attribs['envOptions'] = [
// We use `prefix` but ought to use `domain` (T206764)
'prefix' => $attribs['iwp'],
// For the legacy "domain" path parameter used by the endpoints exposed
// by the parsoid extension. Will be null for core endpoints.
'domain' => $request->getPathParam( 'domain' ),
'pageName' => $attribs['pageName'],
'cookie' => $request->getHeaderLine( 'Cookie' ),
'reqId' => $request->getHeaderLine( 'X-Request-Id' ),
'userAgent' => $request->getHeaderLine( 'User-Agent' ),
'htmlVariantLanguage' => $acceptLanguage,
// Semver::satisfies checks below expect a valid outputContentVersion value.
// Better to set it here instead of adding the default value at every check.
'outputContentVersion' => Parsoid::defaultHTMLVersion(),
];
# Convert language codes in $opts['updates']['variant'] if present
$sourceVariant = $opts['updates']['variant']['source'] ?? null;
if ( $sourceVariant ) {
$sourceVariant = LanguageCode::normalizeNonstandardCodeAndWarn(
$sourceVariant
);
$opts['updates']['variant']['source'] = $sourceVariant;
}
$targetVariant = $opts['updates']['variant']['target'] ?? null;
if ( $targetVariant ) {
$targetVariant = LanguageCode::normalizeNonstandardCodeAndWarn(
$targetVariant
);
$opts['updates']['variant']['target'] = $targetVariant;
}
if ( isset( $opts['wikitext']['headers']['content-language'] ) ) {
$contentLanguage = $opts['wikitext']['headers']['content-language'];
$contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
$contentLanguage
);
$opts['wikitext']['headers']['content-language'] = $contentLanguage;
}
if ( isset( $opts['original']['wikitext']['headers']['content-language'] ) ) {
$contentLanguage = $opts['original']['wikitext']['headers']['content-language'];
$contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
$contentLanguage
);
$opts['original']['wikitext']['headers']['content-language'] = $contentLanguage;
}
$attribs['opts'] = $opts;
// TODO: Remove assertDomainIsCorrect() once we no longer need to support the {domain}
// parameter for the endpoints exposed by the parsoid extension.
if ( $attribs['envOptions']['domain'] !== null ) {
$this->assertDomainIsCorrect( $attribs['envOptions']['domain'] );
}
$this->requestAttributes = $attribs;
return $this->requestAttributes;
}
/**
* @param array $attribs
* @param ?string $source
* @param PageIdentity $page
* @param ?int $revId
*
* @return HtmlOutputRendererHelper
*/
private function getHtmlOutputRendererHelper(
array $attribs,
?string $source,
PageIdentity $page,
?int $revId
): HtmlOutputRendererHelper {
$services = MediaWikiServices::getInstance();
// Request lenient rev handling
$lenientRevHandling = true;
$helper = $services->getPageRestHelperFactory()->newHtmlOutputRendererHelper( $lenientRevHandling );
$authority = $this->getAuthority();
$params = [];
$helper->init( $page, $params, $authority, $revId );
// XXX: should default to the page's content model?
$model = $attribs['opts']['contentmodel']
?? ( $attribs['envOptions']['contentmodel'] ?? CONTENT_MODEL_WIKITEXT );
if ( $source !== null ) {
$helper->setContentSource( $source, $model );
}
if ( isset( $attribs['opts']['stash'] ) ) {
$helper->setStashingEnabled( $attribs['opts']['stash'] );
}
if ( isset( $attribs['envOptions']['outputContentVersion'] ) ) {
$helper->setOutputProfileVersion( $attribs['envOptions']['outputContentVersion'] );
}
if ( isset( $attribs['pagelanguage'] ) ) {
$helper->setPageLanguage( $attribs['pagelanguage'] );
}
if ( isset( $attribs['opts']['accept-language'] ) ) {
$helper->setVariantConversionLanguage( $attribs['opts']['accept-language'] );
}
return $helper;
}
/**
* @param array $attribs
* @param string $html
* @param PageIdentity $page
*
* @return HtmlInputTransformHelper
*/
protected function getHtmlInputTransformHelper(
array $attribs,
string $html,
PageIdentity $page
): HtmlInputTransformHelper {
$services = MediaWikiServices::getInstance();
$helper = $services->getPageRestHelperFactory()->newHtmlInputTransformHelper(
$attribs['envOptions']
);
$metrics = $this->siteConfig->metrics();
if ( $metrics ) {
$helper->setMetrics( $metrics );
}
$parameters = $attribs['opts'] + $attribs;
$body = $attribs['opts'];
$body['html'] = $html;
$helper->init( $page, $body, $parameters );
return $helper;
}
/**
* FIXME: Combine with ParsoidFormatHelper::parseContentTypeHeader
*/
private const NEW_SPEC =
'#^https://www.mediawiki.org/wiki/Specs/(HTML|pagebundle)/(\d+\.\d+\.\d+)$#D';
/**
* This method checks if we support the requested content formats
* As a side-effect, it updates $attribs to set outputContentVersion
* that Parsoid should generate based on request headers.
*
* @param array &$attribs Request attributes from getRequestAttributes()
* @return bool
*/
protected function acceptable( array &$attribs ): bool {
$request = $this->getRequest();
$format = $attribs['opts']['format'];
if ( $format === ParsoidFormatHelper::FORMAT_WIKITEXT ) {
return true;
}
$acceptHeader = $request->getHeader( 'Accept' );
if ( !$acceptHeader ) {
return true;
}
$parser = new HttpAcceptParser();
$acceptableTypes = $parser->parseAccept( $acceptHeader[0] ); // FIXME: Multiple headers valid?
if ( !$acceptableTypes ) {
return true;
}
// `acceptableTypes` is already sorted by quality.
foreach ( $acceptableTypes as $t ) {
$type = "{$t['type']}/{$t['subtype']}";
$profile = $t['params']['profile'] ?? null;
if (
( $format === ParsoidFormatHelper::FORMAT_HTML && $type === 'text/html' ) ||
( $format === ParsoidFormatHelper::FORMAT_PAGEBUNDLE && $type === 'application/json' )
) {
if ( $profile ) {
preg_match( self::NEW_SPEC, $profile, $matches );
if ( $matches && strtolower( $matches[1] ) === $format ) {
$contentVersion = Parsoid::resolveContentVersion( $matches[2] );
if ( $contentVersion ) {
// $attribs mutated here!
$attribs['envOptions']['outputContentVersion'] = $contentVersion;
return true;
} else {
continue;
}
} else {
continue;
}
} else {
return true;
}
} elseif (
( $type === '*/*' ) ||
( $format === ParsoidFormatHelper::FORMAT_HTML && $type === 'text/*' )
) {
return true;
}
}
return false;
}
/**
* Try to create a PageConfig object. If we get an exception (because content
* may be missing or inaccessible), throw an appropriate HTTP response object
* for callers to handle.
*
* @param array $attribs
* @param ?string $wikitextOverride
* Custom wikitext to use instead of the real content of the page.
* @param bool $html2WtMode
* @return PageConfig
* @throws HttpException
*/
protected function tryToCreatePageConfig(
array $attribs, ?string $wikitextOverride = null, bool $html2WtMode = false
): PageConfig {
$revId = $attribs['oldid'];
$pagelanguageOverride = $attribs['pagelanguage'];
$title = $attribs['pageName'];
$title = ( $title !== '' ) ? Title::newFromText( $title ) : Title::newMainPage();
if ( !$title ) {
// TODO use proper validation
throw new LogicException( 'Title not found!' );
}
$user = RequestContext::getMain()->getUser();
if ( $wikitextOverride === null ) {
$revisionRecord = null;
} else {
// Create a mutable revision record point to the same revision
// and set to the desired wikitext.
$revisionRecord = new MutableRevisionRecord( $title );
// Don't set id to $revId if we have $wikitextOverride
// A revision corresponds to specific wikitext, which $wikitextOverride
// might not be.
$revisionRecord->setId( 0 );
$revisionRecord->setSlot(
SlotRecord::newUnsaved(
SlotRecord::MAIN,
new WikitextContent( $wikitextOverride )
)
);
}
$hasOldId = ( $revId !== null );
$ensureAccessibleContent = !$html2WtMode || $hasOldId;
try {
// Note: Parsoid by design isn't supposed to use the user
// context right now, and all user state is expected to be
// introduced as a post-parse transform. So although we pass a
// User here, it only currently affects the output in obscure
// corner cases; see PageConfigFactory::create() for more.
// @phan-suppress-next-line PhanUndeclaredMethod method defined in subtype
$pageConfig = $this->pageConfigFactory->create(
$title, $user, $revisionRecord ?? $revId, null, $pagelanguageOverride,
$ensureAccessibleContent
);
} catch ( SuppressedDataException $e ) {
throw new LocalizedHttpException(
new MessageValue( "rest-permission-denied-revision", [ $e->getMessage() ] ), 403
);
} catch ( RevisionAccessException $e ) {
throw new LocalizedHttpException(
new MessageValue( "rest-specified-revision-unavailable", [ $e->getMessage() ] ), 404
);
}
// All good!
return $pageConfig;
}
/**
* Try to create a PageIdentity object.
* If no page is specified in the request, this will return the wiki's main page.
* If an invalid page is requested, this throws an appropriate HTTPException.
*
* @param array $attribs
* @return PageIdentity
* @throws HttpException
*/
protected function tryToCreatePageIdentity( array $attribs ): PageIdentity {
if ( $attribs['pageName'] === '' ) {
return Title::newMainPage();
}
// XXX: Should be injected, but the Parsoid extension relies on the
// constructor signature. Also, ParsoidHandler should go away soon anyway.
$pageStore = MediaWikiServices::getInstance()->getPageStore();
$page = $pageStore->getPageByText( $attribs['pageName'] );
if ( !$page ) {
throw new LocalizedHttpException(
new MessageValue( "rest-invalid-title", [ 'pageName' ] ), 400
);
}
return $page;
}
/**
* Get the path for the transform endpoint. May be overwritten to override the path.
*
* This is done in the parsoid extension, for backwards compatibility
* with the old endpoint URLs.
*
* @stable to override
*
* @param string $format The format the endpoint is expected to return.
*
* @return string
*/
protected function getTransformEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
return '/coredev/v0/transform/{from}/to/{format}/{title}/{revision}';
}
/**
* Get the path for the page content endpoint. May be overwritten to override the path.
*
* This is done in the parsoid extension, for backwards compatibility
* with the old endpoint URLs.
*
* @stable to override
*
* @param string $format The format the endpoint is expected to return.
*
* @return string
*/
protected function getPageContentEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
if ( $format !== ParsoidFormatHelper::FORMAT_HTML ) {
throw new InvalidArgumentException( 'Unsupported page content format: ' . $format );
}
return '/v1/page/{title}/html';
}
/**
* Get the path for the page content endpoint. May be overwritten to override the path.
*
* This is done in the parsoid extension, for backwards compatibility
* with the old endpoint URLs.
*
* @stable to override
*
* @param string $format The format the endpoint is expected to return.
*
* @return string
*/
protected function getRevisionContentEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
if ( $format !== ParsoidFormatHelper::FORMAT_HTML ) {
throw new InvalidArgumentException( 'Unsupported revision content format: ' . $format );
}
return '/v1/revision/{revision}/html';
}
public function wtLint(
PageConfig $pageConfig, array $attribs, ?array $linterOverrides = []
) {
$envOptions = $attribs['envOptions'] + [
'linterOverrides' => $linterOverrides,
];
try {
$parsoid = $this->newParsoid();
return $parsoid->wikitext2lint( $pageConfig, $envOptions );
} catch ( ClientError $e ) {
throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error", [ $e->getMessage() ] ), 400 );
} catch ( ResourceLimitExceededException $e ) {
throw new LocalizedHttpException(
new MessageValue( "rest-parsoid-resource-exceeded", [ $e->getMessage() ] ), 413
);
}
}
/**
* Wikitext -> HTML helper.
* Spec'd in https://phabricator.wikimedia.org/T75955 and the API tests.
*
* @param PageConfig $pageConfig
* @param array $attribs Request attributes from getRequestAttributes()
* @param ?string $wikitext Wikitext to transform (or null to use the
* page specified in the request attributes).
*
* @return Response
*/
protected function wt2html(
PageConfig $pageConfig, array $attribs, ?string $wikitext = null
) {
$request = $this->getRequest();
$opts = $attribs['opts'];
$format = $opts['format'];
$oldid = $attribs['oldid'];
$stash = $opts['stash'] ?? false;
if ( $format === ParsoidFormatHelper::FORMAT_LINT ) {
$linterOverrides = [];
if ( $this->extensionRegistry->isLoaded( 'Linter' ) ) { // T360809
$disabled = [];
$services = MediaWikiServices::getInstance();
$linterCategories = $services->getMainConfig()->get( 'LinterCategories' );
foreach ( $linterCategories as $name => $cat ) {
if ( $cat['priority'] === 'none' ) {
$disabled[] = $name;
}
}
$linterOverrides['disabled'] = $disabled;
}
$lints = $this->wtLint( $pageConfig, $attribs, $linterOverrides );
$response = $this->getResponseFactory()->createJson( $lints );
return $response;
}
// TODO: This method should take a PageIdentity + revId,
// to reduce the usage of PageConfig in MW core.
$helper = $this->getHtmlOutputRendererHelper(
$attribs,
$wikitext,
$this->pageConfigToPageIdentity( $pageConfig ),
// Id will be 0 if we have $wikitext but that isn't valid
// to call $helper->setRevision with. In any case, the revision
// will be reset when $helper->setContent is called with $wikitext.
// Ideally, the revision would be pass through here instead of
// the id and wikitext.
$pageConfig->getRevisionId() ?: null
);
$needsPageBundle = ( $format === ParsoidFormatHelper::FORMAT_PAGEBUNDLE );
if ( $attribs['body_only'] ) {
$helper->setFlavor( 'fragment' );
} elseif ( !$needsPageBundle ) {
// Inline data-parsoid. This will happen when no special params are set.
$helper->setFlavor( 'edit' );
}
if ( $wikitext === null && $oldid !== null ) {
$mstr = 'pageWithOldid';
} else {
$mstr = 'wt';
}
$parseTiming = Timing::start();
if ( $needsPageBundle ) {
$pb = $helper->getPageBundle();
// Handle custom offset requests as a pb2pb transform
if ( $attribs['offsetType'] !== 'byte' ) {
$parsoid = $this->newParsoid();
$pb = $parsoid->pb2pb(
$pageConfig,
'convertoffsets',
$pb,
[
'inputOffsetType' => 'byte',
'outputOffsetType' => $attribs['offsetType']
]
);
}
$response = $this->getResponseFactory()->createJson( $pb->responseData() );
$helper->putHeaders( $response, false );
ParsoidFormatHelper::setContentType(
$response,
ParsoidFormatHelper::FORMAT_PAGEBUNDLE,
$pb->version
);
} else {
$out = $helper->getHtml();
// TODO: offsetType conversion isn't supported right now for non-pagebundle endpoints
// Once the OutputTransform framework lands, we might revisit this.
$response = $this->getResponseFactory()->create();
$response->getBody()->write( $out->getRawText() );
$helper->putHeaders( $response, true );
// Emit an ETag only if stashing is enabled. It's not reliably useful otherwise.
if ( $stash ) {
$eTag = $helper->getETag();
if ( $eTag ) {
$response->setHeader( 'ETag', $eTag );
}
}
}
// XXX: For pagebundle requests, this can be somewhat inflated
// because of pagebundle json-encoding overheads
$outSize = $response->getBody()->getSize();
$parseTime = $parseTiming->end();
// Ignore slow parse metrics for non-oldid parses
if ( $mstr === 'pageWithOldid' ) {
if ( $parseTime > 3000 ) {
LoggerFactory::getInstance( 'slow-parsoid' )
->info( 'Parsing {title} was slow, took {time} seconds', [
'time' => number_format( $parseTime / 1000, 2 ),
'title' => Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedText(),
] );
}
if ( $parseTime > 10 && $outSize > 100 ) {
// * Don't bother with this metric for really small parse times
// p99 for initialization time is ~7ms according to grafana.
// So, 10ms ensures that startup overheads don't skew the metrics
// * For body_only=false requests, <head> section isn't generated
// and if the output is small, per-request overheads can skew
// the timePerKB metrics.
// NOTE: This is slightly misleading since there are fixed costs
// for generating output like the <head> section and should be factored in,
// but this is good enough for now as a useful first degree of approxmation.
$timePerKB = $parseTime * 1024 / $outSize;
if ( $timePerKB > 500 ) {
// At 100ms/KB, even a 100KB page which isn't that large will take 10s.
// So, we probably want to shoot for a threshold under 100ms.
// But, let's start with 500ms+ outliers first and see what we uncover.
LoggerFactory::getInstance( 'slow-parsoid' )
->info( 'Parsing {title} was slow, timePerKB took {timePerKB} ms, total: {time} seconds', [
'time' => number_format( $parseTime / 1000, 2 ),
'timePerKB' => number_format( $timePerKB, 1 ),
'title' => Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedText(),
] );
}
}
}
if ( $wikitext !== null ) {
// Don't cache requests when wt is set in case somebody uses
// GET for wikitext parsing
// XXX: can we just refuse to do wikitext parsing in a GET request?
$response->setHeader( 'Cache-Control', 'private,no-cache,s-maxage=0' );
} elseif ( $oldid !== null ) {
// XXX: can this go away? Parsoid's PageContent class doesn't expose supressed revision content.
if ( $request->getHeaderLine( 'Cookie' ) ||
$request->getHeaderLine( 'Authorization' ) ) {
// Don't cache requests with a session.
$response->setHeader( 'Cache-Control', 'private,no-cache,s-maxage=0' );
}
}
return $response;
}
protected function newParsoid(): Parsoid {
return new Parsoid( $this->siteConfig, $this->dataAccess );
}
protected function parseHTML( string $html, bool $validateXMLNames = false ): Document {
return DOMUtils::parseHTML( $html, $validateXMLNames );
}
/**
* @param PageConfig|PageIdentity $page
* @param array $attribs Attributes gotten from requests
* @param string $html Original HTML
*
* @return Response
* @throws HttpException
*/
protected function html2wt(
$page, array $attribs, string $html
) {
if ( $page instanceof PageConfig ) {
// TODO: Deprecate passing a PageConfig.
// Ideally, callers would use HtmlToContentTransform directly.
$page = Title::newFromLinkTarget( $page->getLinkTarget() );
}
try {
$transform = $this->getHtmlInputTransformHelper( $attribs, $html, $page );
$response = $this->getResponseFactory()->create();
$transform->putContent( $response );
return $response;
} catch ( ClientError $e ) {
throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error", [ $e->getMessage() ] ), 400 );
}
}
/**
* Pagebundle -> pagebundle helper.
*
* @param array<string,array|string> $attribs
* @return Response
* @throws HttpException
*/
protected function pb2pb( array $attribs ) {
$opts = $attribs['opts'];
$revision = $opts['previous'] ?? $opts['original'] ?? null;
if ( !isset( $revision['html'] ) ) {
throw new LocalizedHttpException( new MessageValue( "rest-missing-revision-html" ), 400 );
}
$vOriginal = ParsoidFormatHelper::parseContentTypeHeader(
$revision['html']['headers']['content-type'] ?? '' );
if ( $vOriginal === null ) {
throw new LocalizedHttpException( new MessageValue( "rest-missing-revision-html-content-type" ), 400 );
}
$attribs['envOptions']['inputContentVersion'] = $vOriginal;
'@phan-var array<string,array|string> $attribs'; // @var array<string,array|string> $attribs
$this->metrics->increment(
'pb2pb.original.version.' . $attribs['envOptions']['inputContentVersion']
);
if ( !empty( $opts['updates'] ) ) {
// FIXME: Handling missing revisions uniformly for all update types
// is not probably the right thing to do but probably okay for now.
// This might need revisiting as we add newer types.
$pageConfig = $this->tryToCreatePageConfig( $attribs, null, true );
// If we're only updating parts of the original version, it should
// satisfy the requested content version, since we'll be returning
// that same one.
// FIXME: Since this endpoint applies the acceptable middleware,
// `getOutputContentVersion` is not what's been passed in, but what
// can be produced. Maybe that should be selectively applied so
// that we can update older versions where it makes sense?
// Uncommenting below implies that we can only update the latest
// version, since carrot semantics is applied in both directions.
// if ( !Semver::satisfies(
// $attribs['envOptions']['inputContentVersion'],
// "^{$attribs['envOptions']['outputContentVersion']}"
// ) ) {
// throw new HttpException(
// 'We do not know how to do this conversion.', 415
// );
// }
if ( !empty( $opts['updates']['redlinks'] ) ) {
// Q(arlolra): Should redlinks be more complex than a bool?
// See gwicke's proposal at T114413#2240381
return $this->updateRedLinks( $pageConfig, $attribs, $revision );
} elseif ( isset( $opts['updates']['variant'] ) ) {
return $this->languageConversion( $pageConfig, $attribs, $revision );
} else {
throw new LocalizedHttpException( new MessageValue( "rest-unknown-parsoid-transformation" ), 400 );
}
}
// TODO(arlolra): subbu has some sage advice in T114413#2365456 that
// we should probably be more explicit about the pb2pb conversion
// requested rather than this increasingly complex fallback logic.
$downgrade = Parsoid::findDowngrade(
$attribs['envOptions']['inputContentVersion'],
$attribs['envOptions']['outputContentVersion']
);
if ( $downgrade ) {
$pb = new PageBundle(
$revision['html']['body'],
$revision['data-parsoid']['body'] ?? null,
$revision['data-mw']['body'] ?? null
);
$this->validatePb( $pb, $attribs['envOptions']['inputContentVersion'] );
Parsoid::downgrade( $downgrade, $pb );
if ( !empty( $attribs['body_only'] ) ) {
$doc = $this->parseHTML( $pb->html );
$body = DOMCompat::getBody( $doc );
$pb->html = ContentUtils::toXML( $body, [ 'innerXML' => true ] );
}
$response = $this->getResponseFactory()->createJson( $pb->responseData() );
ParsoidFormatHelper::setContentType(
$response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $pb->version
);
return $response;
// Ensure we only reuse from semantically similar content versions.
} elseif ( Semver::satisfies( $attribs['envOptions']['outputContentVersion'],
'^' . $attribs['envOptions']['inputContentVersion'] ) ) {
$pageConfig = $this->tryToCreatePageConfig( $attribs );
return $this->wt2html( $pageConfig, $attribs );
} else {
throw new LocalizedHttpException( new MessageValue( "rest-unsupported-profile-conversion" ), 415 );
}
}
/**
* Update red links on a document.
*
* @param PageConfig $pageConfig
* @param array $attribs
* @param array $revision
* @return Response
*/
protected function updateRedLinks(
PageConfig $pageConfig, array $attribs, array $revision
) {
$parsoid = $this->newParsoid();
$pb = new PageBundle(
$revision['html']['body'],
$revision['data-parsoid']['body'] ?? null,
$revision['data-mw']['body'] ?? null,
$attribs['envOptions']['inputContentVersion'],
$revision['html']['headers'] ?? null,
$revision['contentmodel'] ?? null
);
$out = $parsoid->pb2pb( $pageConfig, 'redlinks', $pb, [] );
$this->validatePb( $out, $attribs['envOptions']['inputContentVersion'] );
$response = $this->getResponseFactory()->createJson( $out->responseData() );
ParsoidFormatHelper::setContentType(
$response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $out->version
);
return $response;
}
/**
* Do variant conversion on a document.
*
* @param PageConfig $pageConfig
* @param array $attribs
* @param array $revision
* @return Response
* @throws HttpException
*/
protected function languageConversion(
PageConfig $pageConfig, array $attribs, array $revision
) {
$opts = $attribs['opts'];
$target = $opts['updates']['variant']['target'] ??
$attribs['envOptions']['htmlVariantLanguage'];
$source = $opts['updates']['variant']['source'] ?? null;
if ( !$target ) {
throw new LocalizedHttpException( new MessageValue( "rest-target-variant-required" ), 400 );
}
$pageIdentity = $this->tryToCreatePageIdentity( $attribs );
$pb = new PageBundle(
$revision['html']['body'],
$revision['data-parsoid']['body'] ?? null,
$revision['data-mw']['body'] ?? null,
$attribs['envOptions']['inputContentVersion'],
$revision['html']['headers'] ?? null,
$revision['contentmodel'] ?? null
);
// XXX: DI should inject HtmlTransformFactory
$languageVariantConverter = MediaWikiServices::getInstance()
->getHtmlTransformFactory()
->getLanguageVariantConverter( $pageIdentity );
$languageVariantConverter->setPageConfig( $pageConfig );
$httpContentLanguage = $attribs['pagelanguage' ] ?? null;
if ( $httpContentLanguage ) {
$languageVariantConverter->setPageLanguageOverride( $httpContentLanguage );
}
try {
$out = $languageVariantConverter->convertPageBundleVariant( $pb, $target, $source );
} catch ( InvalidArgumentException $e ) {
throw new LocalizedHttpException(
new MessageValue( "rest-unsupported-language-conversion", [ $source ?? '(unspecified)', $target ] ),
400,
[ 'reason' => $e->getMessage() ]
);
}
$response = $this->getResponseFactory()->createJson( $out->responseData() );
ParsoidFormatHelper::setContentType(
$response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $out->version
);
return $response;
}
/** @inheritDoc */
abstract public function execute(): Response;
/**
* Validate a PageBundle against the given contentVersion, and throw
* an HttpException if it does not match.
* @param PageBundle $pb
* @param string $contentVersion
* @throws HttpException
*/
private function validatePb( PageBundle $pb, string $contentVersion ): void {
$errorMessage = '';
if ( !$pb->validate( $contentVersion, $errorMessage ) ) {
throw new LocalizedHttpException(
new MessageValue( "rest-page-bundle-validation-error", [ $errorMessage ] ),
400
);
}
}
/**
* @param PageConfig $page
*
* @return ProperPageIdentity
* @throws HttpException
*/
private function pageConfigToPageIdentity( PageConfig $page ): ProperPageIdentity {
$services = MediaWikiServices::getInstance();
$title = $page->getLinkTarget();
try {
$page = $services->getPageStore()->getPageForLink( $title );
} catch ( MalformedTitleException | InvalidArgumentException $e ) {
// Note that even some well-formed links are still invalid
// parameters for getPageForLink(), e.g. interwiki links or special pages.
throw new HttpException(
"Bad title: $title", # uses LinkTarget::__toString()
400
);
}
return $page;
}
}