includes/parser/Parsoid/LanguageVariantConverter.php
<?php
namespace MediaWiki\Parser\Parsoid;
use MediaWiki\Language\LanguageCode;
use MediaWiki\Languages\LanguageConverterFactory;
use MediaWiki\Languages\LanguageFactory;
use MediaWiki\Page\PageIdentity;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
use MediaWiki\Rest\HttpException;
use MediaWiki\Rest\LocalizedHttpException;
use MediaWiki\Revision\RevisionAccessException;
use MediaWiki\Title\Title;
use MediaWiki\Title\TitleFactory;
use Wikimedia\Bcp47Code\Bcp47Code;
use Wikimedia\Bcp47Code\Bcp47CodeValue;
use Wikimedia\Message\MessageValue;
use Wikimedia\Parsoid\Config\PageConfig;
use Wikimedia\Parsoid\Config\SiteConfig;
use Wikimedia\Parsoid\Core\PageBundle;
use Wikimedia\Parsoid\DOM\Element;
use Wikimedia\Parsoid\Parsoid;
use Wikimedia\Parsoid\Utils\DOMCompat;
use Wikimedia\Parsoid\Utils\DOMUtils;
/**
* @since 1.40
* @unstable should be marked stable before 1.40 release
*/
class LanguageVariantConverter {
private PageConfigFactory $pageConfigFactory;
private ?PageConfig $pageConfig = null;
private PageIdentity $pageIdentity;
private Title $pageTitle;
private Parsoid $parsoid;
private SiteConfig $siteConfig;
private LanguageConverterFactory $languageConverterFactory;
private LanguageFactory $languageFactory;
/**
* Page language override from the Content-Language header.
*/
private ?Bcp47Code $pageLanguageOverride = null;
private bool $isFallbackLanguageConverterEnabled = true;
public function __construct(
PageIdentity $pageIdentity,
PageConfigFactory $pageConfigFactory,
Parsoid $parsoid,
SiteConfig $siteConfig,
TitleFactory $titleFactory,
LanguageConverterFactory $languageConverterFactory,
LanguageFactory $languageFactory
) {
$this->pageConfigFactory = $pageConfigFactory;
$this->pageIdentity = $pageIdentity;
$this->parsoid = $parsoid;
$this->siteConfig = $siteConfig;
$this->pageTitle = $titleFactory->newFromPageIdentity( $this->pageIdentity );
$this->languageConverterFactory = $languageConverterFactory;
$this->languageFactory = $languageFactory;
}
/**
* Set the PageConfig object to be used during language variant conversion.
* If not provided, the object will be created.
*
* @param PageConfig $pageConfig
* @return void
*/
public function setPageConfig( PageConfig $pageConfig ) {
$this->pageConfig = $pageConfig;
}
/**
* Set the page content language override.
*
* @param Bcp47Code $language
* @return void
*/
public function setPageLanguageOverride( Bcp47Code $language ) {
$this->pageLanguageOverride = $language;
}
/**
* Perform variant conversion on a PageBundle object.
*
* @param PageBundle $pageBundle
* @param Bcp47Code $targetVariant
* @param ?Bcp47Code $sourceVariant
*
* @return PageBundle The converted PageBundle, or the object passed in as
* $pageBundle if the conversion is not supported.
* @throws HttpException
*/
public function convertPageBundleVariant(
PageBundle $pageBundle,
Bcp47Code $targetVariant,
?Bcp47Code $sourceVariant = null
): PageBundle {
[ $pageLanguage, $sourceVariant ] =
$this->getBaseAndSourceLanguage( $pageBundle, $sourceVariant );
if ( !$this->siteConfig->langConverterEnabledBcp47( $pageLanguage ) ) {
// If the language doesn't support variants, just return the content unmodified.
return $pageBundle;
}
$pageConfig = $this->getPageConfig( $pageLanguage, $sourceVariant );
if ( $this->parsoid->implementsLanguageConversionBcp47( $pageConfig, $targetVariant ) ) {
return $this->parsoid->pb2pb(
$pageConfig, 'variant', $pageBundle,
[
'variant' => [
'source' => $sourceVariant,
'target' => $targetVariant,
]
]
);
} else {
if ( !$this->isFallbackLanguageConverterEnabled ) {
// Fallback variant conversion is not enabled, return the page bundle as is.
return $pageBundle;
}
// LanguageConverter::hasVariant and LanguageConverter::convertTo
// could take a string|Bcp47Code in the future, which would
// allow us to avoid the $targetVariantCode conversion here.
$baseLanguage = $this->languageFactory->getParentLanguage( $targetVariant );
$languageConverter = $this->languageConverterFactory->getLanguageConverter( $baseLanguage );
$targetVariantCode = $this->languageFactory->getLanguage( $targetVariant )->getCode();
if ( $languageConverter->hasVariant( $targetVariantCode ) ) {
// NOTE: This is not a convert() because we have the exact desired variant
// and don't need to compute a preferred variant based on a base language.
// Also see T267067 for why convert() should be avoided.
$convertedHtml = $languageConverter->convertTo( $pageBundle->html, $targetVariantCode );
$pageVariant = $targetVariant;
} else {
// No conversion possible - pass through original HTML in original language
$convertedHtml = $pageBundle->html;
$pageVariant = $pageConfig->getPageLanguageBcp47();
}
// Add a note so that we can identify what was used to perform the variant conversion
$msg = "<!-- Variant conversion performed using the core LanguageConverter -->";
$convertedHtml = $msg . $convertedHtml;
// NOTE: Keep this in sync with code in Parsoid.php in Parsoid repo
// Add meta information that Parsoid normally adds
$headers = [
'content-language' => $pageVariant->toBcp47Code(),
'vary' => [ 'Accept', 'Accept-Language' ]
];
$doc = DOMUtils::parseHTML( '' );
$doc->appendChild( $doc->createElement( 'head' ) );
DOMUtils::addHttpEquivHeaders( $doc, $headers );
$docElt = $doc->documentElement;
'@phan-var Element $docElt';
$docHtml = DOMCompat::getOuterHTML( $docElt );
$convertedHtml = preg_replace( "#</body>#", $docHtml, "$convertedHtml</body>" );
return new PageBundle(
$convertedHtml, [], [], $pageBundle->version, $headers
);
}
}
/**
* Perform variant conversion on a ParserOutput object.
*
* @param ParserOutput $parserOutput
* @param Bcp47Code $targetVariant
* @param ?Bcp47Code $sourceVariant
*
* @return ParserOutput
*/
public function convertParserOutputVariant(
ParserOutput $parserOutput,
Bcp47Code $targetVariant,
?Bcp47Code $sourceVariant = null
): ParserOutput {
$pageBundle = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
$modifiedPageBundle = $this->convertPageBundleVariant( $pageBundle, $targetVariant, $sourceVariant );
return PageBundleParserOutputConverter::parserOutputFromPageBundle( $modifiedPageBundle, $parserOutput );
}
/**
* Disable fallback language variant converter
* @return void
*/
public function disableFallbackLanguageConverter(): void {
$this->isFallbackLanguageConverterEnabled = false;
}
private function getPageConfig( Bcp47Code $pageLanguage, ?Bcp47Code $sourceVariant ): PageConfig {
if ( $this->pageConfig ) {
return $this->pageConfig;
}
try {
$this->pageConfig = $this->pageConfigFactory->create(
$this->pageIdentity,
null,
null,
null,
$pageLanguage
);
if ( $sourceVariant ) {
$this->pageConfig->setVariantBcp47( $sourceVariant );
}
} catch ( RevisionAccessException $exception ) {
// TODO: Throw a different exception, this class should not know
// about HTTP status codes.
throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 );
}
return $this->pageConfig;
}
/**
* Try to determine the page's language code as follows:
*
* First consider any value set by calling ::setPageLanguageOverride();
* this would have come from a Content-Language header.
*
* If ::setPageLanguageOverride() has not been called, check for a
* content-language header in $pageBundle, which should be
* equivalent. These are used when the title/article doesn't
* (yet) exist.
*
* If these are not given, use the $default if given; this is used
* to allow additional parameters to the request to be used as
* fallbacks.
*
* If we don't have $default, but we do have a PageConfig in
* $this->pageConfig, return $this->pageConfig->getPageLanguage().
*
* Finally, fall back to $this->pageTitle->getPageLanguage().
*
* @param PageBundle $pageBundle
* @param Bcp47Code|null $default A default language, used after
* Content-Language but before PageConfig/Title lookup.
*
* @return Bcp47Code the page language; may be a variant.
*/
private function getPageLanguage( PageBundle $pageBundle, ?Bcp47Code $default = null ): Bcp47Code {
// If a language was set by calling setPageLanguageOverride(), always use it!
if ( $this->pageLanguageOverride ) {
return $this->pageLanguageOverride;
}
// If the page bundle contains a language code, use that.
$pageBundleLanguage = $pageBundle->headers[ 'content-language' ] ?? null;
if ( $pageBundleLanguage ) {
// The HTTP header will contain a BCP-47 language code, not a
// mediawiki-internal one.
return new Bcp47CodeValue( $pageBundleLanguage );
}
// NOTE: Use explicit default *before* we try PageBundle, because PageConfig::getPageLanguage()
// falls back to Title::getPageLanguage(). If we did that first, $default would never be used.
if ( $default ) {
return $default;
}
// If we have a PageConfig, we can ask it for the page's language. Note that this will fall back to
// Title::getPageLanguage(), so it has to be the last thing we try.
if ( $this->pageConfig ) {
return $this->pageConfig->getPageLanguageBcp47();
}
// Finally, just go by the code associated with the title. This may come from the database or
// it may be determined based on the title itself.
return $this->pageTitle->getPageLanguage();
}
/**
* Determine the codes of the base language and the source variant.
*
* The base language will be used to find the appropriate LanguageConverter.
* It should never be a variant.
*
* The source variant will be used to instruct the LanguageConverter.
* It should always be a variant (or null to trigger auto-detection of
* the source variant).
*
* @param PageBundle $pageBundle
* @param ?Bcp47Code $sourceLanguage
*
* @return array{0:Bcp47Code,1:?Bcp47Code} [ Bcp47Code $pageLanguage, ?Bcp47Code $sourceLanguage ]
*/
private function getBaseAndSourceLanguage( PageBundle $pageBundle, ?Bcp47Code $sourceLanguage ): array {
// Try to determine the language code associated with the content of the page.
// The result may be a variant code.
$baseLanguage = $this->getPageLanguage( $pageBundle, $sourceLanguage );
// To find out if $baseLanguage is actually a variant, get the parent language and compare.
$parentLang = $this->languageFactory->getParentLanguage( $baseLanguage );
// If $parentLang is not the same language as $baseLanguage, this means that
// $baseLanguage is a variant. In that case, set $sourceLanguage to that
// variant (unless $sourceLanguage is already set), and set $baseLanguage
// to the $parentLang
if ( $parentLang && strcasecmp( $parentLang->toBcp47Code(), $baseLanguage->toBcp47Code() ) !== 0 ) {
if ( !$sourceLanguage ) {
$sourceLanguage = $baseLanguage;
}
$baseLanguage = $parentLang;
}
if ( $sourceLanguage !== null ) {
$parentConverter = $this->languageConverterFactory->getLanguageConverter( $parentLang );
// If the source variant isn't actually a variant, trigger auto-detection
$sourceIsVariant = (
strcasecmp( $parentLang->toBcp47Code(), $sourceLanguage->toBcp47Code() ) !== 0 &&
$parentConverter->hasVariant(
LanguageCode::bcp47ToInternal( $sourceLanguage->toBcp47Code() )
)
);
if ( !$sourceIsVariant ) {
$sourceLanguage = null;
}
}
return [ $baseLanguage, $sourceLanguage ];
}
}