includes/language/LanguageNameUtils.php
<?php
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
*/
namespace MediaWiki\Languages;
use BagOStuff;
use HashBagOStuff;
use InvalidArgumentException;
use LanguageCode;
use MediaWiki\Config\ServiceOptions;
use MediaWiki\HookContainer\HookContainer;
use MediaWiki\HookContainer\HookRunner;
use MediaWiki\MainConfigNames;
use MediaWiki\Title\MediaWikiTitleCodec;
/**
* A service that provides utilities to do with language names and codes.
*
* See https://www.mediawiki.org/wiki/Special:MyLanguage/Localisation for more information.
*
* @since 1.34
* @ingroup Language
*/
class LanguageNameUtils {
/**
* Return autonyms in getLanguageName(s).
*/
public const AUTONYMS = null;
/**
* Return all known languages in getLanguageName(s).
*/
public const ALL = 'all';
/**
* Return in getLanguageName(s) only the languages that are defined by MediaWiki.
*/
public const DEFINED = 'mw';
/**
* Return in getLanguageName(s) only the languages for which we have at least some localisation.
*/
public const SUPPORTED = 'mwfile';
/** @var ServiceOptions */
private $options;
/**
* Cache for language names
* @var HashBagOStuff|null
*/
private $languageNameCache;
/**
* Cache for validity of language codes
* @var array
*/
private $validCodeCache = [];
/**
* @internal For use by ServiceWiring
*/
public const CONSTRUCTOR_OPTIONS = [
MainConfigNames::ExtraLanguageNames,
MainConfigNames::UsePigLatinVariant,
MainConfigNames::UseXssLanguage,
];
/** @var HookRunner */
private $hookRunner;
/**
* @param ServiceOptions $options
* @param HookContainer $hookContainer
*/
public function __construct( ServiceOptions $options, HookContainer $hookContainer ) {
$options->assertRequiredOptions( self::CONSTRUCTOR_OPTIONS );
$this->options = $options;
$this->hookRunner = new HookRunner( $hookContainer );
}
/**
* Checks whether any localisation is available for that language tag in MediaWiki
* (MessagesXx.php or xx.json exists).
*
* @param string $code Language tag (in lower case)
* @return bool Whether language is supported
*/
public function isSupportedLanguage( string $code ): bool {
if ( !$this->isValidBuiltInCode( $code ) ) {
return false;
}
if ( $code === 'qqq' ) {
// Special code for internal use, not supported even though there is a qqq.json
return false;
}
if (
$code === 'en-x-piglatin' &&
!$this->options->get( MainConfigNames::UsePigLatinVariant )
) {
// Suppress Pig Latin unless explicitly enabled.
return false;
}
return is_readable( $this->getMessagesFileName( $code ) ) ||
is_readable( $this->getJsonMessagesFileName( $code ) );
}
/**
* Returns true if a language code string is of a valid form, whether it exists.
* This includes codes which are used solely for customisation via the MediaWiki namespace.
*
* @param string $code
*
* @return bool False if the language code contains dangerous characters, e.g, HTML special
* characters or characters that are illegal in MediaWiki titles.
*/
public function isValidCode( string $code ): bool {
if ( !isset( $this->validCodeCache[$code] ) ) {
// People think language codes are HTML-safe, so enforce it. Ideally, we should only
// allow a-zA-Z0-9- but .+ and other chars are often used for {{int:}} hacks. See bugs
// T39564, T39587, T38938.
$this->validCodeCache[$code] =
// Protect against path traversal
strcspn( $code, ":/\\\000&<>'\"" ) === strlen( $code ) &&
!preg_match( MediaWikiTitleCodec::getTitleInvalidRegex(), $code ) &&
// libicu sets ULOC_FULLNAME_CAPACITY to 157; stay comfortably lower
strlen( $code ) <= 128;
}
return $this->validCodeCache[$code];
}
/**
* Returns true if a language code is of a valid form for the purposes of internal customisation
* of MediaWiki, via Messages*.php or *.json.
*
* @param string $code
* @return bool
*/
public function isValidBuiltInCode( string $code ): bool {
return (bool)preg_match( '/^[a-z0-9-]{2,}$/', $code );
}
/**
* Returns true if a language code is an IETF tag known to MediaWiki.
*
* @param string $tag
*
* @return bool
*/
public function isKnownLanguageTag( string $tag ): bool {
// Quick escape for invalid input to avoid exceptions down the line when code tries to
// process tags which are not valid at all.
if ( !$this->isValidBuiltInCode( $tag ) ) {
return false;
}
if ( isset( Data\Names::$names[$tag] ) || $this->getLanguageName( $tag, $tag ) !== '' ) {
return true;
}
return false;
}
/**
* Get an array of language names, indexed by code.
*
* @param null|string $inLanguage Code of language in which to return the names
* Use self::AUTONYMS for autonyms (native names)
* @param string $include One of:
* self::ALL All available languages
* self::DEFINED Only if the language is defined in MediaWiki or wgExtraLanguageNames
* (default)
* self::SUPPORTED Only if the language is in self::DEFINED *and* has a message file
* @return array Language code => language name (sorted by key)
*/
public function getLanguageNames( $inLanguage = self::AUTONYMS, $include = self::DEFINED ) {
if ( $inLanguage !== self::AUTONYMS ) {
$inLanguage = LanguageCode::replaceDeprecatedCodes( LanguageCode::bcp47ToInternal( $inLanguage ) );
}
$cacheKey = $inLanguage === self::AUTONYMS ? 'null' : $inLanguage;
$cacheKey .= ":$include";
if ( !$this->languageNameCache ) {
$this->languageNameCache = new HashBagOStuff( [ 'maxKeys' => 20 ] );
}
return $this->languageNameCache->getWithSetCallback(
$cacheKey,
BagOStuff::TTL_INDEFINITE,
function () use ( $inLanguage, $include ) {
return $this->getLanguageNamesUncached( $inLanguage, $include );
}
);
}
/**
* Uncached helper for getLanguageNames.
*
* @param null|string $inLanguage As getLanguageNames
* @param string $include As getLanguageNames
* @return array Language code => language name (sorted by key)
*/
private function getLanguageNamesUncached( $inLanguage, $include ) {
// If passed an invalid language code to use, fallback to en
if ( $inLanguage !== self::AUTONYMS && !$this->isValidCode( $inLanguage ) ) {
$inLanguage = 'en';
}
$names = [];
if ( $inLanguage !== self::AUTONYMS ) {
# TODO: also include for self::AUTONYMS, when this code is more efficient
// @phan-suppress-next-line PhanTypeMismatchArgumentNullable False positive
$this->hookRunner->onLanguageGetTranslatedLanguageNames( $names, $inLanguage );
}
$mwNames = $this->options->get( MainConfigNames::ExtraLanguageNames ) + Data\Names::$names;
if ( !$this->options->get( MainConfigNames::UsePigLatinVariant ) ) {
// Suppress Pig Latin unless explicitly enabled.
unset( $mwNames['en-x-piglatin'] );
}
if ( $this->options->get( MainConfigNames::UseXssLanguage ) ) {
$mwNames['x-xss'] = 'fake xss language (see $wgUseXssLanguage)';
}
foreach ( $mwNames as $mwCode => $mwName ) {
# - Prefer own MediaWiki native name when not using the hook
# - For other names just add if not added through the hook
if ( $mwCode === $inLanguage || !isset( $names[$mwCode] ) ) {
$names[$mwCode] = $mwName;
}
}
if ( $include === self::ALL ) {
ksort( $names );
return $names;
}
$returnMw = [];
$coreCodes = array_keys( $mwNames );
foreach ( $coreCodes as $coreCode ) {
$returnMw[$coreCode] = $names[$coreCode];
}
if ( $include === self::SUPPORTED ) {
$namesMwFile = [];
# We do this using a foreach over the codes instead of a directory loop so that messages
# files in extensions will work correctly.
foreach ( $returnMw as $code => $value ) {
if ( is_readable( $this->getMessagesFileName( $code ) ) ||
is_readable( $this->getJsonMessagesFileName( $code ) )
) {
$namesMwFile[$code] = $names[$code];
}
}
ksort( $namesMwFile );
return $namesMwFile;
}
ksort( $returnMw );
# self::DEFINED option; default if it's not one of the other two options
# (self::ALL/self::SUPPORTED)
return $returnMw;
}
/**
* @param string $code The code of the language for which to get the name
* @param null|string $inLanguage Code of language in which to return the name (self::AUTONYMS
* for autonyms)
* @param string $include See getLanguageNames(), except this function defaults to self::ALL instead of
* self::DEFINED
* @return string Language name or empty
*/
public function getLanguageName( $code, $inLanguage = self::AUTONYMS, $include = self::ALL ) {
$code = LanguageCode::replaceDeprecatedCodes( LanguageCode::bcp47ToInternal( $code ) );
$array = $this->getLanguageNames( $inLanguage, $include );
return $array[$code] ?? '';
}
/**
* Get the name of a file for a certain language code.
*
* @param string $prefix Prepend this to the filename
* @param string $code Language code
* @param string $suffix Append this to the filename
* @return string $prefix . $mangledCode . $suffix
*/
public function getFileName( $prefix, $code, $suffix = '.php' ) {
if ( !$this->isValidBuiltInCode( $code ) ) {
throw new InvalidArgumentException( "Invalid language code \"$code\"" );
}
return $prefix . str_replace( '-', '_', ucfirst( $code ) ) . $suffix;
}
/**
* @param string $code
* @return string
*/
public function getMessagesFileName( $code ) {
global $IP;
$file = $this->getFileName( "$IP/languages/messages/Messages", $code, '.php' );
$this->hookRunner->onLanguage__getMessagesFileName( $code, $file );
return $file;
}
/**
* @param string $code
* @return string
*/
public function getJsonMessagesFileName( $code ) {
global $IP;
if ( !$this->isValidBuiltInCode( $code ) ) {
throw new InvalidArgumentException( "Invalid language code \"$code\"" );
}
return "$IP/languages/i18n/$code.json";
}
}