src/Statistics/MessageGroupStats.php
<?php
declare( strict_types = 1 );
namespace MediaWiki\Extension\Translate\Statistics;
use AggregateMessageGroup;
use FileBasedMessageGroup;
use MediaWiki\Deferred\DeferredUpdates;
use MediaWiki\Extension\Translate\MessageGroupProcessing\MessageGroups;
use MediaWiki\Extension\Translate\MessageLoading\MessageCollection;
use MediaWiki\Extension\Translate\MessageLoading\MessageHandle;
use MediaWiki\Extension\Translate\Services;
use MediaWiki\Extension\Translate\Utilities\Utilities;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use MessageGroup;
use stdClass;
use WANObjectCache;
use Wikimedia\Rdbms\Database;
use Wikimedia\Rdbms\IDatabase;
/**
* This class aims to provide efficient mechanism for fetching translation completion stats.
* It abstracts MessageGroup statistics calculation and storing.
* You can access stats easily per language or per group.
* Stat array for each item is of format array( total, translate, fuzzy ).
* @author Wikia (trac.wikia-code.com/browser/wikia/trunk/extensions/wikia/TranslationStatistics)
* @author Niklas Laxström
* @license GPL-2.0-or-later
*
* @ingroup Stats MessageGroups
*/
class MessageGroupStats {
/** Name of the database table */
private const TABLE = 'translate_groupstats';
/** Cache key for storage of all language stats */
private const LANGUAGE_STATS_KEY = 'translate-all-language-stats';
public const TOTAL = 0; ///< Array index
public const TRANSLATED = 1; ///< Array index
public const FUZZY = 2; ///< Array index
public const PROOFREAD = 3; ///< Array index
/** If stats are not cached, do not attempt to calculate them on the fly */
public const FLAG_CACHE_ONLY = 1;
/** Ignore cached values. Useful for updating stale values. */
public const FLAG_NO_CACHE = 2;
/** Do not defer updates. Meant for jobs like RebuildMessageGroupStatsJob. */
public const FLAG_IMMEDIATE_WRITES = 4;
/** @var array[] */
private static array $updates = [];
/** @var string[]|null */
private static ?array $languages = null;
/**
* Returns empty stats array. Useful because the number of elements may change.
* @return int[]
*/
public static function getEmptyStats(): array {
return [ 0, 0, 0, 0 ];
}
/**
* Returns empty stats array that indicates stats are incomplete or unknown.
* @return null[]
*/
private static function getUnknownStats(): array {
return [ null, null, null, null ];
}
private static function isValidLanguage( string $languageCode ): bool {
$languages = self::getLanguages();
return in_array( $languageCode, $languages );
}
/**
* In case some code calls stats for dynamic groups. Calculating these numbers
* don't make sense for dynamic groups, and would just throw an exception.
*/
private static function isValidMessageGroup( ?MessageGroup $group ): bool {
return $group && !MessageGroups::isDynamic( $group );
}
/**
* Returns stats for given group in given language.
* @param string $groupId
* @param string $languageCode
* @param int $flags Combination of FLAG_* constants.
* @return null[]|int[]
*/
public static function forItem( string $groupId, string $languageCode, int $flags = 0 ): array {
$group = MessageGroups::getGroup( $groupId );
if ( !self::isValidMessageGroup( $group ) || !self::isValidLanguage( $languageCode ) ) {
return self::getUnknownStats();
}
$res = self::selectRowsIdLang( [ $groupId ], [ $languageCode ], $flags );
$stats = self::extractResults( $res, [ $groupId ] );
if ( !isset( $stats[$groupId][$languageCode] ) ) {
$stats[$groupId][$languageCode] = self::forItemInternal( $stats, $group, $languageCode, $flags );
}
self::queueUpdates( $flags );
return $stats[$groupId][$languageCode];
}
/**
* Returns stats for all groups in given language.
* @param string $languageCode
* @param int $flags Combination of FLAG_* constants.
* @return array[]
*/
public static function forLanguage( string $languageCode, int $flags = 0 ): array {
if ( !self::isValidLanguage( $languageCode ) ) {
$stats = [];
$groups = MessageGroups::singleton()->getGroups();
$ids = array_keys( $groups );
foreach ( $ids as $id ) {
$stats[$id] = self::getUnknownStats();
}
return $stats;
}
$stats = self::forLanguageInternal( $languageCode, [], $flags );
$flattened = [];
foreach ( $stats as $group => $languages ) {
$flattened[$group] = $languages[$languageCode];
}
self::queueUpdates( $flags );
return $flattened;
}
/**
* Returns stats for all languages in given group.
* @param string $groupId
* @param int $flags Combination of FLAG_* constants.
* @return array[]
*/
public static function forGroup( string $groupId, int $flags = 0 ): array {
$group = MessageGroups::getGroup( $groupId );
if ( !self::isValidMessageGroup( $group ) ) {
$languages = self::getLanguages();
$stats = [];
foreach ( $languages as $code ) {
$stats[$code] = self::getUnknownStats();
}
return $stats;
}
$stats = self::forGroupInternal( $group, [], $flags );
self::queueUpdates( $flags );
return $stats[$groupId];
}
/**
* Recalculate stats for all groups associated with the message.
*
* Hook: TranslateEventTranslationReview
* @param MessageHandle $handle
*/
public static function clear( MessageHandle $handle ): void {
$code = $handle->getCode();
if ( !self::isValidLanguage( $code ) ) {
return;
}
$groups = self::getSortedGroupsForClearing( $handle->getGroupIds() );
self::internalClearGroups( $code, $groups, 0 );
}
/**
* Recalculate stats for given group(s).
*
* @param string|string[] $id Message group ids.
* @param int $flags Combination of FLAG_* constants.
*/
public static function clearGroup( $id, int $flags = 0 ): void {
$languages = self::getLanguages();
$groups = self::getSortedGroupsForClearing( (array)$id );
// Do one language at a time, to save memory
foreach ( $languages as $code ) {
self::internalClearGroups( $code, $groups, $flags );
}
}
/**
* Fetch aggregated statistics for all languages across groups. The stats are cached
* in the WANObjectCache, and recalculated on the fly if the values are stale.
* The statistics may lag behind the actuals due to extra and missing values
* @return array[] ( Language Code => Language Stats )
*/
public static function getApproximateLanguageStats(): array {
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
return $cache->getWithSetCallback(
self::LANGUAGE_STATS_KEY,
WANObjectCache::TTL_INDEFINITE,
function ( $oldValue, &$ttl, array &$setOpts ) {
$dbr = Utilities::getSafeReadDB();
$setOpts += Database::getCacheSetOptions( $dbr );
return self::getAllLanguageStats();
},
[
'checkKeys' => [ self::LANGUAGE_STATS_KEY ],
'pcTTL' => $cache::TTL_PROC_SHORT,
]
);
}
private static function getAllLanguageStats(): array {
$dbr = Utilities::getSafeReadDB();
$res = $dbr->newSelectQueryBuilder()
->table( self::TABLE )
->select( [
'tgs_lang',
'tgs_translated' => 'SUM(tgs_translated)',
'tgs_fuzzy' => 'SUM(tgs_fuzzy)',
'tgs_total' => 'SUM(tgs_total)',
'tgs_proofread' => 'SUM(tgs_proofread)'
] )
->groupBy( 'tgs_lang' )
->caller( __METHOD__ )
->fetchResultSet();
$allLanguages = self::getLanguages();
$languagesCodes = array_flip( $allLanguages );
$allStats = [];
foreach ( $res as $row ) {
$allStats[ $row->tgs_lang ] = self::extractNumbers( $row );
unset( $languagesCodes[ $row->tgs_lang ] );
}
// Fill empty stats for missing language codes
foreach ( array_keys( $languagesCodes ) as $code ) {
$allStats[ $code ] = self::getEmptyStats();
}
return $allStats;
}
/**
* Helper for clear and clearGroup that caches already loaded statistics.
* @param string $code
* @param MessageGroup[] $groups
* @param int $flags Combination of FLAG_* constants.
*/
private static function internalClearGroups( string $code, array $groups, int $flags ): void {
$stats = [];
foreach ( $groups as $group ) {
// $stats is modified by reference
self::forItemInternal( $stats, $group, $code, $flags );
}
self::queueUpdates( 0 );
}
/**
* Get sorted message groups ids that can be used for efficient clearing.
*
* To optimize performance, we first need to process all non-aggregate groups.
* Because aggregate groups are flattened (see self::expandAggregates), we can
* process them any order and allow use of cache, except for the aggregate groups
* itself.
* @param string[] $ids
* @return MessageGroup[]
*/
private static function getSortedGroupsForClearing( array $ids ): array {
$groups = array_map( [ MessageGroups::class, 'getGroup' ], $ids );
// Sanity: Remove any invalid groups
$groups = array_filter( $groups );
$sorted = [];
$aggregates = [];
foreach ( $groups as $group ) {
if ( $group instanceof AggregateMessageGroup ) {
$aggregates[$group->getId()] = $group;
} else {
$sorted[$group->getId()] = $group;
}
}
return array_merge( $sorted, $aggregates );
}
/**
* Get list of supported languages for statistics.
* @return string[]
*/
public static function getLanguages(): array {
if ( self::$languages === null ) {
$languages = array_keys( Utilities::getLanguageNames( 'en' ) );
sort( $languages );
self::$languages = $languages;
}
return self::$languages;
}
/**
* Use this to extract results returned from selectRowsIdLang. You must pass the
* message group ids you want to retrieve. Entries that do not match are not returned.
* @param iterable $res Database result object
* @param string[] $ids List of message group ids
* @param array[] $stats Optional array to append results to.
* @return array[]
*/
private static function extractResults( iterable $res, array $ids, array $stats = [] ): array {
// Map the internal ids back to real ids
$idMap = array_combine( array_map( [ self::class, 'getDatabaseIdForGroupId' ], $ids ), $ids );
foreach ( $res as $row ) {
if ( !isset( $idMap[$row->tgs_group] ) ) {
// Stale entry, ignore for now
// TODO: Schedule for purge
continue;
}
$realId = $idMap[$row->tgs_group];
$stats[$realId][$row->tgs_lang] = self::extractNumbers( $row );
}
return $stats;
}
/** Returns an array of needed database fields. */
private static function extractNumbers( stdClass $row ): array {
return [
self::TOTAL => (int)$row->tgs_total,
self::TRANSLATED => (int)$row->tgs_translated,
self::FUZZY => (int)$row->tgs_fuzzy,
self::PROOFREAD => (int)$row->tgs_proofread,
];
}
/**
* @param string $languageCode
* @param array[] $stats
* @param int $flags Combination of FLAG_* constants.
* @return array[]
*/
private static function forLanguageInternal( string $languageCode, array $stats, int $flags ): array {
$groups = MessageGroups::singleton()->getGroups();
$ids = array_keys( $groups );
$res = self::selectRowsIdLang( null, [ $languageCode ], $flags );
$stats = self::extractResults( $res, $ids, $stats );
foreach ( $groups as $id => $group ) {
if ( isset( $stats[$id][$languageCode] ) ) {
continue;
}
$stats[$id][$languageCode] = self::forItemInternal( $stats, $group, $languageCode, $flags );
}
return $stats;
}
/** @return MessageGroup[] */
private static function expandAggregates( AggregateMessageGroup $agg ): array {
$flattened = [];
foreach ( $agg->getGroups() as $group ) {
if ( $group instanceof AggregateMessageGroup ) {
$flattened += self::expandAggregates( $group );
} else {
$flattened[$group->getId()] = $group;
}
}
return $flattened;
}
/**
* @param MessageGroup $group
* @param array[] $stats
* @param int $flags Combination of FLAG_* constants.
* @return array[]
*/
private static function forGroupInternal( MessageGroup $group, array $stats, int $flags ): array {
$id = $group->getId();
$res = self::selectRowsIdLang( [ $id ], null, $flags );
$stats = self::extractResults( $res, [ $id ], $stats );
// Go over each language filling missing entries
$languages = self::getLanguages();
foreach ( $languages as $code ) {
if ( isset( $stats[$id][$code] ) ) {
continue;
}
$stats[$id][$code] = self::forItemInternal( $stats, $group, $code, $flags );
}
// This is for sorting the values added later in correct order
foreach ( array_keys( $stats ) as $key ) {
ksort( $stats[$key] );
}
return $stats;
}
/**
* Fetch rows from the database. Use extractResults to process this value.
* @param ?string[] $ids List of message group ids
* @param ?string[] $codes List of language codes
* @param int $flags Combination of FLAG_* constants.
* @return iterable Database result object
*/
private static function selectRowsIdLang( ?array $ids, ?array $codes, int $flags ): iterable {
if ( $flags & self::FLAG_NO_CACHE ) {
return [];
}
$conditions = [];
if ( $ids !== null ) {
$dbids = array_map( [ self::class, 'getDatabaseIdForGroupId' ], $ids );
$conditions['tgs_group'] = $dbids;
}
if ( $codes !== null ) {
$conditions['tgs_lang'] = $codes;
}
$dbr = Utilities::getSafeReadDB();
return $dbr->newSelectQueryBuilder()
->select( '*' )
->from( self::TABLE )
->where( $conditions )
->caller( __METHOD__ )
->fetchResultSet();
}
/**
* @param array[] &$stats
* @param MessageGroup $group
* @param string $languageCode
* @param int $flags Combination of FLAG_* constants.
* @return null[]|int[]
*/
private static function forItemInternal(
array &$stats,
MessageGroup $group,
string $languageCode,
int $flags
): array {
$id = $group->getId();
if ( $flags & self::FLAG_CACHE_ONLY ) {
$stats[$id][$languageCode] = self::getUnknownStats();
return $stats[$id][$languageCode];
}
// It may happen that caches are requested repeatedly for a group before we get a chance
// to write the values to the database. Check for queued updates first. This has the
// benefit of avoiding duplicate rows for inserts. Ideally this would be checked before we
// query the database for missing values. This code is somewhat ugly as it needs to
// reverse engineer the values from the row format.
$databaseGroupId = self::getDatabaseIdForGroupId( $id );
$uniqueKey = "$databaseGroupId|$languageCode";
$queuedValue = self::$updates[$uniqueKey] ?? null;
if ( $queuedValue && !( $flags & self::FLAG_NO_CACHE ) ) {
return [
self::TOTAL => $queuedValue['tgs_total'],
self::TRANSLATED => $queuedValue['tgs_translated'],
self::FUZZY => $queuedValue['tgs_fuzzy'],
self::PROOFREAD => $queuedValue['tgs_proofread'],
];
}
if ( $group instanceof AggregateMessageGroup ) {
$aggregates = self::calculateAggregateGroup( $stats, $group, $languageCode, $flags );
} else {
$aggregates = self::calculateGroup( $group, $languageCode );
}
// Cache for use in subsequent forItemInternal calls
$stats[$id][$languageCode] = $aggregates;
// Don't add nulls to the database, causes annoying warnings
if ( $aggregates[self::TOTAL] === null ) {
return $aggregates;
}
self::$updates[$uniqueKey] = [
'tgs_group' => $databaseGroupId,
'tgs_lang' => $languageCode,
'tgs_total' => $aggregates[self::TOTAL],
'tgs_translated' => $aggregates[self::TRANSLATED],
'tgs_fuzzy' => $aggregates[self::FUZZY],
'tgs_proofread' => $aggregates[self::PROOFREAD],
];
// For big and lengthy updates, attempt some interim saves. This might not have
// any effect, because writes to the database may be deferred.
if ( count( self::$updates ) % 100 === 0 ) {
self::queueUpdates( $flags );
}
return $aggregates;
}
private static function calculateAggregateGroup(
array &$stats,
AggregateMessageGroup $group,
string $code,
int $flags
): array {
$aggregates = self::getEmptyStats();
$expanded = self::expandAggregates( $group );
$subGroupIds = array_keys( $expanded );
// Performance: if we have per-call cache of stats, do not query them again.
foreach ( $subGroupIds as $index => $sid ) {
if ( isset( $stats[$sid][$code] ) ) {
unset( $subGroupIds[ $index ] );
}
}
if ( $subGroupIds !== [] ) {
$res = self::selectRowsIdLang( $subGroupIds, [ $code ], $flags );
$stats = self::extractResults( $res, $subGroupIds, $stats );
}
$messageGroupMetadata = Services::getInstance()->getMessageGroupMetadata();
foreach ( $expanded as $sid => $subgroup ) {
// Discouraged groups may belong to another group, usually if there
// is an aggregate group for all translatable pages. In that case
// calculate and store the statistics, but don't count them as part of
// the aggregate group, so that the numbers in Special:LanguageStats
// add up. The statistics for discouraged groups can still be viewed
// through Special:MessageGroupStats.
if ( !isset( $stats[$sid][$code] ) ) {
$stats[$sid][$code] = self::forItemInternal( $stats, $subgroup, $code, $flags );
}
if ( !$messageGroupMetadata->isExcluded( $sid, $code ) ) {
$aggregates = self::multiAdd( $aggregates, $stats[$sid][$code] );
}
}
return $aggregates;
}
public static function multiAdd( array $a, array $b ): array {
if ( $a[0] === null || $b[0] === null ) {
return array_fill( 0, count( $a ), null );
}
foreach ( $a as $i => &$v ) {
$v += $b[$i];
}
return $a;
}
/**
* @param MessageGroup $group
* @param string $languageCode
* @return int[] ( total, translated, fuzzy, proofread )
*/
private static function calculateGroup( MessageGroup $group, string $languageCode ): array {
global $wgTranslateDocumentationLanguageCode;
// Calculate if missing and store in the db
$collection = $group->initCollection( $languageCode );
if (
$languageCode === $wgTranslateDocumentationLanguageCode
&& $group instanceof FileBasedMessageGroup
) {
$cache = $group->getMessageGroupCache( $group->getSourceLanguage() );
if ( $cache->exists() ) {
$template = $cache->getExtra()['TEMPLATE'] ?? [];
$infile = [];
foreach ( $template as $key => $data ) {
if ( isset( $data['comments']['.'] ) ) {
$infile[$key] = '1';
}
}
$collection->setInFile( $infile );
}
}
return self::getStatsForCollection( $collection );
}
private static function queueUpdates( int $flags ): void {
$mwInstance = MediaWikiServices::getInstance();
if ( self::$updates === [] || $mwInstance->getReadOnlyMode()->isReadOnly() ) {
return;
}
$lb = $mwInstance->getDBLoadBalancer();
$dbw = $lb->getConnection( DB_PRIMARY ); // avoid connecting yet
$callers = wfGetAllCallers( 50 );
$functionName = __METHOD__;
$callback = static function ( IDatabase $dbw, $method ) use ( $callers, $mwInstance ) {
// Maybe another deferred update already processed these
if ( self::$updates === [] ) {
return;
}
// This path should only be hit during web requests
if ( count( self::$updates ) > 100 ) {
$groups = array_unique( array_column( self::$updates, 'tgs_group' ) );
LoggerFactory::getInstance( 'Translate' )->warning(
"Huge translation update of {count} rows for group(s) {groups}",
[
'count' => count( self::$updates ),
'groups' => implode( ', ', $groups ),
'callers' => $callers,
]
);
}
$dbw->newReplaceQueryBuilder()
->replaceInto( self::TABLE )
->uniqueIndexFields( [ 'tgs_group', 'tgs_lang' ] )
->rows( array_values( self::$updates ) )
->caller( $method )
->execute();
self::$updates = [];
$mwInstance->getMainWANObjectCache()->touchCheckKey( self::LANGUAGE_STATS_KEY );
};
$updateOp = static function () use ( $dbw, $functionName, $callback ) {
$lockName = 'MessageGroupStats:updates';
if ( !$dbw->lock( $lockName, $functionName, 1 ) ) {
return; // raced out
}
$dbw->commit( $functionName, 'flush' );
call_user_func( $callback, $dbw, $functionName );
$dbw->commit( $functionName, 'flush' );
$dbw->unlock( $lockName, $functionName );
};
if ( $flags & self::FLAG_IMMEDIATE_WRITES ) {
call_user_func( $updateOp );
} else {
DeferredUpdates::addCallableUpdate( $updateOp );
}
}
public static function getDatabaseIdForGroupId( string $id ): string {
// The column is 100 bytes long, but we don't need to use it all
if ( strlen( $id ) <= 72 ) {
return $id;
}
$hash = hash( 'sha256', $id, /*asHex*/false );
return substr( $id, 0, 50 ) . '||' . substr( $hash, 0, 20 );
}
/** @return int[] */
public static function getStatsForCollection( MessageCollection $collection ): array {
$collection->filter( 'ignored' );
$collection->filterUntranslatedOptional();
// Store the count of real messages for later calculation.
$total = count( $collection );
// Count fuzzy first.
$collection->filter( 'fuzzy' );
$fuzzy = $total - count( $collection );
// Count the completed translations.
$collection->filter( 'hastranslation', false );
$translated = count( $collection );
// Count how many of the completed translations
// have been proofread
$collection->filter( 'reviewer', false );
$proofread = count( $collection );
return [
self::TOTAL => $total,
self::TRANSLATED => $translated,
self::FUZZY => $fuzzy,
self::PROOFREAD => $proofread,
];
}
}