wikimedia/mediawiki-extensions-Wikibase

View on GitHub
repo/maintenance/updateUnits.php

Summary

Maintainability
C
1 day
Test Coverage
<?php

namespace Wikibase\Repo\Maintenance;

use DataValues\DecimalMath;
use DataValues\DecimalValue;
use MediaWiki\Maintenance\Maintenance;
use MediaWiki\MediaWikiServices;
use MediaWiki\Sparql\SparqlClient;
use Wikibase\Lib\WikibaseSettings;
use Wikibase\Repo\WikibaseRepo;

$basePath =
    getenv( 'MW_INSTALL_PATH' ) !== false ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../../..';
require_once $basePath . '/maintenance/Maintenance.php';

/**
 * Generate the conversion table for units,
 * optionally filtered to units of a certain type.
 *
 * This script retrieves and emits the conversion data into coherent units.
 * For instance, the millimetre is equal to 1/1000 of the coherent unit metre
 * (also an SI base unit), while the ampere hour is equal to 3600 of the
 * coherent unit coulomb, the product of the SI base units ampere and second.
 *
 * Example usage:
 * mwscript extensions/WikidataBuildResources/extensions/Wikibase/repo/maintenance/updateUnits.php
 *   --wiki wikidatawiki --base-uri http://www.wikidata.org/entity/
 *   --unit-class Q1978718 > unitConversion.json
 *
 * @license GPL-2.0-or-later
 * @author Stas Malyshev
 */
class UpdateUnits extends Maintenance {

    /**
     * @var string
     */
    private $baseUri;

    /**
     * Length of the base URI.
     * Helper variable to speed up cutting it out.
     * @var int
     */
    private $baseLen;

    /**
     * @var SparqlClient
     */
    private $client;

    /**
     * Should we silence the error output for tests?
     * @var bool
     */
    public $silent;

    public function __construct() {
        parent::__construct();
        $this->addDescription( "Generate unit conversion table, optionally filtered by type." );

        $this->addOption( 'base-uri', 'Base URI for the data.', false, true );
        $this->addOption( 'unit-class', 'Class for units.', false, true );
        $this->addOption( 'format', 'Output format "json" (default) or "csv".', false, true );
        $this->addOption( 'sparql', 'SPARQL endpoint URL.', false, true );
        $this->addOption( 'check-usage', 'Check whether unit is in use?', false );
    }

    public function execute() {
        if ( !WikibaseSettings::isRepoEnabled() ) {
            $this->fatalError( "You need to have Wikibase enabled in order to use this maintenance script!" );
        }
        $format = $this->getOption( 'format', 'json' );
        $checkUsage = $this->hasOption( 'check-usage' );

        $repoSettings = WikibaseRepo::getSettings();
        $endPoint = $this->getOption( 'sparql',
            $repoSettings->getSetting( 'sparqlEndpoint' ) );
        if ( !$endPoint ) {
            $this->fatalError( 'SPARQL endpoint not defined' );
        }
        $this->setBaseUri( $this->getOption( 'base-uri', WikibaseRepo::getItemVocabularyBaseUri() ) );
        $this->client = new SparqlClient( $endPoint, MediaWikiServices::getInstance()->getHttpRequestFactory() );
        $this->client->appendUserAgent( __CLASS__ );

        $unitClass = $this->getOption( 'unit-class' );
        if ( $unitClass ) {
            $filter = "FILTER EXISTS { ?unit wdt:P31/wdt:P279* wd:$unitClass }\n";
        } else {
            $filter = '';
        }

        // Get units usage stats. We don't care about units
        // That have been used less than 10 times, for now
        if ( $checkUsage ) {
            $unitUsage = $this->getUnitUsage( 10 );
        } else {
            $unitUsage = null;
        }
        $coherentUnits = $this->getCoherentUnits( $filter );

        $convertUnits = [];
        $reconvert = [];

        if ( $checkUsage ) {
            $filter .= "FILTER EXISTS { [] wikibase:quantityUnit ?unit }\n";
        }

        $convertableUnits = $this->getConvertableUnits( $filter );
        foreach ( $convertableUnits as $unit ) {
            $converted =
                $this->convertUnit( $unit, $convertUnits, $coherentUnits, $unitUsage, $reconvert );
            if ( $converted ) {
                $unitName = substr( $unit['unit'], $this->baseLen );
                $convertUnits[$unitName] = $converted;
            }
        }

        $this->reduceUnits( $reconvert, $convertUnits );

        // Add coherent units
        foreach ( $coherentUnits as $coherentUnitName => $coherentUnitData ) {
            $convertUnits[$coherentUnitName] = [
                'factor' => "1",
                'unit' => $coherentUnitName,
                'label' => $coherentUnitData['unitLabel'],
                'siLabel' => $coherentUnitData['unitLabel'],
            ];
        }

        // Sort units by Q-id, as number, to have predictable order
        uksort( $convertUnits,
            function ( $x, $y ) {
                return (int)substr( $x, 1 ) - (int)substr( $y, 1 );
            }
        );

        switch ( strtolower( $format ) ) {
            case 'csv':
                echo $this->formatCSV( $convertUnits );
                break;
            case 'json':
                echo $this->formatJSON( $convertUnits );
                break;
            default:
                $this->fatalError( 'Invalid format' );
        }
    }

    /**
     * Reduce units that are not in term of coherent units into coherent units.
     * If some units are not reducible to coherent units, warnings are issued.
     * @param array $reconvert List of units to be reduced
     * @param array &$convertUnits List of unit conversion configs, will be modified if
     *                             it is possible to reduce the unit to coherent units.
     */
    private function reduceUnits( $reconvert, &$convertUnits ) {
        while ( $reconvert ) {
            $converted = false;
            foreach ( $reconvert as $name => $unit ) {
                $convertedUnit = $this->convertDerivedUnit( $unit, $convertUnits );
                if ( $convertedUnit ) {
                    $convertUnits[$name] = $convertedUnit;
                    unset( $reconvert[$name] );
                    $converted = true;
                }
            }
            // we didn't convert any on this step, no use to continue
            // This loop will converge since on each step we will reduce
            // the length of $reconvert until we can't do it anymore.
            if ( !$converted ) {
                break;
            }
        }

        if ( $reconvert ) {
            // still have unconverted units
            foreach ( $reconvert as $name => $unit ) {
                $this->error( "Weird conversion: {$unit['unit']} reduces to {$unit['siUnit']} which is not coherent!" );
            }
        }
    }

    /**
     * @param string $uri
     */
    public function setBaseUri( $uri ) {
        $this->baseUri = $uri;
        $this->baseLen = strlen( $uri );
    }

    /**
     * Convert unit that does not reduce to a coherent unit.
     *
     * @param string[] $unit
     * @param array[] $convertUnits List of units already converted
     *
     * @return string[]|null Converted data for the unit or null if no conversion possible.
     */
    public function convertDerivedUnit( $unit, $convertUnits ) {
        if ( isset( $convertUnits[$unit['siUnit']] ) ) {
            // we have conversion now
            $math = new DecimalMath();
            $newUnit = $convertUnits[$unit['siUnit']];
            $newFactor =
                $math->product( new DecimalValue( $unit['si'] ),
                    new DecimalValue( $newUnit['factor'] ) );
            return [
                'factor' => trim( $newFactor->getValue(), '+' ),
                'unit' => $newUnit['unit'],
                'label' => $unit['unitLabel'],
                'siLabel' => $newUnit['siLabel'],
            ];
        }
        return null;
    }

    /**
     * Create conversion data for a single unit.
     * @param string[] $unit Unit data
     * @param string[] $convertUnits Already converted data
     * @param array[] $coherentUnits Ultimate target units
     * @param string[]|null $unitUsage Unit usage data
     * @param string[][] &$reconvert Array collecting units that require re-conversion later,
     *                 due to their target unit not being coherent.
     * @return string[]|null Produces conversion data for the unit or null if not possible.
     */
    public function convertUnit( $unit, $convertUnits, $coherentUnits, $unitUsage, &$reconvert ) {
        $unit['unit'] = substr( $unit['unit'], $this->baseLen );
        $unit['siUnit'] = substr( $unit['siUnit'], $this->baseLen );

        if ( isset( $convertUnits[$unit['unit']] ) ) {
            // done already
            return null;
        }
        if ( $unit['unit'] == $unit['siUnit'] ) {
            // coherent unit
            if ( $unit['si'] != 1 ) {
                $this->error( "Weird unit: {$unit['unit']} is {$unit['si']} of itself!" );
                return null;
            }
            if ( !isset( $coherentUnits[$unit['siUnit']] ) ) {
                $this->error( "Weird unit: {$unit['unit']} is self-referring but not coherent!" );
                return null;
            }
        }

        if ( $unitUsage && !isset( $coherentUnits[$unit['unit']] ) && !isset( $unitUsage[$unit['unit']] ) ) {
            $this->error( "Low usage unit {$unit['unit']}, skipping..." );
            return null;
        }

        if ( !isset( $coherentUnits[$unit['siUnit']] ) ) {
            // target unit is not actually coherent
            $reconvert[$unit['unit']] = $unit;
        } else {
            return [
                'factor' => $unit['si'],
                'unit' => $unit['siUnit'],
                // These two are just for humans, not used by actual converter
                'label' => $unit['unitLabel'],
                'siLabel' => $unit['siUnitLabel'],
            ];
        }

        return null;
    }

    /**
     * Format units as JSON
     * @param array[] $convertUnits
     * @return string
     */
    private function formatJSON( array $convertUnits ) {
        return json_encode( $convertUnits, JSON_PRETTY_PRINT );
    }

    /**
     * Get units that are used at least $min times.
     * We don't care about units that have been used less than 10 times, for now.
     * Only top 200 will be returned (though so far we don't have that many).
     * @param int $min Minimal usage for the unit.
     * @return string[] Array of ['unit' => Q-id, 'c' => count]
     */
    private function getUnitUsage( $min ) {
        $usageQuery = <<<UQUERY
SELECT ?unit (COUNT(DISTINCT ?v) as ?c) WHERE {
  ?v wikibase:quantityUnit ?unit .
  ?s ?p ?v .
  FILTER(?unit != wd:Q199)
# Exclude currencies
  FILTER NOT EXISTS { ?unit wdt:P31+ wd:Q8142 }
} GROUP BY ?unit
  HAVING(?c >= $min)
  ORDER BY DESC(?c)
  LIMIT 200
UQUERY;
        $unitUsage = $this->getIDs( $usageQuery, 'unit' );
        $unitUsage = array_flip( $unitUsage );
        return $unitUsage;
    }

    /**
     * Get list of IDs from SPARQL.
     * @param string $sparql Query
     * @param string $item Variable name where IDs are stored
     * @return string[] List of entity ID strings
     */
    private function getIDs( $sparql, $item ) {
        $data = $this->client->query( $sparql );
        if ( $data ) {
            return array_map( function ( $row ) use ( $item ) {
                return str_replace( $this->baseUri, '', $row[$item] );
            }, $data );
        }
        return [];
    }

    /**
     * Get coherent units (those with a conversion factor of 1 to themselves).
     * @param string $filter Unit filter
     * @return array[]
     */
    private function getCoherentUnits( $filter ) {
        $query = <<<QUERY
SELECT ?unit ?unitLabel WHERE {
  ?unit wdt:P31 wd:Q69197847 .
  $filter
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
}
QUERY;
        $coherentUnitsData = $this->client->query( $query );
        '@phan-var array[] $coherentUnitsData';
        $coherentUnits = [];
        // arrange better lookup
        foreach ( $coherentUnitsData as $coherent ) {
            $item = substr( $coherent['unit'], $this->baseLen );
            $coherentUnits[$item] = $coherent;
        }
        return $coherentUnits;
    }

    /**
     * Retrieve the list of convertable units.
     * @param string $filter
     * @return array[]|false List of units that can be converted
     */
    private function getConvertableUnits( $filter ) {
        $unitsQuery = <<<QUERY
SELECT REDUCED ?unit ?si ?siUnit ?unitLabel ?siUnitLabel WHERE {
  ?unit wdt:P31 ?type .
  ?type wdt:P279* wd:Q47574 .
  # Not a currency
  FILTER (?type != wd:Q8142)
  # Not a cardinal number
  FILTER NOT EXISTS { ?unit wdt:P31 wd:Q163875 }
  $filter
  # Has conversion to SI Units
  ?unit p:P2370/psv:P2370 [ wikibase:quantityAmount ?si; wikibase:quantityUnit ?siUnit ] .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
  }
# Enable this to select only units that are actually used
}
QUERY;
        return $this->client->query( $unitsQuery );
    }

    /**
     * Format units as CSV
     * @param array[] $convertUnits
     * @return string
     */
    private function formatCSV( array $convertUnits ) {
        $str = '';
        foreach ( $convertUnits as $name => $data ) {
            $str .= "$name,{$data['unit']},{$data['factor']}\n";
        }
        return $str;
    }

    /**
     * @param string $err
     * @param int $die If > 0, go ahead and die out using this int as the code
     */
    protected function error( $err, $die = 0 ) {
        if ( !$this->silent ) {
            parent::error( $err, $die );
        } elseif ( $die > 0 ) {
            die( $die );
        }
    }

}

$maintClass = UpdateUnits::class;
require_once RUN_MAINTENANCE_IF_MAIN;