wikimedia/mediawiki-extensions-Wikibase

View on GitHub
repo/includes/Rdf/RdfVocabulary.php

Summary

Maintainability
C
7 hrs
Test Coverage
<?php

namespace Wikibase\Repo\Rdf;

use DataValues\DataValue;
use InvalidArgumentException;
use MediaWiki\Language\LanguageCode;
use OutOfBoundsException;
use Wikibase\DataAccess\EntitySourceDefinitions;
use Wikibase\DataModel\Entity\EntityId;
use Wikibase\DataModel\Statement\Statement;
use Wikimedia\Assert\Assert;

/**
 * RDF vocabulary for use in mapping for wikibase data model.
 *
 * @license GPL-2.0-or-later
 */
class RdfVocabulary {

    // Change this when changing data format!
    public const FORMAT_VERSION = '1.0.0';
    private const ONTOLOGY_VERSION = '1.0';

    private const ONTOLOGY_BASE_URI = 'http://wikiba.se/ontology';
    public const NS_ONTOLOGY = 'wikibase'; // wikibase ontology (shared)

    // Nodes
    public const NS_ENTITY = ''; // concept uris
    public const NS_DATA = 'data'; // document uris
    public const NS_STATEMENT = 's'; // statement
    public const NS_REFERENCE = 'ref'; // reference
    public const NS_VALUE = 'v'; // value

    // Predicates
    public const NSP_DIRECT_CLAIM = 't'; // direct assertion entity -> value
    public const NSP_DIRECT_CLAIM_NORM = 'tn'; // direct assertion entity -> value, normalized
    public const NSP_CLAIM = 'p'; // entity -> statement
    public const NSP_CLAIM_STATEMENT = 'ps'; // statement -> simple value
    public const NSP_CLAIM_VALUE = 'psv'; // statement -> deep value
    public const NSP_CLAIM_VALUE_NORM = 'psn'; // statement -> deep value, normalized
    public const NSP_QUALIFIER = 'pq'; // statement -> qualifier
    public const NSP_QUALIFIER_VALUE = 'pqv'; // statement ->  qualifier deep value
    public const NSP_QUALIFIER_VALUE_NORM = 'pqn'; // statement ->  qualifier deep value, normalized
    public const NSP_REFERENCE = 'pr'; // reference -> simple value
    public const NSP_REFERENCE_VALUE = 'prv'; // reference -> deep value
    public const NSP_REFERENCE_VALUE_NORM = 'prn'; // reference -> deep value, normalized
    public const NSP_NOVALUE = 'no'; // novalue class

    // other prefixes
    public const NS_SKOS = 'skos'; // SKOS vocabulary
    public const NS_SCHEMA_ORG = 'schema'; // schema.org vocabulary
    public const NS_CC = 'cc'; // Creative Commons
    public const NS_GEO = 'geo'; // prefix for geolocations
    public const NS_PROV = 'prov'; // for provenance
    private const SKOS_URI = 'http://www.w3.org/2004/02/skos/core#';
    private const SCHEMA_ORG_URI = 'http://schema.org/';
    private const CC_URI = 'http://creativecommons.org/ns#';

    // External URIs
    //FIXME: get from config
    private const MEDIA_URI = 'http://commons.wikimedia.org/wiki/Special:FilePath/';
    //FIXME: get from config
    private const COMMONS_DATA_URI = 'http://commons.wikimedia.org/data/main/';
    private const GEO_URI = 'http://www.opengis.net/ont/geosparql#';
    private const PROV_URI = 'http://www.w3.org/ns/prov#';

    /**
     * URI for unit "1"
     * See: https://phabricator.wikimedia.org/T105432
     */
    public const ONE_ENTITY = 'http://www.wikidata.org/entity/Q199';
    // Ranks
    public const WIKIBASE_RANK_BEST = 'BestRank';

    public const RANK_MAP = [
        Statement::RANK_DEPRECATED => 'DeprecatedRank',
        Statement::RANK_NORMAL => 'NormalRank',
        Statement::RANK_PREFERRED => 'PreferredRank',
    ];

    /** @var string[] Value properties */
    public $claimToValue = [];
    /** @var string[] Value properties for normalized values */
    public $claimToValueNormalized = [];
    /** @var string[] Value properties for normalized values, including for direct claims */
    public $normalizedPropertyValueNamespace = [];

    /**
     * @var string[] Mapping of namespace names to URIs.
     */
    private $namespaces;

    /**
     * @var array Associative array mapping repository names to names specific to the particular repository
     * (ie. containing repository suffix).
     */
    public $entityNamespaceNames = [];

    /** @var string[] */
    public $dataNamespaceNames = [];

    /** @var string[][] */
    public $statementNamespaceNames = [];

    /**
     * @var array[] Associative array mapping repository names to maps, each mapping the "general" property
     * namespace name to the name specific to the particular repository (ie. containing repository suffix).
     */
    public $propertyNamespaceNames = [];

    /**
     * @var string[] Mapping of non-standard to canonical language codes.
     */
    private $canonicalLanguageCodes;

    /**
     * @var string[]
     */
    private $dataTypeUris;

    /**
     * @var string[]
     */
    private static $canonicalLanguageCodeCache = [];

    /**
     * Map of the configured page properties.
     * @var string[][]
     */
    private $pagePropertyDefs;

    /** @var string */
    private $licenseUrl;

    /** @var string[] */
    private $sourceNameByEntityType;

    /**
     * @param string[] $conceptUris Associative array mapping repository names to base URIs for entity concept URIs.
     * @param string[] $dataUris Associative array mapping source/repository names to base URIs for entity description URIs.
     * @param EntitySourceDefinitions $entitySourceDefinitions
     * @param string[] $rdfTurtleNodePrefixes
     * @param string[] $rdfTurtlePredicatePrefixes
     * @param string[] $canonicalLanguageCodes Mapping of non-standard to canonical language codes.
     * @param string[] $dataTypeUris Mapping of property data type IDs to their URIs,
     *                 if different from the default mapping.
     * @param string[][] $pagePropertyDefs Mapping of page props, e.g.:
     *                 pageProp => [ 'name' => wikibase predicate, 'type' => integer ]
     *                 All predicates will be prefixed with wikibase:
     * @param string $licenseUrl
     */
    public function __construct(
        array $conceptUris,
        array $dataUris,
        EntitySourceDefinitions $entitySourceDefinitions,
        array $rdfTurtleNodePrefixes,
        array $rdfTurtlePredicatePrefixes,
        array $canonicalLanguageCodes = [],
        array $dataTypeUris = [],
        array $pagePropertyDefs = [],
        string $licenseUrl = 'http://creativecommons.org/publicdomain/zero/1.0/'
    ) {
        Assert::parameterElementType( 'string', $conceptUris, '$conceptUris' );
        Assert::parameterElementType( 'string', $dataUris, '$dataUris' );

        Assert::parameter(
            array_keys( $conceptUris ) === array_keys( $dataUris ),
            '$dataUris',
            'must have values defined for all keys that $conceptUris'
        );

        $this->canonicalLanguageCodes = $canonicalLanguageCodes;
        $this->dataTypeUris = $dataTypeUris;
        $this->pagePropertyDefs = $pagePropertyDefs;

        $this->namespaces = [
            'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
            'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
            'xsd' => 'http://www.w3.org/2001/XMLSchema#',
            'owl' => 'http://www.w3.org/2002/07/owl#',
            self::NS_ONTOLOGY => self::ONTOLOGY_BASE_URI . "#",

            // external
            self::NS_SKOS => self::SKOS_URI,
            self::NS_SCHEMA_ORG => self::SCHEMA_ORG_URI,
            self::NS_CC => self::CC_URI,
            self::NS_GEO => self::GEO_URI,
            self::NS_PROV => self::PROV_URI,
        ];

        $propertyNamespaces = [
            self::NSP_CLAIM,
            self::NSP_CLAIM_STATEMENT,
            self::NSP_CLAIM_VALUE,
            self::NSP_CLAIM_VALUE_NORM,
            self::NSP_QUALIFIER,
            self::NSP_QUALIFIER_VALUE,
            self::NSP_QUALIFIER_VALUE_NORM,
            self::NSP_REFERENCE,
            self::NSP_REFERENCE_VALUE,
            self::NSP_REFERENCE_VALUE_NORM,
        ];
        $propertyNamespacesUseNodePrefix = [
            self::NSP_DIRECT_CLAIM,
            self::NSP_DIRECT_CLAIM_NORM,
            self::NSP_NOVALUE,
        ];

        foreach ( $conceptUris as $repositoryOrSourceName => $baseUri ) {
            $nodeNamespacePrefix = $rdfTurtleNodePrefixes[$repositoryOrSourceName];
            $predicateNamespacePrefix = $rdfTurtlePredicatePrefixes[$repositoryOrSourceName];

            $this->entityNamespaceNames[$repositoryOrSourceName] = $nodeNamespacePrefix . self::NS_ENTITY;
            $this->dataNamespaceNames[$repositoryOrSourceName] = $predicateNamespacePrefix . self::NS_DATA;
            $this->statementNamespaceNames[$repositoryOrSourceName] = [
                self::NS_STATEMENT => $predicateNamespacePrefix . self::NS_STATEMENT,
                self::NS_REFERENCE => $predicateNamespacePrefix . self::NS_REFERENCE,
                self::NS_VALUE => $predicateNamespacePrefix . self::NS_VALUE,
            ];

            $this->propertyNamespaceNames[$repositoryOrSourceName] = array_combine(
                $propertyNamespaces,
                array_map(
                    function ( $ns ) use ( $predicateNamespacePrefix ) {
                        return $predicateNamespacePrefix . $ns;
                    },
                    $propertyNamespaces
                )
            );
            $this->propertyNamespaceNames[$repositoryOrSourceName] += array_combine(
                $propertyNamespacesUseNodePrefix,
                array_map(
                    function ( $ns ) use ( $nodeNamespacePrefix ) {
                        return $nodeNamespacePrefix . $ns;
                    },
                    $propertyNamespacesUseNodePrefix
                )
            );

            $dataUri = $dataUris[$repositoryOrSourceName];
            $this->namespaces = array_merge(
                $this->namespaces,
                $this->getConceptNamespaces( $nodeNamespacePrefix, $predicateNamespacePrefix, $baseUri, $dataUri )
            );

            $this->claimToValue = array_merge(
                $this->claimToValue,
                [
                    $predicateNamespacePrefix . self::NSP_CLAIM_STATEMENT => $predicateNamespacePrefix . self::NSP_CLAIM_VALUE,
                    $predicateNamespacePrefix . self::NSP_QUALIFIER => $predicateNamespacePrefix . self::NSP_QUALIFIER_VALUE,
                    $predicateNamespacePrefix . self::NSP_REFERENCE => $predicateNamespacePrefix . self::NSP_REFERENCE_VALUE,
                ]
            );
            $this->claimToValueNormalized = array_merge(
                $this->claimToValueNormalized,
                [
                    $predicateNamespacePrefix . self::NSP_CLAIM_STATEMENT => $predicateNamespacePrefix . self::NSP_CLAIM_VALUE_NORM,
                    $predicateNamespacePrefix . self::NSP_QUALIFIER => $predicateNamespacePrefix . self::NSP_QUALIFIER_VALUE_NORM,
                    $predicateNamespacePrefix . self::NSP_REFERENCE => $predicateNamespacePrefix . self::NSP_REFERENCE_VALUE_NORM,
                ]
            );
            $this->normalizedPropertyValueNamespace = array_merge(
                $this->normalizedPropertyValueNamespace,
                [
                    $nodeNamespacePrefix . self::NSP_DIRECT_CLAIM => $nodeNamespacePrefix . self::NSP_DIRECT_CLAIM_NORM,
                    $predicateNamespacePrefix . self::NSP_CLAIM_STATEMENT => $predicateNamespacePrefix . self::NSP_CLAIM_VALUE_NORM,
                    $predicateNamespacePrefix . self::NSP_QUALIFIER => $predicateNamespacePrefix . self::NSP_QUALIFIER_VALUE_NORM,
                    $predicateNamespacePrefix . self::NSP_REFERENCE => $predicateNamespacePrefix . self::NSP_REFERENCE_VALUE_NORM,
                ]
            );
        }

        $this->sourceNameByEntityType = [];
        foreach ( $entitySourceDefinitions->getEntityTypeToDatabaseSourceMapping() as $entityType => $source ) {
            $this->sourceNameByEntityType[$entityType] = $source->getSourceName();
        }

        $this->licenseUrl = $licenseUrl;
    }

    /**
     * Generates mapping of concept namespaces (including the prefix) to URIs
     * using the given URI base.
     *
     * @param string $nodeNamespacePrefix
     * @param string $predicateNamespacePrefix
     * @param string $baseUri
     * @param string $dataUri
     * @return string[]
     */
    private function getConceptNamespaces( $nodeNamespacePrefix, $predicateNamespacePrefix, $baseUri, $dataUri ) {
        $topUri = $this->getConceptUriBase( $baseUri );

        $propUri = $topUri . 'prop/';

        return [
            $nodeNamespacePrefix . self::NS_ENTITY => $baseUri,
            $predicateNamespacePrefix . self::NS_DATA => $dataUri,
            $predicateNamespacePrefix . self::NS_STATEMENT => $baseUri . 'statement/',
            $predicateNamespacePrefix . self::NS_REFERENCE => $topUri . 'reference/',
            $predicateNamespacePrefix . self::NS_VALUE => $topUri . 'value/',
            // predicates
            $nodeNamespacePrefix . self::NSP_DIRECT_CLAIM => $propUri . 'direct/',
            $nodeNamespacePrefix . self::NSP_DIRECT_CLAIM_NORM => $propUri . 'direct-normalized/',
            $predicateNamespacePrefix . self::NSP_CLAIM => $propUri,
            $predicateNamespacePrefix . self::NSP_CLAIM_STATEMENT => $propUri . 'statement/',
            $predicateNamespacePrefix . self::NSP_CLAIM_VALUE => $propUri . 'statement/value/',
            $predicateNamespacePrefix . self::NSP_CLAIM_VALUE_NORM => $propUri . 'statement/value-normalized/',
            $predicateNamespacePrefix . self::NSP_QUALIFIER => $propUri . 'qualifier/',
            $predicateNamespacePrefix . self::NSP_QUALIFIER_VALUE => $propUri . 'qualifier/value/',
            $predicateNamespacePrefix . self::NSP_QUALIFIER_VALUE_NORM => $propUri . 'qualifier/value-normalized/',
            $predicateNamespacePrefix . self::NSP_REFERENCE => $propUri . 'reference/',
            $predicateNamespacePrefix . self::NSP_REFERENCE_VALUE => $propUri . 'reference/value/',
            $predicateNamespacePrefix . self::NSP_REFERENCE_VALUE_NORM => $propUri . 'reference/value-normalized/',
            $nodeNamespacePrefix . self::NSP_NOVALUE => $propUri . 'novalue/',
        ];
    }

    /**
     * Returns the base part of concept URIs
     *
     * @param string $baseUri
     * @return string
     */
    private function getConceptUriBase( $baseUri ) {
        if ( substr( $baseUri, -7 ) === 'entity/' ) {
            return substr( $baseUri, 0, -7 );
        }
        return $baseUri;
    }

    /**
     * Returns a map of namespace names (prefixes) to URIs
     *
     * @return string[]
     */
    public function getNamespaces() {
        return $this->namespaces;
    }

    /**
     * Returns the base URI for a given namespace (aka prefix).
     *
     * @param string $ns The namespace name
     *
     * @throws OutOfBoundsException if $ns is not a known namespace
     * @return string the URI for the given namespace
     */
    public function getNamespaceURI( $ns ) {
        if ( !isset( $this->namespaces[$ns] ) ) {
            throw new OutOfBoundsException();
        }

        return $this->namespaces[$ns];
    }

    /**
     * Returns a local name for the given entity using the given prefix.
     *
     * @param EntityId $entityId
     *
     * @return string
     */
    public function getEntityLName( EntityId $entityId ) {
        return $entityId->getSerialization();
    }

    /**
     * @param EntityId $entityId
     * @return string
     */
    public function getEntityRepositoryName( EntityId $entityId ) {
        return $this->sourceNameByEntityType[$entityId->getEntityType()];
    }

    /**
     * Returns a qname for the given statement using the given prefix.
     *
     * @param Statement $statement
     *
     * @return string
     */
    public function getStatementLName( Statement $statement ) {
        $guid = $statement->getGuid();
        if ( $guid === null ) {
            throw new InvalidArgumentException( 'Can only process statements that have a non-null GUID' );
        }
        return preg_replace( '/[^\w-]/', '-', $guid );
    }

    /**
     * Returns a qname for the given entity type.
     * For well known types, these qnames refer to classes from the Wikibase ontology.
     *
     * @param string $type
     *
     * @return string
     */
    public function getEntityTypeName( $type ) {
        return ucfirst( $type );
    }

    /**
     * Get Wikibase property data type Uri for ontology
     *
     * @param string $dataTypeId
     *
     * @return string
     */
    public function getDataTypeURI( string $dataTypeId ) {
        if ( !isset( $this->dataTypeUris[$dataTypeId] ) ) {
            // if the requested type has no URI in $this->dataTypeUris, add a generic one
            $name = preg_replace( '/\W+/', '', ucwords( strtr( $dataTypeId, '-', ' ' ) ) );
            $this->dataTypeUris[$dataTypeId] = $this->namespaces[self::NS_ONTOLOGY] . $name;
        }

        return $this->dataTypeUris[$dataTypeId];
    }

    /**
     * Get Wikibase value type name for ontology
     *
     * @param DataValue $val
     *
     * @return string
     */
    public function getValueTypeName( DataValue $val ) {
        return ucfirst( $val->getType() ) . 'Value';
    }

    /**
     * Create Commons URL from filename value
     *
     * @param string $file
     *
     * @return string
     */
    public function getMediaFileURI( $file ) {
        return self::MEDIA_URI . rawurlencode( $file );
    }

    /**
     * Create data entry point URL for geo shapes
     *
     * @param string $file
     *
     * @return string
     */
    public function getGeoShapeURI( $file ) {
        return self::COMMONS_DATA_URI . wfUrlencode( $file );
    }

    /**
     * Create data entry point URL for tabular data
     *
     * @param string $file
     *
     * @return string
     */
    public function getTabularDataURI( $file ) {
        return self::COMMONS_DATA_URI . wfUrlencode( $file );
    }

    /**
     * @param string $languageCode Any non-standard or canonical language code
     *
     * @return string Canonical language code
     */
    public function getCanonicalLanguageCode( $languageCode ) {
        // First we check the case since most languages will be cached very quickly
        if ( isset( self::$canonicalLanguageCodeCache[$languageCode] ) ) {
            return self::$canonicalLanguageCodeCache[$languageCode];
        }

        // Wikibase list goes first in case we want to override
        // Like "simple" goes to en-simple not en
        if ( isset( $this->canonicalLanguageCodes[$languageCode] ) ) {
            return $this->canonicalLanguageCodes[$languageCode];
        }

        self::$canonicalLanguageCodeCache[$languageCode] = LanguageCode::bcp47( $languageCode );
        return self::$canonicalLanguageCodeCache[$languageCode];
    }

    /**
     * Return current ontology version URI
     * @return string
     */
    public static function getOntologyURI() {
        return self::ONTOLOGY_BASE_URI . "-" . self::ONTOLOGY_VERSION . ".owl";
    }

    /**
     * Get the map of configured page properties
     * @return string[][]
     */
    public function getPagePropertyDefs() {
        return $this->pagePropertyDefs;
    }

    /**
     * @return string
     */
    public function getLicenseUrl() {
        return $this->licenseUrl;
    }

}