wikimedia/mediawiki-extensions-Wikibase

View on GitHub
repo/includes/Rdf/RdfBuilder.php

Summary

Maintainability
C
1 day
Test Coverage
<?php

declare( strict_types = 1 );

namespace Wikibase\Repo\Rdf;

use SplQueue;
use Wikibase\DataModel\Entity\EntityDocument;
use Wikibase\DataModel\Entity\EntityId;
use Wikibase\DataModel\Entity\PropertyId;
use Wikibase\Lib\Store\EntityRevisionLookup;
use Wikibase\Repo\Content\EntityContentFactory;
use Wikimedia\Purtle\RdfWriter;

/**
 * RDF mapping for wikibase data model.
 *
 * @license GPL-2.0-or-later
 */
class RdfBuilder implements EntityRdfBuilder, EntityStubRdfBuilder, EntityMentionListener {

    /**
     * A list of entities mentioned/touched to or by this builder.
     * The prefixed entity IDs are used as keys in the array, the value 'true'
     * is used to indicate that the entity has been resolved. If the value
     * is an EntityId, this indicates that the entity has not yet been resolved
     * (defined).
     *
     * @var (bool|EntityId)[]
     */
    private $entitiesResolved = [];

    /**
     * A queue of entities to output by this builder.
     *
     * @var SplQueue<EntityDocument>
     */
    private $entitiesToOutput;

    /**
     * What the serializer would produce?
     *
     * @var int
     */
    private $produceWhat;

    /**
     * @var RdfWriter
     */
    private $writer;

    /**
     * @var RdfVocabulary
     */
    private $vocabulary;

    /** @var EntityContentFactory */
    private $entityContentFactory;

    /**
     * Entity-specific RDF builders to apply when building RDF for an entity.
     * @var EntityRdfBuilder[]
     */
    private $entityRdfBuilders;

    /**
     * Entity-specific stub RDF builders factory
     * @var EntityStubRdfBuilderFactory|null
     */
    private $entityStubRdfBuilderFactory;

    /**
     * Entity-specific stub RDF builders to apply when building RDF for an entity.
     * @var EntityStubRdfBuilder[]
     */
    private $entityStubRdfBuilders;
    /**
     * @var EntityRevisionLookup
     */
    private $entityRevisionLookup;

    public function __construct(
        RdfVocabulary $vocabulary,
        EntityRdfBuilderFactory $entityRdfBuilderFactory,
        int $flavor,
        RdfWriter $writer,
        DedupeBag $dedupeBag,
        EntityContentFactory $entityContentFactory,
        EntityStubRdfBuilderFactory $entityStubRdfBuilderFactory,
        EntityRevisionLookup $entityRevisionLookup
    ) {
        $this->entitiesToOutput = new SplQueue();
        $this->vocabulary = $vocabulary;
        $this->writer = $writer;
        $this->produceWhat = $flavor;
        $this->entityContentFactory = $entityContentFactory;
        $this->entityStubRdfBuilderFactory = $entityStubRdfBuilderFactory;
        $this->entityRevisionLookup = $entityRevisionLookup;

        $this->entityRdfBuilders = $entityRdfBuilderFactory->getEntityRdfBuilders(
            $flavor,
            $vocabulary,
            $writer,
            $this,
            $dedupeBag
        );

        $this->entityStubRdfBuilders = $this->entityStubRdfBuilderFactory->getEntityStubRdfBuilders(
            $vocabulary,
            $writer
        );
    }

    /**
     * Start writing RDF document
     * Note that this builder does not have to finish it, it may be finished later.
     */
    public function startDocument(): void {
        foreach ( $this->getNamespaces() as $gname => $uri ) {
            $this->writer->prefix( $gname, $uri );
        }

        $this->writer->start();
    }

    /**
     * Finish writing the document
     * After that, nothing should ever be written into the document.
     */
    public function finishDocument(): void {
        $this->writer->finish();
    }

    /**
     * Returns the RDF generated by the builder
     */
    public function getRDF(): string {
        return $this->writer->drain();
    }

    /**
     * Returns a map of namespace names to URIs
     */
    public function getNamespaces(): array {
        return $this->vocabulary->getNamespaces();
    }

    /**
     * Get map of page properties used by this builder
     *
     * @return string[][]
     */
    public function getPagePropertyDefs(): array {
        return $this->vocabulary->getPagePropertyDefs();
    }

    /**
     * Should we produce this aspect?
     */
    private function shouldProduce( int $what ): bool {
        return ( $this->produceWhat & $what ) !== 0;
    }

    /**
     * @see EntityMentionListener::entityReferenceMentioned
     */
    public function entityReferenceMentioned( EntityId $id ): void {
        if ( $this->shouldProduce( RdfProducer::PRODUCE_RESOLVED_ENTITIES ) ) {
            $this->entityToResolve( $id );
        }
    }

    /**
     * @see EntityMentionListener::propertyMentioned
     */
    public function propertyMentioned( PropertyId $id ): void {
        if ( $this->shouldProduce( RdfProducer::PRODUCE_PROPERTIES ) ) {
            $this->entityToResolve( $id );
        }
    }

    /**
     * @see EntityMentionListener::subEntityMentioned
     */
    public function subEntityMentioned( EntityDocument $entity ): void {
        $this->entitiesToOutput->enqueue( $entity );
    }

    /**
     * Registers an entity as mentioned.
     * Will be recorded as unresolved
     * if it wasn't already marked as resolved.
     */
    private function entityToResolve( EntityId $entityId ): void {
        $prefixedId = $entityId->getSerialization();

        if ( !isset( $this->entitiesResolved[$prefixedId] ) ) {
            $this->entitiesResolved[$prefixedId] = $entityId;
        }
    }

    /**
     * Registers an entity as resolved.
     */
    private function entityResolved( EntityId $entityId ): void {
        $prefixedId = $entityId->getSerialization();
        $this->entitiesResolved[$prefixedId] = true;
    }

    /**
     * Adds revision information about an entity's revision to the RDF graph.
     *
     * @todo extract into MetaDataRdfBuilder
     *
     * @param EntityId $entityId
     * @param int $revision
     * @param string $timestamp in TS_MW format
     */
    public function addEntityRevisionInfo( EntityId $entityId, int $revision, string $timestamp ): void {
        $timestamp = wfTimestamp( TS_ISO_8601, $timestamp );
        $entityLName = $this->vocabulary->getEntityLName( $entityId );
        $entityRepositoryName = $this->vocabulary->getEntityRepositoryName( $entityId );

        $this->writer->about( $this->vocabulary->dataNamespaceNames[$entityRepositoryName], $entityLName )
            ->a( RdfVocabulary::NS_SCHEMA_ORG, "Dataset" )
            ->say( RdfVocabulary::NS_SCHEMA_ORG, 'about' )
            ->is( $this->vocabulary->entityNamespaceNames[$entityRepositoryName], $entityLName );

        if ( $this->shouldProduce( RdfProducer::PRODUCE_VERSION_INFO ) ) {
            // Dumps don't need version/license info for each entity, since it is included in the dump header
            $this->writer
                ->say( RdfVocabulary::NS_CC, 'license' )->is( $this->vocabulary->getLicenseUrl() )
                ->say( RdfVocabulary::NS_SCHEMA_ORG, 'softwareVersion' )->value( RdfVocabulary::FORMAT_VERSION );
        }

        // @phan-suppress-next-line PhanTypeMismatchArgumentProbablyReal T302667
        $this->writer->say( RdfVocabulary::NS_SCHEMA_ORG, 'version' )->value( $revision, 'xsd', 'integer' )
            ->say( RdfVocabulary::NS_SCHEMA_ORG, 'dateModified' )->value( $timestamp, 'xsd', 'dateTime' );
    }

    /**
     * Add page props information.
     * To ensure consistent data, this recalculates the page props from the entity content;
     * it does not actually query the page_props table.
     */
    public function addEntityPageProps( EntityDocument $entity ): void {
        if ( !$this->shouldProduce( RdfProducer::PRODUCE_PAGE_PROPS ) ) {
            return;
        }
        $pagePropertyDefs = $this->getPagePropertyDefs();
        if ( !$pagePropertyDefs ) {
            return;
        }
        $content = $this->entityContentFactory->newFromEntity( $entity );
        $entityPageProperties = $content->getEntityPageProperties();
        if ( !$entityPageProperties ) {
            return;
        }

        $entityId = $entity->getId();
        $entityRepositoryName = $this->vocabulary->getEntityRepositoryName( $entityId );
        $entityLName = $this->vocabulary->getEntityLName( $entityId );

        foreach ( $entityPageProperties as $name => $value ) {
            if ( !isset( $pagePropertyDefs[$name]['name'] ) ) {
                continue;
            }

            if ( isset( $pagePropertyDefs[$name]['type'] ) ) {
                settype( $value, $pagePropertyDefs[$name]['type'] );
            }

            $this->writer->about( $this->vocabulary->dataNamespaceNames[$entityRepositoryName], $entityLName )
                ->say( RdfVocabulary::NS_ONTOLOGY, $pagePropertyDefs[$name]['name'] )
                ->value( $value );
        }
    }

    /**
     * Adds meta-information about an entity id (such as the ID and type) to the RDF graph.
     *
     * @todo extract into MetaDataRdfBuilder
     */
    private function addEntityMetaData( EntityId $entityId ): void {
        $entityLName = $this->vocabulary->getEntityLName( $entityId );
        $entityRepoName = $this->vocabulary->getEntityRepositoryName( $entityId );

        $this->writer->about(
            $this->vocabulary->entityNamespaceNames[$entityRepoName],
            $entityLName
        )
            ->a( RdfVocabulary::NS_ONTOLOGY, $this->vocabulary->getEntityTypeName( $entityId->getEntityType() ) );
    }

    /**
     * Add an entity to the RDF graph, including all supported structural components
     * of the entity and its sub entities.
     *
     * @param EntityDocument $entity the entity to output.
     */
    public function addEntity( EntityDocument $entity ): void {
        $this->addSingleEntity( $entity );
        $this->addQueuedEntities();
    }

    /**
     * Add a single entity to the RDF graph, including all supported structural components
     * of the entity.
     *
     * @param EntityDocument $entity the entity to output.
     */
    private function addSingleEntity( EntityDocument $entity ): void {
        $this->addEntityMetaData( $entity->getId() );

        $type = $entity->getType();
        if ( !empty( $this->entityRdfBuilders[$type] ) ) {
            $this->entityRdfBuilders[$type]->addEntity( $entity );
        }

        $this->entityResolved( $entity->getId() );
    }

    /**
     * Add the RDF serialization of all entities in the entitiesToOutput queue
     */
    private function addQueuedEntities(): void {
        while ( !$this->entitiesToOutput->isEmpty() ) {
            $this->addSingleEntity( $this->entitiesToOutput->dequeue() );
        }
    }

    /**
     * Add stubs for any entities that were previously mentioned (e.g. as properties
     * or data values).
     */
    public function resolveMentionedEntities(): void {
        $hasRedirect = false;

        $this->markStubEntityDataForPrefetching( $this->entitiesResolved );

        foreach ( $this->entitiesResolved as $id ) {
            // $value is true if the entity has already been resolved,
            // or an EntityId to resolve.
            if ( !( $id instanceof EntityId ) ) {
                continue;
            }

            $lookupResult = $this->entityRevisionLookup->getLatestRevisionId( $id );
            $lookupStatus = $lookupResult->onNonexistentEntity(
                function () {
                    return 'nonexistent';
                }
            )->onConcreteRevision(
                function () {
                    return 'concrete revision';
                }
            )->onRedirect(
                function ( $revisionId, $redirectsTo ) {
                    return $redirectsTo;
                }
            )->map();

            switch ( $lookupStatus ) {
                case 'nonexistent':
                    continue 2;
                case 'concrete revision':
                    $this->addEntityStub( $id );
                    break;
                default:
                    // NOTE: this may add more entries to the end of entitiesResolved;
                    $this->addEntityRedirect( $id,
                        $lookupStatus );
                    $hasRedirect = true;
                    break;
            }
        }

        // If we encountered redirects, the redirect targets may now need resolving.
        // They actually got added to $this->entitiesResolved, but may not have been
        // processed by the loop above, because they got added while the loop was in progress.
        if ( $hasRedirect ) {
            // Call resolveMentionedEntities() recursively to resolve any yet unresolved
            // redirect targets. The regress will eventually terminate even for circular
            // redirect chains, because the second time an entity ID is encountered, it
            // will be marked as already resolved.
            // @phan-suppress-next-line PhanPossiblyInfiniteRecursionSameParams
            $this->resolveMentionedEntities();
        }
    }

    /**
     * Adds stub information for the given EntityId to the RDF graph.
     * Stub information means meta information and labels.
     */
    public function addEntityStub( EntityId $entityId ): void {
        $this->markStubEntityDataForPrefetching( [ $entityId ] );
        $this->addEntityMetaData( $entityId );

        $type = $entityId->getEntityType();
        if ( !empty( $this->entityStubRdfBuilders[ $type ] ) ) {
            $this->entityStubRdfBuilders[ $type ]->addEntityStub( $entityId );
        }
    }

    /**
     * Declares $from to be an alias for $to, using the owl:sameAs relationship.
     */
    public function addEntityRedirect( EntityId $from, EntityId $to ): void {
        $fromLName = $this->vocabulary->getEntityLName( $from );
        $fromRepoName = $this->vocabulary->getEntityRepositoryName( $from );
        $toLName = $this->vocabulary->getEntityLName( $to );
        $toRepoName = $this->vocabulary->getEntityRepositoryName( $to );

        $this->writer->about( $this->vocabulary->entityNamespaceNames[$fromRepoName], $fromLName )
            ->say( 'owl', 'sameAs' )
            ->is( $this->vocabulary->entityNamespaceNames[$toRepoName], $toLName );

        $this->entityResolved( $from );

        if ( $this->shouldProduce( RdfProducer::PRODUCE_RESOLVED_ENTITIES ) ) {
            $this->entityToResolve( $to );
        }
    }

    /**
     * Create header structure for the dump (this makes RdfProducer::PRODUCE_VERSION_INFO redundant)
     *
     * @param int $timestamp Timestamp (for testing)
     */
    public function addDumpHeader( int $timestamp = 0 ): void {
        // TODO: this should point to "this document"
        $this->writer->about( RdfVocabulary::NS_ONTOLOGY, 'Dump' )
            ->a( RdfVocabulary::NS_SCHEMA_ORG, "Dataset" )
            ->a( 'owl', 'Ontology' )
            ->say( RdfVocabulary::NS_CC, 'license' )->is( $this->vocabulary->getLicenseUrl() )
            ->say( RdfVocabulary::NS_SCHEMA_ORG, 'softwareVersion' )->value( RdfVocabulary::FORMAT_VERSION )
            ->say( RdfVocabulary::NS_SCHEMA_ORG, 'dateModified' )->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
            ->say( 'owl', 'imports' )->is( RdfVocabulary::getOntologyURI() );
    }

    private function markStubEntityDataForPrefetching( array $entityIds ): void {
        foreach ( $entityIds as $id ) {
            if ( !( $id instanceof EntityId ) ) {
                continue;
            }
            $type = $id->getEntityType();
            if (
                !empty( $this->entityStubRdfBuilders[$type] )
                && $this->entityStubRdfBuilders[$type] instanceof PrefetchingEntityStubRdfBuilder
            ) {
                $this->entityStubRdfBuilders[$type]->markForPrefetchingEntityStub( $id );
            }
        }
    }

}