classes/parsers/RDFIO_URIToWikiTitleConverter.php

Summary

Maintainability
B
6 hrs
Test Coverage
<?php

/**
 * Exception used in the RDFIOURIToTitleConverter class
 */
class WikiTitleNotFoundException extends MWException { }

/**
 * Converter that takes an RDF URI and returns a suitable Wiki title for that URI
 * based on various strategies, which are tried one at a time, until a usable title 
 * is found.
 * @author samuel
 *
 */
class RDFIOURIToTitleConverter { 

    protected $arc2Triples = null;
    protected $arc2ResourceIndex = null;
    protected $arc2NSPrefixes = null;
    protected $arc2Store = null;

    function __construct( $arc2Triples, $arc2ResourceIndex, $arc2NSPrefixes ) {
        $this->arc2Store = new RDFIOARC2StoreWrapper();

        // Store paramters as class variables
        $this->arc2Triples = $arc2Triples;
        $this->arc2ResourceIndex = $arc2ResourceIndex;
        $this->arc2NSPrefixes = $arc2NSPrefixes;
    }

    /**
     * The main method, converting from URI:s to wiki titles.
     * NOTE: Properties are taken care of py a special method below!
     * @param string $uri
     * @return string $wikiTitle
     */
    public function convert( $uri ) {
        // Define the conversion functions to try, in
        // specified order (the first one first).
        // You'll find them defined further below in this file.
        $convStrategies = array(
            'getExistingTitleForURI',
            'applyGlobalSettingForPropertiesToUseAsWikiTitle',
            'parseBNode',
            'shortenURINamespaceToAliasInSourceRDF',
            'extractLocalPartFromURI',
            'useValueAsIs'
        );

        foreach ($convStrategies as $currStrategy ) {
            $title = $this->$currStrategy( $uri );

            $title = urldecode( $title ); // If a part of the URL was used
            $title = $this->cleanPageTitle( $title );

            if ($title != '') {
                return $title;
            }
        }
    }

    /////// CONVERSION STRATEGIES ///////

    /**
     * Strategy 1: Use existing title for URI
     */
    function getExistingTitleForURI( $uri ) {
        return $this->arc2Store->getWikiTitleByEquivalentURI( $uri );
    }

    /**
     * Strategy 2: Use configured properties to get the title
     */
    function applyGlobalSettingForPropertiesToUseAsWikiTitle( $uri ) {
        global $rdfiogTitleProperties;

        $title = '';

        if ( !$this->globalSettingForPropertiesToUseAsWikiTitleExists() ) {
            $this->setglobalSettingForPropertiesToUseAsWikiTitleToDefault();
        }

        $index = $this->arc2ResourceIndex;
        if ( is_array($index) ) {
            foreach ( $index as $subject => $properties ) {
                if ( $subject === $uri ) {
                    foreach ( $properties as $prop => $obj ) {
                        if ( in_array( $prop, $rdfiogTitleProperties ) ) {
                            $title = $obj[0];
                        }
                    }
                }
            }
        }

        return $title;
    }

    /**
     * Strategy 3: Check if $uri is a blank node, and if so, add 'BNode_' to the wiki title.
     * @param $uri
     * @return string
     */
    function parseBNode( $uri ) {
        $title = '';

        if ( substr( $uri, 0, 2 ) == '_:' ) {
            $bnodeId = explode( ':', $uri )[1];
            $title = 'Blank_node_' . substr( $bnodeId, 3);
        }

        return $title;
    }

    /**
     * Strategy 4: Abbreviate the namespace to its NS prefix as configured in
     * mappings in the parser (default ones, or provided as part of the
     * imported data)
     */
    function shortenURINamespaceToAliasInSourceRDF( $uri ) {
        global $rdfiogBaseURIs;
        
        $nsPrefixes = $this->arc2NSPrefixes;
        $title = '';

        // The same, but according to mappings from LocalSettings.php
        if ( is_array( $rdfiogBaseURIs ) ) {
            $nsPrefixes = array_merge( $nsPrefixes, $rdfiogBaseURIs );
        }

        // Collect all the inputs for abbreviation, and apply:
        if ( is_array( $nsPrefixes ) ) {
            $abbreviatedUri = $this->abbreviateParserNSPrefixes( $uri, $nsPrefixes );
            $title = $abbreviatedUri;
        }

        return $title;
    }

    /**
     * Strategy 5: As a default, just try to get the local part of the URL
     */
    function extractLocalPartFromURI( $uri ) {
        $title = '';

        $parts = $this->splitURI( $uri );
        if ( $parts[1] != '' ) {
            $title = $parts[1];
        }

        return $title;
    }

    /**
     * Strategy 6: Just use the value as is, as if it was a literal value
     */
    function useValueAsIs( $uri ) {
        return $uri;
    }

    /////// HELPER METHODS ///////

    /**
     * Just tell if $rdfiogTitleProperties is set or not.
     */
    function globalSettingForPropertiesToUseAsWikiTitleExists() {
        global $rdfiogTitleProperties;
        return isset( $rdfiogTitleProperties );
    }
    
    /**
     * Default settings for which RDF properties to use for getting
     * possible candidates for wiki page title names.
     */
    function setglobalSettingForPropertiesToUseAsWikiTitleToDefault() {
        global $rdfiogTitleProperties;
        $rdfiogTitleProperties = array(
            'http://semantic-mediawiki.org/swivt/1.0#page', // Suggestion for new property
            'http://www.w3.org/2000/01/rdf-schema#label',
            'http://purl.org/dc/terms/title',
            'http://purl.org/dc/elements/1.1/title',
            'http://www.w3.org/2004/02/skos/core#preferredLabel',
            'http://xmlns.com/foaf/0.1/name'
        );
    }

    /**
     * Use the namespaces from the RDF / SPARQL source, to shorten the URIs.
     * @param string $uri
     * @param array $nsPrefixes
     * @return string
     */
    function abbreviateParserNSPrefixes( $uri, $nsPrefixes ) {
        foreach ( $nsPrefixes as $namespace => $prefix ) {
            $nslength = strlen( $namespace );
            $basepart = '';
            $localpart = '';
            $uriContainsNamepace = substr( $uri, 0, $nslength ) === $namespace;
            if ( $uriContainsNamepace ) {
                $localpart = substr( $uri, $nslength );
                $basepart = $prefix;
                break;
            }
        }

        // Make sure both basepart and localpart contains anything before proceeding
        if ( $basepart === '' ||  $localpart === '' ) {
            return '';
        }

        $abbreviatedUri = $basepart . ':' . $localpart;
        return $abbreviatedUri;
    }


    /**
     * Customized version of the splitURI($uri) of the ARC2 library (http://arc.semsol.org)
     * Splits a URI into its base part and local part, and returns them as an
     * array of two strings
     * @param string $uri
     * @return array
     */
    public function splitURI( $uri ) {
        global $rdfiogBaseURIs;
        /* ADAPTED FROM ARC2 WITH SOME MODIFICATIONS
         * the following namespaces may lead to conflated URIs,
         * we have to set the split position manually
         */
        if ( strpos( $uri, 'www.w3.org' ) ) {
            $specials = array(
                'http://www.w3.org/XML/1998/namespace',
                'http://www.w3.org/2005/Atom',
                'http://www.w3.org/1999/xhtml',
            );
            if ( $rdfiogBaseURIs != '' ) {
                $specials = array_merge( $specials, $rdfiogBaseURIs );
            }
            foreach ( $specials as $ns ) {
                if ( strpos( $uri, $ns ) === 0 ) {
                    $localPart = substr( $uri, strlen( $ns ) );
                    if ( !preg_match( '/^[\/\#]/', $localPart ) ) {
                        return array( $ns, $localPart );
                    }
                }
            }
        }
        // auto-splitting on / or #
        if ( preg_match( '/^(.*[\#])([^\#]+)$/', $uri, $matches ) ) {
            return array( $matches[1], $matches[2] );
        }
        if ( preg_match( '/^(.*[\:])([^\:\/]+)$/', $uri, $matches ) ) {
            return array( $matches[1], $matches[2] );
        }
        // auto-splitting on last special char, e.g. urn:foo:bar
        if ( preg_match( '/^(.*[\/])([^\/]+)$/', $uri, $matches ) ) {
            return array( $matches[1], $matches[2] );
        } 
        return array( $uri, '' );
    }

    /**
     * Remove some characters that are not allowed in Wiki titles.
     * @param string $title
     * @return string $title
     */
    public function cleanPageTitle( $title ) {
        $replacements = array(
            '[' => '',
            ']' => '',
            '{{' => '',
            '}}' => '',
            '#' => ':',
        );
        foreach( $replacements as $search => $replace ) {
            $title = str_replace( $search, $replace, $title );
        }
        return $title;
    }
}

/**
 * Subclass of the more general RDFIOURIToTitleConverter.
 * For normal wiki pages. 
 */
class RDFIOURIToWikiTitleConverter extends RDFIOURIToTitleConverter {}

/**
 * Subclass of the more general RDFIOURIToTitleConverter
 * For property pages (those where titles start with "Property:")  
 */
class RDFIOURIToPropertyTitleConverter extends RDFIOURIToTitleConverter {

    /**
     * The main method, which need some special handling.
     * @param string $propertyURI
     * @return string $propertyTitle
     */
    function convert( $propertyURI ) {
        $existingPropTitle = $this->arc2Store->getWikiTitleByEquivalentURI($propertyURI, true);
        if ( $existingPropTitle != "" ) {
            // If the URI had an existing title, use that
            $propertyTitle = $existingPropTitle;
        } else {
            $uriToTitleConv = new RDFIOURIToWikiTitleConverter( $this->arc2Triples, $this->arc2ResourceIndex, $this->arc2NSPrefixes );
            $propertyTitle = $uriToTitleConv->convert( $propertyURI );
        }
        $propertyTitle = $this->cleanPageTitle( $propertyTitle );

        return $propertyTitle;
    }
}