wikimedia/mediawiki-extensions-Wikibase

View on GitHub
repo/includes/ValidatorBuilders.php

Summary

Maintainability
B
5 hrs
Test Coverage
<?php

declare( strict_types = 1 );

namespace Wikibase\Repo;

use DataValues\DataValue;
use DataValues\TimeValue;
use DataValues\UnboundedQuantityValue;
use MediaWiki\Site\MediaWikiPageNameNormalizer;
use ValueValidators\ValueValidator;
use Wikibase\DataModel\Entity\EntityIdParser;
use Wikibase\DataModel\Entity\EntityIdValue;
use Wikibase\DataModel\Entity\Item;
use Wikibase\DataModel\Entity\Property;
use Wikibase\DataModel\Services\Lookup\EntityLookup;
use Wikibase\Lib\ContentLanguages;
use Wikibase\Repo\Validators\AlternativeValidator;
use Wikibase\Repo\Validators\CommonsMediaExistsValidator;
use Wikibase\Repo\Validators\CompositeValidator;
use Wikibase\Repo\Validators\DataFieldValidator;
use Wikibase\Repo\Validators\DataValueValidator;
use Wikibase\Repo\Validators\EntityExistsValidator;
use Wikibase\Repo\Validators\EntityUriValidator;
use Wikibase\Repo\Validators\InterWikiLinkExistsValidator;
use Wikibase\Repo\Validators\MembershipValidator;
use Wikibase\Repo\Validators\NumberRangeValidator;
use Wikibase\Repo\Validators\NumberValidator;
use Wikibase\Repo\Validators\RegexValidator;
use Wikibase\Repo\Validators\StringLengthValidator;
use Wikibase\Repo\Validators\TimestampPrecisionValidator;
use Wikibase\Repo\Validators\TypeValidator;
use Wikibase\Repo\Validators\UrlSchemeValidators;
use Wikibase\Repo\Validators\UrlValidator;

/**
 * Defines validators for the basic well known data types supported by Wikibase.
 *
 * @warning: This is a low level factory for use by bootstrap code only!
 * Program logic should use an instance of DataTypeValidatorFactory.
 *
 * @license GPL-2.0-or-later
 * @author Daniel Kinzler
 */
class ValidatorBuilders {

    private EntityLookup $entityLookup;

    private EntityIdParser $entityIdParser;

    /**
     * @var string[]
     */
    private array $urlSchemes;

    /**
     * @var string The base URI for the vocabulary to use for units (and in the
     * future, globes and calendars).
     */
    private string $itemVocabularyBaseUri;

    /**
     * @var string The base URI wikibase concepts, for use with the validators for time and globe
     * values. Our parsers for these data types currently have Wikidata URIs hardcoded, so we need
     * to hardcode the URI to check them against for now.
     *
     * @todo use a configurable vocabulary for calendars and reference globes, instead of
     * hardcoding wikidata. Then replace usages of $wikidataBaseUri with $vocabularyBaseUri.
     */
    private string $wikidataBaseUri = 'http://www.wikidata.org/entity/';

    private ContentLanguages $contentLanguages;

    private CachingCommonsMediaFileNameLookup $mediaFileNameLookup;

    private MediaWikiPageNameNormalizer $mediaWikiPageNameNormalizer;

    private string $geoShapeStorageApiUrl;

    private string $tabularDataStorageApiUrl;

    /**
     * @param EntityLookup $lookup
     * @param EntityIdParser $idParser
     * @param string[] $urlSchemes
     * @param string $itemVocabularyBaseUri The base URI for vocabulary concepts.
     * @param ContentLanguages $contentLanguages
     * @param CachingCommonsMediaFileNameLookup $cachingCommonsMediaFileNameLookup
     * @param MediaWikiPageNameNormalizer $mediaWikiPageNameNormalizer
     * @param string $geoShapeStorageApiUrl
     * @param string $tabularDataStorageApiUrl
     */
    public function __construct(
        EntityLookup $lookup,
        EntityIdParser $idParser,
        array $urlSchemes,
        string $itemVocabularyBaseUri,
        ContentLanguages $contentLanguages,
        CachingCommonsMediaFileNameLookup $cachingCommonsMediaFileNameLookup,
        MediaWikiPageNameNormalizer $mediaWikiPageNameNormalizer,
        string $geoShapeStorageApiUrl,
        string $tabularDataStorageApiUrl
    ) {
        $this->entityLookup = $lookup;
        $this->entityIdParser = $idParser;
        $this->urlSchemes = $urlSchemes;
        $this->itemVocabularyBaseUri = $itemVocabularyBaseUri;
        $this->contentLanguages = $contentLanguages;
        $this->mediaFileNameLookup = $cachingCommonsMediaFileNameLookup;
        $this->mediaWikiPageNameNormalizer = $mediaWikiPageNameNormalizer;
        $this->geoShapeStorageApiUrl = $geoShapeStorageApiUrl;
        $this->tabularDataStorageApiUrl = $tabularDataStorageApiUrl;
    }

    /**
     * @return ValueValidator[]
     */
    public function buildItemValidators(): array {
        return $this->getEntityValidators( Item::ENTITY_TYPE );
    }

    /**
     * @return ValueValidator[]
     */
    public function buildPropertyValidators(): array {
        return $this->getEntityValidators( Property::ENTITY_TYPE );
    }

    /**
     * @return ValueValidator[]
     */
    public function buildEntityValidators(): array {
        return $this->getEntityValidators();
    }

    /**
     * @param string|null $entityType
     *
     * @return ValueValidator[]
     */
    public function getEntityValidators( string $entityType = null ): array {
        $typeValidator = new TypeValidator( EntityIdValue::class );
        $entityExistsValidator = new EntityExistsValidator( $this->entityLookup, $entityType );

        return [
            $typeValidator,
            $entityExistsValidator,
        ];
    }

    /**
     * @param int $maxLength Defaults to 400 characters. This was an arbitrary decision when it
     * turned out that 255 was to short for descriptions.
     *
     * @return ValueValidator[]
     */
    private function getCommonStringValidators( int $maxLength = 400 ): array {
        $validators = [];

        $validators[] = new TypeValidator( 'string' );
        //TODO: validate UTF8 (here and elsewhere)
        $validators[] = new StringLengthValidator( 1, $maxLength, 'mb_strlen' );
        // no leading/trailing whitespace, no tab or vertical whitespace, no line breaks.
        $validators[] = new RegexValidator(
            '/^\s|[\v\t]|\s$/u',
            true,
            'illegal-string-chars'
        );

        return $validators;
    }

    /**
     * @param string $checkExistence Either 'checkExistence' or 'doNotCheckExistence'
     *
     * @return ValueValidator[]
     */
    public function buildMediaValidators( string $checkExistence = 'checkExistence' ): array {
        // oi_archive_name is max 255 bytes, which include a timestamp and an exclamation mark,
        // so restrict file name to 240 bytes (see UploadBase::getTitle).
        $validators = $this->getCommonStringValidators( 240 );

        //The filename must contain an extension
        $validators[] = new RegexValidator(
            '/.*\.\w{2,}$/u',
            false,
            'check-file-type'
        );

        // Must contain a non-empty file name with no nasty characters (see documentation of
        // $wgLegalTitleChars as well as $wgIllegalFileChars)
        $validators[] = new RegexValidator(
            '/^[^#\/:[\\\\\]{|}]+$/u',
            false,
            'illegal-file-chars'
        );
        if ( $checkExistence === 'checkExistence' ) {
            $validators[] = new CommonsMediaExistsValidator( $this->mediaFileNameLookup );
        }

        $topValidator = new DataValueValidator(
            new CompositeValidator( $validators ) //Note: each validator is fatal
        );

        return [ new TypeValidator( DataValue::class ), $topValidator ];
    }

    /**
     * @param string $checkExistence Either 'checkExistence' or 'doNotCheckExistence'
     *
     * @return ValueValidator[]
     */
    public function buildGeoShapeValidators( string $checkExistence = 'checkExistence' ): array {
        $validators = $this->getCommonStringValidators( 240 );
        //Don't forget to change message `wikibase-validator-illegal-geo-shape-title` modifying this
        // Check for 'Data:' prefix, '.map' extension and illegal characters
        $validators[] = new RegexValidator(
            '/^Data:[^\\[\\]#\\\:{|}]+\.map$/u',
            false,
            'illegal-geo-shape-title'
        );
        if ( $checkExistence === 'checkExistence' ) {
            $validators[] = new InterWikiLinkExistsValidator(
                $this->mediaWikiPageNameNormalizer,
                $this->geoShapeStorageApiUrl
            );
        }

        $topValidator = new DataValueValidator(
            new CompositeValidator( $validators ) //Note: each validator is fatal
        );

        return [ new TypeValidator( DataValue::class ), $topValidator ];
    }

    /**
     * @param string $checkExistence Either 'checkExistence' or 'doNotCheckExistence'
     *
     * @return ValueValidator[]
     */
    public function buildTabularDataValidators( string $checkExistence = 'checkExistence' ): array {
        $validators = $this->getCommonStringValidators( 240 );
        $validators[] = new RegexValidator(
            '/^Data:[^\\[\\]#\\\:{|}]+\.tab$/u',
            false,
            'illegal-tabular-data-title'
        );
        if ( $checkExistence === 'checkExistence' ) {
            $validators[] = new InterWikiLinkExistsValidator(
                $this->mediaWikiPageNameNormalizer,
                $this->tabularDataStorageApiUrl
            );
        }

        $topValidator = new DataValueValidator(
            new CompositeValidator( $validators )
        );

        return [ new TypeValidator( DataValue::class ), $topValidator ];
    }

    /**
     * @param int $maxLength
     * @return ValueValidator[]
     */
    public function buildStringValidators( int $maxLength = 400 ): array {
        $validators = $this->getCommonStringValidators( $maxLength );

        $topValidator = new DataValueValidator(
            new CompositeValidator( $validators ) //Note: each validator is fatal
        );

        return [ new TypeValidator( DataValue::class ), $topValidator ];
    }

    /**
     * @param int $maxLength Defaults to 400 characters. This was an arbitrary decision and simply copied the default
     * of the CommonStringValidators
     *
     * @return ValueValidator[]
     */
    public function buildMonolingualTextValidators( int $maxLength = 400 ): array {
        $validators = [];

        $validators[] = new DataFieldValidator(
            'text',
            new CompositeValidator( $this->getCommonStringValidators( $maxLength ) ) //Note: each validator is fatal
        );

        $validators[] = new DataFieldValidator(
            'language',
            new MembershipValidator( $this->contentLanguages->getLanguages(), 'not-a-language' )
        );

        $topValidator = new DataValueValidator(
            new CompositeValidator( $validators ) //Note: each validator is fatal
        );

        return [ new TypeValidator( DataValue::class ), $topValidator ];
    }

    /**
     * @return ValueValidator[]
     */
    public function buildTimeValidators(): array {
        $validators = [];
        $validators[] = new TypeValidator( 'array' );

        // Expected to be a short IRI, see TimeFormatter and TimeParser.
        $urlValidator = $this->getEntityUriValidator( $this->wikidataBaseUri, 255, 'item' );
        //TODO: enforce well known calendar models from config

        $validators[] = new DataFieldValidator( 'calendarmodel', $urlValidator );

        // time string field
        $timeStringValidators = [];
        $timeStringValidators[] = new TypeValidator( 'string' );

        // down to the day
        $maxPrecision = TimeValue::PRECISION_DAY;
        $isoDataPattern = '/T00:00:00Z\z/';

        $timeStringValidators[] = new RegexValidator( $isoDataPattern );

        $validators[] = new DataFieldValidator(
            'time',
            new CompositeValidator( $timeStringValidators ) //Note: each validator is fatal
        );

        $precisionValidators = [];
        $precisionValidators[] = new TypeValidator( 'integer' );
        $precisionValidators[] = new NumberRangeValidator( TimeValue::PRECISION_YEAR1G, $maxPrecision );

        $validators[] = new DataFieldValidator(
            'precision',
            new CompositeValidator( $precisionValidators ) //Note: each validator is fatal
        );
        $validators[] = new TimestampPrecisionValidator();

        $topValidator = new DataValueValidator(
            new CompositeValidator( $validators ) //Note: each validator is fatal
        );

        return [ new TypeValidator( DataValue::class ), $topValidator ];
    }

    /**
     * @return ValueValidator[]
     */
    public function buildCoordinateValidators(): array {
        $validators = [];
        $validators[] = new TypeValidator( 'array' );

        // Expected to be a short IRI, see GlobeCoordinateValue and GlobeCoordinateParser.
        $urlValidator = $this->getEntityUriValidator( $this->wikidataBaseUri, 255, 'item' );
        //TODO: enforce well known reference globes from config

        $validators[] = new DataFieldValidator( 'precision', new NumberValidator() );

        $validators[] = new DataFieldValidator( 'globe', $urlValidator );

        $topValidator = new DataValueValidator(
            new CompositeValidator( $validators ) //Note: each validator is fatal
        );

        return [ new TypeValidator( DataValue::class ), $topValidator ];
    }

    /**
     * @param string[] $urlSchemes List of URL schemes, e.g. 'http'
     * @param int $maxLength Maximum length in bytes; compare https://stackoverflow.com/a/417184
     * @return ValueValidator[]
     */
    private function getUrlValidators( array $urlSchemes, int $maxLength ): array {
        $validators = [];
        $validators[] = new TypeValidator( 'string' );
        $validators[] = new StringLengthValidator( 2, $maxLength );

        $urlValidatorsBuilder = new UrlSchemeValidators();
        $urlValidators = $urlValidatorsBuilder->getValidators( $urlSchemes );
        $validators[] = new UrlValidator( $urlValidators );

        return $validators;
    }

    private function getEntityUriValidator(
        string $prefix,
        int $maxLength,
        ?string $entityType = null
    ): ValueValidator {
        $validators = $this->getUrlValidators( [ 'http', 'https' ], $maxLength );
        $validators[] = new EntityUriValidator( $this->entityIdParser, $prefix, $entityType );
        return new CompositeValidator( $validators );
    }

    /**
     * @param int $maxLength
     * @return ValueValidator[]
     */
    public function buildUrlValidators( int $maxLength = 500 ): array {
        $urlValidator = new CompositeValidator(
            $this->getUrlValidators( $this->urlSchemes, $maxLength )
        );

        $topValidator = new DataValueValidator(
            $urlValidator
        );

        return [ new TypeValidator( DataValue::class ), $topValidator ];
    }

    /**
     * @return ValueValidator[]
     */
    public function buildQuantityValidators(): array {
        $validators = [];
        $validators[] = new TypeValidator( 'array' );

        // The "amount", "upperBound" and "lowerBound" fields are already validated by the
        // UnboundedQuantityValue/QuantityValue constructors.

        $unitValidators = new AlternativeValidator( [
            // NOTE: "1" is always considered legal for historical reasons,
            // since we use it to represent "unitless" quantities. We could also use
            // http://qudt.org/vocab/unit#Unitless or http://www.wikidata.org/entity/Q199
            new MembershipValidator( [ '1' ] ),
            $this->getEntityUriValidator( $this->itemVocabularyBaseUri, 255, 'item' ),
        ] );
        $validators[] = new DataFieldValidator( 'unit', $unitValidators );

        $topValidator = new DataValueValidator(
            new CompositeValidator( $validators ) //Note: each validator is fatal
        );

        return [ new TypeValidator( UnboundedQuantityValue::class ), $topValidator ];
    }

}