wikimedia/mediawiki-extensions-Wikibase

View on GitHub
repo/includes/LinkedData/EntityDataRequestHandler.php

Summary

Maintainability
D
2 days
Test Coverage
<?php

namespace Wikibase\Repo\LinkedData;

use HttpError;
use MediaWiki\Cache\HTMLCacheUpdater;
use MediaWiki\Output\OutputPage;
use MediaWiki\Request\WebRequest;
use MediaWiki\Request\WebResponse;
use Psr\Log\LoggerInterface;
use Wikibase\DataModel\Entity\EntityId;
use Wikibase\DataModel\Entity\EntityIdParser;
use Wikibase\DataModel\Entity\EntityIdParsingException;
use Wikibase\DataModel\Entity\EntityRedirect;
use Wikibase\DataModel\Services\Lookup\EntityRedirectLookup;
use Wikibase\DataModel\Services\Lookup\EntityRedirectLookupException;
use Wikibase\Lib\Store\BadRevisionException;
use Wikibase\Lib\Store\EntityRevision;
use Wikibase\Lib\Store\EntityRevisionLookup;
use Wikibase\Lib\Store\RedirectRevision;
use Wikibase\Lib\Store\RevisionedUnresolvedRedirectException;
use Wikibase\Lib\Store\StorageException;
use Wikibase\Lib\SubEntityTypesMapper;
use Wikibase\Repo\Rdf\UnknownFlavorException;
use Wikimedia\Http\HttpAcceptNegotiator;
use Wikimedia\Http\HttpAcceptParser;

/**
 * Request handler implementing a linked data interface for Wikibase entities.
 *
 * @license GPL-2.0-or-later
 * @author Daniel Kinzler
 * @author Thomas Pellissier Tanon
 * @author Anja Jentzsch < anja.jentzsch@wikimedia.de >
 */
class EntityDataRequestHandler {

    /**
     * Allowed smallest and biggest number of seconds for the "max-age=..." and "s-maxage=..." cache
     * control parameters.
     *
     * @todo Hard maximum could be configurable somehow.
     */
    private const MINIMUM_MAX_AGE = 0;
    private const MAXIMUM_MAX_AGE = 2678400; // 31 days

    /**
     * @var EntityDataSerializationService
     */
    private $serializationService;

    /**
     * @var EntityDataUriManager
     */
    private $uriManager;

    /**
     * @var EntityIdParser
     */
    private $entityIdParser;

    /**
     * @var EntityRevisionLookup
     */
    private $entityRevisionLookup;

    /**
     * @var EntityRedirectLookup
     */
    private $entityRedirectLookup;

    /**
     * @var EntityDataFormatProvider
     */
    private $entityDataFormatProvider;

    /**
     * @var HTMLCacheUpdater
     */
    private $htmlCacheUpdater;

    /**
     * @var LoggerInterface
     */
    private $logger;

    /**
     * @var string
     */
    private $defaultFormat;

    /**
     * @var int Number of seconds to cache entity data.
     */
    private $maxAge;

    /**
     * @var bool
     */
    private $useCdn;

    /**
     * @var string|null
     */
    private $frameOptionsHeader;

    /**
     * @var string[]
     */
    private $entityTypesWithoutRdfOutput;

    /**
     * @var SubEntityTypesMapper
     */
    private $subEntityTypesMap;

    /**
     * @param EntityDataUriManager $uriManager
     * @param HTMLCacheUpdater $htmlCacheUpdater
     * @param EntityIdParser $entityIdParser
     * @param EntityRevisionLookup $entityRevisionLookup
     * @param EntityRedirectLookup $entityRedirectLookup
     * @param EntityDataSerializationService $serializationService
     * @param EntityDataFormatProvider $entityDataFormatProvider
     * @param LoggerInterface $logger
     * @param string[] $entityTypesWithoutRdfOutput
     * @param string $defaultFormat The format as a file extension or MIME type.
     * @param int $maxAge number of seconds to cache entity data
     * @param bool $useCdn do we have web caches configured?
     * @param string|null $frameOptionsHeader for X-Frame-Options
     * @param SubEntityTypesMapper $subEntityTypesMap
     */
    public function __construct(
        EntityDataUriManager $uriManager,
        HTMLCacheUpdater $htmlCacheUpdater,
        EntityIdParser $entityIdParser,
        EntityRevisionLookup $entityRevisionLookup,
        EntityRedirectLookup $entityRedirectLookup,
        EntityDataSerializationService $serializationService,
        EntityDataFormatProvider $entityDataFormatProvider,
        LoggerInterface $logger,
        array $entityTypesWithoutRdfOutput,
        $defaultFormat,
        $maxAge,
        $useCdn,
        $frameOptionsHeader,
        SubEntityTypesMapper $subEntityTypesMap
    ) {
        $this->uriManager = $uriManager;
        $this->htmlCacheUpdater = $htmlCacheUpdater;
        $this->entityIdParser = $entityIdParser;
        $this->entityRevisionLookup = $entityRevisionLookup;
        $this->entityRedirectLookup = $entityRedirectLookup;
        $this->serializationService = $serializationService;
        $this->entityDataFormatProvider = $entityDataFormatProvider;
        $this->logger = $logger;
        $this->entityTypesWithoutRdfOutput = $entityTypesWithoutRdfOutput;
        $this->defaultFormat = $defaultFormat;
        $this->maxAge = $maxAge;
        $this->useCdn = $useCdn;
        $this->frameOptionsHeader = $frameOptionsHeader;
        $this->subEntityTypesMap = $subEntityTypesMap;
    }

    /**
     * Checks whether the request is complete, i.e. whether it contains all information needed
     * to reply with entity data.
     *
     * This does not check whether the request is valid and will actually produce a successful
     * response.
     *
     * @param string|null $doc Document name, e.g. Q5 or Q5.json or Q5:33.xml
     * @param WebRequest $request
     *
     * @return bool
     */
    public function canHandleRequest( $doc, WebRequest $request ) {
        if ( $doc === '' || $doc === null ) {
            if ( $request->getText( 'id', '' ) === '' ) {
                return false;
            }
        }

        return true;
    }

    /**
     * Main method for handling requests.
     *
     * @param null|string $doc Document name, e.g. Q5 or Q5.json or Q5:33.xml
     * @param WebRequest $request The request parameters. Known parameters are:
     *        - id: the entity ID
     *        - format: the format
     *        - oldid|revision: the revision ID
     *        - action=purge: to purge cached data from (web) caches
     * @param OutputPage $output
     *
     * @note Instead of an output page, a WebResponse could be sufficient, but
     *        redirect logic is currently implemented in OutputPage.
     *
     * @throws HttpError
     */
    public function handleRequest( $doc, WebRequest $request, OutputPage $output ) {
        // No matter what: The response is always public
        $output->getRequest()->response()->header( 'Access-Control-Allow-Origin: *' );

        $revision = 0;
        $id = '';
        $format = '';

        if ( $doc !== null && $doc !== '' ) {
            [ $id, $format ] = $this->uriManager->parseDocName( $doc );
        }

        // get entity id and format from request parameter
        $format = $request->getText( 'format', $format );
        $id = $request->getText( 'id', $id );
        $revision = $request->getInt( 'revision', $revision );
        $redirectMode = $request->getText( 'redirect' );
        //TODO: malformed revision IDs should trigger a code 400

        // If there is no ID, fail
        if ( $id === null || $id === '' ) {
            //TODO: different error message?
            throw new HttpError( 400, $output->msg( 'wikibase-entitydata-bad-id', $id ) );
        }

        try {
            $entityId = $this->entityIdParser->parse( $id );
        } catch ( EntityIdParsingException $ex ) {
            throw new HttpError( 400, $output->msg( 'wikibase-entitydata-bad-id', $id ) );
        }

        if ( $this->entityDataFormatProvider->isRdfFormat( $format ) &&
            in_array( $entityId->getEntityType(), $this->entityTypesWithoutRdfOutput )
        ) {
            throw new HttpError( 406, $output->msg( 'wikibase-entitydata-rdf-not-available', $entityId->getEntityType() ) );
        }

        //XXX: allow for logged in users only?
        if ( $request->getText( 'action' ) === 'purge' ) {
            $this->purgeWebCache( $entityId, $revision );
            //XXX: Now what? Proceed to show the data?
        }

        if ( $format === null || $format === '' ) {
            // if no format is given, apply content negotiation and return.
            $this->httpContentNegotiation( $request, $output, $entityId, $revision );
            return;
        }

        //NOTE: will trigger a 415 if the format is not supported
        $format = $this->getCanonicalFormat( $format );

        if ( $doc !== null && $doc !== '' ) {
            // if subpage syntax is used, always enforce the canonical form
            $canonicalDoc = $this->uriManager->getDocName( $entityId, $format );

            if ( $doc !== $canonicalDoc ) {
                $url = $this->uriManager->getDocUrl( $entityId, $format, $revision );
                if ( $url === null ) {
                    throw new HttpError( 400, $output->msg( 'wikibase-entitydata-bad-id', $id ) );
                }
                $output->redirect( $url, 301 );
                return;
            }
        }

        // if the format is HTML, redirect to the entity's wiki page
        if ( $format === 'html' ) {
            $url = $this->uriManager->getDocUrl( $entityId, 'html', $revision );
            if ( $url === null ) {
                throw new HttpError( 400, $output->msg( 'wikibase-entitydata-bad-id', $id ) );
            }
            $output->redirect( $url, 303 );
            return;
        }

        // if redirection was force, redirect
        if ( $redirectMode === 'force' ) {
            $url = $this->uriManager->getDocUrl( $entityId, $format, $revision );
            if ( $url === null ) {
                throw new HttpError( 400, $output->msg( 'wikibase-entitydata-bad-id', $id ) );
            }
            $output->redirect( $url, 303 );
            return;
        }

        $this->showData( $request, $output, $format, $entityId, $revision );
    }

    /**
     * Returns the canonical format name for the given format.
     *
     * @param string $format
     *
     * @return string
     * @throws HttpError code 415 if the format is not supported.
     */
    public function getCanonicalFormat( $format ) {
        $format = strtolower( $format );

        // we always support html, it's handled by the entity's wiki page.
        if ( $format === 'html' || $format === 'htm' || $format === 'text/html' ) {
            return 'html';
        }

        // normalize format name (note that HTML may not be known to the service)
        $canonicalFormat = $this->entityDataFormatProvider->getFormatName( $format );

        if ( $canonicalFormat === null ) {
            $msg = wfMessage( 'wikibase-entitydata-unsupported-format', $format );
            throw new HttpError( 415, $msg );
        }

        return $canonicalFormat;
    }

    /**
     * Purges the entity data identified by the doc parameter from any HTTP caches.
     * Does nothing if $wgUseCdn is not set.
     *
     * @param EntityId $id The entity ID for which to purge all data.
     * @param int $revision The revision ID (0 for current/unspecified)
     */
    public function purgeWebCache( EntityId $id, int $revision ) {
        $urls = $this->uriManager->getPotentiallyCachedUrls( $id, $revision );
        if ( $urls !== [] ) {
            $this->htmlCacheUpdater->purgeUrls( $urls );
        }
    }

    /**
     * Applies HTTP content negotiation.
     * If the negotiation is successful, this method will set the appropriate redirect
     * in the OutputPage object and return. Otherwise, an HttpError is thrown.
     *
     * @param WebRequest $request
     * @param OutputPage $output
     * @param EntityId $id The ID of the entity to show
     * @param int $revision The desired revision
     *
     * @throws HttpError
     */
    public function httpContentNegotiation( WebRequest $request, OutputPage $output, EntityId $id, $revision = 0 ) {
        $headers = $request->getAllHeaders();
        if ( isset( $headers['ACCEPT'] ) ) {
            $parser = new HttpAcceptParser();
            $accept = $parser->parseWeights( $headers['ACCEPT'] );
        } else {
            // anything goes
            $accept = [
                '*' => 0.1, // just to make extra sure
            ];

            $defaultFormat = $this->entityDataFormatProvider->getFormatName( $this->defaultFormat );
            $defaultMime = $this->entityDataFormatProvider->getMimeType( $defaultFormat );

            // prefer the default
            if ( $defaultMime != null ) {
                $accept[$defaultMime] = 1;
            }
        }

        $mimeTypes = $this->entityDataFormatProvider->getSupportedMimeTypes();
        $mimeTypes[] = 'text/html'; // HTML is handled by the normal page URL

        $negotiator = new HttpAcceptNegotiator( $mimeTypes );
        $format = $negotiator->getBestSupportedKey( $accept, null );

        if ( $format === null ) {
            $mimeTypes = implode( ', ', $this->entityDataFormatProvider->getSupportedMimeTypes() );
            $msg = $output->msg( 'wikibase-entitydata-not-acceptable', $mimeTypes );
            throw new HttpError( 406, $msg );
        }

        $format = $this->getCanonicalFormat( $format );

        $url = $this->uriManager->getDocUrl( $id, $format, $revision );
        if ( $url === null ) {
            throw new HttpError( 400, $output->msg( 'wikibase-entitydata-bad-id', $id->getSerialization() ) );
        }
        $output->redirect( $url, 303 );
    }

    /**
     * Loads the requested Entity. Redirects are resolved if no specific revision
     * is requested or they are explicitly allowed by $allowRedirects.
     *
     * @param EntityId $id
     * @param int $revision The revision ID (use 0 for the current revision).
     * @param bool $allowRedirects Can we fetch redirects when revision is set?
     *
     * @return array list( EntityRevision, RedirectRevision|null )
     * @throws HttpError
     */
    private function getEntityRevision( EntityId $id, $revision, $allowRedirects = false ) {
        $prefixedId = $id->getSerialization();
        $redirectRevision = null;

        try {
            $entityRevision = $this->entityRevisionLookup->getEntityRevision( $id, $revision );

            if ( $entityRevision === null ) {
                $this->logger->debug(
                    '{method}: entity not found: {prefixedId}',
                    [
                        'method' => __METHOD__,
                        'prefixedId' => $prefixedId,
                    ]
                );

                $msg = wfMessage( 'wikibase-entitydata-not-found', $prefixedId );
                throw new HttpError( 404, $msg );
            }
        } catch ( RevisionedUnresolvedRedirectException $ex ) {
            $this->validateRedirectability( $id, $ex->getRedirectTargetId() );

            $redirectRevision = new RedirectRevision(
                new EntityRedirect( $id, $ex->getRedirectTargetId() ),
                $ex->getRevisionId(), $ex->getRevisionTimestamp()
            );

            if ( $revision === 0 || $allowRedirects ) {
                // If no specific revision is requested or redirects are explicitly allowed, resolve the redirect.
                [ $entityRevision ] = $this->getEntityRevision( $ex->getRedirectTargetId(), 0 );
            } else {
                // The requested revision is a redirect
                $this->logger->debug(
                    '{method}: revision {revision} of {prefixedId} is a redirect: {exMsg}',
                    [
                        'method' => __METHOD__,
                        'revision' => $revision,
                        'prefixedId' => $prefixedId,
                        'exMsg' => strval( $ex ),
                    ]
                );

                $msg = wfMessage( 'wikibase-entitydata-bad-revision', $prefixedId, $revision );
                throw new HttpError( 400, $msg );
            }
        } catch ( BadRevisionException $ex ) {
            $this->logger->debug(
                '{method}: could not load revision {revision} or {prefixedId}: {exMsg}',
                [
                    'method' => __METHOD__,
                    'revision' => $revision,
                    'prefixedId' => $prefixedId,
                    'exMsg' => strval( $ex ),
                ]
            );

            $msg = wfMessage( 'wikibase-entitydata-bad-revision', $prefixedId, $revision );
            throw new HttpError( 404, $msg );
        } catch ( StorageException $ex ) {
            $this->logger->debug(
                '{method}: failed to load {prefixedId}: {exMsg} (revision {revision})',
                [
                    'method' => __METHOD__,
                    'prefixedId' => $prefixedId,
                    'exMsg' => strval( $ex ),
                    'revision' => $revision,
                ]
            );

            $msg = wfMessage( 'wikibase-entitydata-storage-error', $prefixedId, $revision );
            throw new HttpError( 500, $msg );
        }

        return [ $entityRevision, $redirectRevision ];
    }

    private function validateRedirectability( EntityId $id, EntityId $redirectTargetId ): void {
        if ( $this->subEntityTypesMap->getParentEntityType( $id->getEntityType() ) === $redirectTargetId->getEntityType() ) {
            throw new HttpError(
                404,
                wfMessage(
                    'wikibase-entitydata-unresolvable-sub-entity-redirect',
                    $id->getSerialization(),
                    $redirectTargetId->getSerialization()
                )
            );
        }
    }

    /**
     * Loads incoming redirects referring to the given entity ID.
     *
     * @param EntityId $id
     *
     * @return EntityId[]
     * @throws HttpError
     */
    private function getIncomingRedirects( EntityId $id ) {
        try {
            return $this->entityRedirectLookup->getRedirectIds( $id );
        } catch ( EntityRedirectLookupException $ex ) {
            $prefixedId = $id->getSerialization();
            $this->logger->debug(
                '{method}: failed to load incoming redirects of {prefixedId}: {exMsg}',
                [
                    'method' => __METHOD__,
                    'prefixedId' => $prefixedId,
                    'exMsg' => strval( $ex ),
                ]
            );

            return [];
        }
    }

    /**
     * Output entity data.
     *
     * @param WebRequest $request
     * @param OutputPage $output
     * @param string $format The name (mime type of file extension) of the format to use
     * @param EntityId $id The entity ID
     * @param int $revision The revision ID (use 0 for the current revision).
     *
     * @throws HttpError
     */
    public function showData( WebRequest $request, OutputPage $output, $format, EntityId $id, $revision ) {
        $flavor = $request->getRawVal( 'flavor' );

        /** @var EntityRevision $entityRevision */
        /** @var RedirectRevision $followedRedirectRevision */
        // If flavor is "dump", we allow fetching redirects by revision, since we won't
        // be dumping the content of the target revision.
        [ $entityRevision, $followedRedirectRevision ] = $this->getEntityRevision( $id, $revision, $flavor === 'dump' );

        // handle If-Modified-Since
        $imsHeader = $request->getHeader( 'IF-MODIFIED-SINCE' );
        if ( $imsHeader !== false ) {
            $ims = wfTimestamp( TS_MW, $imsHeader );

            if ( $entityRevision->getTimestamp() <= $ims ) {
                $response = $output->getRequest()->response();
                $response->header( 'Status: 304', true, 304 );
                $output->setArticleBodyOnly( true );
                return;
            }
        }

        if ( $flavor === 'dump' || $revision > 0 ) {
            // In dump mode and when fetching a specific revision, don't include incoming redirects.
            $incomingRedirects = [];
        } else {
            // Get the incoming redirects of the entity (if we followed a redirect, use the target id).
            $incomingRedirects = $this->getIncomingRedirects( $entityRevision->getEntity()->getId() );
        }

        try {
            [ $data, $contentType ] = $this->serializationService->getSerializedData(
                $format,
                $entityRevision,
                $followedRedirectRevision,
                $incomingRedirects,
                $flavor
            );
        } catch ( UnknownFlavorException $e ) {
            $knownFlavors = $e->getKnownFlavors();
            throw new HttpError(
                400,
                $output->msg( 'wikibase-entitydata-bad-flavor' )
                    ->plaintextParams( $e->getUnknownFlavor() )
                    ->numParams( count( $knownFlavors ) )
                    ->params( implode( '|', $knownFlavors ) )
            );
        }

        $output->disable();
        $this->outputData(
            $request,
            $id,
            $revision,
            $output->getRequest()->response(),
            $data,
            $contentType,
            $entityRevision->getTimestamp()
        );
    }

    /**
     * Output the entity data and set the appropriate HTTP response headers.
     *
     * @param WebRequest $request
     * @param EntityId $requestId The original entity ID of the request
     * @param int $requestRevision The original revision ID of the request (0 for latest)
     * @param WebResponse $response
     * @param string $data The data to output
     * @param string $contentType The data's mime type
     * @param string $lastModified
     */
    public function outputData(
        WebRequest $request,
        EntityId $requestId,
        int $requestRevision,
        WebResponse $response,
        string $data,
        string $contentType,
        string $lastModified
    ) {
        // NOTE: similar code as in RawAction::onView, keep in sync.

        $maxAge = $request->getInt( 'maxage', $this->maxAge );
        $sMaxAge = $request->getInt( 'smaxage', $this->maxAge );

        $maxAge  = max( self::MINIMUM_MAX_AGE, min( self::MAXIMUM_MAX_AGE, $maxAge ) );
        $sMaxAge = max( self::MINIMUM_MAX_AGE, min( self::MAXIMUM_MAX_AGE, $sMaxAge ) );

        $response->header( 'Content-Type: ' . $contentType . '; charset=UTF-8' );

        if ( $lastModified ) {
            $response->header( 'Last-Modified: ' . wfTimestamp( TS_RFC2822, $lastModified ) );
        }

        //Set X-Frame-Options API results (bug T41180)
        if ( $this->frameOptionsHeader !== null && $this->frameOptionsHeader !== '' ) {
            $response->header( "X-Frame-Options: $this->frameOptionsHeader" );
        }

        $cacheableUrls = $this->uriManager->getCacheableUrls( $requestId, $requestRevision );
        if ( in_array( $request->getFullRequestURL(), $cacheableUrls ) ) {
            $response->header( 'Cache-Control: public, s-maxage=' . $sMaxAge . ', max-age=' . $maxAge );
        } else {
            $response->header( 'Cache-Control: private, no-cache, s-maxage=0' );
        }

        print $data;

        // exit normally here, keeping all levels of output buffering.
    }

}