wikimedia/mediawiki-extensions-Wikibase

View on GitHub
repo/maintenance/DumpEntities.php

Summary

Maintainability
A
3 hrs
Test Coverage
<?php

namespace Wikibase\Repo\Maintenance;

use MediaWiki\Maintenance\Maintenance;
use MediaWiki\Registration\ExtensionRegistry;
use MediaWiki\Settings\SettingsBuilder;
use Onoi\MessageReporter\ObservableMessageReporter;
use Wikibase\DataModel\Services\EntityId\EntityIdPager;
use Wikibase\DataModel\Services\Lookup\EntityLookupException;
use Wikibase\Lib\Reporting\ExceptionHandler;
use Wikibase\Lib\Reporting\ReportingExceptionHandler;
use Wikibase\Lib\WikibaseSettings;
use Wikibase\Repo\Dumpers\DumpGenerator;
use Wikibase\Repo\IO\EntityIdReader;
use Wikibase\Repo\IO\LineReader;
use Wikibase\Repo\Store\Sql\SqlEntityIdPager;
use Wikibase\Repo\Store\Sql\SqlEntityIdPagerFactory;
use Wikibase\Repo\Store\Store;
use Wikibase\Repo\WikibaseRepo;
use Wikimedia\AtEase\AtEase;

$basePath = getenv( 'MW_INSTALL_PATH' ) !== false ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../../..';

require_once $basePath . '/maintenance/Maintenance.php';

/**
 * Maintenance script for generating a dump of entities in the repository.
 *
 * @license GPL-2.0-or-later
 * @author Daniel Kinzler
 */
abstract class DumpEntities extends Maintenance {

    /**
     * @var SqlEntityIdPagerFactory
     */
    private $sqlEntityIdPagerFactory;

    /**
     * @var bool|resource
     */
    private $logFileHandle = false;

    /** @var string[] */
    private $existingEntityTypes = [];

    /** @var string[] */
    private $entityTypesToExcludeFromOutput = [];

    public function __construct() {
        parent::__construct();

        $this->addDescription( 'Generate a ' . $this->getDumpType() . ' dump from entities in the repository.' );

        $this->addOption( 'list-file', "A file containing one entity ID per line.", false, true );
        $this->addOption(
            'entity-type',
            "Only dump this kind of entity, e.g. `item` or `property`. Can be given multiple times.",
            false,
            true,
            false,
            /* $multiOccurrence */ true
        );
        $this->addOption( 'sharding-factor', "The number of shards (must be >= 1)", false, true );
        $this->addOption( 'shard', "The shard to output (must be less than the sharding-factor)", false, true );
        $this->addOption( 'batch-size', "The number of entities per processing batch", false, true );
        $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, true );
        $this->addOption( 'log', "Log file (default is stderr). Will be appended.", false, true );
        $this->addOption( 'quiet', "Disable progress reporting", false, false );
        $this->addOption( 'limit', "Limit how many entities are dumped.", false, true );
        $this->addOption( 'no-cache', "If this is set, don't try to read from an EntityRevisionCache.", false, false );
        $this->addOption(
            'first-page-id',
            'First page id to dump, use 1 to start with the first page. Use the reported last SqlEntityIdPager position + 1 ' .
                'to continue a previous run. Not compatible with --list-file.',
            false,
            true
        );
        $this->addOption(
            'last-page-id',
            'Page id of the last page to possibly include in the dump. Not compatible with --list-file.',
            false,
            true
        );
        $this->addOption(
            'ignore-missing',
            'Ignore missing IDs, do not report errors on them',
            false,
            false
        );
    }

    public function setDumpEntitiesServices(
        SqlEntityIdPagerFactory $sqlEntityIdPagerFactory,
        array $existingEntityTypes,
        array $entityTypesToExcludeFromOutput
    ) {
        $this->sqlEntityIdPagerFactory = $sqlEntityIdPagerFactory;
        $this->existingEntityTypes = $existingEntityTypes;
        $this->entityTypesToExcludeFromOutput = $entityTypesToExcludeFromOutput;
    }

    /**
     * Used in description of script at command-line
     * @return string
     */
    abstract protected function getDumpType(): string;

    /**
     * Create concrete dumper instance
     * @param resource $output
     * @return DumpGenerator
     */
    abstract protected function createDumper( $output );

    /**
     * Outputs a message vis the output() method.
     *
     * @see MessageReporter::logMessage()
     *
     * @param string $message
     */
    public function logMessage( $message ) {
        if ( $this->logFileHandle ) {
            fwrite( $this->logFileHandle, "$message\n" );
            fflush( $this->logFileHandle );
        } else {
            $this->output( "$message\n" );
        }
    }

    /**
     * Opens the given file for use by logMessage().
     *
     * @param string $file use "-" as a shortcut for "php://stdout"
     */
    private function openLogFile( $file ) {
        $this->closeLogFile();

        if ( $file === '-' ) {
            $file = 'php://stdout';
        }

        // wouldn't streams be nice...
        $this->logFileHandle = fopen( $file, 'a' );

        if ( !$this->logFileHandle ) {
            $this->fatalError( 'Failed to open log file: ' . $file );
        }
    }

    /**
     * Closes any currently open file opened with openLogFile().
     */
    private function closeLogFile() {
        if ( $this->logFileHandle
            && $this->logFileHandle !== STDERR
            && $this->logFileHandle !== STDOUT
        ) {
            fclose( $this->logFileHandle );
        }

        $this->logFileHandle = false;
    }

    /**
     * Do the actual work. All child classes will need to implement this
     */
    public function execute() {
        //TODO: more validation for options
        $shardingFactor = (int)$this->getOption( 'sharding-factor', 1 );
        $shard = (int)$this->getOption( 'shard', 0 );
        $batchSize = (int)$this->getOption( 'batch-size', 100 );
        $limit = (int)$this->getOption( 'limit', 0 );

        //TODO: Allow injection of an OutputStream for logging
        $this->openLogFile( $this->getOption( 'log', 'php://stderr' ) );

        $outFile = $this->getOption( 'output', 'php://stdout' );

        if ( $outFile === '-' ) {
            $outFile = 'php://stdout';
        }

        $output = fopen( $outFile, 'w' ); //TODO: Allow injection of an OutputStream

        if ( !$output ) {
            $this->fatalError( 'Failed to open ' . $outFile . '!' );
        }

        if ( $this->hasOption( 'list-file' ) ) {
            $this->logMessage( "Dumping entities listed in " . $this->getOption( 'list-file' ) );
        }

        $entityTypes = $this->getEntityTypes();
        if ( !$entityTypes ) {
            $this->logMessage( "No entity types to dump" );
            $this->closeLogFile();
            return;
        }

        $this->logMessage( 'Dumping entities of type ' . implode( ', ', $entityTypes ) );

        if ( $shardingFactor ) {
            $this->logMessage( "Dumping shard $shard/$shardingFactor" );
        }

        $dumper = $this->createDumper( $output );
        $dumper->setLimit( $limit );

        $progressReporter = new ObservableMessageReporter();
        $progressReporter->registerReporterCallback( [ $this, 'logMessage' ] );
        $dumper->setProgressReporter( $progressReporter );

        $ignored = $this->hasOption( 'ignore-missing' ) ?
            [ EntityLookupException::class ] :
            [];
        $exceptionReporter = new ReportingExceptionHandler( $progressReporter, $ignored );
        $dumper->setExceptionHandler( $exceptionReporter );

        //NOTE: we filter for $entityType twice: filtering in the DB is efficient,
        //      but filtering in the dumper is needed when working from a list file.
        $dumper->setShardingFilter( $shardingFactor, $shard );
        $dumper->setEntityTypesFilter( $entityTypes );
        $dumper->setBatchSize( $batchSize );

        $db = WikibaseRepo::getRepoDomainDbFactory()->newRepoDb();
        $dumper->setBatchCallback( [ $db, 'autoReconfigure' ] );

        $idStream = $this->makeIdStream( $entityTypes, $exceptionReporter );
        AtEase::suppressWarnings();
        $dumper->generateDump( $idStream );
        AtEase::restoreWarnings();

        if ( $idStream instanceof EntityIdReader ) {
            // close stream / free resources
            $idStream->dispose();
        }

        $this->closeLogFile();
    }

    /**
     * @inheritDoc
     */
    public function finalSetup( SettingsBuilder $settingsBuilder ) {
        parent::finalSetup( $settingsBuilder );

        if ( $this->hasOption( 'dbgroupdefault' ) ) {
            // A group was set via cli, so no need to set the default here
            return;
        }

        $settingsBuilder->registerHookHandlers( [ 'MediaWikiServices' => [ function() {
            global $wgDBDefaultGroup;
            if ( !ExtensionRegistry::getInstance()->isLoaded( 'WikibaseRepository' ) ) {
                // Something instantiates the MediaWikiServices before Wikibase
                // is loaded, nothing we can do here.
                wfWarn( self::class . ': Can not change default DB group.' );

                return;
            }

            // Don't use WikibaseRepo or MediaWikiServices here as this is run very early on, thus
            // the bootstrapping code is not ready yet (T202452).
            $settings = WikibaseSettings::getRepoSettings();
            $dumpDBDefaultGroup = $settings->getSetting( 'dumpDBDefaultGroup' );

            if ( $dumpDBDefaultGroup !== null ) {
                $wgDBDefaultGroup = $dumpDBDefaultGroup;
            }
        } ] ] );
    }

    /**
     * @return string[]
     */
    private function getEntityTypes(): array {
        $inputTypes = array_diff(
            $this->getOption( 'entity-type', $this->existingEntityTypes ),
            $this->entityTypesToExcludeFromOutput
        );

        $types = [];
        foreach ( $inputTypes as $type ) {
            if ( !in_array( $type, $this->existingEntityTypes ) ) {
                $this->logMessage( "Warning: Unknown entity type $type." );
                continue;
            }
            $types[] = $type;
        }

        return $types;
    }

    /**
     * @param string[] $entityTypes
     * @param ExceptionHandler|null $exceptionReporter
     *
     * @return EntityIdReader|SqlEntityIdPager a stream of EntityId objects
     */
    private function makeIdStream( array $entityTypes, ExceptionHandler $exceptionReporter = null ) {
        $listFile = $this->getOption( 'list-file' );

        if ( $listFile !== null ) {
            $stream = $this->makeIdFileStream( $listFile, $exceptionReporter );
        } else {
            $stream = $this->makeIdQueryStream( $entityTypes );
        }

        return $stream;
    }

    /**
     * Returns EntityIdPager::NO_REDIRECTS.
     *
     * @return mixed a EntityIdPager::XXX_REDIRECTS constant
     */
    protected function getRedirectMode() {
        return EntityIdPager::NO_REDIRECTS;
    }

    /**
     * Cache flag for use in Store::getEntityRevisionLookup.
     *
     * @return string One of Store::LOOKUP_CACHING_RETRIEVE_ONLY and Store::LOOKUP_CACHING_DISABLED
     */
    protected function getEntityRevisionLookupCacheMode() {
        if ( $this->getOption( 'no-cache', false ) ) {
            return Store::LOOKUP_CACHING_DISABLED;
        } else {
            return Store::LOOKUP_CACHING_RETRIEVE_ONLY;
        }
    }

    /**
     * @param string[] $entityTypes
     *
     * @return SqlEntityIdPager
     */
    private function makeIdQueryStream( array $entityTypes ) {
        $sqlEntityIdPager = $this->sqlEntityIdPagerFactory->newSqlEntityIdPager( $entityTypes, $this->getRedirectMode() );

        $firstPageId = $this->getOption( 'first-page-id', null );
        if ( $firstPageId ) {
            $sqlEntityIdPager->setPosition( intval( $firstPageId ) - 1 );
        }
        $lastPageId = $this->getOption( 'last-page-id', null );
        if ( $lastPageId ) {
            $sqlEntityIdPager->setCutoffPosition( intval( $lastPageId ) );
        }

        return $sqlEntityIdPager;
    }

    /**
     * @param string $listFile
     * @param ExceptionHandler|null $exceptionReporter
     *
     * @return EntityIdReader
     */
    private function makeIdFileStream( $listFile, ExceptionHandler $exceptionReporter = null ) {
        $input = fopen( $listFile, 'r' );

        if ( !$input ) {
            $this->fatalError( "Failed to open ID file: $listFile" );
        }

        $stream = new EntityIdReader( new LineReader( $input ), WikibaseRepo::getEntityIdParser() );
        $stream->setExceptionHandler( $exceptionReporter );

        return $stream;
    }

}