wikimedia/mediawiki-core

View on GitHub
maintenance/storage/trackBlobs.php

Summary

Maintainability
D
2 days
Test Coverage
<?php
/**
 * Adds blobs from a given external storage cluster to the blob_tracking table.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @ingroup Maintenance
 */

use MediaWiki\Revision\SlotRecord;
use Wikimedia\Rdbms\DBConnectionError;
use Wikimedia\Rdbms\IExpression;
use Wikimedia\Rdbms\LikeValue;
use Wikimedia\Rdbms\OrExpressionGroup;

require_once __DIR__ . '/../Maintenance.php';

class TrackBlobs extends Maintenance {
    public $clusters;
    public $textClause;
    public $doBlobOrphans;
    public $trackedBlobs = [];

    public $batchSize = 1000;
    public $reportingInterval = 10;

    public function __construct() {
        parent::__construct();

        $this->addArg( 'cluster', 'cluster(s) to scan', true, true );

        $this->addDescription(
            'Adds blobs from a given ES cluster to the blob_tracking table. ' .
            'Automatically deletes the tracking table and starts from the start again when restarted.'
        );
    }

    public function execute() {
        $this->clusters = $this->parameters->getArgs();
        if ( extension_loaded( 'gmp' ) ) {
            $this->doBlobOrphans = true;
            foreach ( $this->clusters as $cluster ) {
                $this->trackedBlobs[$cluster] = gmp_init( 0 );
            }
        } else {
            echo "Warning: the gmp extension is needed to find orphan blobs\n";
        }

        $this->checkIntegrity();
        $this->initTrackingTable();
        $this->trackRevisions();
        $this->trackOrphanText();
        if ( $this->doBlobOrphans ) {
            $this->findOrphanBlobs();
        }
        $this->output( "All done.\n" );
    }

    private function checkIntegrity() {
        echo "Doing integrity check...\n";
        $dbr = $this->getReplicaDB();

        // Scan for HistoryBlobStub objects in the text table (T22757)

        $exists = (bool)$dbr->newSelectQueryBuilder()
            ->select( '1' )
            ->from( 'text' )
            ->where(
                'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
                'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'' )
            ->caller( __METHOD__ )->fetchField();

        if ( $exists ) {
            echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
                "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
                "to fix this.\n";
            exit( 1 );
        }

        echo "Integrity check OK\n";
    }

    private function initTrackingTable() {
        $dbw = $this->getDB( DB_PRIMARY );
        if ( $dbw->tableExists( 'blob_tracking', __METHOD__ ) ) {
            $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ), __METHOD__ );
            $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ), __METHOD__ );
        }
        $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
    }

    private function getTextClause() {
        if ( !$this->textClause ) {
            $dbr = $this->getReplicaDB();
            $conds = [];
            foreach ( $this->clusters as $cluster ) {
                $conds[] = $dbr->expr(
                    'old_text',
                    IExpression::LIKE,
                    new LikeValue( "DB://$cluster/", $dbr->anyString() )
                );
            }
            $this->textClause = new OrExpressionGroup( ...$conds );
        }

        return $this->textClause;
    }

    private function interpretPointer( $text ) {
        if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
            return false;
        }

        return [
            'cluster' => $m[1],
            'id' => intval( $m[2] ),
            'hash' => $m[3] ?? null
        ];
    }

    /**
     *  Scan the revision table for rows stored in the specified clusters
     */
    private function trackRevisions() {
        $dbw = $this->getPrimaryDB();
        $dbr = $this->getReplicaDB();

        $textClause = $this->getTextClause();
        $startId = 0;
        $endId = (int)$dbr->newSelectQueryBuilder()
            ->select( 'MAX(rev_id)' )
            ->from( 'revision' )
            ->caller( __METHOD__ )->fetchField();
        $batchesDone = 0;
        $rowsInserted = 0;

        echo "Finding revisions...\n";

        $conds = [
            $textClause,
            $dbr->expr(
                'old_flags',
                IExpression::LIKE,
                new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
            )
        ];
        $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore();

        $conds = array_merge( [
            'slot_role_id=' . $slotRoleStore->getId( SlotRecord::MAIN ),
            'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ),
        ], $conds );

        while ( true ) {
            $res = $dbr->newSelectQueryBuilder()
                ->select( [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ] )
                ->from( 'revision' )
                ->join( 'slots', null, 'rev_id=slot_revision_id' )
                ->join( 'content', null, 'content_id=slot_content_id' )
                ->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' )
                ->where( $dbr->expr( 'rev_id', '>', $startId ) )
                ->andWhere( $conds )
                ->orderBy( 'rev_id' )
                ->limit( $this->batchSize )
                ->caller( __METHOD__ )->fetchResultSet();
            if ( !$res->numRows() ) {
                break;
            }

            $insertBatch = [];
            foreach ( $res as $row ) {
                $startId = (int)$row->rev_id;
                $info = $this->interpretPointer( $row->old_text );
                if ( !$info ) {
                    echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
                    continue;
                }
                if ( !in_array( $info['cluster'], $this->clusters ) ) {
                    echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
                    continue;
                }
                $insertBatch[] = [
                    'bt_page' => $row->rev_page,
                    'bt_rev_id' => $row->rev_id,
                    'bt_text_id' => $row->old_id,
                    'bt_cluster' => $info['cluster'],
                    'bt_blob_id' => $info['id'],
                    'bt_cgz_hash' => $info['hash']
                ];
                if ( $this->doBlobOrphans ) {
                    gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
                }
            }
            $dbw->newInsertQueryBuilder()
                ->insertInto( 'blob_tracking' )
                ->rows( $insertBatch )
                ->caller( __METHOD__ )->execute();
            $rowsInserted += count( $insertBatch );

            ++$batchesDone;
            if ( $batchesDone >= $this->reportingInterval ) {
                $batchesDone = 0;
                echo "$startId / $endId\n";
                $this->waitForReplication();
            }
        }
        echo "Found $rowsInserted revisions\n";
    }

    /**
     * Scan the text table for orphan text
     * Orphan text here does not imply DB corruption -- deleted text tracked by the
     * archive table counts as orphan for our purposes.
     */
    private function trackOrphanText() {
        # Wait until the blob_tracking table is available in the replica DB
        $dbw = $this->getPrimaryDB();
        $dbr = $this->getReplicaDB();
        $this->getServiceContainer()->getDBLoadBalancerFactory()->waitForReplication( [ 'timeout' => 100_000 ] );

        $textClause = $this->getTextClause();
        $startId = 0;
        $endId = (int)$dbr->newSelectQueryBuilder()
            ->select( 'MAX(old_id)' )
            ->from( 'text' )
            ->caller( __METHOD__ )->fetchField();
        $rowsInserted = 0;
        $batchesDone = 0;

        echo "Finding orphan text...\n";

        # Scan the text table for orphan text
        while ( true ) {
            $res = $dbr->newSelectQueryBuilder()
                ->select( [ 'old_id', 'old_flags', 'old_text' ] )
                ->from( 'text' )
                ->leftJoin( 'blob_tracking', null, 'bt_text_id=old_id' )
                ->where( [
                    $dbr->expr( 'old_id', '>', $startId ),
                    $textClause,
                    $dbr->expr(
                        'old_flags',
                        IExpression::LIKE,
                        new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
                    ),
                    'bt_text_id' => null,
                ] )
                ->orderBy( 'old_id' )
                ->limit( $this->batchSize )
                ->caller( __METHOD__ )->fetchResultSet();

            if ( !$res->numRows() ) {
                break;
            }

            $insertBatch = [];
            foreach ( $res as $row ) {
                $startId = (int)$row->old_id;
                $info = $this->interpretPointer( $row->old_text );
                if ( !$info ) {
                    echo "Invalid DB:// URL in old_id {$row->old_id}\n";
                    continue;
                }
                if ( !in_array( $info['cluster'], $this->clusters ) ) {
                    echo "Invalid cluster returned in SQL query\n";
                    continue;
                }

                $insertBatch[] = [
                    'bt_page' => 0,
                    'bt_rev_id' => 0,
                    'bt_text_id' => $row->old_id,
                    'bt_cluster' => $info['cluster'],
                    'bt_blob_id' => $info['id'],
                    'bt_cgz_hash' => $info['hash']
                ];
                if ( $this->doBlobOrphans ) {
                    gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
                }
            }
            $dbw->newInsertQueryBuilder()
                ->insertInto( 'blob_tracking' )
                ->rows( $insertBatch )
                ->caller( __METHOD__ )->execute();

            $rowsInserted += count( $insertBatch );
            ++$batchesDone;
            if ( $batchesDone >= $this->reportingInterval ) {
                $batchesDone = 0;
                echo "$startId / $endId\n";
                $this->waitForReplication();
            }
        }
        echo "Found $rowsInserted orphan text rows\n";
    }

    /**
     * Scan the blobs table for rows not registered in blob_tracking (and thus not
     * registered in the text table).
     *
     * Orphan blobs are indicative of DB corruption. They are inaccessible and
     * should probably be deleted.
     */
    private function findOrphanBlobs() {
        if ( !extension_loaded( 'gmp' ) ) {
            echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";

            return;
        }

        $dbw = $this->getPrimaryDB();
        $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();

        foreach ( $this->clusters as $cluster ) {
            echo "Searching for orphan blobs in $cluster...\n";
            $lb = $lbFactory->getExternalLB( $cluster );
            try {
                $extDB = $lb->getMaintenanceConnectionRef( DB_REPLICA );
            } catch ( DBConnectionError $e ) {
                if ( strpos( $e->getMessage(), 'Unknown database' ) !== false ) {
                    echo "No database on $cluster\n";
                } else {
                    echo "Error on $cluster: " . $e->getMessage() . "\n";
                }
                continue;
            }
            $table = $extDB->getLBInfo( 'blobs table' ) ?? 'blobs';
            if ( !$extDB->tableExists( $table, __METHOD__ ) ) {
                echo "No blobs table on cluster $cluster\n";
                continue;
            }
            $startId = 0;
            $batchesDone = 0;
            $actualBlobs = gmp_init( 0 );
            $endId = (int)$extDB->newSelectQueryBuilder()
                ->select( 'MAX(blob_id)' )
                ->from( $table )
                ->caller( __METHOD__ )->fetchField();

            // Build a bitmap of actual blob rows
            while ( true ) {
                $res = $extDB->newSelectQueryBuilder()
                    ->select( [ 'blob_id' ] )
                    ->from( $table )
                    ->where( $extDB->expr( 'blob_id', '>', $startId ) )
                    ->orderBy( 'blob_id' )
                    ->limit( $this->batchSize )
                    ->caller( __METHOD__ )->fetchResultSet();

                if ( !$res->numRows() ) {
                    break;
                }

                foreach ( $res as $row ) {
                    gmp_setbit( $actualBlobs, $row->blob_id );
                    $startId = (int)$row->blob_id;
                }

                ++$batchesDone;
                if ( $batchesDone >= $this->reportingInterval ) {
                    $batchesDone = 0;
                    echo "$startId / $endId\n";
                }
            }

            // Find actual blobs that weren't tracked by the previous passes
            // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
            $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );

            // Traverse the orphan list
            $insertBatch = [];
            $id = 0;
            $numOrphans = 0;
            while ( true ) {
                $id = gmp_scan1( $orphans, $id );
                if ( $id == -1 ) {
                    break;
                }
                $insertBatch[] = [
                    'bo_cluster' => $cluster,
                    'bo_blob_id' => $id
                ];
                if ( count( $insertBatch ) > $this->batchSize ) {
                    $dbw->newInsertQueryBuilder()
                        ->insertInto( 'blob_orphans' )
                        ->rows( $insertBatch )
                        ->caller( __METHOD__ )->execute();
                    $insertBatch = [];
                }

                ++$id;
                ++$numOrphans;
            }
            if ( $insertBatch ) {
                $dbw->newInsertQueryBuilder()
                    ->insertInto( 'blob_orphans' )
                    ->rows( $insertBatch )
                    ->caller( __METHOD__ )->execute();
            }
            echo "Found $numOrphans orphan(s) in $cluster\n";
        }
    }
}

$maintClass = TrackBlobs::class;
require_once RUN_MAINTENANCE_IF_MAIN;