maintenance/storage/trackBlobs.php
<?php
/**
* Adds blobs from a given external storage cluster to the blob_tracking table.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
use MediaWiki\Revision\SlotRecord;
use Wikimedia\Rdbms\DBConnectionError;
use Wikimedia\Rdbms\IExpression;
use Wikimedia\Rdbms\LikeValue;
use Wikimedia\Rdbms\OrExpressionGroup;
require_once __DIR__ . '/../Maintenance.php';
class TrackBlobs extends Maintenance {
public $clusters;
public $textClause;
public $doBlobOrphans;
public $trackedBlobs = [];
public $batchSize = 1000;
public $reportingInterval = 10;
public function __construct() {
parent::__construct();
$this->addArg( 'cluster', 'cluster(s) to scan', true, true );
$this->addDescription(
'Adds blobs from a given ES cluster to the blob_tracking table. ' .
'Automatically deletes the tracking table and starts from the start again when restarted.'
);
}
public function execute() {
$this->clusters = $this->parameters->getArgs();
if ( extension_loaded( 'gmp' ) ) {
$this->doBlobOrphans = true;
foreach ( $this->clusters as $cluster ) {
$this->trackedBlobs[$cluster] = gmp_init( 0 );
}
} else {
echo "Warning: the gmp extension is needed to find orphan blobs\n";
}
$this->checkIntegrity();
$this->initTrackingTable();
$this->trackRevisions();
$this->trackOrphanText();
if ( $this->doBlobOrphans ) {
$this->findOrphanBlobs();
}
$this->output( "All done.\n" );
}
private function checkIntegrity() {
echo "Doing integrity check...\n";
$dbr = $this->getReplicaDB();
// Scan for HistoryBlobStub objects in the text table (T22757)
$exists = (bool)$dbr->newSelectQueryBuilder()
->select( '1' )
->from( 'text' )
->where(
'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'' )
->caller( __METHOD__ )->fetchField();
if ( $exists ) {
echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
"This script could destroy these objects if it continued. Run resolveStubs.php\n" .
"to fix this.\n";
exit( 1 );
}
echo "Integrity check OK\n";
}
private function initTrackingTable() {
$dbw = $this->getDB( DB_PRIMARY );
if ( $dbw->tableExists( 'blob_tracking', __METHOD__ ) ) {
$dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ), __METHOD__ );
$dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ), __METHOD__ );
}
$dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
}
private function getTextClause() {
if ( !$this->textClause ) {
$dbr = $this->getReplicaDB();
$conds = [];
foreach ( $this->clusters as $cluster ) {
$conds[] = $dbr->expr(
'old_text',
IExpression::LIKE,
new LikeValue( "DB://$cluster/", $dbr->anyString() )
);
}
$this->textClause = new OrExpressionGroup( ...$conds );
}
return $this->textClause;
}
private function interpretPointer( $text ) {
if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
return false;
}
return [
'cluster' => $m[1],
'id' => intval( $m[2] ),
'hash' => $m[3] ?? null
];
}
/**
* Scan the revision table for rows stored in the specified clusters
*/
private function trackRevisions() {
$dbw = $this->getPrimaryDB();
$dbr = $this->getReplicaDB();
$textClause = $this->getTextClause();
$startId = 0;
$endId = (int)$dbr->newSelectQueryBuilder()
->select( 'MAX(rev_id)' )
->from( 'revision' )
->caller( __METHOD__ )->fetchField();
$batchesDone = 0;
$rowsInserted = 0;
echo "Finding revisions...\n";
$conds = [
$textClause,
$dbr->expr(
'old_flags',
IExpression::LIKE,
new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
)
];
$slotRoleStore = $this->getServiceContainer()->getSlotRoleStore();
$conds = array_merge( [
'slot_role_id=' . $slotRoleStore->getId( SlotRecord::MAIN ),
'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ),
], $conds );
while ( true ) {
$res = $dbr->newSelectQueryBuilder()
->select( [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ] )
->from( 'revision' )
->join( 'slots', null, 'rev_id=slot_revision_id' )
->join( 'content', null, 'content_id=slot_content_id' )
->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' )
->where( $dbr->expr( 'rev_id', '>', $startId ) )
->andWhere( $conds )
->orderBy( 'rev_id' )
->limit( $this->batchSize )
->caller( __METHOD__ )->fetchResultSet();
if ( !$res->numRows() ) {
break;
}
$insertBatch = [];
foreach ( $res as $row ) {
$startId = (int)$row->rev_id;
$info = $this->interpretPointer( $row->old_text );
if ( !$info ) {
echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
continue;
}
if ( !in_array( $info['cluster'], $this->clusters ) ) {
echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
continue;
}
$insertBatch[] = [
'bt_page' => $row->rev_page,
'bt_rev_id' => $row->rev_id,
'bt_text_id' => $row->old_id,
'bt_cluster' => $info['cluster'],
'bt_blob_id' => $info['id'],
'bt_cgz_hash' => $info['hash']
];
if ( $this->doBlobOrphans ) {
gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
}
}
$dbw->newInsertQueryBuilder()
->insertInto( 'blob_tracking' )
->rows( $insertBatch )
->caller( __METHOD__ )->execute();
$rowsInserted += count( $insertBatch );
++$batchesDone;
if ( $batchesDone >= $this->reportingInterval ) {
$batchesDone = 0;
echo "$startId / $endId\n";
$this->waitForReplication();
}
}
echo "Found $rowsInserted revisions\n";
}
/**
* Scan the text table for orphan text
* Orphan text here does not imply DB corruption -- deleted text tracked by the
* archive table counts as orphan for our purposes.
*/
private function trackOrphanText() {
# Wait until the blob_tracking table is available in the replica DB
$dbw = $this->getPrimaryDB();
$dbr = $this->getReplicaDB();
$this->getServiceContainer()->getDBLoadBalancerFactory()->waitForReplication( [ 'timeout' => 100_000 ] );
$textClause = $this->getTextClause();
$startId = 0;
$endId = (int)$dbr->newSelectQueryBuilder()
->select( 'MAX(old_id)' )
->from( 'text' )
->caller( __METHOD__ )->fetchField();
$rowsInserted = 0;
$batchesDone = 0;
echo "Finding orphan text...\n";
# Scan the text table for orphan text
while ( true ) {
$res = $dbr->newSelectQueryBuilder()
->select( [ 'old_id', 'old_flags', 'old_text' ] )
->from( 'text' )
->leftJoin( 'blob_tracking', null, 'bt_text_id=old_id' )
->where( [
$dbr->expr( 'old_id', '>', $startId ),
$textClause,
$dbr->expr(
'old_flags',
IExpression::LIKE,
new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
),
'bt_text_id' => null,
] )
->orderBy( 'old_id' )
->limit( $this->batchSize )
->caller( __METHOD__ )->fetchResultSet();
if ( !$res->numRows() ) {
break;
}
$insertBatch = [];
foreach ( $res as $row ) {
$startId = (int)$row->old_id;
$info = $this->interpretPointer( $row->old_text );
if ( !$info ) {
echo "Invalid DB:// URL in old_id {$row->old_id}\n";
continue;
}
if ( !in_array( $info['cluster'], $this->clusters ) ) {
echo "Invalid cluster returned in SQL query\n";
continue;
}
$insertBatch[] = [
'bt_page' => 0,
'bt_rev_id' => 0,
'bt_text_id' => $row->old_id,
'bt_cluster' => $info['cluster'],
'bt_blob_id' => $info['id'],
'bt_cgz_hash' => $info['hash']
];
if ( $this->doBlobOrphans ) {
gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
}
}
$dbw->newInsertQueryBuilder()
->insertInto( 'blob_tracking' )
->rows( $insertBatch )
->caller( __METHOD__ )->execute();
$rowsInserted += count( $insertBatch );
++$batchesDone;
if ( $batchesDone >= $this->reportingInterval ) {
$batchesDone = 0;
echo "$startId / $endId\n";
$this->waitForReplication();
}
}
echo "Found $rowsInserted orphan text rows\n";
}
/**
* Scan the blobs table for rows not registered in blob_tracking (and thus not
* registered in the text table).
*
* Orphan blobs are indicative of DB corruption. They are inaccessible and
* should probably be deleted.
*/
private function findOrphanBlobs() {
if ( !extension_loaded( 'gmp' ) ) {
echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
return;
}
$dbw = $this->getPrimaryDB();
$lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
foreach ( $this->clusters as $cluster ) {
echo "Searching for orphan blobs in $cluster...\n";
$lb = $lbFactory->getExternalLB( $cluster );
try {
$extDB = $lb->getMaintenanceConnectionRef( DB_REPLICA );
} catch ( DBConnectionError $e ) {
if ( strpos( $e->getMessage(), 'Unknown database' ) !== false ) {
echo "No database on $cluster\n";
} else {
echo "Error on $cluster: " . $e->getMessage() . "\n";
}
continue;
}
$table = $extDB->getLBInfo( 'blobs table' ) ?? 'blobs';
if ( !$extDB->tableExists( $table, __METHOD__ ) ) {
echo "No blobs table on cluster $cluster\n";
continue;
}
$startId = 0;
$batchesDone = 0;
$actualBlobs = gmp_init( 0 );
$endId = (int)$extDB->newSelectQueryBuilder()
->select( 'MAX(blob_id)' )
->from( $table )
->caller( __METHOD__ )->fetchField();
// Build a bitmap of actual blob rows
while ( true ) {
$res = $extDB->newSelectQueryBuilder()
->select( [ 'blob_id' ] )
->from( $table )
->where( $extDB->expr( 'blob_id', '>', $startId ) )
->orderBy( 'blob_id' )
->limit( $this->batchSize )
->caller( __METHOD__ )->fetchResultSet();
if ( !$res->numRows() ) {
break;
}
foreach ( $res as $row ) {
gmp_setbit( $actualBlobs, $row->blob_id );
$startId = (int)$row->blob_id;
}
++$batchesDone;
if ( $batchesDone >= $this->reportingInterval ) {
$batchesDone = 0;
echo "$startId / $endId\n";
}
}
// Find actual blobs that weren't tracked by the previous passes
// This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
$orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
// Traverse the orphan list
$insertBatch = [];
$id = 0;
$numOrphans = 0;
while ( true ) {
$id = gmp_scan1( $orphans, $id );
if ( $id == -1 ) {
break;
}
$insertBatch[] = [
'bo_cluster' => $cluster,
'bo_blob_id' => $id
];
if ( count( $insertBatch ) > $this->batchSize ) {
$dbw->newInsertQueryBuilder()
->insertInto( 'blob_orphans' )
->rows( $insertBatch )
->caller( __METHOD__ )->execute();
$insertBatch = [];
}
++$id;
++$numOrphans;
}
if ( $insertBatch ) {
$dbw->newInsertQueryBuilder()
->insertInto( 'blob_orphans' )
->rows( $insertBatch )
->caller( __METHOD__ )->execute();
}
echo "Found $numOrphans orphan(s) in $cluster\n";
}
}
}
$maintClass = TrackBlobs::class;
require_once RUN_MAINTENANCE_IF_MAIN;