includes/Storage/PageEditStash.php
<?php
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
*/
namespace MediaWiki\Storage;
use MediaWiki\Content\Content;
use MediaWiki\HookContainer\HookContainer;
use MediaWiki\HookContainer\HookRunner;
use MediaWiki\Page\PageIdentity;
use MediaWiki\Page\WikiPageFactory;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\ParserOutputFlags;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Storage\Hook\ParserOutputStashForEditHook;
use MediaWiki\User\UserEditTracker;
use MediaWiki\User\UserFactory;
use MediaWiki\User\UserIdentity;
use Psr\Log\LoggerInterface;
use stdClass;
use Wikimedia\ObjectCache\BagOStuff;
use Wikimedia\Rdbms\IConnectionProvider;
use Wikimedia\ScopedCallback;
use Wikimedia\Stats\StatsFactory;
use WikiPage;
/**
* Manage the pre-emptive page parsing for edits to wiki pages.
*
* This is written to by ApiStashEdit, and consumed by ApiEditPage
* and EditPage (via PageUpdaterFactory and DerivedPageDataUpdater).
*
* See also mediawiki.action.edit/stash.js.
*
* @since 1.34
* @ingroup Page
*/
class PageEditStash {
/** @var BagOStuff */
private $cache;
/** @var IConnectionProvider */
private $dbProvider;
/** @var LoggerInterface */
private $logger;
/** @var StatsFactory */
private $stats;
/** @var ParserOutputStashForEditHook */
private $hookRunner;
/** @var UserEditTracker */
private $userEditTracker;
/** @var UserFactory */
private $userFactory;
/** @var WikiPageFactory */
private $wikiPageFactory;
/** @var int */
private $initiator;
public const ERROR_NONE = 'stashed';
public const ERROR_PARSE = 'error_parse';
public const ERROR_CACHE = 'error_cache';
public const ERROR_UNCACHEABLE = 'uncacheable';
public const ERROR_BUSY = 'busy';
public const PRESUME_FRESH_TTL_SEC = 30;
public const MAX_CACHE_TTL = 300; // 5 minutes
public const MAX_SIGNATURE_TTL = 60;
private const MAX_CACHE_RECENT = 2;
public const INITIATOR_USER = 1;
public const INITIATOR_JOB_OR_CLI = 2;
/**
* @param BagOStuff $cache
* @param IConnectionProvider $dbProvider
* @param LoggerInterface $logger
* @param StatsFactory $stats
* @param UserEditTracker $userEditTracker
* @param UserFactory $userFactory
* @param WikiPageFactory $wikiPageFactory
* @param HookContainer $hookContainer
* @param int $initiator Class INITIATOR__* constant
*/
public function __construct(
BagOStuff $cache,
IConnectionProvider $dbProvider,
LoggerInterface $logger,
StatsFactory $stats,
UserEditTracker $userEditTracker,
UserFactory $userFactory,
WikiPageFactory $wikiPageFactory,
HookContainer $hookContainer,
$initiator
) {
$this->cache = $cache;
$this->dbProvider = $dbProvider;
$this->logger = $logger;
$this->stats = $stats;
$this->userEditTracker = $userEditTracker;
$this->userFactory = $userFactory;
$this->wikiPageFactory = $wikiPageFactory;
$this->hookRunner = new HookRunner( $hookContainer );
$this->initiator = $initiator;
}
/**
* @param PageUpdater $pageUpdater (a WikiPage instance is also supported but deprecated)
* @param Content $content Edit content
* @param UserIdentity $user
* @param string $summary Edit summary
* @return string Class ERROR_* constant
*/
public function parseAndCache( $pageUpdater, Content $content, UserIdentity $user, string $summary ) {
$logger = $this->logger;
if ( $pageUpdater instanceof WikiPage ) {
wfDeprecated( __METHOD__ . ' with WikiPage instance', '1.42' );
$pageUpdater = $pageUpdater->newPageUpdater( $user );
}
$page = $pageUpdater->getPage();
$key = $this->getStashKey( $page, $this->getContentHash( $content ), $user );
$fname = __METHOD__;
// Use the primary DB to allow for fast blocking locks on the "save path" where this
// value might actually be used to complete a page edit. If the edit submission request
// happens before this edit stash requests finishes, then the submission will block until
// the stash request finishes parsing. For the lock acquisition below, there is not much
// need to duplicate parsing of the same content/user/summary bundle, so try to avoid
// blocking at all here.
$dbw = $this->dbProvider->getPrimaryDatabase();
if ( !$dbw->lock( $key, $fname, 0 ) ) {
// De-duplicate requests on the same key
return self::ERROR_BUSY;
}
/** @noinspection PhpUnusedLocalVariableInspection */
$unlocker = new ScopedCallback( static function () use ( $dbw, $key, $fname ) {
$dbw->unlock( $key, $fname );
} );
$cutoffTime = time() - self::PRESUME_FRESH_TTL_SEC;
// Reuse any freshly build matching edit stash cache
$editInfo = $this->getStashValue( $key );
if ( $editInfo && (int)wfTimestamp( TS_UNIX, $editInfo->timestamp ) >= $cutoffTime ) {
$alreadyCached = true;
} else {
$pageUpdater->setContent( SlotRecord::MAIN, $content );
$update = $pageUpdater->prepareUpdate( EDIT_INTERNAL ); // applies pre-safe transform
$output = $update->getCanonicalParserOutput(); // causes content to be parsed
$output->setCacheTime( $update->getRevision()->getTimestamp() );
// emulate a cache value that kind of looks like a PreparedEdit, for use below
$editInfo = (object)[
'pstContent' => $update->getRawContent( SlotRecord::MAIN ),
'output' => $output,
'timestamp' => $output->getCacheTime()
];
$alreadyCached = false;
}
$logContext = [ 'cachekey' => $key, 'title' => (string)$page ];
if ( $editInfo->output ) {
// Let extensions add ParserOutput metadata or warm other caches
$legacyUser = $this->userFactory->newFromUserIdentity( $user );
$legacyPage = $this->wikiPageFactory->newFromTitle( $page );
$this->hookRunner->onParserOutputStashForEdit(
$legacyPage, $content, $editInfo->output, $summary, $legacyUser );
if ( $alreadyCached ) {
$logger->debug( "Parser output for key '{cachekey}' already cached.", $logContext );
return self::ERROR_NONE;
}
$code = $this->storeStashValue(
$key,
$editInfo->pstContent,
$editInfo->output,
$editInfo->timestamp,
$user
);
if ( $code === true ) {
$logger->debug( "Cached parser output for key '{cachekey}'.", $logContext );
return self::ERROR_NONE;
} elseif ( $code === 'uncacheable' ) {
$logger->info(
"Uncacheable parser output for key '{cachekey}' [{code}].",
$logContext + [ 'code' => $code ]
);
return self::ERROR_UNCACHEABLE;
} else {
$logger->error(
"Failed to cache parser output for key '{cachekey}'.",
$logContext + [ 'code' => $code ]
);
return self::ERROR_CACHE;
}
}
return self::ERROR_PARSE;
}
/**
* Check that a prepared edit is in cache and still up-to-date
*
* This method blocks if the prepared edit is already being rendered,
* waiting until rendering finishes before doing final validity checks.
*
* The cache is rejected if template or file changes are detected.
* Note that foreign template or file transclusions are not checked.
*
* This returns an object with the following fields:
* - pstContent: the Content after pre-save-transform
* - output: the ParserOutput instance
* - timestamp: the timestamp of the parse
* - edits: author edit count if they are logged in or NULL otherwise
*
* @param PageIdentity $page
* @param Content $content
* @param UserIdentity $user to get parser options from
* @return stdClass|false Returns edit stash object or false on cache miss
*/
public function checkCache( PageIdentity $page, Content $content, UserIdentity $user ) {
$legacyUser = $this->userFactory->newFromUserIdentity( $user );
if (
// The context is not an HTTP POST request
!$legacyUser->getRequest()->wasPosted() ||
// The context is a CLI script or a job runner HTTP POST request
$this->initiator !== self::INITIATOR_USER ||
// The editor account is a known bot
$legacyUser->isBot()
) {
// Avoid wasted queries and statsd pollution
return false;
}
$logger = $this->logger;
$key = $this->getStashKey( $page, $this->getContentHash( $content ), $user );
$logContext = [
'key' => $key,
'title' => (string)$page,
'user' => $user->getName()
];
$editInfo = $this->getAndWaitForStashValue( $key );
if ( !is_object( $editInfo ) || !$editInfo->output ) {
$this->incrCacheReadStats( 'miss', 'no_stash', $content );
if ( $this->recentStashEntryCount( $user ) > 0 ) {
$logger->info( "Empty cache for key '{key}' but not for user.", $logContext );
} else {
$logger->debug( "Empty cache for key '{key}'.", $logContext );
}
return false;
}
$age = time() - (int)wfTimestamp( TS_UNIX, $editInfo->output->getCacheTime() );
$logContext['age'] = $age;
$isCacheUsable = true;
if ( $age <= self::PRESUME_FRESH_TTL_SEC ) {
// Assume nothing changed in this time
$this->incrCacheReadStats( 'hit', 'presumed_fresh', $content );
$logger->debug( "Timestamp-based cache hit for key '{key}'.", $logContext );
} elseif ( !$user->isRegistered() ) {
$lastEdit = $this->lastEditTime( $user );
$cacheTime = $editInfo->output->getCacheTime();
if ( $lastEdit < $cacheTime ) {
// Logged-out user made no local upload/template edits in the meantime
$this->incrCacheReadStats( 'hit', 'presumed_fresh', $content );
$logger->debug( "Edit check based cache hit for key '{key}'.", $logContext );
} else {
$isCacheUsable = false;
$this->incrCacheReadStats( 'miss', 'proven_stale', $content );
$logger->info( "Stale cache for key '{key}' due to outside edits.", $logContext );
}
} else {
if ( $editInfo->edits === $this->userEditTracker->getUserEditCount( $user ) ) {
// Logged-in user made no local upload/template edits in the meantime
$this->incrCacheReadStats( 'hit', 'presumed_fresh', $content );
$logger->debug( "Edit count based cache hit for key '{key}'.", $logContext );
} else {
$isCacheUsable = false;
$this->incrCacheReadStats( 'miss', 'proven_stale', $content );
$logger->info( "Stale cache for key '{key}'due to outside edits.", $logContext );
}
}
if ( !$isCacheUsable ) {
return false;
}
if ( $editInfo->output->getOutputFlag( ParserOutputFlags::VARY_REVISION ) ) {
// This can be used for the initial parse, e.g. for filters or doUserEditContent(),
// but a second parse will be triggered in doEditUpdates() no matter what
$logger->info(
"Cache for key '{key}' has vary-revision; post-insertion parse inevitable.",
$logContext
);
} else {
static $flagsMaybeReparse = [
// Similar to the above if we didn't guess the ID correctly
ParserOutputFlags::VARY_REVISION_ID,
// Similar to the above if we didn't guess the timestamp correctly
ParserOutputFlags::VARY_REVISION_TIMESTAMP,
// Similar to the above if we didn't guess the content correctly
ParserOutputFlags::VARY_REVISION_SHA1,
// Similar to the above if we didn't guess page ID correctly
ParserOutputFlags::VARY_PAGE_ID,
];
foreach ( $flagsMaybeReparse as $flag ) {
if ( $editInfo->output->getOutputFlag( $flag ) ) {
$logger->debug(
"Cache for key '{key}' has $flag; post-insertion parse possible.",
$logContext
);
}
}
}
return $editInfo;
}
/**
* @param string $result
* @param string $reason
* @param Content $content
*/
private function incrCacheReadStats( $result, $reason, Content $content ) {
static $subtypeByResult = [ 'miss' => 'cache_misses', 'hit' => 'cache_hits' ];
$this->stats->getCounter( "editstash_cache_checks_total" )
->setLabel( 'reason', $reason )
->setLabel( 'result', $result )
->setLabel( 'model', $content->getModel() )
->copyToStatsdAt( [
'editstash.' . $subtypeByResult[ $result ] . '.' . $reason,
'editstash_by_model.' . $content->getModel() . '.' . $subtypeByResult[ $result ] . '.' . $reason ] )
->increment();
}
/**
* @param string $key
* @return bool|stdClass
*/
private function getAndWaitForStashValue( $key ) {
$editInfo = $this->getStashValue( $key );
if ( !$editInfo ) {
$start = microtime( true );
// We ignore user aborts and keep parsing. Block on any prior parsing
// so as to use its results and make use of the time spent parsing.
$dbw = $this->dbProvider->getPrimaryDatabase();
if ( $dbw->lock( $key, __METHOD__, 30 ) ) {
$editInfo = $this->getStashValue( $key );
$dbw->unlock( $key, __METHOD__ );
}
$timeMs = 1000 * max( 0, microtime( true ) - $start );
$this->stats->getTiming( 'editstash_lock_wait_seconds' )
->copyToStatsdAt( 'editstash.lock_wait_time' )
->observe( $timeMs );
}
return $editInfo;
}
/**
* @param string $textHash
* @return string|bool Text or false if missing
*/
public function fetchInputText( $textHash ) {
$textKey = $this->cache->makeKey( 'stashedit', 'text', $textHash );
return $this->cache->get( $textKey );
}
/**
* @param string $text
* @param string $textHash
* @return bool Success
*/
public function stashInputText( $text, $textHash ) {
$textKey = $this->cache->makeKey( 'stashedit', 'text', $textHash );
return $this->cache->set(
$textKey,
$text,
self::MAX_CACHE_TTL,
BagOStuff::WRITE_ALLOW_SEGMENTS
);
}
/**
* @param UserIdentity $user
* @return string|null TS_MW timestamp or null
*/
private function lastEditTime( UserIdentity $user ) {
$time = $this->dbProvider->getReplicaDatabase()->newSelectQueryBuilder()
->select( 'MAX(rc_timestamp)' )
->from( 'recentchanges' )
->join( 'actor', null, 'actor_id=rc_actor' )
->where( [ 'actor_name' => $user->getName() ] )
->caller( __METHOD__ )
->fetchField();
return wfTimestampOrNull( TS_MW, $time );
}
/**
* Get hash of the content, factoring in model/format
*
* @param Content $content
* @return string
*/
private function getContentHash( Content $content ) {
return sha1( implode( "\n", [
$content->getModel(),
$content->getDefaultFormat(),
$content->serialize( $content->getDefaultFormat() )
] ) );
}
/**
* Get the temporary prepared edit stash key for a user
*
* This key can be used for caching prepared edits provided:
* - a) The $user was used for PST options
* - b) The parser output was made from the PST using cannonical matching options
*
* @param PageIdentity $page
* @param string $contentHash Result of getContentHash()
* @param UserIdentity $user User to get parser options from
* @return string
*/
private function getStashKey( PageIdentity $page, $contentHash, UserIdentity $user ) {
return $this->cache->makeKey(
'stashedit-info-v2',
md5( "{$page->getNamespace()}\n{$page->getDBkey()}" ),
// Account for the edit model/text
$contentHash,
// Account for user name related variables like signatures
md5( "{$user->getId()}\n{$user->getName()}" )
);
}
/**
* @param string $key
* @return stdClass|bool Object map (pstContent,output,outputID,timestamp,edits) or false
*/
private function getStashValue( $key ) {
$serial = $this->cache->get( $key );
return $this->unserializeStashInfo( $serial );
}
/**
* Build a value to store in memcached based on the PST content and parser output
*
* This makes a simple version of WikiPage::prepareContentForEdit() as stash info
*
* @param string $key
* @param Content $pstContent Pre-Save transformed content
* @param ParserOutput $parserOutput
* @param string $timestamp TS_MW
* @param UserIdentity $user
* @return string|bool True or an error code
*/
private function storeStashValue(
$key,
Content $pstContent,
ParserOutput $parserOutput,
$timestamp,
UserIdentity $user
) {
// If an item is renewed, mind the cache TTL determined by config and parser functions.
// Put an upper limit on the TTL to avoid extreme template/file staleness.
$age = time() - (int)wfTimestamp( TS_UNIX, $parserOutput->getCacheTime() );
$ttl = min( $parserOutput->getCacheExpiry() - $age, self::MAX_CACHE_TTL );
// Avoid extremely stale user signature timestamps (T84843)
if ( $parserOutput->getOutputFlag( ParserOutputFlags::USER_SIGNATURE ) ) {
$ttl = min( $ttl, self::MAX_SIGNATURE_TTL );
}
if ( $ttl <= 0 ) {
return 'uncacheable'; // low TTL due to a tag, magic word, or signature?
}
// Store what is actually needed and split the output into another key (T204742)
$stashInfo = (object)[
'pstContent' => $pstContent,
'output' => $parserOutput,
'timestamp' => $timestamp,
'edits' => $this->userEditTracker->getUserEditCount( $user ),
];
$serial = $this->serializeStashInfo( $stashInfo );
if ( $serial === false ) {
return 'store_error';
}
$ok = $this->cache->set( $key, $serial, $ttl, BagOStuff::WRITE_ALLOW_SEGMENTS );
if ( $ok ) {
// These blobs can waste slots in low cardinality memcached slabs
$this->pruneExcessStashedEntries( $user, $key );
}
return $ok ? true : 'store_error';
}
/**
* @param UserIdentity $user
* @param string $newKey
*/
private function pruneExcessStashedEntries( UserIdentity $user, $newKey ) {
$key = $this->cache->makeKey( 'stash-edit-recent', sha1( $user->getName() ) );
$keyList = $this->cache->get( $key ) ?: [];
if ( count( $keyList ) >= self::MAX_CACHE_RECENT ) {
$oldestKey = array_shift( $keyList );
$this->cache->delete( $oldestKey, BagOStuff::WRITE_ALLOW_SEGMENTS );
}
$keyList[] = $newKey;
$this->cache->set( $key, $keyList, 2 * self::MAX_CACHE_TTL );
}
/**
* @param UserIdentity $user
* @return int
*/
private function recentStashEntryCount( UserIdentity $user ) {
$key = $this->cache->makeKey( 'stash-edit-recent', sha1( $user->getName() ) );
return count( $this->cache->get( $key ) ?: [] );
}
private function serializeStashInfo( stdClass $stashInfo ) {
// @todo: use JSON with ParserOutput and Content
return serialize( $stashInfo );
}
private function unserializeStashInfo( $serial ) {
if ( is_string( $serial ) ) {
// @todo: use JSON with ParserOutput and Content
$stashInfo = unserialize( $serial );
if ( is_object( $stashInfo ) && $stashInfo->output instanceof ParserOutput ) {
return $stashInfo;
}
}
return false;
}
}