wikimedia/mediawiki-core

View on GitHub
includes/parser/ParserCache.php

Summary

Maintainability
D
3 days
Test Coverage
<?php
/**
 * Cache for outputs of the PHP parser
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @ingroup Cache Parser
 */

use MediaWiki\HookContainer\HookContainer;
use MediaWiki\HookContainer\HookRunner;
use MediaWiki\Json\JsonCodec;
use MediaWiki\Page\PageRecord;
use MediaWiki\Page\WikiPageFactory;
use MediaWiki\Parser\ParserCacheFilter;
use MediaWiki\Parser\ParserCacheMetadata;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Title\TitleFactory;
use Psr\Log\LoggerInterface;
use Wikimedia\Stats\StatsFactory;
use Wikimedia\UUID\GlobalIdGenerator;

/**
 * Cache for ParserOutput objects corresponding to the latest page revisions.
 *
 * The ParserCache is a two-tiered cache backed by BagOStuff which supports
 * varying the stored content on the values of ParserOptions used during
 * a page parse.
 *
 * First tier is keyed by the page ID and stores ParserCacheMetadata, which
 * contains information about cache expiration and the list of ParserOptions
 * used during the parse of the page. For example, if only 'dateformat' and
 * 'userlang' options were accessed by the parser when producing output for the
 * page, array [ 'dateformat', 'userlang' ] will be stored in the metadata cache.
 * This means none of the other existing options had any effect on the output.
 *
 * The second tier of the cache contains ParserOutput objects. The key for the
 * second tier is constructed from the page ID and values of those ParserOptions
 * used during a page parse which affected the output. Upon cache lookup, the list
 * of used option names is retrieved from tier 1 cache, and only the values of
 * those options are hashed together with the page ID to produce a key, while
 * the rest of the options are ignored. Following the example above where
 * only [ 'dateformat', 'userlang' ] options changed the parser output for a
 * page, the key will look like 'page_id!dateformat=default:userlang=ru'.
 * Thus any cache lookup with dateformat=default and userlang=ru will hit the
 * same cache entry regardless of the values of the rest of the options, since they
 * were not accessed during a parse and thus did not change the output.
 *
 * @see ParserOutput::recordOption()
 * @see ParserOutput::getUsedOptions()
 * @see ParserOptions::allCacheVaryingOptions()
 * @ingroup Cache Parser
 */
class ParserCache {
    /**
     * Constants for self::getKey()
     * @since 1.30
     * @since 1.36 the constants were made public
     */

    /** Use only current data */
    public const USE_CURRENT_ONLY = 0;

    /** Use expired data if current data is unavailable */
    public const USE_EXPIRED = 1;

    /** Use expired data or data from different revisions if current data is unavailable */
    public const USE_OUTDATED = 2;

    /**
     * Use expired data and data from different revisions, and if all else
     * fails vary on all variable options
     */
    private const USE_ANYTHING = 3;

    /** @var string The name of this ParserCache. Used as a root of the cache key. */
    private $name;

    /** @var BagOStuff */
    private $cache;

    /**
     * Anything cached prior to this is invalidated
     *
     * @var string
     */
    private $cacheEpoch;

    /** @var HookRunner */
    private $hookRunner;

    /** @var JsonCodec */
    private $jsonCodec;

    /** @var StatsFactory */
    private $stats;

    /** @var LoggerInterface */
    private $logger;

    /** @var TitleFactory */
    private $titleFactory;

    /** @var WikiPageFactory */
    private $wikiPageFactory;

    private ?ParserCacheFilter $filter = null;

    private GlobalIdGenerator $globalIdGenerator;

    /**
     * @var BagOStuff small in-process cache to store metadata.
     * It's needed multiple times during the request, for example
     * to build a PoolWorkArticleView key, and then to fetch the
     * actual ParserCache entry.
     */
    private $metadataProcCache;

    /**
     * Setup a cache pathway with a given back-end storage mechanism.
     *
     * This class use an invalidation strategy that is compatible with
     * MultiWriteBagOStuff in async replication mode.
     *
     * @param string $name
     * @param BagOStuff $cache
     * @param string $cacheEpoch Anything before this timestamp is invalidated
     * @param HookContainer $hookContainer
     * @param JsonCodec $jsonCodec
     * @param StatsFactory $stats
     * @param LoggerInterface $logger
     * @param TitleFactory $titleFactory
     * @param WikiPageFactory $wikiPageFactory
     * @param GlobalIdGenerator $globalIdGenerator
     */
    public function __construct(
        string $name,
        BagOStuff $cache,
        string $cacheEpoch,
        HookContainer $hookContainer,
        JsonCodec $jsonCodec,
        StatsFactory $stats,
        LoggerInterface $logger,
        TitleFactory $titleFactory,
        WikiPageFactory $wikiPageFactory,
        GlobalIdGenerator $globalIdGenerator
    ) {
        $this->name = $name;
        $this->cache = $cache;
        $this->cacheEpoch = $cacheEpoch;
        $this->hookRunner = new HookRunner( $hookContainer );
        $this->jsonCodec = $jsonCodec;
        $this->stats = $stats;
        $this->logger = $logger;
        $this->titleFactory = $titleFactory;
        $this->wikiPageFactory = $wikiPageFactory;
        $this->globalIdGenerator = $globalIdGenerator;
        $this->metadataProcCache = new HashBagOStuff( [ 'maxKeys' => 2 ] );
    }

    /**
     * @since 1.41
     * @param ParserCacheFilter $filter
     */
    public function setFilter( ParserCacheFilter $filter ): void {
        $this->filter = $filter;
    }

    /**
     * @param PageRecord $page
     * @since 1.28
     */
    public function deleteOptionsKey( PageRecord $page ) {
        $page->assertWiki( PageRecord::LOCAL );
        $key = $this->makeMetadataKey( $page );
        $this->metadataProcCache->delete( $key );
        $this->cache->delete( $key );
    }

    /**
     * Retrieve the ParserOutput from ParserCache, even if it's outdated.
     * @param PageRecord $page
     * @param ParserOptions $popts
     * @return ParserOutput|false
     */
    public function getDirty( PageRecord $page, $popts ) {
        $page->assertWiki( PageRecord::LOCAL );
        $value = $this->get( $page, $popts, true );
        return is_object( $value ) ? $value : false;
    }

    /**
     * @param PageRecord $page
     * @return string
     */
    private function getContentModelFromPage( PageRecord $page ) {
        $wikiPage = $this->wikiPageFactory->newFromTitle( $page );
        return str_replace( '.', '_', $wikiPage->getContentModel() );
    }

    /**
     * @param PageRecord $page
     * @param string $status
     * @param string|null $reason
     */
    private function incrementStats( PageRecord $page, $status, $reason = null ) {
        $contentModel = $this->getContentModelFromPage( $page );
        $metricSuffix = $reason ? "{$status}_{$reason}" : $status;

        $this->stats->getCounter( 'ParserCache_operation_total' )
            ->setLabel( 'name', $this->name )
            ->setLabel( 'contentModel', $contentModel )
            ->setLabel( 'status', $status )
            ->setLabel( 'reason', $reason ?: 'n/a' )
            ->copyToStatsdAt( "{$this->name}.{$contentModel}.{$metricSuffix}" )
            ->increment();
    }

    /**
     * @param PageRecord $page
     * @param string $renderReason
     */
    private function incrementRenderReasonStats( PageRecord $page, $renderReason ) {
        $contentModel = $this->getContentModelFromPage( $page );
        $renderReason = preg_replace( '/\W+/', '_', $renderReason );

        $this->stats->getCounter( 'ParserCache_render_total' )
            ->setLabel( 'name', $this->name )
            ->setLabel( 'contentModel', $contentModel )
            ->setLabel( 'reason', $renderReason )
            ->copyToStatsdAt( "{$this->name}.{$contentModel}.reason.{$renderReason}" )
            ->increment();
    }

    /**
     * Returns the ParserCache metadata about the given page
     * considering the given options.
     *
     * @note Which parser options influence the cache key
     * is controlled via ParserOutput::recordOption() or
     * ParserOptions::addExtraKey().
     *
     * @param PageRecord $page
     * @param int $staleConstraint one of the self::USE_ constants
     * @return ParserCacheMetadata|null
     * @since 1.36
     */
    public function getMetadata(
        PageRecord $page,
        int $staleConstraint = self::USE_ANYTHING
    ): ?ParserCacheMetadata {
        $page->assertWiki( PageRecord::LOCAL );

        $pageKey = $this->makeMetadataKey( $page );
        $metadata = $this->metadataProcCache->get( $pageKey );
        if ( !$metadata ) {
            $metadata = $this->cache->get(
                $pageKey,
                BagOStuff::READ_VERIFIED
            );
        }

        if ( $metadata === false ) {
            $this->incrementStats( $page, 'miss', 'absent_metadata' );
            $this->logger->debug( 'ParserOutput metadata cache miss', [ 'name' => $this->name ] );
            return null;
        }

        // NOTE: If the value wasn't serialized to JSON when being stored,
        //       we may already have a ParserOutput object here. This used
        //       to be the default behavior before 1.36. We need to retain
        //       support so we can handle cached objects after an update
        //       from an earlier revision.
        // NOTE: Support for reading string values from the cache must be
        //       deployed a while before starting to write JSON to the cache,
        //       in case we have to revert either change.
        if ( is_string( $metadata ) ) {
            $metadata = $this->restoreFromJson( $metadata, $pageKey, CacheTime::class );
        }

        if ( !$metadata instanceof CacheTime ) {
            $this->incrementStats( $page, 'miss', 'unserialize' );
            return null;
        }

        if ( $this->checkExpired( $metadata, $page, $staleConstraint, 'metadata' ) ) {
            return null;
        }

        if ( $this->checkOutdated( $metadata, $page, $staleConstraint, 'metadata' ) ) {
            return null;
        }

        $this->logger->debug( 'Parser cache options found', [ 'name' => $this->name ] );
        return $metadata;
    }

    /**
     * @param PageRecord $page
     * @return string
     */
    private function makeMetadataKey( PageRecord $page ): string {
        return $this->cache->makeKey( $this->name, 'idoptions', $page->getId( PageRecord::LOCAL ) );
    }

    /**
     * Get a key that will be used by the ParserCache to store the content
     * for a given page considering the given options and the array of
     * used options.
     *
     * @warning The exact format of the key is considered internal and is subject
     * to change, thus should not be used as storage or long-term caching key.
     * This is intended to be used for logging or keying something transient.
     *
     * @param PageRecord $page
     * @param ParserOptions $options
     * @param array|null $usedOptions Defaults to all cache varying options.
     * @return string
     * @internal
     * @since 1.36
     */
    public function makeParserOutputKey(
        PageRecord $page,
        ParserOptions $options,
        array $usedOptions = null
    ): string {
        $usedOptions ??= ParserOptions::allCacheVaryingOptions();
        // idhash seem to mean 'page id' + 'rendering hash' (r3710)
        $pageid = $page->getId( PageRecord::LOCAL );
        $title = $this->titleFactory->newFromPageIdentity( $page );
        $hash = $options->optionsHash( $usedOptions, $title );
        // Before T263581 ParserCache was split between normal page views
        // and action=parse. -0 is left in the key to avoid invalidating the entire
        // cache when removing the cache split.
        return $this->cache->makeKey( $this->name, 'idhash', "{$pageid}-0!{$hash}" );
    }

    /**
     * Retrieve the ParserOutput from ParserCache.
     * false if not found or outdated.
     *
     * @param PageRecord $page
     * @param ParserOptions $popts
     * @param bool $useOutdated (default false)
     *
     * @return ParserOutput|false
     */
    public function get( PageRecord $page, $popts, $useOutdated = false ) {
        $page->assertWiki( PageRecord::LOCAL );

        if ( !$page->exists() ) {
            $this->incrementStats( $page, 'miss', 'nonexistent' );
            return false;
        }

        if ( $page->isRedirect() ) {
            // It's a redirect now
            $this->incrementStats( $page, 'miss', 'redirect' );
            return false;
        }

        $staleConstraint = $useOutdated ? self::USE_OUTDATED : self::USE_CURRENT_ONLY;
        $parserOutputMetadata = $this->getMetadata( $page, $staleConstraint );
        if ( !$parserOutputMetadata ) {
            return false;
        }

        if ( !$popts->isSafeToCache( $parserOutputMetadata->getUsedOptions() ) ) {
            $this->incrementStats( $page, 'miss', 'unsafe' );
            return false;
        }

        $parserOutputKey = $this->makeParserOutputKey(
            $page,
            $popts,
            $parserOutputMetadata->getUsedOptions()
        );

        $value = $this->cache->get( $parserOutputKey, BagOStuff::READ_VERIFIED );
        if ( $value === false ) {
            $this->incrementStats( $page, 'miss', 'absent' );
            $this->logger->debug( 'ParserOutput cache miss', [ 'name' => $this->name ] );
            return false;
        }

        // NOTE: If the value wasn't serialized to JSON when being stored,
        //       we may already have a ParserOutput object here. This used
        //       to be the default behavior before 1.36. We need to retain
        //       support so we can handle cached objects after an update
        //       from an earlier revision.
        // NOTE: Support for reading string values from the cache must be
        //       deployed a while before starting to write JSON to the cache,
        //       in case we have to revert either change.
        if ( is_string( $value ) ) {
            $value = $this->restoreFromJson( $value, $parserOutputKey, ParserOutput::class );
        }

        if ( !$value instanceof ParserOutput ) {
            $this->incrementStats( $page, 'miss', 'unserialize' );
            return false;
        }

        if ( $this->checkExpired( $value, $page, $staleConstraint, 'output' ) ) {
            return false;
        }

        if ( $this->checkOutdated( $value, $page, $staleConstraint, 'output' ) ) {
            return false;
        }

        $wikiPage = $this->wikiPageFactory->newFromTitle( $page );
        if ( $this->hookRunner->onRejectParserCacheValue( $value, $wikiPage, $popts ) === false ) {
            $this->incrementStats( $page, 'miss', 'rejected' );
            $this->logger->debug( 'key valid, but rejected by RejectParserCacheValue hook handler',
                [ 'name' => $this->name ] );
            return false;
        }

        $this->logger->debug( 'ParserOutput cache found', [ 'name' => $this->name ] );
        $this->incrementStats( $page, 'hit' );
        return $value;
    }

    /**
     * @param ParserOutput $parserOutput
     * @param PageRecord $page
     * @param ParserOptions $popts
     * @param string|null $cacheTime TS_MW timestamp when the cache was generated
     * @param int|null $revId Revision ID that was parsed
     */
    public function save(
        ParserOutput $parserOutput,
        PageRecord $page,
        $popts,
        $cacheTime = null,
        $revId = null
    ) {
        $page->assertWiki( PageRecord::LOCAL );
        // T350538: Eventually we'll warn if the $cacheTime and $revId
        // parameters are non-null here, since we *should* be getting
        // them from the ParserOutput.
        if ( $revId !== null && $revId !== $parserOutput->getCacheRevisionId() ) {
            $this->logger->warning(
                'Inconsistent revision ID',
                [
                    'name' => $this->name,
                    'reason' => $popts->getRenderReason(),
                    'revid1' => $revId,
                    'revid2' => $parserOutput->getCacheRevisionId(),
                ]
            );
        }

        if ( !$parserOutput->hasText() ) {
            throw new InvalidArgumentException( 'Attempt to cache a ParserOutput with no text set!' );
        }

        $expire = $parserOutput->getCacheExpiry();

        if ( !$popts->isSafeToCache( $parserOutput->getUsedOptions() ) ) {
            $this->logger->debug(
                'Parser options are not safe to cache and has not been saved',
                [ 'name' => $this->name ]
            );
            $this->incrementStats( $page, 'save', 'unsafe' );
            return;
        }

        if ( $expire <= 0 ) {
            $this->logger->debug(
                'Parser output was marked as uncacheable and has not been saved',
                [ 'name' => $this->name ]
            );
            $this->incrementStats( $page, 'save', 'uncacheable' );
            return;
        }

        if ( $this->filter && !$this->filter->shouldCache( $parserOutput, $page, $popts ) ) {
            $this->logger->debug(
                'Parser output was filtered and has not been saved',
                [ 'name' => $this->name ]
            );
            $this->incrementStats( $page, 'save', 'filtered' );

            // TODO: In this case, we still want to cache in RevisionOutputCache (T350669).
            return;
        }

        if ( $this->cache instanceof EmptyBagOStuff ) {
            return;
        }

        // Ensure cache properties are set in the ParserOutput
        // T350538: These should be turned into assertions that the
        // properties are already present.
        if ( $cacheTime ) {
            $parserOutput->setCacheTime( $cacheTime );
        } else {
            if ( !$parserOutput->hasCacheTime() ) {
                $this->logger->warning(
                    'No cache time set',
                    [
                        'name' => $this->name,
                        'reason' => $popts->getRenderReason(),
                    ]
                );
            }
            $cacheTime = $parserOutput->getCacheTime();
        }

        if ( $revId ) {
            $parserOutput->setCacheRevisionId( $revId );
        } elseif ( $parserOutput->getCacheRevisionId() ) {
            $revId = $parserOutput->getCacheRevisionId();
        } else {
            $revId = $page->getLatest( PageRecord::LOCAL );
            $parserOutput->setCacheRevisionId( $revId );
        }
        if ( !$revId ) {
            $this->logger->warning(
                'Parser output cannot be saved if the revision ID is not known',
                [ 'name' => $this->name ]
            );
            $this->incrementStats( $page, 'save', 'norevid' );
            return;
        }

        if ( !$parserOutput->getRenderId() ) {
            $this->logger->warning(
                'Parser output missing render ID',
                [
                    'name' => $this->name,
                    'reason' => $popts->getRenderReason(),
                ]
            );
            $parserOutput->setRenderId( $this->globalIdGenerator->newUUIDv1() );
        }

        // Transfer cache properties to the cache metadata
        $metadata = new CacheTime;
        $metadata->recordOptions( $parserOutput->getUsedOptions() );
        $metadata->updateCacheExpiry( $expire );
        $metadata->setCacheTime( $cacheTime );
        $metadata->setCacheRevisionId( $revId );

        $parserOutputKey = $this->makeParserOutputKey(
            $page,
            $popts,
            $metadata->getUsedOptions()
        );

        $msg = "Saved in parser cache with key $parserOutputKey" .
            " and timestamp $cacheTime" .
            " and revision id $revId.";

        $reason = $popts->getRenderReason();
        $msg .= " Rendering was triggered because: $reason";

        $parserOutput->addCacheMessage( $msg );

        $pageKey = $this->makeMetadataKey( $page );

        $parserOutputData = $this->convertForCache( $parserOutput, $parserOutputKey );
        $metadataData = $this->convertForCache( $metadata, $pageKey );

        if ( !$parserOutputData || !$metadataData ) {
            $this->logger->warning(
                'Parser output failed to serialize and was not saved',
                [ 'name' => $this->name ]
            );
            $this->incrementStats( $page, 'save', 'nonserializable' );
            return;
        }

        // Save the parser output
        $this->cache->set(
            $parserOutputKey,
            $parserOutputData,
            $expire,
            BagOStuff::WRITE_ALLOW_SEGMENTS
        );

        // ...and its pointer to the local cache.
        $this->metadataProcCache->set( $pageKey, $metadataData, $expire );
        // ...and to the global cache.
        $this->cache->set( $pageKey, $metadataData, $expire );

        $title = $this->titleFactory->newFromPageIdentity( $page );
        $this->hookRunner->onParserCacheSaveComplete( $this, $parserOutput, $title, $popts, $revId );

        $this->logger->debug( 'Saved in parser cache', [
            'name' => $this->name,
            'key' => $parserOutputKey,
            'cache_time' => $cacheTime,
            'rev_id' => $revId
        ] );
        $this->incrementStats( $page, 'save', 'success' );
        $this->incrementRenderReasonStats( $page, $popts->getRenderReason() );
    }

    /**
     * Get the backend BagOStuff instance that
     * powers the parser cache
     *
     * @since 1.30
     * @internal
     * @return BagOStuff
     */
    public function getCacheStorage() {
        return $this->cache;
    }

    /**
     * Check if $entry expired for $page given the $staleConstraint
     * when fetching from $cacheTier.
     * @param CacheTime $entry
     * @param PageRecord $page
     * @param int $staleConstraint One of USE_* constants.
     * @param string $cacheTier
     * @return bool
     */
    private function checkExpired(
        CacheTime $entry,
        PageRecord $page,
        int $staleConstraint,
        string $cacheTier
    ): bool {
        if ( $staleConstraint < self::USE_EXPIRED && $entry->expired( $page->getTouched() ) ) {
            $this->incrementStats( $page, 'miss', 'expired' );
            $this->logger->debug( "{$cacheTier} key expired", [
                'name' => $this->name,
                'touched' => $page->getTouched(),
                'epoch' => $this->cacheEpoch,
                'cache_time' => $entry->getCacheTime()
            ] );
            return true;
        }
        return false;
    }

    /**
     * Check if $entry belongs to the latest revision of $page
     * given $staleConstraint when fetched from $cacheTier.
     * @param CacheTime $entry
     * @param PageRecord $page
     * @param int $staleConstraint One of USE_* constants.
     * @param string $cacheTier
     * @return bool
     */
    private function checkOutdated(
        CacheTime $entry,
        PageRecord $page,
        int $staleConstraint,
        string $cacheTier
    ): bool {
        $latestRevId = $page->getLatest( PageRecord::LOCAL );
        if ( $staleConstraint < self::USE_OUTDATED && $entry->isDifferentRevision( $latestRevId ) ) {
            $this->incrementStats( $page, 'miss', 'revid' );
            $this->logger->debug( "{$cacheTier} key is for an old revision", [
                'name' => $this->name,
                'rev_id' => $latestRevId,
                'cached_rev_id' => $entry->getCacheRevisionId()
            ] );
            return true;
        }
        return false;
    }

    /**
     * @param string $jsonData
     * @param string $key
     * @param string $expectedClass
     * @return CacheTime|ParserOutput|null
     */
    private function restoreFromJson( string $jsonData, string $key, string $expectedClass ) {
        try {
            /** @var CacheTime $obj */
            $obj = $this->jsonCodec->unserialize( $jsonData, $expectedClass );
            return $obj;
        } catch ( JsonException $e ) {
            $this->logger->error( "Unable to unserialize JSON", [
                'name' => $this->name,
                'cache_key' => $key,
                'ex_message' => $e->getMessage()
            ] );
            return null;
        } catch ( Exception $e ) {
            $this->logger->error( "Unexpected failure during cache load", [
                'name' => $this->name,
                'cache_key' => $key,
                'ex_message' => $e->getMessage()
            ] );
            return null;
        }
    }

    /**
     * @param CacheTime $obj
     * @param string $key
     * @return string|null
     */
    protected function convertForCache( CacheTime $obj, string $key ) {
        try {
            return $this->jsonCodec->serialize( $obj );
        } catch ( JsonException $e ) {
            // Try to collect some additional debugging information, but
            // wrap this in a try block to ensure we don't make the problem
            // worse.
            try {
                $details = $this->jsonCodec->detectNonSerializableData( $obj, true );
            } catch ( \Throwable $t ) {
                $details = $t->getMessage();
            }
            $this->logger->error( "Unable to serialize JSON", [
                'name' => $this->name,
                'cache_key' => $key,
                'ex_message' => $e->getMessage(),
                'details' => $details,
                'trace' => $e->getTraceAsString(),
            ] );
            return null;
        }
    }
}