wikimedia/mediawiki-core

View on GitHub
includes/content/WikiTextStructure.php

Summary

Maintainability
A
0 mins
Test Coverage
<?php

use HtmlFormatter\HtmlFormatter;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Sanitizer;

/**
 * Class allowing to explore the structure of parsed wikitext.
 */
class WikiTextStructure {

    private ?string $openingText = null;
    private ?string $allText = null;
    /** @var string[] */
    private array $auxText = [];
    private ParserOutput $parserOutput;

    /**
     * Selectors to elements that are excluded entirely from search
     */
    private const EXCLUDED_ELEMENT_SELECTORS = [
        // "it looks like you don't have javascript enabled..." – do not need to index
        'audio', 'video',
        // CSS stylesheets aren't content
        'style',
        // The [1] for references from Cite
        'sup.reference',
        // The ↑ next to references in the references section from Cite
        '.mw-cite-backlink',
        // Headings are already indexed in their own field.
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
        // Collapsed fields are hidden by default, so we don't want them showing up.
        '.autocollapse',
        // Content explicitly decided to be not searchable by editors such
        // as custom navigation templates.
        '.navigation-not-searchable',
        // User-facing interface code prompting the user to act from WikibaseMediaInfo
        '.wbmi-entityview-emptyCaption',
    ];

    /**
     * Selectors to elements that are considered auxiliary to the article text for search
     */
    private const AUXILIARY_ELEMENT_SELECTORS = [
        // Thumbnail captions aren't really part of the text proper
        '.thumbcaption',
        'figcaption',
        // Neither are tables
        'table',
        // Common style for "See also:".
        '.rellink',
        // Common style for calling out helpful links at the top of the article.
        '.dablink',
        // New class users can use to mark stuff as auxiliary to searches.
        '.searchaux',
    ];

    /**
     * @param ParserOutput $parserOutput
     */
    public function __construct( ParserOutput $parserOutput ) {
        $this->parserOutput = $parserOutput;
    }

    /**
     * Gets headings from the page.
     * @return string[]
     * First strip out things that look like references.  We can't use HTML filtering because
     * the references come back as <sup> tags without a class.  To keep from breaking stuff like
     *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
     * we don't remove the whole <sup> tag.
     *
     * We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
     * I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
     *
     * So we only strip things that look like <sup> tags wrapping a reference. And since the data
     * looks like:
     *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
     * we can not really use HtmlFormatter as we have no suitable selector.
     */
    public function headings() {
        $headings = [];
        $tocData = $this->parserOutput->getTOCData();
        if ( $tocData === null ) {
            return $headings;
        }
        $ignoredHeadings = $this->getIgnoredHeadings();
        foreach ( $tocData->getSections() as $heading ) {
            $heading = $heading->line;

            // Some wikis wrap the brackets in a span:
            // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
            $heading = preg_replace( '/<\/?span>/', '', $heading );
            // Normalize [] so the following regexp would work.
            $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
            $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );

            // Strip tags from the heading or else we'll display them (escaped) in search results
            $heading = trim( Sanitizer::stripAllTags( $heading ) );

            // Note that we don't take the level of the heading into account - all headings are equal.
            // Except the ones we ignore.
            if ( !in_array( $heading, $ignoredHeadings ) ) {
                $headings[] = $heading;
            }
        }
        return $headings;
    }

    /**
     * Parse a message content into an array. This function is generally used to
     * parse settings stored as i18n messages (see search-ignored-headings).
     *
     * @param string $message
     * @return string[]
     */
    public static function parseSettingsInMessage( $message ) {
        $lines = explode( "\n", $message );
        // Remove comments
        $lines = preg_replace( '/#.*$/', '', $lines );
        // Remove extra spaces
        $lines = array_map( 'trim', $lines );
        // Remove empty lines
        return array_filter( $lines );
    }

    /**
     * Gets a list of heading to ignore.
     * @return string[]
     */
    private function getIgnoredHeadings() {
        static $ignoredHeadings = null;
        if ( $ignoredHeadings === null ) {
            $ignoredHeadings = [];
            $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
            if ( $source->isBlank() ) {
                // Try the old version too, just in case
                $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
            }
            if ( !$source->isDisabled() ) {
                $lines = self::parseSettingsInMessage( $source->plain() );
                // Now we just have headings!
                $ignoredHeadings = $lines;
            }
        }
        return $ignoredHeadings;
    }

    /**
     * Extract parts of the text - opening, main and auxiliary.
     */
    private function extractWikitextParts() {
        if ( $this->allText !== null ) {
            return;
        }
        $text = $this->parserOutput->getRawText();
        if ( $text === '' ) {
            $this->allText = "";
            // empty text - nothing to seek here
            return;
        }

        $this->openingText = $this->extractTextBeforeFirstHeading( $text );

        $formatter = new HtmlFormatter( $text );

        // Strip elements from the page that we never want in the search text.
        $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
        $formatter->filterContent();

        // Strip elements from the page that are auxiliary text.  These will still be
        // searched, but matches will be ranked lower and non-auxiliary matches will be
        // preferred in highlighting.
        $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
        $auxiliaryElements = $formatter->filterContent();
        $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
        foreach ( $auxiliaryElements as $auxiliaryElement ) {
            $this->auxText[] =
                trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
        }
    }

    /**
     * Get text before first heading.
     * @param string $text
     * @return string|null
     */
    private function extractTextBeforeFirstHeading( $text ) {
        $matches = [];
        if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
            // There isn't a first heading, so we interpret this as the article
            // being entirely without heading.
            return null;
        }
        $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
        if ( !$text ) {
            // There isn't any text before the first heading, so we declare there isn't
            // a first heading.
            return null;
        }

        $formatter = new HtmlFormatter( $text );
        $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
        $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
        $formatter->filterContent();
        $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );

        if ( !$text ) {
            // There isn't any text after filtering before the first heading, so we declare
            // that there isn't a first heading.
            return null;
        }

        return $text;
    }

    /**
     * @return string|null
     */
    public function getOpeningText() {
        $this->extractWikitextParts();
        return $this->openingText;
    }

    /**
     * @return string
     */
    public function getMainText() {
        $this->extractWikitextParts();
        return $this->allText;
    }

    /**
     * @return string[]
     */
    public function getAuxiliaryText() {
        $this->extractWikitextParts();
        return $this->auxText;
    }

    /**
     * Get the "defaultsort" property
     * @return string|null
     */
    public function getDefaultSort() {
        $sort = $this->parserOutput->getPageProperty( 'defaultsort' );
        if ( $sort === false ) {
            return null;
        }
        return $sort;
    }
}