wikimedia/mediawiki-core

View on GitHub
includes/site/MediaWikiPageNameNormalizer.php

Summary

Maintainability
C
1 day
Test Coverage
<?php
/**
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 */

namespace MediaWiki\Site;

use FormatJson;
use InvalidArgumentException;
use MediaWiki\Http\HttpRequestFactory;
use MediaWiki\MediaWikiServices;
use UtfNormal\Validator;

/**
 * Service for normalizing a page name via a MediaWiki action API.
 *
 * @since 1.27
 * @author John Erling Blad < jeblad@gmail.com >
 * @author Daniel Kinzler
 * @author Jeroen De Dauw < jeroendedauw@gmail.com >
 * @author Marius Hoch
 */
class MediaWikiPageNameNormalizer {

    public const FOLLOW_REDIRECT = 1;
    public const NOFOLLOW_REDIRECT = 2;

    /**
     * @var HttpRequestFactory
     */
    private $httpRequestFactory;

    /**
     * @param HttpRequestFactory|null $httpRequestFactory
     */
    public function __construct( $httpRequestFactory = null ) {
        if ( !$httpRequestFactory instanceof HttpRequestFactory ) {
            $httpRequestFactory = MediaWikiServices::getInstance()->getHttpRequestFactory();
        }
        $this->httpRequestFactory = $httpRequestFactory;
    }

    /**
     * Returns the normalized form of the given page title, using the
     * normalization rules of the given site. If $followRedirect is set to self::FOLLOW_REDIRECT (default)
     * and the given title is a redirect, the redirect will be resolved and
     * the redirect target is returned.
     * Only titles of existing pages will be returned.
     *
     * @note This actually makes an API request to the remote site, so beware
     *   that this function is slow and depends on an external service.
     *
     * @see Site::normalizePageName
     *
     * @since 1.27
     * @since 1.37 Added $followRedirect
     *
     * @param string $pageName
     * @param string $apiUrl
     * @param int $followRedirect either self::FOLLOW_REDIRECT or self::NOFOLLOW_REDIRECT
     *
     * @return string|false The normalized form of the title,
     * or false to indicate an invalid title, a missing page,
     * or some other kind of error.
     */
    public function normalizePageName( string $pageName, $apiUrl, $followRedirect = self::FOLLOW_REDIRECT ) {
        if ( $followRedirect === self::FOLLOW_REDIRECT ) {
            $redirects = true;
        } elseif ( $followRedirect === self::NOFOLLOW_REDIRECT ) {
            $redirects = false;
        } else {
            throw new InvalidArgumentException( '$followRedirect is not properly set: ' . $followRedirect );
        }

        // Go on call the external site

        // Make sure the string is normalized into NFC (due to T42017)
        // but do nothing to the whitespaces, that should work appropriately.
        // @see https://phabricator.wikimedia.org/T42017
        $pageName = Validator::cleanUp( $pageName );

        // Build the args for the specific call
        $args = [
            'action' => 'query',
            'prop' => 'info',
            'redirects' => $redirects,
            'converttitles' => true,
            'format' => 'json',
            'titles' => $pageName,
            // @todo options for maxlag and maxage
            // Note that maxlag will lead to a long delay before a reply is made,
            // but that maxage can avoid the extreme delay. On the other hand
            // maxage could be nice to use anyhow as it stops unnecessary requests.
            // Also consider smaxage if maxage is used.
        ];

        $url = wfAppendQuery( $apiUrl, $args );

        // Go on call the external site
        // @todo we need a good way to specify a timeout here.
        $ret = $this->httpRequestFactory->get( $url, [], __METHOD__ );

        if ( $ret === null ) {
            wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
            return false;
        }

        $data = FormatJson::decode( $ret, true );

        if ( !is_array( $data ) ) {
            wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
            return false;
        }

        $page = static::extractPageRecord( $data, $pageName );

        if ( isset( $page['missing'] ) ) {
            wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
                . $ret );
            return false;
        }

        if ( isset( $page['invalid'] ) ) {
            wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
                . $ret );
            return false;
        }

        if ( !isset( $page['title'] ) ) {
            wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
            return false;
        }

        return $page['title'];
    }

    /**
     * Get normalization record for a given page title from an API response.
     *
     * @param array $externalData A reply from the API on a external server.
     * @param string $pageTitle Identifies the page at the external site, needing normalization.
     *
     * @return array|bool A 'page' structure representing the page identified by $pageTitle.
     */
    private static function extractPageRecord( $externalData, $pageTitle ) {
        // If there is a special case with only one returned page
        // we can cheat, and only return
        // the single page in the "pages" substructure.
        if ( isset( $externalData['query']['pages'] ) ) {
            $pages = array_values( $externalData['query']['pages'] );
            if ( count( $pages ) === 1 ) {
                return $pages[0];
            }
        }
        // This is only used during internal testing, as it is assumed
        // a more optimal (and lossfree) storage.
        // Make initial checks and return if prerequisites are not meet.
        if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
            return false;
        }
        // Loop over the tree different named structures, that otherwise are similar
        $structs = [
            'normalized' => 'from',
            'converted' => 'from',
            'redirects' => 'from',
            'pages' => 'title'
        ];
        foreach ( $structs as $listId => $fieldId ) {
            // Check if the substructure exist at all.
            if ( !isset( $externalData['query'][$listId] ) ) {
                continue;
            }
            // Filter the substructure down to what we actually are using.
            $collectedHits = array_filter(
                array_values( $externalData['query'][$listId] ),
                static function ( $a ) use ( $fieldId, $pageTitle ) {
                    return $a[$fieldId] === $pageTitle;
                }
            );
            // If still looping over normalization, conversion or redirects,
            // then we need to keep the new page title for later rounds.
            if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
                switch ( count( $collectedHits ) ) {
                    case 0:
                        break;
                    case 1:
                        $pageTitle = $collectedHits[0]['to'];
                        break;
                    default:
                        return false;
                }
            } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
                // If on the pages structure we should prepare for returning.

                switch ( count( $collectedHits ) ) {
                    case 0:
                        return false;
                    case 1:
                        return array_shift( $collectedHits );
                    default:
                        return false;
                }
            }
        }
        // should never be here
        return false;
    }

}