wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/BaseInterwikiResolver.php

Summary

Maintainability
C
7 hrs
Test Coverage
<?php

namespace CirrusSearch;

use MediaWiki\Interwiki\InterwikiLookup;
use MediaWiki\Logger\LoggerFactory;
use Wikimedia\Http\MultiHttpClient;
use Wikimedia\ObjectCache\WANObjectCache;

/**
 * Base InterwikiResolver class.
 * Subclasses just need to provide the full matrix array
 * by implementing loadMatrix(), the resulting matrix will
 * be stored by this base class.
 */
abstract class BaseInterwikiResolver implements InterwikiResolver {
    private const CONFIG_CACHE_TTL = 600;

    /** @var array[]|null full IW matrix (@see loadMatrix()) */
    private ?array $matrix = null;

    /** @var SearchConfig main wiki config */
    protected SearchConfig $config;

    /** @var bool use cirrus config dump API */
    private $useConfigDumpApi;

    /**
     * @var MultiHttpClient http client to fetch config of other wikis
     */
    private MultiHttpClient $httpClient;

    /**
     * @var InterwikiLookup
     */
    protected InterwikiLookup $interwikiLookup;

    /**
     * @var WANObjectCache
     */
    protected WANObjectCache $wanCache;

    /**
     * @param SearchConfig $config
     * @param MultiHttpClient $client http client to fetch cirrus config
     * @param WANObjectCache $wanCache Cache object for caching repeated requests
     * @param InterwikiLookup $iwLookup
     */
    public function __construct(
        SearchConfig $config,
        MultiHttpClient $client,
        WANObjectCache $wanCache,
        InterwikiLookup $iwLookup
    ) {
        $this->config = $config;
        $this->useConfigDumpApi = $this->config->get( 'CirrusSearchFetchConfigFromApi' );
        $this->httpClient = $client;
        $this->interwikiLookup = $iwLookup;
        $this->wanCache = $wanCache;
    }

    /**
     * @return string[]
     */
    public function getSisterProjectPrefixes() {
        $matrix = $this->getMatrix();
        return $matrix['sister_projects'] ?? [];
    }

    /**
     * @return SearchConfig[] configs of sister project indexed by interwiki prefix
     */
    public function getSisterProjectConfigs() {
        $prefixes = $this->getSisterProjectPrefixes();
        return $this->loadConfigFromAPI( $prefixes, [], [ $this, 'minimalSearchConfig' ] );
    }

    /**
     * @param string $wikiId
     * @return string|null
     */
    public function getInterwikiPrefix( $wikiId ) {
        $matrix = $this->getMatrix();
        return $matrix['prefixes_by_wiki'][$wikiId] ?? null;
    }

    /**
     * @param string $lang
     * @return string[] a two elements array [ 'prefix', 'language' ]
     */
    public function getSameProjectWikiByLang( $lang ) {
        $matrix = $this->getMatrix();
        // Most of the time the language is equal to the interwiki prefix.
        // But it's not always the case, use the language_map to identify the interwiki prefix first.
        $lang = $matrix['language_map'][$lang] ?? $lang;
        return isset( $matrix['cross_language'][$lang] ) ? [ $matrix['cross_language'][$lang], $lang ] : [];
    }

    /**
     * @param string $lang
     * @return SearchConfig[] single element array: [ interwiki => SearchConfig ]
     */
    public function getSameProjectConfigByLang( $lang ) {
        $wikiAndPrefix = $this->getSameProjectWikiByLang( $lang );
        if ( !$wikiAndPrefix ) {
            return [];
        }
        [ $wiki, $prefix ] = $wikiAndPrefix;
        return $this->loadConfigFromAPI(
            [ $prefix => $wiki ],
            [],
            [ $this, 'minimalSearchConfig' ] );
    }

    /** @return array[] */
    private function getMatrix() {
        if ( $this->matrix === null ) {
            $this->matrix = $this->loadMatrix();

        }
        return $this->matrix;
    }

    /**
     * Load the interwiki matric information
     * The returned array must include the following keys:
     * - sister_projects: an array with the list of sister wikis indexed by
     *   interwiki prefix
     * - cross_language: an array with the list of wikis running the same
     *   project/site indexed by interwiki prefix
     * - language_map: an array with the list of interwiki prefixes where
     *   where the language code of the wiki does not match the prefix
     * - prefixes_by_wiki: an array with the list of interwiki indexed
     *   by wikiID
     *
     * The result of this method is stored in the current InterwikiResolver instance
     * so it can be called only once per request.
     *
     * return array[]
     */
    abstract protected function loadMatrix();

    /**
     * @param string[] $wikis
     * @param string[] $hashConfigFlags constructor flags for SearchConfig
     * @param callable $fallbackConfig function to load the config if the
     * api is not usable or if a failure occurs
     * @return SearchConfig[] config indexed by iw prefix
     */
    private function loadConfigFromAPI( $wikis, array $hashConfigFlags, $fallbackConfig ) {
        $endpoints = [];
        foreach ( $wikis as $prefix => $wiki ) {
            $iw = $this->interwikiLookup->fetch( $prefix );
            if ( !$iw || !$this->useConfigDumpApi || !$iw->isLocal() ) {
                continue;
            }
            $api = $iw->getAPI();
            if ( !$api ) {
                $parts = parse_url( $iw->getURL() );
                if ( !isset( $parts['host'] ) ) {
                    continue;
                }
                $api = $parts['scheme'] ?? 'http';
                $api .= '://' . $parts['host'];
                $api .= isset( $parts['port'] ) ? ':' . $parts['port'] : '';
                $api .= '/w/api.php';
            }
            $endpoints[$prefix] = [ 'url' => $api, 'wiki' => $wiki ];
        }

        if ( $endpoints ) {
            $prefixes = array_keys( $endpoints );
            asort( $prefixes );
            $cacheKey = implode( '-', $prefixes );
            $configs = $this->wanCache->getWithSetCallback(
                $this->wanCache->makeKey( 'cirrussearch-load-iw-config', $cacheKey ),
                self::CONFIG_CACHE_TTL,
                function () use ( $endpoints ) {
                    return $this->sendConfigDumpRequest( $endpoints );
                }
            );
        } else {
            $configs = [];
        }
        $retValue = [];
        foreach ( $wikis as $prefix => $wiki ) {
            if ( isset( $configs[$prefix] ) ) {
                $config = $configs[$prefix];
                $config['_wikiID'] = $wiki;

                $retValue[$prefix] = new HashSearchConfig(
                    $config,
                    array_merge( $hashConfigFlags, [ HashSearchConfig::FLAG_INHERIT ] )
                );
            } else {
                $retValue[$prefix] = $fallbackConfig( $wiki, $hashConfigFlags );
            }
        }
        return $retValue;
    }

    /**
     * @param array[] $endpoints list of arrays containing 'url' and 'wiki', indexed by iw prefix
     * @return array[] list of array containing extracted config vars, failed wikis
     * are not returned.
     */
    private function sendConfigDumpRequest( $endpoints ) {
        $logger = LoggerFactory::getInstance( 'CirrusSearch' );
        $reqs = [];
        foreach ( $endpoints as $prefix => $info ) {
            $reqs[$prefix] = [
                'method' => 'GET',
                'url' => $info['url'],
                'query' => [
                    'action' => 'cirrus-config-dump',
                    'format' => 'json',
                    'formatversion' => '2',
                ]
            ];
        }
        if ( !$reqs ) {
            return [];
        }
        $responses = $this->httpClient->runMulti( $reqs );
        $configs = [];
        foreach ( $responses as $prefix => $response ) {
            if ( $response['response']['code'] !== 200 ) {
                $logger->warning(
                    'Failed to fetch config for {wiki} at {url}: ' .
                    'http status {httpstatus} : {clienterror}',
                    [
                        'wiki' => $endpoints[$prefix]['wiki'],
                        'url' => $endpoints[$prefix]['url'],
                        'httpstatus' => $response['response']['code'],
                        'clienterror' => $response['response']['error']
                    ]
                );
                continue;
            }

            $data = json_decode( $response['response']['body'], true );
            if ( json_last_error() !== JSON_ERROR_NONE ) {
                $logger->warning(
                    'Failed to fetch config for {wiki} at {url}: ' .
                    'json error code {jsonerrorcode}',
                    [
                        'wiki' => $endpoints[$prefix]['wiki'],
                        'url' => $endpoints[$prefix]['url'],
                        'jsonerrorcode' => json_last_error()
                    ]
                );
                continue;
            }

            if ( isset( $data['error'] ) ) {
                $logger->warning(
                    'Failed to fetch config for {wiki} at {url}: {apierrormessage}',
                    [
                        'wiki' => $endpoints[$prefix]['wiki'],
                        'url' => $endpoints[$prefix]['url'],
                        'apierrormessage' => $data['error']
                    ]
                );
                continue;
            }
            unset( $data['warnings'] );
            $configs[$prefix] = $data;
        }
        return $configs;
    }

    /**
     * Minimal config needed to run a search on a target wiki
     * living on the same cluster as the host wiki
     *
     * @param string $wiki
     * @param string[] $hashConfigFlags constructor flags for HashSearchConfig
     * @return SearchConfig
     */
    protected function minimalSearchConfig( $wiki, array $hashConfigFlags ) {
        return new HashSearchConfig(
            [
                '_wikiID' => $wiki,
                'CirrusSearchIndexBaseName' => $wiki,
            ],
            array_merge( [ HashSearchConfig::FLAG_INHERIT ], $hashConfigFlags )
        );
    }
}