wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Util.php

Summary

Maintainability
D
1 day
Test Coverage
<?php

namespace CirrusSearch;

use MediaWiki\Context\RequestContext;
use MediaWiki\Language\Language;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use MediaWiki\PoolCounter\PoolCounterWorkViaCallback;
use MediaWiki\Request\WebRequest;
use MediaWiki\Status\Status;
use MediaWiki\Title\Title;
use MediaWiki\User\UserIdentity;
use MediaWiki\WikiMap\WikiMap;
use Wikimedia\Assert\Assert;
use Wikimedia\IPUtils;
use Wikimedia\Stats\StatsFactory;

/**
 * Random utility functions that don't have a better home
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */
class Util {
    /**
     * Cache getDefaultBoostTemplates()
     *
     * @var array|null boost templates
     */
    private static $defaultBoostTemplates = null;

    /**
     * @var string|null Id identifying this php execution
     */
    private static $executionId;

    /**
     * Get the textual representation of a namespace with underscores stripped, varying
     * by gender if need be (using Title::getNsText()).
     *
     * @param Title $title The page title to use
     * @return string|false
     */
    public static function getNamespaceText( Title $title ) {
        $ret = $title->getNsText();
        return is_string( $ret ) ? strtr( $ret, '_', ' ' ) : $ret;
    }

    /**
     * Set label and statsd BC setup for pool counter metrics
     * @param string $type The pool counter type, such as CirrusSearch-Search
     * @param bool $isSuccess If the pool counter gave a success, or failed the request
     * @param float $observation the time it took to update the counter
     * @return void
     */
    private static function recordPoolStats( string $type, bool $isSuccess, float $observation ): void {
        $pos = strpos( $type, '-' );
        if ( $pos !== false ) {
            $type = substr( $type, $pos + 1 );
        }
        $postfix = $isSuccess ? 'successMs' : 'failureMs';
        self::getStatsFactory()
            ->getTiming( "pool_counter_seconds" )
            ->setLabel( "type", $type )
            ->setLabel( "status", $isSuccess ? "success" : "failure" )
            ->copyToStatsdAt( "CirrusSearch.poolCounter.$type.$postfix" )
            ->observe( $observation );
    }

    /**
     * @param float $startPoolWork The time this pool request started, from microtime( true )
     * @param string $type The pool counter type, such as CirrusSearch-Search
     * @param bool $isSuccess If the pool counter gave a success, or failed the request
     * @param callable $callback The function to wrap
     * @return callable The original callback wrapped to collect pool counter stats
     */
    private static function wrapWithPoolStats( $startPoolWork,
        $type,
        $isSuccess,
        callable $callback
    ) {
        return function () use ( $type, $isSuccess, $callback, $startPoolWork ) {
            self::recordPoolStats(
                $type,
                $isSuccess,
                1000 * ( microtime( true ) - $startPoolWork ) );

            return $callback( ...func_get_args() );
        };
    }

    /**
     * Wraps the complex pool counter interface to force the single call pattern
     * that Cirrus always uses.
     *
     * @param string $type same as type parameter on PoolCounter::factory
     * @param UserIdentity|null $user
     * @param callable $workCallback callback when pool counter is acquired.  Called with
     *  no parameters.
     * @param string|null $busyErrorMsg The i18n key to return when the queue
     *  is full, or null to use the default.
     * @return mixed
     */
    public static function doPoolCounterWork( $type, $user, $workCallback, $busyErrorMsg = null ) {
        global $wgCirrusSearchPoolCounterKey;

        // By default the pool counter allows you to lock the same key with
        // multiple types.  That might be useful but it isn't how Cirrus thinks.
        // Instead, all keys are scoped to their type.

        if ( !$user ) {
            // We don't want to even use the pool counter if there isn't a user.
            // Note that anonymous users are still users, this is most likely
            // maintenance scripts.
            // @todo Maintenenace scripts and jobs should already override
            // poolcounters as necessary, can this be removed?
            return $workCallback();
        }

        $key = "$type:$wgCirrusSearchPoolCounterKey";

        $errorCallback = static function ( Status $status ) use ( $key, $busyErrorMsg ) {
            $error = $status->getMessages()[0]->getKey();

            LoggerFactory::getInstance( 'CirrusSearch' )->warning(
                "Pool error on {key}:  {error}",
                [ 'key' => $key, 'error' => $error ]
            );
            if ( $error === 'pool-queuefull' ) {
                return Status::newFatal( $busyErrorMsg ?: 'cirrussearch-too-busy-error' );
            }
            return Status::newFatal( 'cirrussearch-backend-error' );
        };

        // wrap some stats collection on the success/failure handlers
        $startPoolWork = microtime( true );
        $workCallback = self::wrapWithPoolStats( $startPoolWork, $type, true, $workCallback );
        $errorCallback = self::wrapWithPoolStats( $startPoolWork, $type, false, $errorCallback );

        $work = new PoolCounterWorkViaCallback( $type, $key, [
            'doWork' => $workCallback,
            'error' => $errorCallback,
        ] );
        return $work->execute();
    }

    /**
     * @param string $str
     * @return float
     */
    public static function parsePotentialPercent( $str ) {
        $result = floatval( $str );
        if ( strpos( $str, '%' ) === false ) {
            return $result;
        }
        return $result / 100;
    }

    /**
     * Parse a message content into an array. This function is generally used to
     * parse settings stored as i18n messages (see cirrussearch-boost-templates).
     *
     * @param string $message
     * @return string[]
     */
    public static function parseSettingsInMessage( $message ) {
        $lines = explode( "\n", $message );
        $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
        $lines = array_map( 'trim', $lines );          // Remove extra spaces
        $lines = array_filter( $lines );               // Remove empty lines
        return $lines;
    }

    /**
     * Set $dest to the true/false from $request->getVal( $name ) if yes/no.
     *
     * @param mixed &$dest
     * @param WebRequest $request
     * @param string $name
     */
    public static function overrideYesNo( &$dest, $request, $name ) {
        $val = $request->getVal( $name );
        if ( $val !== null ) {
            $dest = wfStringToBool( $val );
        }
    }

    /**
     * Set $dest to the numeric value from $request->getVal( $name ) if it is <= $limit
     * or => $limit if upperLimit is false.
     *
     * @param mixed &$dest
     * @param WebRequest $request
     * @param string $name
     * @param int|null $limit
     * @param bool $upperLimit
     */
    public static function overrideNumeric( &$dest, $request, $name, $limit = null, $upperLimit = true ) {
        $val = $request->getVal( $name );
        if ( $val !== null && is_numeric( $val ) ) {
            if ( $limit === null ) {
                $dest = $val;
            } elseif ( $upperLimit && $val <= $limit ) {
                $dest = $val;
            } elseif ( !$upperLimit && $val >= $limit ) {
                $dest = $val;
            }
        }
    }

    /**
     * Get boost templates configured in messages.
     * @param SearchConfig|null $config Search config requesting the templates
     * @return float[]
     */
    public static function getDefaultBoostTemplates( SearchConfig $config = null ) {
        $config ??= MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'CirrusSearch' );

        $fromConfig = $config->get( 'CirrusSearchBoostTemplates' );
        if ( $config->get( 'CirrusSearchIgnoreOnWikiBoostTemplates' ) ) {
            // on wiki messages disabled, we can return this config
            // directly
            return $fromConfig;
        }

        $fromMessage = self::getOnWikiBoostTemplates( $config );
        if ( !$fromMessage ) {
            // the onwiki config is empty (or unknown for non-local
            // config), we can fallback to templates from config
            return $fromConfig;
        }
        return $fromMessage;
    }

    /**
     * Load and cache boost templates configured on wiki via the system
     * message 'cirrussearch-boost-templates'.
     * If called from the local wiki the message will be cached.
     * If called from a non local wiki an attempt to fetch this data from the cache is made.
     * If an empty array is returned it means that no config is available on wiki
     * or the value possibly unknown if run from a non local wiki.
     *
     * @param SearchConfig $config
     * @return float[] indexed by template name
     */
    private static function getOnWikiBoostTemplates( SearchConfig $config ) {
        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
        $cacheKey = $cache->makeGlobalKey( 'cirrussearch-boost-templates', $config->getWikiId() );
        if ( $config->getWikiId() == WikiMap::getCurrentWikiId() ) {
            // Local wiki we can fetch boost templates from system
            // message
            if ( self::$defaultBoostTemplates !== null ) {
                // This static cache is never set with non-local
                // wiki data.
                return self::$defaultBoostTemplates;
            }

            $templates = $cache->getWithSetCallback(
                $cacheKey,
                600,
                static function () {
                    $source = wfMessage( 'cirrussearch-boost-templates' )->inContentLanguage();
                    if ( !$source->isDisabled() ) {
                        $lines = Util::parseSettingsInMessage( $source->plain() );
                        // Now parse the templates
                        return Query\BoostTemplatesFeature::parseBoostTemplates( implode( ' ', $lines ) );
                    }
                    return [];
                }
            );
            self::$defaultBoostTemplates = $templates;
            return $templates;
        }
        // Here we're dealing with boost template from other wiki, try to fetch it if it exists
        // otherwise, don't bother.
        $nonLocalCache = $cache->get( $cacheKey );
        if ( !is_array( $nonLocalCache ) ) {
            // not yet in cache, value is unknown
            // return empty array
            return [];
        }
        return $nonLocalCache;
    }

    /**
     * Strip question marks from queries, according to the defined stripping
     * level, defined by $wgCirrusSearchStripQuestionMarks. Strip all ?s, those
     * at word breaks, or only string-final. Ignore queries that are all
     * punctuation or use insource. Don't remove escaped \?s, but unescape them.
     *
     * @param string $term
     * @param string $strippingLevel Either "all", "break", or "final"
     * @return string modified term, based on strippingLevel
     */
    public static function stripQuestionMarks( $term, $strippingLevel ) {
        if ( strpos( $term, 'insource:/' ) === false &&
             strpos( $term, 'intitle:/' ) === false &&
            !preg_match( '/^[\p{P}\p{Z}]+$/u', $term )
        ) {
            // FIXME: get rid of negative lookbehinds on (?<!\\\\)
            // it may improperly transform \\? into \? instead of \\ and destroy properly escaped \
            if ( $strippingLevel === 'final' ) {
                // strip only query-final question marks that are not escaped
                $term = preg_replace( "/((?<!\\\\)\?|\s)+$/", '', $term );
                $term = preg_replace( '/\\\\\?/', '?', $term );
            } elseif ( $strippingLevel === 'break' ) {
                // strip question marks at word boundaries
                $term = preg_replace( '/(?<!\\\\)\?+(\PL|$)/', '$1', $term );
                $term = preg_replace( '/\\\\\?/', '?', $term );
            } elseif ( $strippingLevel === 'all' ) {
                // strip all unescaped question marks
                $term = preg_replace( '/(?<!\\\\)\?+/', ' ', $term );
                $term = preg_replace( '/\\\\\?/', '?', $term );
            }
        }
        return $term;
    }

    /**
     * Identifies a specific execution of php. That might be one web
     * request, or multiple jobs run in the same executor. An execution id
     * is valid over a brief timespan, perhaps a minute or two for some jobs.
     *
     * @return string unique identifier
     */
    public static function getExecutionId() {
        if ( self::$executionId === null ) {
            self::$executionId = (string)mt_rand();
        }
        return self::$executionId;
    }

    /**
     * Unit tests only
     */
    public static function resetExecutionId() {
        self::$executionId = null;
    }

    /**
     * Get a token that (hopefully) uniquely identifies this search. It will be
     * added to the search result page js config vars, and put into the url with
     * history.replaceState(). This means click through's from supported browsers
     * will record this token as part of the referrer.
     *
     * @return string
     */
    public static function getRequestSetToken() {
        static $token;
        if ( $token === null ) {
            // random UID, 70B tokens have a collision probability of 4*10^-16
            // so should work for marking unique queries.
            $gen = MediaWikiServices::getInstance()->getGlobalIdGenerator();
            $uuid = $gen->newUUIDv4();
            // make it a little shorter by using straight base36
            $hex = substr( $uuid, 0, 8 ) . substr( $uuid, 9, 4 ) .
                substr( $uuid, 14, 4 ) . substr( $uuid, 19, 4 ) .
                substr( $uuid, 24 );
            $token = \Wikimedia\base_convert( $hex, 16, 36 );
        }
        return $token;
    }

    /**
     * @param string $extraData Extra information to mix into the hash
     * @return string A token that identifies the source of the request
     */
    public static function generateIdentToken( $extraData = '' ) {
        $request = RequestContext::getMain()->getRequest();
        try {
            $ip = $request->getIP();
        } catch ( \MWException $e ) {
            // No ip, probably running cli?
            $ip = 'unknown';
        }
        return md5( implode( ':', [
            $extraData,
            $ip,
            $request->getHeader( 'X-Forwarded-For' ),
            $request->getHeader( 'User-Agent' ),
        ] ) );
    }

    /**
     * @return string The context the request is in. Either cli, api, web or misc.
     */
    public static function getExecutionContext() {
        if ( PHP_SAPI === 'cli' ) {
            return 'cli';
        } elseif ( MW_ENTRY_POINT == 'api' ) {
            return 'api';
        } elseif ( MW_ENTRY_POINT == 'index' ) {
            return 'web';
        } else {
            return 'misc';
        }
    }

    /**
     * Identify a namespace by attempting some unicode folding techniques.
     * 2 methods supported:
     * - naive: case folding + naive accents removal (only some combined accents are removed)
     * - utr30: (slow to load) case folding + strong accent squashing based on the withdrawn UTR30 specs
     * all methods will apply something similar to near space flattener.
     * @param string $namespace name of the namespace to identify
     * @param string $method either naive or utr30
     * @param Language|null $language
     * @return bool|int
     */
    public static function identifyNamespace( $namespace, $method = 'naive', Language $language = null ) {
        static $naive = null;
        static $utr30 = null;

        $normalizer = null;
        if ( $method === 'naive' ) {
            if ( $naive === null ) {
                $naive = \Transliterator::createFromRules(
                    '::NFD;::Upper;::Lower;::[:Nonspacing Mark:] Remove;::NFC;[\_\-\'\u2019\u02BC]>\u0020;'
                );
            }
            $normalizer = $naive;
        } elseif ( $method === 'utr30' ) {
            if ( $utr30 === null ) {
                $utr30 =
                $normalizer = \Transliterator::createFromRules( file_get_contents( __DIR__ . '/../data/utr30.txt' ) );
            }
            $normalizer = $utr30;
        }

        Assert::postcondition( $normalizer !== null,
            'Failed to load Transliterator with method ' . $method );
        $namespace = $normalizer->transliterate( $namespace );
        if ( $namespace === '' ) {
            return false;
        }
        $language ??= MediaWikiServices::getInstance()->getContentLanguage();
        foreach ( $language->getNamespaceIds() as $candidate => $nsId ) {
            if ( $normalizer->transliterate( $candidate ) === $namespace ) {
                return $nsId;
            }
        }

        return false;
    }

    /**
     * Helper for PHP's annoying emptiness check.
     * empty(0) should not be true!
     * empty(false) should not be true!
     * Empty arrays, strings, and nulls/undefined count as empty.
     *
     * False otherwise.
     * @param mixed $v
     * @return bool
     */
    public static function isEmpty( $v ) {
        return ( is_array( $v ) && count( $v ) === 0 ) ||
            ( is_object( $v ) && count( (array)$v ) === 0 ) ||
            ( is_string( $v ) && strlen( $v ) === 0 ) ||
            ( $v === null );
    }

    /**
     * Helper function to conditionally set a key in a dest array only if it
     * is defined in a source array.  This is just to help DRY up what would
     * otherwise could be a long series of
     * if ( isset($sourceArray[$key] )) { $destArray[$key] = $sourceArray[$key] }
     * statements.  This also supports using a different key in the dest array,
     * as well as mapping the value when assigning to $sourceArray.
     *
     * Usage:
     * $arr1 = ['KEY1' => '123'];
     * $arr2 = [];
     *
     * setIfDefined($arr1, 'KEY1', $arr2, 'key1', 'intval');
     * // $arr2['key1'] is now set to 123 (integer value)
     *
     * setIfDefined($arr1, 'KEY2', $arr2);
     * // $arr2 stays the same, because $arr1 does not have 'KEY2' defined.
     *
     * @param array $sourceArray the array from which to look for $sourceKey
     * @param string $sourceKey the key to look for in $sourceArray
     * @param array &$destArray by reference destination array in which to set value if defined
     * @param string|null $destKey optional, key to use instead of $sourceKey in $destArray.
     * @param callable|null $mapFn optional, If set, this will be called on the value before setting it.
     * @param bool $checkEmpty If false, emptyiness of result after $mapFn is called will not be
     *                 checked before setting on $destArray.  If true, it will, using Util::isEmpty.
     *                 Default: true
     * @return array
     */
    public static function setIfDefined(
        array $sourceArray,
        $sourceKey,
        array &$destArray,
        $destKey = null,
        $mapFn = null,
        $checkEmpty = true
    ) {
        if ( array_key_exists( $sourceKey, $sourceArray ) ) {
            $val = $sourceArray[$sourceKey];
            if ( $mapFn !== null ) {
                $val = $mapFn( $val );
            }
            // Only set in $destArray if we are not checking emptiness,
            // or if we are and the $val is not empty.
            if ( !$checkEmpty || !self::isEmpty( $val ) ) {
                $key = $destKey ?: $sourceKey;
                $destArray[$key] = $val;
            }
        }
        return $destArray;
    }

    /**
     * @return StatsFactory prefixed with the "CirrusSearch" component
     */
    public static function getStatsFactory(): StatsFactory {
        return MediaWikiServices::getInstance()->getStatsFactory()->withComponent( "CirrusSearch" );
    }

    /**
     * @param SearchConfig $config Configuration of the check
     * @param string $ip The address to check against, ipv4 or ipv6.
     * @param string[] $headers Map from http header name to value. All names must be uppercased.
     * @return bool True when the parameters appear to be a non-interactive use case.
     */
    public static function looksLikeAutomation( SearchConfig $config, string $ip, array $headers ): bool {
        // Is there an http header that can be matched with regex to flag automation,
        // such as the user-agent or a flag applied by some infrastructure?
        $automationHeaders = $config->get( 'CirrusSearchAutomationHeaderRegexes' ) ?? [];
        foreach ( $automationHeaders as $name => $pattern ) {
            $name = strtoupper( $name );
            if ( !isset( $headers[$name] ) ) {
                continue;
            }
            $ret = preg_match( $pattern, $headers[$name] );
            if ( $ret === 1 ) {
                return true;
            } elseif ( $ret === false ) {
                LoggerFactory::getInstance( 'CirrusSearch' )->warning(
                    "Invalid regex provided for header `$name` in `CirrusSearchAutomationHeaderRegexes`." );
            }
        }

        // Does the ip address fall into a subnet known for automation?
        $ranges = $config->get( 'CirrusSearchAutomationCIDRs' );
        if ( IPUtils::isInRanges( $ip, $ranges ) ) {
            return true;
        }

        // Default assumption that requests are interactive
        return false;
    }

    /**
     * If we're supposed to create raw result, create and return it,
     * or output it and finish.
     * @param mixed $result Search result data
     * @param WebRequest $request Request context
     * @param CirrusDebugOptions $debugOptions
     * @return string The new raw result.
     */
    public static function processSearchRawReturn( $result, WebRequest $request,
                                                   CirrusDebugOptions $debugOptions ) {
        $output = null;
        $header = null;
        if ( $debugOptions->getCirrusExplainFormat() !== null ) {
            $header = 'Content-type: text/html; charset=UTF-8';
            $printer = new ExplainPrinter( $debugOptions->getCirrusExplainFormat() );
            $output = $printer->format( $result );
        }

        // This should always be true, except in the case of the test suite which wants the actual
        // objects returned.
        if ( $debugOptions->isDumpAndDie() ) {
            if ( $output === null ) {
                $header = 'Content-type: application/json; charset=UTF-8';
                if ( $result === null ) {
                    $output = '{}';
                } else {
                    $output = json_encode( $result, JSON_PRETTY_PRINT );
                }
            }

            // When dumping the query we skip _everything_ but echoing the query.
            RequestContext::getMain()->getOutput()->disable();
            // @phan-suppress-next-line PhanTypeMismatchArgumentNullable $header can't be null here
            $request->response()->header( $header );
            echo $output;
            exit();
        }

        return $result;
    }
}