wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/RequestLogger.php

Summary

Maintainability
D
1 day
Test Coverage
<?php

namespace CirrusSearch;

use ISearchResultSet;
use MediaWiki\Context\RequestContext;
use MediaWiki\Deferred\DeferredUpdates;
use MediaWiki\Json\FormatJson;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use MediaWiki\User\User;
use MediaWiki\User\UserIdentity;
use MediaWiki\WikiMap\WikiMap;

/**
 * Handles logging information about requests made to various destinations,
 * such as monolog, EventBus and statsd.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */
class RequestLogger {
    /**
     * @const int max number of results to store in cirrussearch-request logs (per request)
     */
    private const LOG_MAX_RESULTS = 50;

    // If a title hit was given to the user, but the hit was not
    // obtained via ElasticSearch, we won't really know where it came from.
    // We still want to log the fact that the request set generated a hit
    // for this title.  When this happens, the hit index field value will be this.
    private const UNKNOWN_HIT_INDEX = '_UNKNOWN_';

    /**
     * @var RequestLog[] Set of requests made
     */
    private $logs = [];

    /**
     * @var string[] Result page ids that were returned to user
     */
    private $resultTitleStrings = [];

    /**
     * @var string[][] Extra payload for the logs, indexed first by the log index
     *  in self::$logs, and second by the payload item name.
     */
    private $extraPayload = [];

    /**
     * @return bool True when logs have been generated during the current
     *  php execution.
     */
    public function hasQueryLogs() {
        return (bool)$this->logs;
    }

    /**
     * @param float $percentage A value between 0 and 1 indiciating the
     *  percentage of calls that should return true.
     * @return bool True on $percentage calls to this method
     */
    private function sample( $percentage ) {
        if ( $percentage <= 0 ) {
            return false;
        }
        if ( $percentage >= 1 ) {
            return true;
        }
        $rand = random_int( 0, PHP_INT_MAX ) / PHP_INT_MAX;
        return $rand <= $percentage;
    }

    /**
     * Summarizes all the requests made in this process and reports
     * them along with the test they belong to.
     */
    private function reportLogs() {
        if ( $this->logs ) {

            // Build the mediawiki/search/requestset event and log it to the (json+EventBus)
            // cirrussearch-request channel.
            LoggerFactory::getInstance( 'cirrussearch-request' )->debug(
                '', $this->buildCirrusSearchRequestEvent()
            );

            // reset logs
            $this->logs = [];
        }
    }

    /**
     * @param RequestLog $log The log about a network request to be added
     * @param User|null $user The user performing the request, or null
     *  for actions that don't have a user (such as index updates).
     * @param int|null $slowMillis The threshold in ms after which the request
     *  will be considered slow.
     * @return array A map of information about the performed request, suitible
     *  for use as a psr-3 log context.
     */
    public function addRequest( RequestLog $log, UserIdentity $user = null, $slowMillis = null ) {
        global $wgCirrusSearchLogElasticRequests;

        // @todo Is this necessary here? Check on what uses the response value
        $finalContext = $log->getLogVariables() + [
            'source' => Util::getExecutionContext(),
            'executor' => Util::getExecutionId(),
            'identity' => Util::generateIdentToken(),
        ];
        if ( $wgCirrusSearchLogElasticRequests ) {
            $this->logs[] = $log;
            if ( count( $this->logs ) === 1 ) {
                DeferredUpdates::addCallableUpdate( function () {
                    $this->reportLogs();
                } );
            }

            $logMessage = $this->buildLogMessage( $log, $finalContext );
            LoggerFactory::getInstance( 'CirrusSearchRequests' )->debug( $logMessage, $finalContext );
            if ( $slowMillis && $log->getTookMs() >= $slowMillis ) {
                if ( $user !== null ) {
                    $finalContext['user'] = $user->getName();
                    $logMessage .= ' for {user}';
                }
                LoggerFactory::getInstance( 'CirrusSearchSlowRequests' )->info( $logMessage, $finalContext );
            }
        }

        return $finalContext;
    }

    /**
     * @param string $key
     * @param string $value
     */
    public function appendLastLogPayload( $key, $value ) {
        $idx = count( $this->logs ) - 1;
        if ( isset( $this->logs[$idx] ) ) {
            $this->extraPayload[$idx][$key] = $value;
        }
    }

    /**
     * Report the types of queries that were issued
     * within the current request.
     *
     * @return string[]
     */
    public function getQueryTypesUsed() {
        $types = [];
        foreach ( $this->logs as $log ) {
            $types[] = $log->getQueryType();
        }
        return array_unique( $types );
    }

    /**
     * This is set externally because we don't have complete control, from the
     * SearchEngine interface, of what is actually sent to the user. Instead hooks
     * receive the final results that will be sent to the user and set them here.
     *
     * Accepts two result sets because some places (Special:Search) perform multiple
     * searches. This can be called multiple times, but only that last call wins. For
     * API's that is correct, for Special:Search a hook catches the final results and
     * sets them here.
     *
     * @param ISearchResultSet[] $matches
     */
    public function setResultPages( array $matches ) {
        $titleStrings = [];
        foreach ( $matches as $resultSet ) {
            if ( $resultSet !== null ) {
                $titleStrings = array_merge( $titleStrings, $this->extractTitleStrings( $resultSet ) );
            }
        }
        $this->resultTitleStrings = $titleStrings;
    }

    /**
     * Builds and ships a log context of an object that conforms to the
     * JSONSchema  mediawiki/search/requestset event schema.
     *
     * @return array mediawiki/search/requestset event object
     */
    private function buildCirrusSearchRequestEvent() {
        global $wgRequest, $wgServerName;
        $webrequest = $wgRequest;

        // for the moment RequestLog::getRequests() is still created in the
        // old format to serve the old log formats, so here we transform the
        // context into the new mediawiki/cirrussearch/request event format.
        // At some point the context should just be created in the correct format.
        $elasticSearchRequests = [];

        // If false, then at least one of the Elasticsearch request responses
        // returned was not from cache
        $allRequestsCached = true;

        // While building the elasticSearchRequests object, collect
        // a list of all hits from all Elasticsearch requests.
        $allHits = [];

        // Iterate over each individual Elasticsearch request
        foreach ( $this->logs as $idx => $elasticSearchRequestLog ) {
            foreach ( $elasticSearchRequestLog->getRequests() as $requestContext ) {
                // Build an entry in the elasticSearchRequests array from each logged
                // Elasticsearch request.
                $requestEntry = [];

                Util::setIfDefined( $requestContext, 'query', $requestEntry, 'query', 'strval' );
                Util::setIfDefined( $requestContext, 'queryType', $requestEntry, 'query_type', 'strval' );
                Util::setIfDefined(
                    $requestContext, 'index', $requestEntry, 'indices',
                    // Use list for search indices, not csv string.
                    static function ( $v ) {
                        return explode( ',', $v );
                    }
                );
                Util::setIfDefined(
                    $requestContext, 'namespaces', $requestEntry, 'namespaces',
                    // Make sure namespace values are all integers.
                    static function ( $v ) {
                        if ( !$v ) {
                            return $v;
                        } else {
                            return array_values( array_map( 'intval', $v ) );
                        }
                    }
                );
                Util::setIfDefined( $requestContext, 'tookMs', $requestEntry, 'request_time_ms', 'intval' );
                Util::setIfDefined( $requestContext, 'elasticTookMs', $requestEntry, 'search_time_ms', 'intval' );
                Util::setIfDefined( $requestContext, 'limit', $requestEntry, 'limit', 'intval' );
                Util::setIfDefined( $requestContext, 'hitsTotal', $requestEntry, 'hits_total', 'intval' );
                Util::setIfDefined( $requestContext, 'hitsReturned', $requestEntry, 'hits_returned', 'intval' );
                Util::setIfDefined( $requestContext, 'hitsOffset', $requestEntry, 'hits_offset', 'intval' );
                Util::setIfDefined( $requestContext, 'suggestion', $requestEntry, 'suggestion', 'strval' );

                // suggestion_requested is true if suggestionRequested or if any suggestion is present.
                $requestEntry['suggestion_requested'] =
                    (bool)( $requestContext['suggestionRequested'] ?? isset( $requestContext['suggestion'] ) );

                Util::setIfDefined( $requestContext, 'maxScore', $requestEntry, 'max_score', 'floatval' );

                if ( isset( $this->extraPayload[$idx] ) ) {
                    // fields in extraPayload keys must be explicitly
                    // set here if we want to include them in the event.
                    Util::setIfDefined(
                        $this->extraPayload[$idx], 'langdetect', $requestEntry, 'langdetect', 'strval'
                    );
                }

                Util::setIfDefined( $requestContext, 'syntax', $requestEntry, 'syntax' );

                // If response was servied from cache.
                $requestEntry['cached'] = (bool)$elasticSearchRequestLog->isCachedResponse();
                // $wereAllResponsesCached will be used later for
                // deteriming if all responses for the request set were cached.
                if ( !$requestEntry['cached'] ) {
                    $allRequestsCached = false;
                }

                Util::setIfDefined(
                    $requestContext, 'hits', $requestEntry, 'hits',
                    function ( $v ) {
                        return $this->encodeHits( $v );
                    }
                );
                if ( isset( $requestEntry['hits'] ) ) {
                    $allHits = array_merge( $allHits, $requestEntry['hits'] );
                }

                $elasticSearchRequests[] = $requestEntry;
            }
        }

        // Reindex allHits by page titles. It's maybe not perfect, but it's
        // hopefully a "close enough" representation of where our final result
        // set came from. maybe :(
        $allHitsByTitle = [];
        foreach ( $allHits as $hit ) {
            '@phan-var array $hit';
            $allHitsByTitle[$hit['page_title']] = $hit;
        }
        $resultHits = [];
        // $this->resultTitleStrings give us title hits that were actually provided to the user.
        // Build a subset top level list of hits from each sub request's list of hits.
        $resultTitleStrings = array_slice( $this->resultTitleStrings, 0, self::LOG_MAX_RESULTS );
        foreach ( $resultTitleStrings as $titleString ) {
            // $allHitsByTitle[$titleString] will have the hit. If there isn't a hit
            // for this title, then this title 'hit' did not come from a ElasticSearch request,
            // so log a 'fake' hit for it.  In this case, the index will be self::UNKNOWN_HIT_INDEX.
            $hit = $allHitsByTitle[$titleString] ?? [
                'page_title' => (string)$titleString,
                'index' => self::UNKNOWN_HIT_INDEX,
            ];
            $resultHits[] = $hit;
        }

        $gen = MediaWikiServices::getInstance()->getGlobalIdGenerator();
        $requestEvent = [
            // This schema can be found in the mediawiki/event-schemas repository.
            // The $schema URI here should be updated if we increment schema versions.
            '$schema' => '/mediawiki/cirrussearch/request/0.0.1',
            'meta' => [
                'request_id' => $webrequest->getRequestId(),
                'id' => $gen->newUUIDv4(),
                'dt' => wfTimestamp( TS_ISO_8601 ),
                'domain' => $wgServerName,
                'stream' => 'mediawiki.cirrussearch-request',
            ],
            'http' => [
                'method' => $webrequest->getMethod(),
                'client_ip' => $webrequest->getIP(),
                'has_cookies' => $webrequest->getHeader( 'Cookie' ) ? true : false,
            ],
            'database' => WikiMap::getCurrentWikiId(),
            'mediawiki_host' => gethostname(),
            'search_id' => Util::getRequestSetToken(),
            'source' => Util::getExecutionContext(),
            'identity' => Util::generateIdentToken(),
            'request_time_ms' => $this->getPhpRequestTookMs(),
            'all_elasticsearch_requests_cached' => $allRequestsCached,
        ];

        $webRequestValues = $webrequest->getValues();
        if ( $webRequestValues ) {
            $requestEvent['params'] = [];
            // Make sure all params are string keys and values
            foreach ( $webRequestValues as $k => $v ) {
                if ( !is_scalar( $v ) ) {
                    // This is potentially a multi-dimensional array. JSON is
                    // perhaps not the best format, but this gives a good
                    // guarantee about always returning a string, and
                    // faithfully represents the variety of shapes request
                    // parameters can be parsed into.
                    $v = FormatJson::encode( $v );
                }
                $k = $webrequest->normalizeUnicode( $k );
                $requestEvent['params'][(string)$k] = (string)$v;
            }
        }

        // Don't set these fields if there is no data.
        if ( $resultHits ) {
            $requestEvent['hits'] = $resultHits;
        }
        if ( $elasticSearchRequests ) {
            $requestEvent['elasticsearch_requests'] = $elasticSearchRequests;
        }
        // Should always be true, but don't accidently instantiate user testing if somehow
        // it wasn't done previously.
        if ( UserTestingStatus::hasInstance() ) {
            $ut = UserTestingStatus::getInstance();
            if ( $ut->isActive() ) {
                // Wrap into an array. UserTesting does not support multiple
                // concurrent tests in one request, but the schema was written
                // when that was a possibility.
                $requestEvent['backend_user_tests'] = [ $ut->getTrigger() ];
            }
        }

        // If in webrequests, log these request headers in http.headers.
        $httpRequestHeadersToLog = [ 'accept-language', 'referer', 'user-agent' ];
        foreach ( $httpRequestHeadersToLog as $header ) {
            if ( $webrequest->getHeader( $header ) ) {
                $requestEvent['http']['request_headers'][$header] =
                    (string)$webrequest->normalizeUnicode( $webrequest->getHeader( $header ) );
            }
        }

        return $requestEvent;
    }

    /**
     * @param ISearchResultSet $matches
     * @return string[]
     */
    private function extractTitleStrings( ISearchResultSet $matches ) {
        $strings = [];
        foreach ( $matches as $result ) {
            $strings[] = (string)$result->getTitle();
        }

        return $strings;
    }

    /**
     * @param RequestLog $log The request to build a log message about
     * @param array $context Request specific log variables from RequestLog::getLogVariables()
     * @return string a PSR-3 compliant message describing $context
     */
    private function buildLogMessage( RequestLog $log, array $context ) {
        $message = $log->getDescription();
        $message .= " against {index} took {tookMs} millis";
        if ( isset( $context['elasticTookMs'] ) ) {
            $message .= " and {elasticTookMs} Elasticsearch millis";
            if ( isset( $context['elasticTook2PassMs'] ) && $context['elasticTook2PassMs'] >= 0 ) {
                $message .= " (with 2nd pass: {elasticTook2PassMs} ms)";
            }
        }
        if ( isset( $context['hitsTotal'] ) ) {
            $message .= ". Found {hitsTotal} total results";
            $message .= " and returned {hitsReturned} of them starting at {hitsOffset}";
        }
        if ( isset( $context['namespaces'] ) ) {
            $namespaces = implode( ', ', $context['namespaces'] );
            $message .= " within these namespaces: $namespaces";
        }
        if ( isset( $context['suggestion'] ) && strlen( $context['suggestion'] ) > 0 ) {
            $message .= " and suggested '{suggestion}'";
        }
        $message .= ". Requested via {source} for {identity} by executor {executor}";

        return $message;
    }

    /**
     * Enforce all type constraints on the hits field and limit
     * the number of results to the maximum specified.
     *
     * @param array[] $hits
     * @return array[]
     */
    private function encodeHits( array $hits ) {
        $formattedHits = [];
        foreach ( array_slice( $hits, 0, self::LOG_MAX_RESULTS )  as $hit ) {
            $formattedHit = [];
            Util::setIfDefined( $hit, 'title', $formattedHit, 'page_title', 'strval' );
            Util::setIfDefined( $hit, 'pageId', $formattedHit, 'page_id', 'intval' );
            Util::setIfDefined( $hit, 'index', $formattedHit, 'index', 'strval' );
            Util::setIfDefined( $hit, 'score', $formattedHit, 'score', 'floatval' );
            Util::setIfDefined( $hit, 'profileName', $formattedHit, 'profile_name', 'strval' );
            $formattedHits[] = $formattedHit;
        }
        return $formattedHits;
    }

    /**
     * Note that this is only accurate for hhvm and php-fpm
     * since they close the request to the user before running
     * deferred updates.
     *
     * @return int The number of ms the php request took
     */
    private function getPhpRequestTookMs() {
        $timing = RequestContext::getMain()->getTiming();
        $startMark = $timing->getEntryByName( 'requestStart' );
        $endMark  = $timing->getEntryByName( 'requestShutdown' );
        if ( $startMark && $endMark ) {
            // should always work, but Timing can return null so
            // fallbacks are provided.
            $tookS = $endMark['startTime'] - $startMark['startTime'];
        } elseif ( isset( $_SERVER['REQUEST_TIME_FLOAT'] ) ) {
            // php >= 5.4
            $tookS = microtime( true ) - $_SERVER['REQUEST_TIME_FLOAT'];
        } else {
            // php 5.3
            $tookS = microtime( true ) - $_SERVER['REQUEST_TIME'];
        }

        return intval( 1000 * $tookS );
    }
}