maintenance/RunSearch.php
<?php
namespace CirrusSearch\Maintenance;
use CirrusSearch\CirrusDebugOptions;
use CirrusSearch\CirrusSearch;
use CirrusSearch\HashSearchConfig;
use CirrusSearch\Search\CirrusSearchResultSet;
use CirrusSearch\SearchConfig;
use MediaWiki\Maintenance\OrderedStreamingForkController;
use MediaWiki\MediaWikiServices;
use MediaWiki\Settings\SettingsBuilder;
use MediaWiki\Status\Status;
use PageArchive;
use SearchSuggestionSet;
use Wikimedia\Rdbms\IResultWrapper;
/**
* Run search queries provided on stdin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
$IP = getenv( 'MW_INSTALL_PATH' );
if ( $IP === false ) {
$IP = __DIR__ . '/../../..';
}
require_once "$IP/maintenance/Maintenance.php";
require_once __DIR__ . '/../includes/Maintenance/Maintenance.php';
class RunSearch extends Maintenance {
/**
* @var string
*/
protected $indexBaseName;
public function __construct() {
parent::__construct();
$this->addDescription( 'Run one or more searches against the specified cluster. ' .
'search queries are read from stdin.' );
$this->addOption( 'baseName', 'What basename to use for all indexes, ' .
'defaults to wiki id', false, true );
$this->addOption( 'type', 'What type of search to run, prefix, suggest, archive or full_text. ' .
'defaults to full_text.', false, true );
$this->addOption( 'options', 'A JSON object mapping from global variable to ' .
'its test value', false, true );
$this->addOption( 'fork', 'Fork multiple processes to run queries from.' .
'defaults to false.', false, true );
$this->addOption( 'decode', 'urldecode() queries before running them', false, false );
$this->addOption( 'explain', 'Include lucene explanation in the results', false, false );
$this->addOption( 'limit', 'Set the max number of results returned by query (defaults to 10)', false, true );
$this->addOption( 'i-know-what-im-doing', 'Allow setting unknown options from --options', false, false );
}
public function finalSetup( SettingsBuilder $settingsBuilder ) {
parent::finalSetup( $settingsBuilder );
$this->applyGlobals();
}
public function execute() {
$this->disablePoolCountersAndLogging();
$this->indexBaseName = $this->getOption( 'baseName', $this->getSearchConfig()->get( SearchConfig::INDEX_BASE_NAME ) );
$callback = [ $this, 'consume' ];
$forks = $this->getOption( 'fork', false );
$forks = ctype_digit( $forks ) ? (int)$forks : 0;
$controller = new OrderedStreamingForkController( $forks, $callback, STDIN, STDOUT );
$controller->start();
return true;
}
/**
* To keep life sane this shouldn't be able to set completely arbitrary configuration, only
* the options that change search ranking. CirrusSearch has so many variables that enumerating
* them and maintaining extra lists of them would be a tedious process.
*
* @return array<string,true> Changeable global variables represented as the keys for an array, for
* use with isset().
*/
private function loadChangeableConfigVars(): array {
// WARNING: The autoloader isn't available yet, you can't use any mw/cirrus classes
$config = json_decode( file_get_contents( __DIR__ . '/../extension.json' ), true );
if ( !is_array( $config ) ) {
throw new \RuntimeException( 'Could not load extension.json for gathering the '
. 'list of changeable config vars' );
}
$changeable = [];
foreach ( $config['config'] as $key => $_ ) {
$changeable['wg' . $key] = true;
}
return $changeable;
}
/**
* Applies global variables provided as the options CLI argument
* to override current settings.
*/
protected function applyGlobals() {
$optionsData = $this->getOption( 'options', 'false' );
if ( substr_compare( $optionsData, 'B64://', 0, strlen( 'B64://' ) ) === 0 ) {
$optionsData = base64_decode( substr( $optionsData, strlen( 'B64://' ) ) );
}
$options = json_decode( $optionsData, true );
$changeable = $this->loadChangeableConfigVars();
if ( $options ) {
// TODO: This function needs to be called from Maintenance::finalSetup, otherwise the
// config changes are applied too late to make it into various structures created on
// initialization. This is particularly a problem with wikidata integration. Or at
// least it was in Sept 2018. See ce3cf5fc52e4fade6e35fa38093180ae7397fee2.
// Unfortunately, as of March 2020, default values from extension.json are *not*
// available when Maintenance::finalSetup is called. This means you can only modify
// explicitly configured values, anything that still has default values cannot be
// changed.
$forceChange = $this->getOption( 'i-know-what-im-doing', false );
foreach ( $options as $key => $value ) {
if ( strpos( $key, '.' ) !== false ) {
$this->changeGlobalKeyPath( $key, $value, $changeable );
} elseif ( $forceChange || isset( $changeable[$key] ) ) {
// This is different from the keypath case above in that this can set
// variables that haven't been loaded yet. In particular at this point
// in the MW load process explicitly configured variables are
// available, but defaults from extension.json have not yet been
// loaded.
$GLOBALS[$key] = $value;
} else {
$this->fatalError( "\nERROR: $key is not a globally changeable variable\n" );
}
}
}
}
/**
* Navigate a key path to change a global variable.
*
* @param string $key the path
* @param mixed $value what we want to set it to
* @param array<string,true> $changeable the changeable variables
*/
private function changeGlobalKeyPath( string $key, $value, array $changeable ): void {
// key path
$path = explode( '.', $key );
$pathel = array_shift( $path );
if ( !isset( $changeable[$pathel] ) ) {
$this->fatalError( "\nERROR: $key is not a globally changeable variable\n" );
}
$cur =& $GLOBALS[$pathel];
foreach ( $path as $pathel ) {
if ( !is_array( $cur ) || !array_key_exists( $pathel, $cur ) ) {
$this->fatalError( "\nERROR: $key is not a valid global variable path\n" );
}
$cur =& $cur[$pathel];
}
$cur = $value;
}
/**
* Transform the search request into a JSON string representing the
* search result.
*
* @param string $query
* @return string JSON object
*/
public function consume( $query ) {
if ( $this->getOption( 'decode' ) ) {
$query = urldecode( $query );
}
$data = [ 'query' => $query ];
$status = $this->searchFor( $query );
if ( $status->isOK() ) {
$value = $status->getValue();
if ( $value instanceof IResultWrapper ) {
// Archive search results
$data += $this->processArchiveResult( $value );
} elseif ( $value instanceof CirrusSearchResultSet ) {
$data += $this->processResultSet( $value, $query );
} elseif ( $value instanceof SearchSuggestionSet ) {
// these are suggestion results
$data += $this->processSuggestionSet( $value );
} else {
throw new \RuntimeException(
'Unknown result type: '
. get_debug_type( $value )
);
}
} else {
$data['error'] = $status->getMessage()->text();
}
return json_encode( $data );
}
/**
* Extract data from a search result set.
* @param CirrusSearchResultSet $value
* @param string $query
* @return array
*/
protected function processResultSet( CirrusSearchResultSet $value, $query ) {
// these are prefix or full text results
$rows = [];
foreach ( $value as $result ) {
/** @var CirrusSearch\Search\CirrusSearchResult $result */
$row = [
// use getDocId() rather than asking the title to allow this script
// to work when a production index has been imported to a test es instance
'docId' => $result->getDocId(),
'title' => $result->getTitle()->getPrefixedText(),
'score' => $result->getScore(),
'snippets' => [
'text' => $result->getTextSnippet(),
'title' => $result->getTitleSnippet(),
'redirect' => $result->getRedirectSnippet(),
'section' => $result->getSectionSnippet(),
'category' => $result->getCategorySnippet(),
],
'explanation' => $result->getExplanation(),
'extra' => $result->getExtensionData(),
];
$img = $result->getFile() ?: MediaWikiServices::getInstance()->getRepoGroup()
->findFile( $result->getTitle() );
if ( $img ) {
$thumb = $img->transform( [ 'width' => 120, 'height' => 120 ] );
if ( $thumb ) {
$row['thumb_url'] = $thumb->getUrl();
}
}
$rows[] = $row;
}
return [
'totalHits' => $value->getTotalHits(),
'rows' => $rows,
];
}
/**
* Extract data from a search suggestions set.
* @param SearchSuggestionSet $value
* @return array
*/
protected function processSuggestionSet( SearchSuggestionSet $value ) {
$rows = [];
foreach ( $value->getSuggestions() as $suggestion ) {
$rows[] = [
'pageId' => $suggestion->getSuggestedTitleID(),
'title' => $suggestion->getSuggestedTitle()->getPrefixedText(),
'snippets' => [],
];
}
return [
'totalHits' => $value->getSize(),
'rows' => $rows,
];
}
/**
* Extract data from archive search results.
* @param IResultWrapper $value
* @return array
*/
protected function processArchiveResult( IResultWrapper $value ) {
$rows = [];
foreach ( $value as $row ) {
$rows[] = [
'title' => $row->ar_title,
'namespace' => $row->ar_namespace,
'count' => $row->count,
];
}
return [
'totalHits' => $value->numRows(),
'rows' => $rows,
];
}
/**
* Search for term in the archive.
* @param string $query
* @return Status<IResultWrapper>
*/
protected function searchArchive( $query ) {
$result = PageArchive::listPagesBySearch( $query );
return Status::newGood( $result );
}
/**
* Transform the search request into a Status object representing the
* search result. Varies based on CLI input argument `type`.
*
* @param string $query
* @return Status<CirrusSearch\Search\CirrusSearchResultSet|SearchSuggestionSet|IResultWrapper>
*/
protected function searchFor( $query ) {
$searchType = $this->getOption( 'type', 'full_text' );
if ( $searchType === 'archive' ) {
// Archive has its own engine so go directly there
return $this->searchArchive( $query );
}
$limit = $this->getOption( 'limit', 10 );
$options = CirrusDebugOptions::forRelevanceTesting(
$this->getOption( 'explain', false ) ? 'raw' : null
);
$config = new HashSearchConfig( [ SearchConfig::INDEX_BASE_NAME => $this->indexBaseName ],
[ HashSearchConfig::FLAG_INHERIT ] );
$engine = new CirrusSearch( $config, $options );
$namespaces = array_keys( $engine->getConfig()->get( 'NamespacesToBeSearchedDefault' ), true );
$engine->setNamespaces( $namespaces );
$engine->setConnection( $this->getConnection() );
$engine->setLimitOffset( $limit );
switch ( $searchType ) {
case 'full_text':
// @todo pass through $this->getConnection() ?
$result = $engine->searchText( $query );
if ( $result instanceof Status ) {
return $result;
} else {
return Status::newGood( $result );
}
case 'prefix':
$titles = $engine->defaultPrefixSearch( $query );
$resultSet = SearchSuggestionSet::fromTitles( $titles );
return Status::newGood( $resultSet );
case 'suggest':
$result = $engine->completionSearch( $query );
if ( $result instanceof Status ) {
return $result;
} else {
return Status::newGood( $result );
}
default:
$this->fatalError( "\nERROR: Unknown search type $searchType\n" );
}
}
}
$maintClass = RunSearch::class;
require_once RUN_MAINTENANCE_IF_MAIN;