wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Search/TextIndexField.php

Summary

Maintainability
C
7 hrs
Test Coverage
<?php

namespace CirrusSearch\Search;

use CirrusSearch\CirrusSearch;
use CirrusSearch\Maintenance\MappingConfigBuilder;
use CirrusSearch\Profile\SearchProfileService;
use CirrusSearch\SearchConfig;
use SearchEngine;
use SearchIndexField;

/**
 * Index field representing keyword.
 * Keywords use special analyzer.
 * @package CirrusSearch
 */
class TextIndexField extends CirrusIndexField {
    /**
     * Distance that lucene places between multiple values of the same field.
     * Set pretty high to prevent accidental phrase queries between those values.
     */
    public const POSITION_INCREMENT_GAP = 10;

    /* Bit field parameters for string fields.
     *   ENABLE_NORMS: Enable norms on the field.  Good for text you search against but useless
     *     for fields that don't get involved in the score.
     *   COPY_TO_SUGGEST: Copy the contents of this field to the suggest field for "Did you mean".
     *   SPEED_UP_HIGHLIGHTING: Store extra data in the field to speed up highlighting.  This is important for
     *     long strings or fields with many values.
     *   SUPPORT_REGEX: If the wikimedia-extra plugin is available add a trigram
     *     index to speed up search.
     */
    public const ENABLE_NORMS = 0x1000000;
    // FIXME: when exactly we want to disable norms for text fields?
    public const COPY_TO_SUGGEST = 0x2000000;
    public const SPEED_UP_HIGHLIGHTING = 0x4000000;
    public const SUPPORT_REGEX = 0x8000000;
    public const STRING_FIELD_MASK = 0xFFFFFF;

    /**
     * Extra definitions.
     * @var array
     */
    protected $extra;
    /**
     * Text options for this field
     * @var int
     */
    private $textOptions;

    /**
     * Name of the type in Elastic
     * @var string
     */
    protected $typeName = 'text';

    /**
     * Are trigrams useful?
     * @var bool
     */
    protected $allowTrigrams = false;

    public function __construct( $name, $type, SearchConfig $config, $extra = [] ) {
        parent::__construct( $name, $type, $config );

        $this->extra = $extra;

        if ( $config->getElement( 'CirrusSearchWikimediaExtraPlugin', 'regex' ) &&
            in_array( 'build', $config->getElement( 'CirrusSearchWikimediaExtraPlugin', 'regex' ) )
        ) {
            $this->allowTrigrams = true;
        }
    }

    /**
     * Set text options for this field if non-default
     * @param int $options
     * @return self
     */
    public function setTextOptions( $options ) {
        $this->textOptions = $options;
        return $this;
    }

    /**
     * Get text options for this field
     * @param int $mappingFlags
     * @return int
     */
    protected function getTextOptions( $mappingFlags ) {
        if ( $this->textOptions !== null ) {
            return $this->textOptions;
        }
        $options = self::ENABLE_NORMS | self::SPEED_UP_HIGHLIGHTING;
        if ( $this->config->get( 'CirrusSearchEnablePhraseSuggest' ) &&
            $mappingFlags & MappingConfigBuilder::PHRASE_SUGGEST_USE_TEXT &&
            !$this->checkFlag( SearchIndexField::FLAG_SCORING )
        ) {
            // SCORING fields are not copied since this info is already in other fields
            $options |= self::COPY_TO_SUGGEST;
        }
        if ( $this->checkFlag( SearchIndexField::FLAG_NO_HIGHLIGHT ) ) {
            // Disable highlighting is asked to
            $options &= ~self::SPEED_UP_HIGHLIGHTING;
        }
        return $options;
    }

    /**
     * @param SearchEngine $engine
     * @return array
     */
    public function getMapping( SearchEngine $engine ) {
        if ( !( $engine instanceof CirrusSearch ) ) {
            throw new \LogicException( "Cannot map CirrusSearch fields for another engine." );
        }
        $this->initFlags();
        /**
         * @var CirrusSearch $engine
         */
        $field = parent::getMapping( $engine );

        if ( $this->config->get( 'CirrusSearchEnablePhraseSuggest' ) &&
             $this->checkFlag( self::COPY_TO_SUGGEST )
        ) {
            $field[ 'copy_to' ] = [ 'suggest' ];
        }

        if ( $this->checkFlag( self::FLAG_NO_INDEX ) ) {
            // no need to configure further a not-indexed field
            return $field;
        }

        $extra = $this->extra;

        if ( $this->mappingFlags & MappingConfigBuilder::PREFIX_START_WITH_ANY ) {
            $extra[] = [
                'analyzer' => 'word_prefix',
                'search_analyzer' => 'plain_search',
                'index_options' => 'docs'
            ];
        }
        if ( $this->checkFlag( SearchIndexField::FLAG_CASEFOLD ) ) {
            $extra[] = [
                'analyzer' => 'lowercase_keyword',
                'norms' => false,
                'index_options' => 'docs',
                // TODO: Re-enable in ES 5.2 with keyword type and s/analyzer/normalizer/
                // 'ignore_above' => KeywordIndexField::KEYWORD_IGNORE_ABOVE,
            ];
        }

        if ( $this->allowTrigrams && $this->checkFlag( self::SUPPORT_REGEX ) ) {
            $extra[] = [
                'norms' => false,
                'type' => 'text',
                'analyzer' => 'trigram',
                'index_options' => 'docs',
            ];
        }

        // multi_field is dead in 1.0 so we do this which actually looks less gnarly.
        $field += [
            'analyzer' => 'text',
            'search_analyzer' => 'text_search',
            'position_increment_gap' => self::POSITION_INCREMENT_GAP,
            'similarity' => self::getSimilarity( $this->config, $this->name ),
            'fields' => [
                'plain' => [
                    'type' => 'text',
                    'analyzer' => 'plain',
                    'search_analyzer' => 'plain_search',
                    'position_increment_gap' => self::POSITION_INCREMENT_GAP,
                    'similarity' => self::getSimilarity( $this->config, $this->name, 'plain' ),
                ],
            ]
        ];
        $disableNorms = !$this->checkFlag( self::ENABLE_NORMS );
        if ( $disableNorms ) {
            $disableNorms = [ 'norms' => false ];
            $field = array_merge( $field, $disableNorms );
            $field[ 'fields' ][ 'plain' ] = array_merge( $field[ 'fields' ][ 'plain' ], $disableNorms );
        }
        foreach ( $extra as $extraField ) {
            $extraName = $extraField[ 'fieldName' ] ?? $extraField[ 'analyzer' ];
            unset( $extraField[ 'fieldName' ] );

            $field[ 'fields' ][ $extraName ] = array_merge( [
                'similarity' => self::getSimilarity( $this->config, $this->name, $extraName ),
                'type' => 'text',
            ], $extraField );

            if ( $disableNorms ) {
                $field[ 'fields' ][ $extraName ] = array_merge(
                    $field[ 'fields' ][ $extraName ], $disableNorms );
            }
        }
        $this->configureHighlighting( $field,
            [ 'plain', 'prefix', 'prefix_asciifolding', 'near_match', 'near_match_asciifolding' ] );
        return $field;
    }

    /**
     * Adapt the field options according to the highlighter used
     * @param mixed[] &$field the mapping options being built
     * @param string[] $subFields list of subfields to configure
     * @param bool $rootField configure the root field (defaults to true)
     */
    protected function configureHighlighting( array &$field, array $subFields, $rootField = true ) {
        if ( $this->mappingFlags & MappingConfigBuilder::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER ) {
            if ( $this->checkFlag( self::SPEED_UP_HIGHLIGHTING ) ) {
                if ( $rootField ) {
                    $field[ 'index_options' ] = 'offsets';
                }
                foreach ( $subFields as $fieldName ) {
                    if ( isset( $field[ 'fields' ][ $fieldName ] ) ) {
                        $field[ 'fields' ][ $fieldName ][ 'index_options' ] = 'offsets';
                    }
                }
            }
        } else {
            // We use the FVH on all fields so turn on term vectors
            if ( $rootField ) {
                $field[ 'term_vector' ] = 'with_positions_offsets';
            }
            foreach ( $subFields as $fieldName ) {
                if ( isset( $field[ 'fields' ][ $fieldName ] ) ) {
                    $field[ 'fields' ][ $fieldName ][ 'term_vector' ] = 'with_positions_offsets';
                }
            }
        }
    }

    /**
     * Init the field flags
     */
    protected function initFlags() {
        $this->flags =
            ( $this->flags & self::STRING_FIELD_MASK ) | $this->getTextOptions( $this->mappingFlags );
    }

    /**
     * Get the field similarity
     * @param SearchConfig $config
     * @param string $field
     * @param string|null $analyzer
     * @return string
     */
    public static function getSimilarity( SearchConfig $config, $field, $analyzer = null ) {
        $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY );
        $fieldSimilarity = $similarity['fields'][$field] ?? $similarity['fields']['__default__'] ?? null;
        if ( $analyzer !== null && isset( $similarity['fields']["$field.$analyzer"] ) ) {
            $fieldSimilarity = $similarity['fields']["$field.$analyzer"];
        }
        if ( $fieldSimilarity === null ) {
            throw new \RuntimeException( "Invalid similarity profile, unable to infer the similarity for " .
                "the field $field, (defining a __default__ field might solve the issue" );
        }
        return $fieldSimilarity;
    }
}