wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/BuildDocument/Completion/NaiveSubphrasesSuggestionsBuilder.php

Summary

Maintainability
A
35 mins
Test Coverage
<?php

namespace CirrusSearch\BuildDocument\Completion;

use UnexpectedValueException;

/**
 * Simple class for SuggestionsBuilder that needs to munge the title
 * into a list of "subphrases" suggestions.
 * Subphrases are only generated for title, redirects are not yet supported.
 * A set of new fields is used to insert these suggestions 'suggest-extra'
 * is used by default but can be overridden with string[] getExtraFields().
 */
class NaiveSubphrasesSuggestionsBuilder implements ExtraSuggestionsBuilder {
    /** @const string */
    private const LANG_FIELD = 'language';

    /** @const int */
    private const MAX_SUBPHRASES = 10;

    /** @const string subpage type */
    public const SUBPAGE_TYPE = 'subpage';

    /** @const string subpage type */
    public const STARTS_WITH_ANY_WORDS_TYPE = 'anywords';

    /**
     * @var string[] list of regex char ranges indexed by type
     */
    private static $RANGES_BY_TYPE = [
        self::SUBPAGE_TYPE => '\/',
        self::STARTS_WITH_ANY_WORDS_TYPE => '\/\s',
    ];

    /** @var int */
    private $maxSubPhrases;

    /**
     * @var string regex character range, this value must be a valid char
     * range and will be used to build a regular expression like
     * '[' . $charRange . ']'
     */
    private $charRange;

    /**
     * @param string $charRange character range used to split subphrases
     * @param int $maxSubPhrases defaults to MAX_SUBPHRASES
     */
    public function __construct( $charRange, $maxSubPhrases = self::MAX_SUBPHRASES ) {
        $this->charRange = $charRange;
        $this->maxSubPhrases = $maxSubPhrases;
    }

    public static function create( array $config ) {
        $limit = $config['limit'] ?? self::MAX_SUBPHRASES;
        if ( !isset( self::$RANGES_BY_TYPE[$config['type']] ) ) {
            throw new UnexpectedValueException( "Unsupported NaiveSubphrasesSuggestionsBuilder type " .
                $config['type'] );
        }
        $cr = self::$RANGES_BY_TYPE[$config['type']];
        return new self( $cr, $limit );
    }

    /**
     * Get the char range used by this builder
     * to split and generate subphrase suggestions
     * @return string a valid regex char range that will be inserted inside
     * square brackets.
     */
    protected function getCharRange() {
        return $this->charRange;
    }

    /**
     * List of FST fields where the subphrase suggestions
     * will be added.
     * @return string[]
     */
    protected function getExtraFields() {
        return [ 'suggest-subphrases' ];
    }

    /**
     * @inheritDoc
     */
    public function getRequiredFields() {
        // This builder needs the language field
        // to exclude subpages generated by the translate
        // extension
        return [ self::LANG_FIELD ];
    }

    /**
     * @param mixed[] $inputDoc
     * @param string $suggestType (title or redirect)
     * @param int $score
     * @param \Elastica\Document $suggestDoc suggestion type (title or redirect)
     * @param int $targetNamespace
     */
    public function build( array $inputDoc, $suggestType, $score, \Elastica\Document $suggestDoc, $targetNamespace ) {
        if ( $suggestType === SuggestBuilder::REDIRECT_SUGGESTION ) {
            // It's unclear howto support redirects here.
            // It seems hard to retrieve the best redirect if
            // we destroy it with this builder. We would have to
            // add a special code at search time and apply the
            // same splitting strategy on retrieved redirects.
            return;
        }

        $language = "";
        if ( isset( $inputDoc[self::LANG_FIELD] ) ) {
            $language = $inputDoc[self::LANG_FIELD];
        }

        $subPages = $this->tokenize( $inputDoc['title'], $language );
        if ( $subPages ) {
            $suggest = $suggestDoc->get( 'suggest' );
            $suggest['input'] = $subPages;
            foreach ( $this->getExtraFields() as $field ) {
                $suggestDoc->set( $field, $suggest );
            }
        }
    }

    /**
     * Split a translated page title into an array
     * with the title at offset 0 and the language
     * subpage at offset 1.
     *
     * e.g. splitTranslatedPage("Hello/en", "en")
     *  - will output [ "Hello", "/en" ]
     * e.g. splitTranslatedPage("Hello/test", "en")
     *  - will output [ "Hello/test", "" ]
     *
     * @param string $title
     * @param string $language
     * @return string[]
     */
    public function splitTranslatedPage( $title, $language ) {
        $langSubPage = '/' . $language;
        if ( strlen( $langSubPage ) < strlen( $title ) &&
            substr_compare( $title, $langSubPage, -strlen( $langSubPage ) ) == 0
        ) {
            return [ substr( $title, 0, -strlen( $langSubPage ) ), $langSubPage ];
        } else {
            return [ $title, "" ];
        }
    }

    /**
     * Tokenize the input $title by generating phrases suited
     * for completion search.
     * e.g. :
     * $title = "Hello Beautifull Word/en";
     * $builder->tokenize( $title, "en", "\\s" );
     * will generate the following array:
     *   [ "Beautifull Word/en", "Word/en" ]
     *
     * @param string $title
     * @param string $language
     * @return string[] tokenized phrasal suggestions
     */
    public function tokenize( $title, $language ) {
        [ $title, $langSubPage ] = $this->splitTranslatedPage( $title, $language );

        $cr = $this->getCharRange();
        $matches = preg_split( "/[$cr]+/", $title, $this->maxSubPhrases + 1,
            PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_NO_EMPTY );
        // Remove the first one because it's the whole title
        array_shift( $matches );
        $subphrases = [];
        foreach ( $matches as $m ) {
            $subphrases[] = substr( $title, (int)$m[1] ) . $langSubPage;
        }
        return $subphrases;
    }
}