wikimedia/mediawiki-extensions-CirrusSearch

View on GitHub
includes/Query/ArticleTopicFeature.php

Summary

Maintainability
A
1 hr
Test Coverage
<?php

namespace CirrusSearch\Query;

use CirrusSearch\Search\SearchContext;
use CirrusSearch\WarningCollector;
use CirrusSearch\Wikimedia\WeightedTagsHooks;
use Elastica\Query\DisMax;
use Elastica\Query\Term;
use MediaWiki\Message\Message;

/**
 * Finds pages based on how well they match a given topic, based on scores provided by the
 * (Wikimedia-specific) articletopic ORES model.
 * @package CirrusSearch\Wikimedia
 * @see WeightedTagsHooks
 * @see https://www.mediawiki.org/wiki/Help:CirrusSearch#Articletopic
 */
class ArticleTopicFeature extends SimpleKeywordFeature {
    public const ARTICLE_TOPIC_TAG_PREFIX = 'classification.ores.articletopic';
    public const DRAFT_TOPIC_TAG_PREFIX = 'classification.ores.drafttopic';

    private const PREFIX_PER_KEYWORD = [
        'articletopic' => self::ARTICLE_TOPIC_TAG_PREFIX,
        'drafttopic' => self::DRAFT_TOPIC_TAG_PREFIX
    ];

    public const TERMS_TO_LABELS = [
        'biography' => 'Culture.Biography.Biography*',
        'women' => 'Culture.Biography.Women',
        'food-and-drink' => 'Culture.Food and drink',
        'internet-culture' => 'Culture.Internet culture',
        'linguistics' => 'Culture.Linguistics',
        'literature' => 'Culture.Literature',
        'books' => 'Culture.Media.Books',
        'entertainment' => 'Culture.Media.Entertainment',
        'films' => 'Culture.Media.Films',
        'media' => 'Culture.Media.Media*',
        'music' => 'Culture.Media.Music',
        'radio' => 'Culture.Media.Radio',
        'software' => 'Culture.Media.Software',
        'television' => 'Culture.Media.Television',
        'video-games' => 'Culture.Media.Video games',
        'performing-arts' => 'Culture.Performing arts',
        'philosophy-and-religion' => 'Culture.Philosophy and religion',
        'sports' => 'Culture.Sports',
        'architecture' => 'Culture.Visual arts.Architecture',
        'comics-and-anime' => 'Culture.Visual arts.Comics and Anime',
        'fashion' => 'Culture.Visual arts.Fashion',
        'visual-arts' => 'Culture.Visual arts.Visual arts*',
        'geographical' => 'Geography.Geographical',
        'africa' => 'Geography.Regions.Africa.Africa*',
        'central-africa' => 'Geography.Regions.Africa.Central Africa',
        'eastern-africa' => 'Geography.Regions.Africa.Eastern Africa',
        'northern-africa' => 'Geography.Regions.Africa.Northern Africa',
        'southern-africa' => 'Geography.Regions.Africa.Southern Africa',
        'western-africa' => 'Geography.Regions.Africa.Western Africa',
        'central-america' => 'Geography.Regions.Americas.Central America',
        'north-america' => 'Geography.Regions.Americas.North America',
        'south-america' => 'Geography.Regions.Americas.South America',
        'asia' => 'Geography.Regions.Asia.Asia*',
        'central-asia' => 'Geography.Regions.Asia.Central Asia',
        'east-asia' => 'Geography.Regions.Asia.East Asia',
        'north-asia' => 'Geography.Regions.Asia.North Asia',
        'south-asia' => 'Geography.Regions.Asia.South Asia',
        'southeast-asia' => 'Geography.Regions.Asia.Southeast Asia',
        'west-asia' => 'Geography.Regions.Asia.West Asia',
        'eastern-europe' => 'Geography.Regions.Europe.Eastern Europe',
        'europe' => 'Geography.Regions.Europe.Europe*',
        'northern-europe' => 'Geography.Regions.Europe.Northern Europe',
        'southern-europe' => 'Geography.Regions.Europe.Southern Europe',
        'western-europe' => 'Geography.Regions.Europe.Western Europe',
        'oceania' => 'Geography.Regions.Oceania',
        'business-and-economics' => 'History and Society.Business and economics',
        'education' => 'History and Society.Education',
        'history' => 'History and Society.History',
        'military-and-warfare' => 'History and Society.Military and warfare',
        'politics-and-government' => 'History and Society.Politics and government',
        'society' => 'History and Society.Society',
        'transportation' => 'History and Society.Transportation',
        'biology' => 'STEM.Biology',
        'chemistry' => 'STEM.Chemistry',
        'computing' => 'STEM.Computing',
        'earth-and-environment' => 'STEM.Earth and environment',
        'engineering' => 'STEM.Engineering',
        'libraries-and-information' => 'STEM.Libraries & Information',
        'mathematics' => 'STEM.Mathematics',
        'medicine-and-health' => 'STEM.Medicine & Health',
        'physics' => 'STEM.Physics',
        'stem' => 'STEM.STEM*',
        'space' => 'STEM.Space',
        'technology' => 'STEM.Technology',
    ];

    /**
     * Helper method for turning raw ORES score data (as stored in the Cirrus document) into
     * search terms, for analytics/debugging.
     * @param array $rawTopicData The unprefixed content of the document's weighted_tags field
     * @return array corresponding search term => ORES score (rounded to three decimals)
     */
    public static function getTopicScores( array $rawTopicData ): array {
        $labelsToTerms = array_flip( self::TERMS_TO_LABELS );
        $topicScores = [];
        foreach ( $rawTopicData as $rawTopic ) {
            [ $oresLabel, $scaledScore ] = explode( '|', $rawTopic );
            $topicId = $labelsToTerms[$oresLabel];
            $topicScores[$topicId] = (int)$scaledScore / 1000;
        }
        return $topicScores;
    }

    /**
     * @inheritDoc
     * @phan-return array{topics:string[],tag_prefix:string}
     */
    public function parseValue(
        $key, $value, $quotedValue, $valueDelimiter, $suffix, WarningCollector $warningCollector
    ) {
        $topics = explode( '|', $value );
        $invalidTopics = array_diff( $topics, array_keys( self::TERMS_TO_LABELS ) );
        $validTopics = array_filter( array_map( static function ( $topic ) {
            return self::TERMS_TO_LABELS[$topic];
        }, array_diff( $topics, $invalidTopics ) ) );

        if ( $invalidTopics ) {
            $warningCollector->addWarning( 'cirrussearch-articletopic-invalid-topic',
                Message::listParam( $invalidTopics, 'comma' ), count( $invalidTopics ) );
        }
        return [ 'topics' => $validTopics, 'tag_prefix' => self::PREFIX_PER_KEYWORD[$key] ];
    }

    /** @inheritDoc */
    protected function getKeywords() {
        return array_keys( self::PREFIX_PER_KEYWORD );
    }

    /** @inheritDoc */
    protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) {
        $parsed = $this->parseValue( $key, $value, $quotedValue, '', '', $context );
        $topics = $parsed['topics'];
        $tagPrefix = $parsed['tag_prefix'];
        if ( $topics === [] ) {
            $context->setResultsPossible( false );
            return [ null, true ];
        }

        $query = new DisMax();
        foreach ( $topics as $topic ) {
            $topicQuery = new Term();
            $topicQuery->setTerm( WeightedTagsHooks::FIELD_NAME, $tagPrefix . '/' . $topic );
            $query->addQuery( $topicQuery );
        }

        if ( !$negated ) {
            $context->addNonTextQuery( $query );
            return [ null, false ];
        } else {
            return [ $query, false ];
        }
    }

}