wikimedia/mediawiki-extensions-Translate

View on GitHub
src/TranslatorInterface/EntitySearch.php

Summary

Maintainability
C
7 hrs
Test Coverage
<?php
declare( strict_types = 1 );

namespace MediaWiki\Extension\Translate\TranslatorInterface;

use AggregateMessageGroup;
use Collation;
use MediaWiki\Extension\Translate\MessageBundleTranslation\MessageBundleMessageGroup;
use MediaWiki\Extension\Translate\MessageGroupProcessing\MessageGroups;
use MediaWiki\Extension\Translate\MessageLoading\MessageIndex;
use MediaWiki\Title\MalformedTitleException;
use MediaWiki\Title\NamespaceInfo;
use MediaWiki\Title\TitleFormatter;
use MediaWiki\Title\TitleParser;
use SplMinHeap;
use WANObjectCache;
use Wikimedia\LightweightObjectStore\ExpirationAwareness;
use WikiPageMessageGroup;

/**
 * Service for searching message groups and message keys.
 * @author Niklas Laxström
 * @license GPL-2.0-or-later
 * @since 2021.10
 */
class EntitySearch {
    private const FIELD_DELIMITER = "\x7F";
    private const ROW_DELIMITER = "\n";

    private WANObjectCache $cache;
    private Collation $collation;
    private MessageGroups $messageGroupFactory;
    private NamespaceInfo $namespaceInfo;
    private MessageIndex $messageIndex;
    private TitleParser $titleParser;
    private TitleFormatter $titleFormatter;
    private const TYPE_AGGREGATE = AggregateMessageGroup::class;
    private const TYPE_MESSAGE_BUNDLE = MessageBundleMessageGroup::class;
    private const TYPE_WIKIPAGE = WikiPageMessageGroup::class;
    private const TYPE_OTHERS = 'o';
    private const TYPE_MAPPING = [
        self::TYPE_AGGREGATE => [ 'aggregate-groups', 'a' ],
        self::TYPE_MESSAGE_BUNDLE => [ 'message-bundles', 'm' ],
        self::TYPE_WIKIPAGE => [ 'translatable-pages', 'w' ]
    ];
    /** @var array<string,string> */
    private array $mappedTypes;

    public function __construct(
        WANObjectCache $cache,
        Collation $collation,
        MessageGroups $messageGroupFactory,
        NamespaceInfo $namespaceInfo,
        MessageIndex $messageIndex,
        TitleParser $titleParser,
        TitleFormatter $titleFormatter
    ) {
        $this->cache = $cache;
        $this->collation = $collation;
        $this->messageGroupFactory = $messageGroupFactory;
        $this->namespaceInfo = $namespaceInfo;
        $this->messageIndex = $messageIndex;
        $this->titleParser = $titleParser;
        $this->titleFormatter = $titleFormatter;
        $this->mappedTypes = $this->getGroupTypes();
    }

    public function searchStaticMessageGroups( string $query, int $maxResults, array $types = [] ): array {
        $cache = $this->cache;
        // None of the static groups currently use language-dependent labels. This
        // may need revisiting later and splitting the cache by language.
        $key = $cache->makeKey( 'Translate', 'EntitySearch', 'static-groups', '-v2' );
        $haystack = $cache->getWithSetCallback(
            $key,
            ExpirationAwareness::TTL_WEEK,
            function (): string {
                return $this->getStaticMessageGroupsHaystack();
            },
            [
                // Calling touchCheckKey() on this key purges the cache
                'checkKeys' => [ $this->messageGroupFactory->getCacheKey() ],
                // Avoid querying cache servers multiple times in a web request
                'pcTTL' => ExpirationAwareness::TTL_PROC_LONG
            ]
        );

        // Algorithm: Construct one big string with one entity per line. Then run
        // preg_match_all twice over it, first to collect prefix match (to show them
        // first), then to match words if more results are needed.
        $results = [];

        $delimiter = self::FIELD_DELIMITER;
        $anything = "[^$delimiter\n]";
        $query = preg_quote( $query, '/' );

        $groupTypeFilter = [];
        foreach ( $types as $filter ) {
            $groupTypeFilter[] = $this->mappedTypes[$filter] ?? '';
        }
        $groupTypeFilter = count( $groupTypeFilter )
            ? ( '[' . implode( $groupTypeFilter ) . ']' )
            : $anything;

        // Prefix match
        $pattern = "/^$groupTypeFilter$delimiter($query$anything*)$delimiter($anything+)$/miu";
        preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
        foreach ( $matches as [ , $label, $groupId ] ) {
            // Index by $groupId to avoid duplicates from the prefix match and the word match
            $results[$groupId] = [
                'label' => $label,
                'group' => $groupId,
            ];

            if ( count( $results ) >= $maxResults ) {
                return array_values( $results );
            }
        }

        // Word match
        $pattern = "/^$groupTypeFilter$delimiter($anything*\b$query$anything*)$delimiter($anything+)$/miu";
        preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
        foreach ( $matches as [ , $label, $groupId ] ) {
            $results[$groupId] = [
                'label' => $label,
                'group' => $groupId,
            ];

            if ( count( $results ) >= $maxResults ) {
                return array_values( $results );
            }
        }

        return array_values( $results );
    }

    /** Search message prefixes. Results are collapsed into prefix patterns when possible. */
    public function searchMessages( string $query, int $maxResults ): array {
        // Optimized based on requirements:
        // * "Natural" sorting of results
        // * No need to show which message group things belong to
        // * Match at any point in the message
        // * Return full keys of prefixes that match multiple messages

        // Algorithm: Construct one big string (haystack) with one entity per line.
        // Then run preg_match_all over it. Because we will have many more matches
        // than search results, this may be more efficient than calling preg_match
        // repeatedly in a loop.  On the other hand, it can use a lot of memory to
        // construct the array for all the matches.
        $haystack = $this->getMessagesHaystack();
        $rowDelimiter = self::ROW_DELIMITER;
        $anything = "[^$rowDelimiter]";
        $query = preg_quote( $query, '/' );

        // Word match
        $pattern = "/^($anything*\b$query)$anything*$/miu";
        preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );

        $results = [];
        $previousPrefixMatch = null;
        foreach ( $matches as [ $full, $prefixMatch ] ) {
            // This is a bit tricky. If we are at the maximum results, continue processing
            // until the prefix changes, to get an accurate count
            if ( count( $results ) >= $maxResults && $previousPrefixMatch !== $prefixMatch ) {
                break;
            }

            if ( $full === $prefixMatch ) {
                $results[$full] = [ $full, 1, true, $full ];
            } else {
                if ( !isset( $results["$prefixMatch*"] ) ) {
                    $results["$prefixMatch*"] = [ "$prefixMatch*", 0, false, $full ];
                }
                $results["$prefixMatch*"][1]++;
            }
            $previousPrefixMatch = $prefixMatch;
        }

        // Convert partial matches with single results to full match
        foreach ( $results as $index => [ $label, $count, $isExact, $full ] ) {
            if ( $count === 1 && !$isExact ) {
                $results[$index][0] = $full;
            }
        }

        // Drop unnecessary fields, pretty format title
        foreach ( $results as &$value ) {
            try {
                $title = $this->titleParser->parseTitle( $value[0] );
                $label = $this->titleFormatter->getPrefixedText( $title );
            } catch ( MalformedTitleException $e ) {
                $label = $value[0];
            }
            $value = [
                'pattern' => $label,
                'count' => $value[1]
            ];
        }

        return array_values( $results );
    }

    /** Match messages matching a pattern. '*' is the wildcard for anything. */
    public function matchMessages( string $query ): array {
        $haystack = $this->getMessagesHaystack();
        $rowDelimiter = self::ROW_DELIMITER;
        $anything = "[^$rowDelimiter]*";

        // Need something that's not affected by preg_quote and cannot be guessed
        $placeholder = bin2hex( random_bytes( 16 ) );
        $query = str_replace( '*', $placeholder, $query );
        $query = preg_quote( $query, '/' );
        $query = str_replace( $placeholder, $anything, $query );

        $pattern = "/^$query/miu";
        preg_match_all( $pattern, $haystack, $matches, PREG_SET_ORDER );
        return array_column( $matches, 0 );
    }

    private function getStaticMessageGroupsHaystack(): string {
        $groups = $this->messageGroupFactory->getGroups();
        $data = new SplMinHeap();
        foreach ( $groups as $group ) {
            $label = $group->getLabel();
            // Ensure there are no special chars that will break matching
            $label = strtr( $label, [ self::FIELD_DELIMITER => '', self::ROW_DELIMITER => '' ] );
            $sortKey = $this->collation->getSortKey( $label );
            // It is unlikely that different groups have the same label (or sort key),
            // but it's possible, so cannot use a hashmap here.
            $groupType = get_class( $group );
            $type = self::TYPE_MAPPING[$groupType][1] ?? self::TYPE_OTHERS;
            $data->insert( [ $sortKey, $label, $group->getId(), $type ] );
        }

        $haystack = '';
        foreach ( $data as [ , $label, $groupId, $type ] ) {
            $haystack
                .= $type
                . self::FIELD_DELIMITER
                . $label
                . self::FIELD_DELIMITER
                . $groupId
                . self::ROW_DELIMITER;
        }
        return $haystack;
    }

    private function getMessageHaystackUncached(): string {
        $namespaceMap = [];
        $data = new SplMinHeap();
        $keys = $this->messageIndex->getKeys();
        foreach ( $keys as $key ) {
            // Normalize "_" to " " so that \b in regexp matches words separated by underscores
            $key = strtr( $key, [ '_' => ' ' ] );

            [ $namespaceId, $label ] = explode( ':', $key, 2 );
            if ( !isset( $namespaceMap[$namespaceId] ) ) {
                $namespaceMap[$namespaceId] = $this->namespaceInfo->getCanonicalName( (int)$namespaceId );
            }
            $label = $namespaceMap[$namespaceId] . ":$label";

            // Ensure there are no special chars that will break matching
            $label = strtr( $label, [ self::ROW_DELIMITER => '' ] );
            $sortKey = $this->collation->getSortKey( $label );
            $data->insert( [ $sortKey, $label ] );
        }

        $haystack = '';
        foreach ( $data as [ , $label ] ) {
            $haystack .= $label . self::ROW_DELIMITER;
        }

        return $haystack;
    }

    private function getMessagesHaystack(): string {
        $cache = $this->cache;
        $key = $cache->makeKey( 'Translate', 'EntitySearch', 'messages' );
        return $cache->getWithSetCallback(
            $key,
            ExpirationAwareness::TTL_WEEK,
            function (): string {
                // This can get rather large. On translatewiki.net it is multiple megabytes
                // uncompressed. With compression (assumed to happen implicitly in the
                // caching layer) it's under a megabyte.
                return $this->getMessageHaystackUncached();
            },
            [
                // Calling touchCheckKey() on this key purges the cache
                'checkKeys' => [ $this->messageIndex->getStatusCacheKey() ],
                // Avoid querying cache servers multiple times in a web request
                'pcTTL' => ExpirationAwareness::TTL_PROC_LONG
            ]
        );
    }

    /** @return array<string,string> */
    public function getGroupTypes(): array {
        return array_column( self::TYPE_MAPPING, 1, 0 );
    }
}