wikimedia/mediawiki-core

View on GitHub
includes/language/converters/MniConverter.php

Summary

Maintainability
C
1 day
Test Coverage
<?php
/**
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file MniConverter.php
 * @author Nokib Sarkar
 * @author Haoreima
 */
/**
 * Meitei specific converter routines.
 *
 * @ingroup Languages
 */
class MniConverter extends LanguageConverterSpecific {
    private const O = 'ꯑ';
    private const OO = 'ꯑꯣ';
    private const U = 'ꯎ';
    private const EE = 'ꯑꯤ';
    private const YA = 'ꯌ';
    private const Y_ = 'য';
    private const WA = 'ꯋ';
    private const BA = 'ꯕ';
    private const NA_ = 'ꯟ';
    private const NA = 'ꯅ';
    private const DIACRITIC_AA = 'ꯥ';
    private const HALANTA = '꯭';
    private const SKIP = '';
    private const PERIOD = '꯫';
    private const PA_ = 'ꯞ';
    private const DIACRITICS_WITH_O = [
        'ꯣ' => 'ো',
        'ꯤ' => 'ী',
        'ꯥ' => 'া',
        'ꯦ' => 'ে',
        'ꯧ' => 'ৌ',
        'ꯩ' => 'ৈ',
        'ꯪ' => 'ং',
    ];
    private const CONJUGATE_WITH_O = [
        'ꯑꯣ' => 'ও',
        'ꯑꯤ' => 'ঈ',
        'ꯑꯥ' => 'আ',
        'ꯑꯦ' => 'এ',
        'ꯑꯧ' => 'ঔ',
        'ꯑꯩ' => 'ঐ',
        'ꯑꯪ' => 'অং',
    ];
    private const NOT_WEIRD_AFTER_NA_ = [ 'ꯇ', 'ꯊ', 'ꯗ', 'ꯙ', 'ꯟ', 'ꯕ', 'ꯌ', 'ꯁ' ];
    private const NUMERALS = [
        '꯰' => '০',
        '꯱' => '১',
        '꯲' => '২',
        '꯳' => '৩',
        '꯴' => '৪',
        '꯵' => '৫',
        '꯶' => '৬',
        '꯷' => '৭',
        '꯸' => '৮',
        '꯹' => '৯',
    ];
    private const HALANTA_CONSONANTS = [
        'ꯟ' => 'ন্',
        'ꯛ' => 'ক্',
        'ꯝ' => 'ম্',
        'ꯡ' => 'ং',
        'ꯜ' => 'ল্',
        'ꯠ' => 'ৎ',
        'ꯞ' => 'প্',
    ];
    private const HALANTA_CONSONANTS_TO_NORMAL = [
        'ꯟ' => 'ন',
        'ꯛ' => 'ক',
        'ꯝ' => 'ম',
        'ꯡ' => 'ং',
        'ꯜ' => 'ল',
        'ꯠ' => 'ৎ',
        'ꯞ' => 'প',
    ];
    private const NON_WORD_CHARACTER_PATTERN = "/[\s꯫\p{P}<>=\-\|$+^~]+?/u";
    private const CONSONANTS = self::HALANTA_CONSONANTS + [
        'ꯀ' => 'ক',
        'ꯈ' => 'খ',
        'ꯒ' => 'গ',
        'ꯘ' => 'ঘ',
        'ꯉ' => 'ঙ',
        'ꯆ' => 'চ',
        'ꯖ' => 'জ',
        'ꯓ' => 'ঝ',
        'ꯇ' => 'ত',
        'ꯊ' => 'থ',
        'ꯗ' => 'দ',
        'ꯙ' => 'ধ',
        'ꯅ' => 'ন',
        'ꯄ' => 'প',
        'ꯐ' => 'ফ',
        'ꯕ' => 'ব',
        'ꯚ' => 'ভ',
        'ꯃ' => 'ম',
        'ꯌ' => 'য়',
        'ꯔ' => 'র',
        'ꯂ' => 'ল',
        'ꯋ' => 'ৱ',
        'ꫩ' => 'শ',
        'ꫪ' => 'ষ',
        'ꯁ' => 'স',
        'ꯍ' => 'হ',
    ];
    private const VOWELS = [
        'ꯑ' => 'অ',
        'ꯏ' => 'ই',
        'ꯎ' => 'উ',
        'ꯢ' => 'ই',
        'ꯨ' => 'ু',
    ];
    private const MTEI_TO_BENG_MAP_EXTRA = [
        '꯫' => '।',
        '꯭' => '্',
    ];
    private const MTEI_TO_BENG_MAP =
        self::VOWELS +
        self::DIACRITICS_WITH_O +
        self::CONJUGATE_WITH_O +
        self::CONSONANTS +
        self::NUMERALS +
        self::MTEI_TO_BENG_MAP_EXTRA;

    private function isBeginning( $position, $text ) {
        $at_first = $position === 0;
        return $at_first || preg_match( self::NON_WORD_CHARACTER_PATTERN, $text[$position - 1] );
    }

    private function isEndOfWord( $char ) {
        if ( $char === self::PERIOD ) {
            return true;
        }
        $status = preg_match( self::NON_WORD_CHARACTER_PATTERN, $char, $matches );
        return count( $matches ) > 0;
    }

    private function mteiToBengali( $text ) {
        $chars = mb_str_split( $text );
        $l = count( $chars );
        $i = 0;
        while ( $i < $l ) {
            $char = $chars[$i];
            if (
                $char === self::O &&
                $i + 1 < $l &&
                array_key_exists( $chars[ $i + 1 ], self::DIACRITICS_WITH_O )
            ) {
                /**
                 * We have only 3 true vowels,
                 * ꯑ(a), ꯏ(i), ꯎ (u)
                 * Others are just extension from "a" by mixing with diacritics
                 */
                yield self::CONJUGATE_WITH_O[$char . $chars[ $i + 1 ]];
                $i += 1;
            } elseif (
                $char === self::HALANTA &&
                $i > 0 &&
                array_key_exists( $chars[ $i - 1 ], self::HALANTA_CONSONANTS )
            ) {
                // Remove halanta if the consonant has halanta already
                yield self::SKIP;
            } elseif (
                array_key_exists( $char, self::HALANTA_CONSONANTS ) &&
                ( $i === $l - 1 || ( $i + 1 < $l &&
                    $this->isEndOfWord( $chars[ $i + 1 ] )
                ) )
            ) {
                // Remove halanta if this is the last character of the word
                yield self::HALANTA_CONSONANTS_TO_NORMAL[$char];
            } elseif ( $char === self::YA &&
                $i > 0 && $chars[ $i - 1 ] === self::HALANTA ) {
                // য + ্ = য়
                yield self::Y_;
            } elseif (
                $char === self::WA &&
                $i - 2 >= 0 && $chars[ $i - 1 ] === self::HALANTA &&
                array_key_exists( $chars[ $i - 2 ], self::CONSONANTS )
            ) {
                // ব + ্ + র = ব্র
                yield self::CONSONANTS[self::BA];
            } elseif (
                $char === self::PA_ && $i + 1 < $l && $chars[ $i + 1 ] === 'ꯀ'
            ) {
                // do not conjugate with halanta if it's followed by "ক"
                yield self::HALANTA_CONSONANTS_TO_NORMAL[$char];
            } elseif (
                $char === self::NA_ &&
                $i + 1 < $l &&
                !in_array( $chars[ $i + 1 ], self::NOT_WEIRD_AFTER_NA_ ) &&
                array_key_exists( $chars[ $i + 1 ], self::CONSONANTS )
            ) {
                /**
                 * ন্ / ণ্ + any consonant
                 * (except, ট, ঠ, ড, ঢ, , ত, থ, দ, ধ, ন, ব, য, য়) = weird
                 * Any consonant + ্ + ন = maybe ok
                 */
                yield self::MTEI_TO_BENG_MAP[self::NA];
                $i += 1;
                continue;
            } elseif ( $char === self::U && !$this->isBeginning( $i, $text ) ) {
                // উ/ঊ in the middle of words are often replaced by ও
                yield self::MTEI_TO_BENG_MAP[self::OO];
            } elseif ( $char === self::O &&
                $i + 2 < $l && $chars[$i + 1] === self::EE[0] && $chars[ $i + 2 ] === self::EE[1] ) {
                /**
                 * Instead of হাঈবা, people love to use হায়বা.
                 * But this is only in the case when ee or ya is
                 * in the middle of the words,
                 * never to do it if it's in the beginning.
                 */
                yield self::MTEI_TO_BENG_MAP[self::YA];
            } elseif (
                !array_key_exists( $char, self::HALANTA_CONSONANTS ) &&
                array_key_exists( $char, self::CONSONANTS ) &&
                ( $i === $l - 1 || ( $i + 1 < $l &&
                    $this->isEndOfWord( $chars[ $i + 1 ] )
                ) )
            ) {
                // Consonants without halantas should end with diacritics of aa sound everytime.
                yield self::MTEI_TO_BENG_MAP[$char] . self::MTEI_TO_BENG_MAP[self::DIACRITIC_AA];
            } else {
                yield (
                    array_key_exists( $char, self::MTEI_TO_BENG_MAP ) ?
                    self::MTEI_TO_BENG_MAP[$char] : $char
                );
            }
            $i += 1;
        }
    }

    public function transliterate( $text ) {
        $transliterated = '';
        foreach ( $this->mteiToBengali( $text ) as $char ) {
            $transliterated .= $char;
        }
        return $transliterated;
    }

    public function getMainCode(): string {
        return 'mni';
    }

    public function getLanguageVariants(): array {
        return [ 'mni', 'mni-beng' ];
    }

    public function getVariantsFallbacks(): array {
        return [
            'mni-beng' => 'mni'
        ];
    }

    protected function loadDefaultTables(): array {
        return [
            'mni' => new ReplacementArray(),
            'mni-beng' => new ReplacementArray(),
        ];
    }

    /**
     * Transliterates text into Bangla Script. This allows developers to test the language variants
     * functionality and user interface without having to switch wiki language away from default.
     * This method also processes custom conversion rules to allow testing these parts of the
     * language converter as well.
     *
     * @param string $text
     * @param string $toVariant
     * @return string
     */
    public function translate( $text, $toVariant ) {
        if ( $toVariant === 'mni-beng' ) {
            return $this->transliterate( $text );
        }
        return $text;
    }
}