zedseven/Romanization.NET

View on GitHub
Romanization/Languages/Korean/RevisedRomanization.cs

Summary

Maintainability
B
6 hrs
Test Coverage
A
97%
using Romanization.Internal;
using System;
using System.Collections.Generic;
using System.Diagnostics.Contracts;
using System.Linq;
using System.Text;

// ReSharper disable CheckNamespace
// ReSharper disable CommentTypo
// ReSharper disable IdentifierTypo
// ReSharper disable StringLiteralTypo
// ReSharper disable InconsistentNaming

namespace Romanization
{
    public static partial class Korean
    {
        /// <summary>
        /// The Revised Romanization of Korean system.<br />
        /// For more information, visit:
        /// <a href='https://en.wikipedia.org/wiki/Revised_Romanization_of_Korean'>https://en.wikipedia.org/wiki/Revised_Romanization_of_Korean</a>
        /// </summary>
        public sealed class RevisedRomanization : IRomanizationSystem
        {
            /// <inheritdoc />
            public SystemType Type => SystemType.PhonemicTranscription;

            /// <summary>
            /// Whether to insert a hyphen ('-') between syllables in non-required
            /// spots. This can help to distinguish between ambiguous words: <c>가을 -> ga-eul</c> (fall; autumn) vs.
            /// <c>개울</c> -> gae-ul (stream).
            /// </summary>
            public readonly bool HyphenateSyllables;

            private readonly struct HyphenString
            {
                public readonly AspirationString BaseString;
                public readonly int HyphenIndex;
                private readonly bool InsertHyphen;

                public HyphenString(AspirationString baseString, int hyphenIndex = -1)
                {
                    BaseString = baseString;
                    HyphenIndex = hyphenIndex;
                    InsertHyphen = hyphenIndex <= -1 && !baseString.AspiratedString.Contains('-');
                }

                public static implicit operator HyphenString(string s)
                    => new HyphenString(s);

                public static implicit operator HyphenString(AspirationString s)
                    => new HyphenString(s);

                public static implicit operator HyphenString(ValueTuple<AspirationString, int> s)
                    => new HyphenString(s.Item1, s.Item2);

                public static implicit operator HyphenString(Tuple<AspirationString, int> s)
                    => new HyphenString(s.Item1, s.Item2);

                public string ToString(bool aspirated)
                    => aspirated
                        ? HyphenIndex > -1
                            ? $"{BaseString.AspiratedString.Substring(0, HyphenIndex)}-{BaseString.AspiratedString.Substring(HyphenIndex)}"
                            : $"{BaseString.AspiratedString}{(InsertHyphen ? "-" : "")}"
                        : HyphenIndex > -1
                            ? $"{BaseString.NonAspiratedString.Substring(0, HyphenIndex)}-{BaseString.NonAspiratedString.Substring(HyphenIndex)}"
                            : $"{BaseString.NonAspiratedString}{(InsertHyphen ? "-" : "")}";

                public override string ToString()
                    => ToString(true);
            }

            // System-Specific Constants
            private readonly Dictionary<char, string> HangeulVowelRomanizations = new Dictionary<char, string>();
            private readonly Dictionary<char, string> HangeulConsonantInitialRomanizations = new Dictionary<char, string>();
            private readonly Dictionary<char, string> HangeulConsonantFinalRomanizations = new Dictionary<char, string>();
            private readonly Dictionary<(char, char), HyphenString> HangeulConsonantCombinationRomanizations = new Dictionary<(char, char), HyphenString>();

            /// <summary>
            /// Instantiates a copy of the system to process romanizations.
            /// </summary>
            public RevisedRomanization()
            {
                #region Romanization Chart

                // Sourced from https://en.wikipedia.org/wiki/Revised_Romanization_of_Korean#Transcription_rules

                // Vowels
                HangeulVowelRomanizations['ㅏ'] = "a";
                HangeulVowelRomanizations['ㅐ'] = "ae";
                HangeulVowelRomanizations['ㅑ'] = "ya";
                HangeulVowelRomanizations['ㅒ'] = "yae";
                HangeulVowelRomanizations['ㅓ'] = "eo";
                HangeulVowelRomanizations['ㅔ'] = "e";
                HangeulVowelRomanizations['ㅕ'] = "yeo";
                HangeulVowelRomanizations['ㅖ'] = "ye";
                HangeulVowelRomanizations['ㅗ'] = "o";
                HangeulVowelRomanizations['ㅘ'] = "wa";
                HangeulVowelRomanizations['ㅙ'] = "wae";
                HangeulVowelRomanizations['ㅚ'] = "oe";
                HangeulVowelRomanizations['ㅛ'] = "yo";
                HangeulVowelRomanizations['ㅜ'] = "u";
                HangeulVowelRomanizations['ㅝ'] = "wo";
                HangeulVowelRomanizations['ㅞ'] = "we";
                HangeulVowelRomanizations['ㅟ'] = "wi";
                HangeulVowelRomanizations['ㅠ'] = "yu";
                HangeulVowelRomanizations['ㅡ'] = "eu";
                HangeulVowelRomanizations['ㅢ'] = "ui";
                HangeulVowelRomanizations['ㅣ'] = "i";

                // Consonants in initial positions
                HangeulConsonantInitialRomanizations['ㄱ'] = "g";
                HangeulConsonantInitialRomanizations['ㄲ'] = "kk";
                HangeulConsonantInitialRomanizations['ㄴ'] = "n";
                HangeulConsonantInitialRomanizations['ㄷ'] = "d";
                HangeulConsonantInitialRomanizations['ㄸ'] = "tt";
                HangeulConsonantInitialRomanizations['ㄹ'] = "r";
                HangeulConsonantInitialRomanizations['ㅁ'] = "m";
                HangeulConsonantInitialRomanizations['ㅂ'] = "b";
                HangeulConsonantInitialRomanizations['ㅃ'] = "pp";
                HangeulConsonantInitialRomanizations['ㅅ'] = "s";
                HangeulConsonantInitialRomanizations['ㅆ'] = "ss";
                HangeulConsonantInitialRomanizations['ㅇ'] = "";
                HangeulConsonantInitialRomanizations['ㅈ'] = "j";
                HangeulConsonantInitialRomanizations['ㅉ'] = "jj";
                HangeulConsonantInitialRomanizations['ㅊ'] = "ch";
                HangeulConsonantInitialRomanizations['ㅋ'] = "k";
                HangeulConsonantInitialRomanizations['ㅌ'] = "t";
                HangeulConsonantInitialRomanizations['ㅍ'] = "p";
                HangeulConsonantInitialRomanizations['ㅎ'] = "h";

                // Consonants in final positions
                HangeulConsonantFinalRomanizations['ㄱ'] = "k";
                HangeulConsonantFinalRomanizations['ㄲ'] = "k";
                HangeulConsonantFinalRomanizations['ㄴ'] = "n";
                HangeulConsonantFinalRomanizations['ㄷ'] = "t";
                HangeulConsonantFinalRomanizations['ㄹ'] = "l";
                HangeulConsonantFinalRomanizations['ㅁ'] = "m";
                HangeulConsonantFinalRomanizations['ㅂ'] = "p";
                HangeulConsonantFinalRomanizations['ㅅ'] = "t";
                HangeulConsonantFinalRomanizations['ㅆ'] = "t";
                HangeulConsonantFinalRomanizations['ㅇ'] = "ng";
                HangeulConsonantFinalRomanizations['ㅈ'] = "t";
                HangeulConsonantFinalRomanizations['ㅊ'] = "t";
                HangeulConsonantFinalRomanizations['ㅋ'] = "k";
                HangeulConsonantFinalRomanizations['ㅌ'] = "t";
                HangeulConsonantFinalRomanizations['ㅍ'] = "p";
                HangeulConsonantFinalRomanizations['ㅎ'] = "t";

                // Special cases of combinations of an ending from one block and the beginning of a new one
                HangeulConsonantCombinationRomanizations[('ㄱ', 'ㅇ')] = "g";
                HangeulConsonantCombinationRomanizations[('ㄱ', 'ㄴ')] = ("ngn", 2);
                HangeulConsonantCombinationRomanizations[('ㄱ', 'ㄹ')] = ("ngn", 2);
                HangeulConsonantCombinationRomanizations[('ㄱ', 'ㅁ')] = ("ngm", 2);
                HangeulConsonantCombinationRomanizations[('ㄱ', 'ㄱ')] = "k-k";
                HangeulConsonantCombinationRomanizations[('ㄱ', 'ㅎ')] = (("kh", "k"), -1); // kh,k
                HangeulConsonantCombinationRomanizations[('ㄴ', 'ㄱ')] = "n-g";
                HangeulConsonantCombinationRomanizations[('ㄴ', 'ㄹ')] = ("ll", 1); // ll,nn
                HangeulConsonantCombinationRomanizations[('ㄷ', 'ㅇ')] = "d"; // d,j
                HangeulConsonantCombinationRomanizations[('ㄷ', 'ㄴ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㄷ', 'ㄹ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㄷ', 'ㅁ')] = ("nm", 1);
                HangeulConsonantCombinationRomanizations[('ㄷ', 'ㅌ')] = "t-t";
                HangeulConsonantCombinationRomanizations[('ㄷ', 'ㅎ')] = (("th", "t"), -1); // th,t,ch
                HangeulConsonantCombinationRomanizations[('ㄹ', 'ㅇ')] = "r";
                HangeulConsonantCombinationRomanizations[('ㄹ', 'ㄴ')] = ("ll", 1); // ll,nn
                HangeulConsonantCombinationRomanizations[('ㄹ', 'ㄹ')] = ("ll", 1);
                HangeulConsonantCombinationRomanizations[('ㅁ', 'ㄹ')] = ("mn", 1);
                HangeulConsonantCombinationRomanizations[('ㅂ', 'ㅇ')] = "b";
                HangeulConsonantCombinationRomanizations[('ㅂ', 'ㄴ')] = ("mn", 1);
                HangeulConsonantCombinationRomanizations[('ㅂ', 'ㄹ')] = ("mn", 1);
                HangeulConsonantCombinationRomanizations[('ㅂ', 'ㅁ')] = ("mm", 1);
                HangeulConsonantCombinationRomanizations[('ㅂ', 'ㅍ')] = "p-p";
                HangeulConsonantCombinationRomanizations[('ㅂ', 'ㅎ')] = (("ph", "p"), -1); // ph,p
                HangeulConsonantCombinationRomanizations[('ㅅ', 'ㅇ')] = "s";
                HangeulConsonantCombinationRomanizations[('ㅅ', 'ㄴ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅅ', 'ㄹ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅅ', 'ㅁ')] = ("nm", 1);
                HangeulConsonantCombinationRomanizations[('ㅅ', 'ㅌ')] = "t-t";
                HangeulConsonantCombinationRomanizations[('ㅇ', 'ㅇ')] = "ng-";
                HangeulConsonantCombinationRomanizations[('ㅇ', 'ㄹ')] = ("ngn", 2);
                HangeulConsonantCombinationRomanizations[('ㅈ', 'ㅇ')] = "j";
                HangeulConsonantCombinationRomanizations[('ㅈ', 'ㄴ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅈ', 'ㄹ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅈ', 'ㅁ')] = ("nm", 1);
                HangeulConsonantCombinationRomanizations[('ㅈ', 'ㅌ')] = "t-t";
                HangeulConsonantCombinationRomanizations[('ㅈ', 'ㅎ')] = (("th", "t"), -1); // th,t,ch
                HangeulConsonantCombinationRomanizations[('ㅊ', 'ㅇ')] = "ch";
                HangeulConsonantCombinationRomanizations[('ㅊ', 'ㄴ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅊ', 'ㄹ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅊ', 'ㅁ')] = ("nm", 1);
                HangeulConsonantCombinationRomanizations[('ㅊ', 'ㅌ')] = "t-t";
                HangeulConsonantCombinationRomanizations[('ㅊ', 'ㅎ')] = (("th", "t"), -1); // th,t,ch
                HangeulConsonantCombinationRomanizations[('ㅌ', 'ㄴ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅌ', 'ㄹ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅌ', 'ㅁ')] = ("nm", 1);
                HangeulConsonantCombinationRomanizations[('ㅌ', 'ㅌ')] = "t-t";
                HangeulConsonantCombinationRomanizations[('ㅌ', 'ㅎ')] = (("th", "t"), -1); // th,t,ch
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㅇ')] = "h";
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㄱ')] = "k";
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㄷ')] = "t";
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㄴ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㄹ')] = ("nn", 1);
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㅁ')] = ("nm", 1);
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㅂ')] = "p";
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㅅ')] = ("hs", 1);
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㅈ')] = "ch";
                HangeulConsonantCombinationRomanizations[('ㅎ', 'ㅎ')] = "t";

                #endregion
            }

            /// <summary>
            /// Performs romanization on the given text, according to the Revised Romanization of Korean system.
            /// </summary>
            /// <param name="text">The text to romanize.</param>
            /// <param name="givenName">Whether or not the text to romanize is a given name, since Korean names are
            /// often romanized without consideration for special Jamo combinations.</param>
            /// <param name="noun">Whether or not the text to romanize is a noun, since there is a distinction between
            /// whether or not aspiration is reflected based on nouns.</param>
            /// <param name="hyphenateSyllables">Whether to insert a hyphen ('-') between syllables in non-required
            /// spots. This can help to distinguish between ambiguous words: <c>가을 -> ga-eul</c> (fall; autumn) vs.
            /// <c>개울</c> -> gae-ul (stream).</param>
            /// <returns>A romanized version of the text, leaving unrecognized characters untouched. Note that all
            /// romanized text will be lowercase.</returns>
            [Pure]
            public string Process(string text, bool givenName, bool noun = false, bool hyphenateSyllables = false)
            {
                text = text
                    // Replace common alternate characters
                    .ReplaceCommonAlternates()
                    // Insert spaces at boundaries between Latin characters and Korean ones
                    .SeparateLanguageBoundaries();

                // Decompose all syllable blocks in text into their component Jamo
                List<PlacementChar> jamoList = text.SelectMany(c =>
                {
                    SyllableBlock b = DecomposeSyllableBlock(c);
                    return b != null ? b.FlattenToArray() : new[] { (PlacementChar)c };
                })
                    .ToList();

                // Use the component Jamo to build the romanization
                StringBuilder romanizedText = new StringBuilder();
                for (int i = 0; i < jamoList.Count; i++)
                {
                    bool lastChar = i >= jamoList.Count - 1;
                    switch (jamoList[i].Placement)
                    {
                        case PlacementChar.Placements.NotApplicable:
                            romanizedText.Append(jamoList[i]);
                            continue;
                        case PlacementChar.Placements.Initial:
                            romanizedText.Append(HangeulConsonantInitialRomanizations[jamoList[i]]);
                            continue;
                        case PlacementChar.Placements.Medial:
                            romanizedText.Append(HangeulVowelRomanizations[jamoList[i]]);

                            // Two-jamo syllable hyphenation
                            if (hyphenateSyllables && !lastChar && jamoList[i + 1].Placement == PlacementChar.Placements.Initial)
                                romanizedText.Append('-');

                            continue;
                        case PlacementChar.Placements.Final:
                            if (!givenName && !lastChar)
                            {
                                (char, char) key = (jamoList[i], jamoList[i + 1]);
                                if (HangeulConsonantCombinationRomanizations.TryGetValue(key, out HyphenString specialCaseRomanization))
                                {
                                    // TODO: This may be backwards - (!noun may need to be inverted) - this is because documentation for this is heavily unclear on whether aspiration should be reflected in nouns
                                    // More info: "... However, aspirated sounds are *not* reflected in case of nouns
                                    // where ㅎ follows ㄱ, ㄷ, and ㅂ: 묵호 → Mukho, 집현전 → Jiphyeonjeon." (emphasis mine)
                                    // The text says aspiration should not be reflected in such nouns, yet both examples
                                    // it gives are nouns that reflect aspiration.
                                    // Furthermore, the previous examples all exclude aspiration and whether or not the
                                    // words are nouns is unclear - this leads me to believe the text has it backwards.
                                    // As someone with a very rudimentary understanding of Korean, I can't determine one
                                    // way or the other for certain, so for now this is how it will stay.
                                    if (!noun && jamoList[i + 1] == 'ㅎ' &&
                                        (jamoList[i] == 'ㄱ' || jamoList[i] == 'ㄷ' || jamoList[i] == 'ㅂ'))
                                        romanizedText.Append(hyphenateSyllables
                                            ? specialCaseRomanization.ToString(false)
                                            : specialCaseRomanization.BaseString.NonAspiratedString);
                                    else
                                        romanizedText.Append(hyphenateSyllables
                                            ? specialCaseRomanization.ToString(true)
                                            : specialCaseRomanization.BaseString.AspiratedString);
                                    i++;
                                    continue;
                                }
                            }

                            romanizedText.Append(HangeulConsonantFinalRomanizations[jamoList[i]]);

                            // Three-Jamo syllable hyphenation
                            if (hyphenateSyllables && !lastChar && jamoList[i + 1].Placement == PlacementChar.Placements.Initial)
                                romanizedText.Append('-');

                            continue;
                    }
                }

                return romanizedText.ToString();
            }

            /// <summary>
            /// Performs romanization on the given text, according to the Revised Romanization of Korean system.
            /// </summary>
            /// <param name="text">The text to romanize.</param>
            /// <returns>A romanized version of the text, leaving unrecognized characters untouched. Note that all
            /// romanized text will be lowercase.</returns>
            [Pure]
            public string Process(string text)
                => Process(text, false, false, false);
        }
    }
}