ubcsanskrit/sanscript.rb

View on GitHub
lib/sanscript/transliterate/schemes.rb

Summary

Maintainability
A
2 hrs
Test Coverage
# frozen_string_literal: true

module Sanscript
  module Transliterate
    #  Schemes
    #  =======
    #  Schemes are of two kinds: "Brahmic" and "roman." "Brahmic" schemes
    #  describe abugida scripts found in India. "Roman" schemes describe
    #  manufactured alphabets that are meant to describe or encode Brahmi
    #  scripts. Abugidas and alphabets are processed by separate algorithms
    #  because of the unique difficulties involved with each.
    #
    #  Brahmic consonants are stated without a virama. Roman consonants are
    #  stated without the vowel 'a'.
    #
    #  (Since "abugida" is not a well-known term, Sanscript uses "Brahmic"
    #  and "roman" for clarity.)
    #
    @schemes = {

      # Bengali
      # -------
      # 'va' and 'ba' are both rendered as ব.
      #
      bengali: {
        vowels: "অ আ ই ঈ উ ঊ ঋ ৠ ঌ ৡ  এ ঐ  ও ঔ".split(/\s/),
        vowel_marks: "া ি ী ু ূ ৃ ৄ ৢ ৣ  ে ৈ  ো ৌ".split(/\s/),
        other_marks: "ং ঃ ঁ".split(/\s/),
        virama: ["্"],
        consonants: "ক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণ ত থ দ ধ ন " \
                    "প ফ ব ভ ম য র ল ব শ ষ স হ ळ ক্ষ জ্ঞ".split(/\s/),
        symbols: "০ ১ ২ ৩ ৪ ৫ ৬ ৭ ৮ ৯ ॐ ঽ । ॥".split(/\s/),
        other: "    ড ঢ  য ".split(/\s/),
      },

      # Devanagari
      # ----------
      # The most comprehensive and unambiguous Brahmic script listed.
      #
      devanagari: {
        # "Independent" forms of the vowels. These are used whenever the
        # vowel does not immediately follow a consonant.
        vowels: "अ आ इ ई उ ऊ ऋ ॠ ऌ ॡ ऎ ए ऐ ऒ ओ औ".split(/\s/),

        # "Dependent" forms of the vowels. These are used whenever the
        # vowel immediately follows a consonant. If a letter is not
        # listed in `vowels`, it should not be listed here.
        vowel_marks: "ा ि ी ु ू ृ ॄ ॢ ॣ ॆ े ै ॊ ो ौ".split(/\s/),

        # Miscellaneous marks, all of which are used in Sanskrit.
        other_marks: "ं ः ँ".split(/\s/),

        # In syllabic scripts like Devanagari, consonants have an inherent
        # vowel that must be suppressed explicitly. We do so by putting a
        # virama after the consonant.
        virama: ["्"],

        # Various Sanskrit consonants and consonant clusters. Every token
        # here has an explicit vowel. Thus "क" is "ka" instead of "k".
        consonants: "क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न " \
                    "प फ ब भ म य र ल व श ष स ह ळ क्ष ज्ञ".split(/\s/),

        # Numbers and punctuation
        symbols: "० १ २ ३ ४ ५ ६ ७ ८ ९ ॐ ऽ । ॥".split(/\s/),

        # Zero-width joiner. This is used to separate a consonant cluster
        # and avoid a complex ligature.
        zwj: ["\u200D"],

        # Dummy consonant. This is used in ITRANS to prevert certain types
        # of parser ambiguity. Thus "barau" -> बरौ but "bara_u" -> बरउ.
        skip: [""],

        # Vedic accent. Udatta and anudatta.
        accent: %W[\u0951 \u0952],

        # Accent combined with anusvara and and visarga. For compatibility
        # with ITRANS, which allows the reverse of these four.
        combo_accent: "ः॑ ः॒ ं॑ ं॒".split(/\s/),

        candra: ["ॅ"],

        # Non-Sanskrit consonants
        other: "क़ ख़ ग़ ज़ ड़ ढ़ फ़ य़ ऱ".split(/\s/),
      },

      # Gujarati
      # --------
      # Sanskrit-complete.
      #
      gujarati: {
        vowels: "અ આ ઇ ઈ ઉ ઊ ઋ ૠ ઌ ૡ  એ ઐ  ઓ ઔ".split(/\s/),
        vowel_marks: "ા િ ી ુ ૂ ૃ ૄ ૢ ૣ  ે ૈ  ો ૌ".split(/\s/),
        other_marks: "ં ઃ ઁ".split(/\s/),
        virama: ["્"],
        consonants: "ક ખ ગ ઘ ઙ ચ છ જ ઝ ઞ ટ ઠ ડ ઢ ણ ત થ દ ધ ન " \
                    "પ ફ બ ભ મ ય ર લ વ શ ષ સ હ ળ ક્ષ જ્ઞ".split(/\s/),
        symbols: "૦ ૧ ૨ ૩ ૪ ૫ ૬ ૭ ૮ ૯ ૐ ઽ ૤ ૥".split(/\s/),
        candra: ["ૅ"],
      },

      # Gurmukhi
      # --------
      # Missing R/RR/lR/lRR
      #
      gurmukhi: {
        vowels: "ਅ ਆ ਇ ਈ ਉ ਊ      ਏ ਐ  ਓ ਔ".split(/\s/),
        vowel_marks: "ਾ ਿ ੀ ੁ ੂ      ੇ ੈ  ੋ ੌ".split(/\s/),
        other_marks: "ਂ ਃ ਁ".split(/\s/),
        virama: ["੍"],
        consonants: "ਕ ਖ ਗ ਘ ਙ ਚ ਛ ਜ ਝ ਞ ਟ ਠ ਡ ਢ ਣ ਤ ਥ ਦ ਧ ਨ " \
                    "ਪ ਫ ਬ ਭ ਮ ਯ ਰ ਲ ਵ ਸ਼ ਸ਼ ਸ ਹ ਲ਼ ਕ੍ਸ਼ ਜ੍ਞ".split(/\s/),
        symbols: "੦ ੧ ੨ ੩ ੪ ੫ ੬ ੭ ੮ ੯ ॐ ऽ । ॥".split(/\s/),
        other: " ਖ ਗ ਜ ਡ  ਫ  ".split(/\s/),
      },

      # Kannada
      # -------
      # Sanskrit-complete.
      #
      kannada: {
        vowels: "ಅ ಆ ಇ ಈ ಉ ಊ ಋ ೠ ಌ ೡ ಎ ಏ ಐ ಒ ಓ ಔ".split(/\s/),
        vowel_marks: "ಾ ಿ ೀ ು ೂ ೃ ೄ ೢ ೣ ೆ ೇ ೈ ೊ ೋ ೌ".split(/\s/),
        other_marks: "ಂ ಃ ँ".split(/\s/),
        virama: ["್"],
        consonants: "ಕ ಖ ಗ ಘ ಙ ಚ ಛ ಜ ಝ ಞ ಟ ಠ ಡ ಢ ಣ ತ ಥ ದ ಧ ನ " \
                    "ಪ ಫ ಬ ಭ ಮ ಯ ರ ಲ ವ ಶ ಷ ಸ ಹ ಳ ಕ್ಷ ಜ್ಞ".split(/\s/),
        symbols: "೦ ೧ ೨ ೩ ೪ ೫ ೬ ೭ ೮ ೯ ಓಂ ಽ । ॥".split(/\s/),
        other: "      ಫ  ಱ".split(/\s/),
      },

      # Malayalam
      # ---------
      # Sanskrit-complete.
      #
      malayalam: {
        vowels: "അ ആ ഇ ഈ ഉ ഊ ഋ ൠ ഌ ൡ എ ഏ ഐ ഒ ഓ ഔ".split(/\s/),
        vowel_marks: "ാ ി ീ ു ൂ ൃ ൄ ൢ ൣ െ േ ൈ ൊ ോ ൌ".split(/\s/),
        other_marks: "ം ഃ ँ".split(/\s/),
        virama: ["്"],
        consonants: "ക ഖ ഗ ഘ ങ ച ഛ ജ ഝ ഞ ട ഠ ഡ ഢ ണ ത ഥ ദ ധ ന " \
                    "പ ഫ ബ ഭ മ യ ര ല വ ശ ഷ സ ഹ ള ക്ഷ ജ്ഞ".split(/\s/),
        symbols: "൦ ൧ ൨ ൩ ൪ ൫ ൬ ൭ ൮ ൯ ഓം ഽ । ॥".split(/\s/),
        other: "        റ".split(/\s/),
      },

      # Oriya
      # -----
      # Sanskrit-complete.
      #
      oriya: {
        vowels: "ଅ ଆ ଇ ଈ ଉ ଊ ଋ ୠ ଌ ୡ  ଏ ଐ  ଓ ଔ".split(/\s/),
        vowel_marks: "ା ି ୀ ୁ ୂ ୃ ୄ ୢ ୣ  େ ୈ  ୋ ୌ".split(/\s/),
        other_marks: "ଂ ଃ ଁ".split(/\s/),
        virama: ["୍"],
        consonants: "କ ଖ ଗ ଘ ଙ ଚ ଛ ଜ ଝ ଞ ଟ ଠ ଡ ଢ ଣ ତ ଥ ଦ ଧ ନ " \
                    "ପ ଫ ବ ଭ ମ ଯ ର ଲ ଵ ଶ ଷ ସ ହ ଳ କ୍ଷ ଜ୍ଞ".split(/\s/),
        symbols: "୦ ୧ ୨ ୩ ୪ ୫ ୬ ୭ ୮ ୯ ଓଂ ଽ । ॥".split(/\s/),
        other: "    ଡ ଢ  ଯ ".split(/\s/),
      },

      # Tamil
      # -----
      # Missing R/RR/lR/lRR vowel marks and voice/aspiration distinctions.
      # The most incomplete of the Sanskrit schemes here.
      #
      tamil: {
        vowels: "அ ஆ இ ஈ உ ஊ     எ ஏ ஐ ஒ ஓ ஔ".split(/\s/),
        vowel_marks: "ா ி ீ ு ூ     ெ ே ை ொ ோ ௌ".split(/\s/),
        other_marks: "ஂ ஃ ".split(/\s/),
        virama: ["்"],
        consonants: "க க க க ங ச ச ஜ ச ஞ ட ட ட ட ண த த த த ந " \
                    "ப ப ப ப ம ய ர ல வ ஶ ஷ ஸ ஹ ள க்ஷ ஜ்ஞ".split(/\s/),
        symbols: "௦ ௧ ௨ ௩ ௪ ௫ ௬ ௭ ௮ ௯ ௐ ऽ । ॥".split(/\s/),
        other: "        ற".split(/\s/),
      },

      # Telugu
      # ------
      # Sanskrit-complete.
      #
      telugu: {
        vowels: "అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఎ ఏ ఐ ఒ ఓ ఔ".split(/\s/),
        vowel_marks: "ా ి ీ ు ూ ృ ౄ ౢ ౣ ె ే ై ొ ో ౌ".split(/\s/),
        other_marks: "ం ః ఁ".split(/\s/),
        virama: ["్"],
        consonants: "క ఖ గ ఘ ఙ చ ఛ జ ఝ ఞ ట ఠ డ ఢ ణ త థ ద ధ న " \
                    "ప ఫ బ భ మ య ర ల వ శ ష స హ ళ క్ష జ్ఞ".split(/\s/),
        symbols: "౦ ౧ ౨ ౩ ౪ ౫ ౬ ౭ ౮ ౯ ఓం ఽ । ॥".split(/\s/),
        other: "        ఱ".split(/\s/),
      },

      # International Alphabet of Sanskrit Transliteration
      # --------------------------------------------------
      # The most "professional" Sanskrit romanization scheme.
      #
      iast: {
        vowels: "a ā i ī u ū ṛ ṝ ḷ ḹ  e ai  o au".split(/\s/),
        other_marks: ["ṃ", "ḥ", "~"],
        virama: [""],
        consonants: "k kh g gh ṅ c ch j jh ñ ṭ ṭh ḍ ḍh ṇ t th d dh n " \
                    "p ph b bh m y r l v ś ṣ s h ḻ kṣ jñ".split(/\s/),
        symbols: "0 1 2 3 4 5 6 7 8 9 oṃ ' | ||".split(/\s/),
      },

      # ITRANS
      # ------
      # One of the first romanization schemes -- and one of the most
      # complicated. For alternate forms, see the "allAlternates" variable
      # below.
      #  *
      # '_' is a "null" letter, which allows adjacent vowels.
      #
      itrans: {
        vowels: "a A i I u U RRi RRI LLi LLI  e ai  o au".split(/\s/),
        other_marks: ["M", "H", ".N"],
        virama: [""],
        consonants: "k kh g gh ~N ch Ch j jh ~n T Th D Dh N t th d dh n " \
                    "p ph b bh m y r l v sh Sh s h L kSh j~n".split(/\s/),
        symbols: "0 1 2 3 4 5 6 7 8 9 OM .a | ||".split(/\s/),
        candra: [".c"],
        zwj: ["{}"],
        skip: ["_"],
        accent: ["\\'", "\\_"],
        combo_accent: "\\'H \\_H \\'M \\_M".split(/\s/),
        other: "q K G z .D .Dh f Y R".split(/\s/),
      },

      # Harvard-Kyoto
      # -------------
      # A simple 1:1 mapping.
      #
      hk: {
        vowels: "a A i I u U R RR lR lRR  e ai  o au".split(/\s/),
        other_marks: "M H ~".split(/\s/),
        virama: [""],
        consonants: "k kh g gh G c ch j jh J T Th D Dh N t th d dh n " \
                    "p ph b bh m y r l v z S s h L kS jJ".split(/\s/),
        symbols: "0 1 2 3 4 5 6 7 8 9 OM ' | ||".split(/\s/),
      },

      # National Library at Kolkata
      # ---------------------------
      # Apart from using "ē" and "ō" instead of "e" and "o", this scheme is
      # identical to IAST. ṝ, ḷ, and ḹ are not part of the scheme proper.
      #  *
      # This is defined further below.
      #

      # Sanskrit Library Phonetic Basic
      # -------------------------------
      # With one ASCII letter per phoneme, this is the tersest transliteration
      # scheme in use today and is especially suited to computer processing.
      #
      slp1: {
        vowels: "a A i I u U f F x X  e E  o O".split(/\s/),
        other_marks: "M H ~".split(/\s/),
        virama: [""],
        consonants: "k K g G N c C j J Y w W q Q R t T d D n " \
                    "p P b B m y r l v S z s h L kz jY".split(/\s/),
        symbols: "0 1 2 3 4 5 6 7 8 9 oM ' | ||".split(/\s/),
      },

      # Velthuis
      # --------
      # A case-insensitive Sanskrit encoding.
      #
      velthuis: {
        vowels: "a aa i ii u uu .r .rr .li .ll  e ai  o au".split(/\s/),
        other_marks: ".m .h ".split(/\s/),
        virama: [""],
        consonants: "k kh g gh \"n c ch j jh ~n .t .th .d .dh .n t th d dh n " \
                    "p ph b bh m y r l v ~s .s s h L k.s j~n".split(/\s/),
        symbols: "0 1 2 3 4 5 6 7 8 9 o.m ' | ||".split(/\s/),
      },

      # WX
      # --
      # As terse as SLP1.
      #
      wx: {
        vowels: "a A i I u U q Q L   e E  o O".split(/\s/),
        other_marks: "M H z".split(/\s/),
        virama: [""],
        consonants: "k K g G f c C j J F t T d D N w W x X n " \
                    "p P b B m y r l v S R s h  kR jF".split(/\s/),
        symbols: "0 1 2 3 4 5 6 7 8 9 oM ' | ||".split(/\s/),
      },
    }

    # Set of names of Roman schemes
    @roman_schemes = Set.new

    # Set of names of Brahmic schemes
    @brahmic_schemes = Set.new

    # Set of names of all schemes
    @scheme_names = Set.new

    # Map of alternate encodings.
    @all_alternates = {
      itrans: {
        "A" => ["aa"],
        "I" => %w[ii ee],
        "U" => %w[uu oo],
        "RRi" => ["R^i"],
        "RRI" => ["R^I"],
        "LLi" => ["L^i"],
        "LLI" => ["L^I"],
        "M" => [".m", ".n"],
        "~N" => ["N^"],
        "ch" => ["c"],
        "Ch" => %w[C chh],
        "~n" => ["JN"],
        "v" => ["w"],
        "Sh" => %w[S shh],
        "kSh" => %w[kS x],
        "j~n" => %w[GY dny],
        "OM" => ["AUM"],
        "\\_" => ["\\`"],
        "\\_H" => ["\\`H"],
        "\\'M" => ["\\'.m", "\\'.n"],
        "\\_M" => "\\_.m \\_.n \\`M \\`.m \\`.n".split(/\s/),
        ".a" => ["~"],
        "|" => ["."],
        "||" => [".."],
        "z" => ["J"],
      },
    }
  end
end