lib/sanscript/transliterate.rb
# frozen_string_literal: true
require "sanscript/transliterate/schemes"
module Sanscript
# Sanskrit transliteration module.
# Derived from Sanscript (https://github.com/sanskrit/sanscript.js), which is
# released under the MIT and GPL Licenses.
#
# "Sanscript is a Sanskrit transliteration library. Currently, it supports
# other Indian languages only incidentally."
module Transliterate
class << self
# @return [Array<Symbol>] the names of all supported schemes
attr_reader :scheme_names
# @return [Array<Symbol>] the names of all Brahmic schemes
attr_reader :brahmic_schemes
# @return [Array<Symbol>] the names of all roman schemes
attr_reader :roman_schemes
# @return [Hash] the data for all schemes
attr_reader :schemes
# @return [Hash] the alternate-character data for all schemes
attr_reader :all_alternates
# @return [Hash] the default transliteration options
attr_reader :defaults
end
@defaults = {
skip_sgml: false,
syncope: false,
}
@cache = {}
module_function
# rubocop:disable AbcSize, CyclomaticComplexity, MethodLength, PerceivedComplexity
# Check whether the given scheme encodes Brahmic Sanskrit.
#
# @param name [Symbol] the scheme name
# @return [Boolean]
def brahmic_scheme?(name)
@brahmic_schemes.include?(name.to_sym)
end
# Check whether the given scheme encodes romanized Sanskrit.
#
# @param name [Symbol] the scheme name
# @return [Boolean]
def roman_scheme?(name)
@roman_schemes.include?(name.to_sym)
end
# Add a Brahmic scheme to Sanscript.
#
# Schemes are of two types: "Brahmic" and "roman". Brahmic consonants
# have an inherent vowel sound, but roman consonants do not. This is the
# main difference between these two types of scheme.
#
# A scheme definition is a Hash that maps a group name to a
# list of characters. For illustration, see `transliterate/schemes.rb`.
#
# You can use whatever group names you like, but for the best results,
# you should use the same group names that Sanscript does.
#
# @param name [Symbol] the scheme name
# @param scheme [Hash] the scheme data, constructed as described above
# @return [Hash] the frozen scheme data as it exists inside the module
def add_brahmic_scheme(name, scheme)
name = name.to_sym
scheme = scheme.deep_dup
@schemes[name] = IceNine.deep_freeze(scheme)
@brahmic_schemes.add(name)
@scheme_names.add(name)
scheme
end
# Add a roman scheme to Sanscript.
#
# @param name [Symbol] the scheme name
# @param scheme [Hash] the scheme data, constructed as in {add_brahmic_scheme}.
# The "vowel_marks" field can be omitted
# @return [Hash] the frozen scheme data as it exists inside the module
def add_roman_scheme(name, scheme)
name = name.to_sym
scheme = scheme.deep_dup
scheme[:vowel_marks] = scheme[:vowels][1..-1] unless scheme.key?(:vowel_marks)
@schemes[name] = IceNine.deep_freeze(scheme)
@roman_schemes.add(name)
@scheme_names.add(name)
scheme
end
# Set up various schemes
begin
# Re-add existing Brahmic schemes in order to add them to `scheme_names`
# and to freeze them up.
brahmic_scheme_names = %i[bengali devanagari gujarati gurmukhi kannada malayalam
oriya tamil telugu]
brahmic_scheme_names.each do |name|
add_brahmic_scheme(name, @schemes[name])
end
# Set up roman schemes
kolkata = @schemes[:kolkata] = @schemes[:iast].deep_dup
roman_scheme_names = %i[iast itrans hk kolkata slp1 velthuis wx]
kolkata[:vowels] = %w[a ā i ī u ū ṛ ṝ ḷ ḹ e ē ai o ō au]
# These schemes already belong to Sanscript.schemes. But by adding
# them again with `add_roman_scheme`, we automatically build up
# `roman_schemes` and define a `vowel_marks` field for each one.
roman_scheme_names.each do |name|
add_roman_scheme(name, @schemes[name])
end
# Add Kyoto-Harvard copy (alternate name of Harvard-Kyoto scheme)/
add_roman_scheme(:kh, @schemes[:hk])
# ITRANS variant, which supports Dravidian short 'e' and 'o'.
itrans_dravidian = @schemes[:itrans].deep_dup
itrans_dravidian[:vowels] = %w[a A i I u U Ri RRI LLi LLi e E ai o O au]
itrans_dravidian[:vowel_marks] = itrans_dravidian[:vowels][1..-1]
@all_alternates[:itrans_dravidian] = @all_alternates[:itrans]
add_roman_scheme(:itrans_dravidian, itrans_dravidian)
# ensure deep freeze on alternates
@all_alternates.each_value { |alternates| IceNine.deep_freeze(alternates) }
end
# Transliterate from one script to another.
#
# @param data [String] the String to transliterate
# @param from [Symbol] the source script
# @param to [Symbol] the destination script
# @option opts [Boolean] :skip_sgml (false) escape SGML-style tags in text string
# @option opts [Boolean] :syncope (false) activate Hindi-style schwa syncope
# @return [String] the transliterated string
def transliterate(data, from, to, **opts)
from = from.to_sym
to = to.to_sym
return data if from == to
raise SchemeNotSupportedError, from unless @schemes.key?(from)
raise SchemeNotSupportedError, to unless @schemes.key?(to)
data = data.to_str.dup
options = @defaults.merge(opts)
map = make_map(from, to)
data.gsub!(/(<.*?>)/, "##\\1##") if options[:skip_sgml]
# Easy way out for "{\m+}", "\", and ".h".
if from == :itrans
data.gsub!(/\{\\m\+\}/, ".h.N")
data.gsub!(/\.h/, "")
data.gsub!(/\\([^'`_]|$)/, "##\\1##")
end
if map[:from_roman?]
transliterate_roman(data, map, options)
else
transliterate_brahmic(data, map)
end
end
class << self
private
# Create a map from every character in `from` to its partner in `to`.
# Also, store any "marks" that `from` might have.
#
# @param from [Symbol] name of input scheme
# @param to [Symbol] name of output scheme
# @return [Hash] a mapping from one scheme to another
def make_map(from, to)
@cache[:"#{from}_#{to}"] ||= begin
alternates = @all_alternates[from] || {}
consonants = {}
from_scheme = @schemes[from]
letters = {}
token_lengths = []
marks = {}
to_scheme = @schemes[to]
from_scheme.each do |group, from_group|
to_group = to_scheme[group]
next if to_group.nil?
from_group.each_with_index do |f, i|
t = to_group[i]
alts = alternates[f] || []
token_lengths.push(f.length)
token_lengths.concat(alts.map(&:length))
if group == :vowel_marks || group == :virama # rubocop:disable MultipleComparison
marks[f] = t
alts.each { |alt| marks[alt] = t }
else
letters[f] = t
alts.each { |alt| letters[alt] = t }
if group == :consonants || group == :other # rubocop:disable MultipleComparison
consonants[f] = t
alts.each { |alt| consonants[alt] = t }
end
end
end
end
IceNine.deep_freeze(
consonants: consonants,
from_roman?: roman_scheme?(from),
letters: letters,
marks: marks,
max_token_length: token_lengths.max,
to_roman?: roman_scheme?(to),
virama: to_scheme[:virama].first
)
end
end
# Transliterate from a romanized script.
#
# @param data [String] the string to transliterate
# @param map [Hash] map data generated from {#make_map}
# @return [String] the transliterated string
def transliterate_roman(data, map, options = {})
data = data.to_str.chars
buf = []
token_buffer = []
had_consonant = false
transliteration_enabled = true
control_char = false
max_token_length = map[:max_token_length]
until data.empty? && token_buffer.empty?
# Match all token substrings to our map.
token = data[0, max_token_length].join("")
max_token_length.downto(1) do |j| # rubocop:disable BlockLength
token = token[0, j] unless j == max_token_length
if j == 2
if !control_char && token == "##"
transliteration_enabled = !transliteration_enabled
data.shift(2)
break
elsif control_char && token == "#}"
transliteration_enabled = true
control_char = false
buf << token
data.shift(2)
break
elsif transliteration_enabled && token == "{#"
transliteration_enabled = false
control_char = true
buf << token
data.shift(2)
break
end
end
if transliteration_enabled && (temp_letter = map[:letters][token])
if map[:to_roman?]
buf << temp_letter
else
# Handle the implicit vowel. Ignore 'a' and force
# vowels to appear as marks if we've just seen a
# consonant.
if had_consonant
# rubocop:disable Metrics/BlockNesting
if (temp_mark = map[:marks][token])
buf << temp_mark
elsif token != "a"
buf.push(map[:virama], temp_letter)
end
# rubocop:enable Metrics/BlockNesting
else
buf << temp_letter
end
had_consonant = map[:consonants].key?(token)
end
j > 1 ? data.shift(j) : data.shift
break
elsif j == 1 # Last iteration
if had_consonant
had_consonant = false
buf << map[:virama] unless options[:syncope]
end
buf << token
data.shift
end
end
end
buf << map[:virama] if had_consonant && !options[:syncope]
buf.join("")
end
# Transliterate from a Brahmic script.
#
# @param data [String] the string to transliterate
# @param map [Hash] map data generated from {#make_map}
# @return [String] the transliterated string
def transliterate_brahmic(data, map)
data = data.to_str.chars
buf = []
had_roman_consonant = false
transliteration_enabled = true
control_char = false
until data.empty?
token = data[0, 2].join("")
if !control_char && token == "##"
if had_roman_consonant
buf << "a" if transliteration_enabled
had_roman_consonant = false
end
transliteration_enabled = !transliteration_enabled
data.shift(2)
next
elsif control_char && token == "#}"
transliteration_enabled = true
control_char = false
buf << token
data.shift(2)
next
elsif transliteration_enabled && token == "{#"
if had_roman_consonant
buf << "a"
had_roman_consonant = false
end
transliteration_enabled = false
control_char = true
buf << token
data.shift(2)
next
end
l = data.shift
unless transliteration_enabled
buf << l
next
end
temp = map[:marks][l]
if !temp.nil?
buf << temp
had_roman_consonant = false
else
if had_roman_consonant
buf << "a"
had_roman_consonant = false
end
# Push transliterated letter if possible. Otherwise, push
# the letter itself.
temp = map[:letters][l]
if !temp.nil?
buf << temp
had_roman_consonant = map[:to_roman?] && map[:consonants].key?(l)
else
buf << l
end
end
end
buf << "a" if had_roman_consonant
buf.join("")
end
end
# rubocop:enable AbcSize, CyclomaticComplexity, MethodLength, PerceivedComplexity
end
end