lib/ting/hanyu_pinyin_parser.rb
module Ting
class HanyuPinyinParser
include Procable
def hanyu_writer
@hanyu_writer ||= Ting.writer(:hanyu, :accents)
end
def hanyu_reader
@hanyu_reader ||= Ting.reader(:hanyu, :accents)
end
def all_syllables
@all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse
end
def consonant_syllables
@consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i)
end
def pinyin_regexp
# This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation.
@pinyin_cluster_regexp ||= /\A
# Every syllable can appear at the start of a cluster.
(#{Regexp.union(all_syllables)})
# However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to
# be prefixed with an apostrophe.
# Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with
# a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid
# syllable.
(#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})*
(r)?
\Z/x
end
def pinyin_separator_regexp
# A regular expression that matches every character that can *not* appear in pinyin.
@pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+")
end
def parse_cluster(pinyin)
syllables = []
# Chop off one syllable at a time from the end by continuously matching the same regular expression.
# This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will
# only contain the *last* content it has matched, we have to use a loop.
while match = pinyin_regexp.match(pinyin)
# If an 'r' at the end was matched, this implies that all other parts of the string were matched as
# syllables, and this cluster uses erhua.
if 'r' == match[3]
syllables << 'er'
pinyin = pinyin.chop
end
last_syllable = match[2] || match[1]
syllables << last_syllable
pinyin = pinyin[0, pinyin.length - last_syllable.length]
end
raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty?
syllables.reverse
end
def parse(pinyin)
# hanyu_reader cannot parse uppercase pinyin.
pinyin = pinyin.downcase
clusters = pinyin.split(pinyin_separator_regexp)
clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader)
end
alias call parse
end
end