lib/ruby_speech/ssml/prosody.rb
module RubySpeech
module SSML
##
# The prosody element permits control of the pitch, speaking rate and volume of the speech output.
#
# http://www.w3.org/TR/speech-synthesis/#S3.2.4
#
# Although each attribute individually is optional, it is an error if no attributes are specified when the prosody element is used. The "x-foo" attribute value names are intended to be mnemonics for "extra foo". Note also that customary pitch levels and standard pitch ranges may vary significantly by language, as may the meanings of the labelled values for pitch targets and ranges.
#
# The duration attribute takes precedence over the rate attribute. The contour attribute takes precedence over the pitch and range attributes.
#
# The default value of all prosodic attributes is no change. For example, omitting the rate attribute means that the rate is the same within the element as outside.
#
class Prosody < Element
%w{
audio
break
desc
emphasis
mark
p
phoneme
s
say_as
speak
sub
voice
}.each { |f| require "ruby_speech/ssml/#{f}" }
register :prosody
VALID_PITCHES = [:'x-low', :low, :medium, :high, :'x-high', :default].freeze
VALID_VOLUMES = [:silent, :'x-soft', :soft, :medium, :loud, :'x-loud', :default].freeze
VALID_RATES = [:'x-slow', :slow, :medium, :fast, :'x-fast', :default].freeze
VALID_CHILD_TYPES = [Nokogiri::XML::Element, Nokogiri::XML::Text, String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, S, SayAs, Sub, Voice].freeze
##
# The baseline pitch for the contained text. Although the exact meaning of "baseline pitch" will vary across synthesis processors, increasing/decreasing this value will typically increase/decrease the approximate pitch of the output. Legal values are: a number followed by "Hz", a relative change or "x-low", "low", "medium", "high", "x-high", or "default". Labels "x-low" through "x-high" represent a sequence of monotonically non-decreasing pitch levels.
#
# @return [Symbol, String]
#
def pitch
value = read_attr :pitch
return unless value
if value.include?('Hz')
value
elsif VALID_PITCHES.include?(value.to_sym)
value.to_sym
end
end
##
# @param [Symbol, String] p
#
# @raises ArgumentError if p is not a string that contains 'Hz' or one of VALID_PITCHES
#
def pitch=(p)
set_frequency_attribute :pitch, p
end
##
# The actual pitch contour for the contained text.
#
# The pitch contour is defined as a set of white space-separated targets at specified time positions in the speech output. The algorithm for interpolating between the targets is processor-specific. In each pair of the form (time position,target), the first value is a percentage of the period of the contained text (a number followed by "%") and the second value is the value of the pitch attribute (a number followed by "Hz", a relative change, or a label value). Time position values outside 0% to 100% are ignored. If a pitch value is not defined for 0% or 100% then the nearest pitch target is copied. All relative values for the pitch are relative to the pitch value just before the contained text.
#
# @return [Symbol]
#
def contour
read_attr :contour
end
##
# @param [String] v
#
def contour=(v)
self[:contour] = v
end
##
# The pitch range (variability) for the contained text. Although the exact meaning of "pitch range" will vary across synthesis processors, increasing/decreasing this value will typically increase/decrease the dynamic range of the output pitch. Legal values are: a number followed by "Hz", a relative change or "x-low", "low", "medium", "high", "x-high", or "default". Labels "x-low" through "x-high" represent a sequence of monotonically non-decreasing pitch ranges.
#
# @return [Symbol]
#
def range
value = read_attr :range
return unless value
if value.include?('Hz')
value
elsif VALID_PITCHES.include?(value.to_sym)
value.to_sym
end
end
##
# @param [Symbol, String] p
#
# @raises ArgumentError if p is not a string that contains 'Hz' or one of VALID_PITCHES
#
def range=(p)
set_frequency_attribute :range, p
end
##
# A change in the speaking rate for the contained text. Legal values are: a relative change or "x-slow", "slow", "medium", "fast", "x-fast", or "default". Labels "x-slow" through "x-fast" represent a sequence of monotonically non-decreasing speaking rates. When a number is used to specify a relative change it acts as a multiplier of the default rate. For example, a value of 1 means no change in speaking rate, a value of 2 means a speaking rate twice the default rate, and a value of 0.5 means a speaking rate of half the default rate. Further, changes can be specified as percentages (e.g. '10%' or '+15%'), but these are the only string entries permitted. The default rate for a voice depends on the language and dialect and on the personality of the voice. The default rate for a voice should be such that it is experienced as a normal speaking rate for the voice when reading aloud text. Since voices are processor-specific, the default rate will be as well.
#
# @return [Symbol, Float, String]
#
def rate
value = read_attr :rate
return unless value
if VALID_RATES.include?(value.to_sym)
value.to_sym
elsif value.include? "%"
value
else
value.to_f
end
end
##
# @param [Symbol, Numeric, String] v
#
# @raises ArgumentError if v is not either a positive Numeric or one of VALID_RATES
#
def rate=(v)
raise ArgumentError, "You must specify a valid rate ([positive-number](multiplier), #{VALID_RATES.map(&:inspect).join ', '})" unless (v.is_a?(Numeric) && v >= 0) || VALID_RATES.include?(v) || (v.is_a?(String) && v.include?("%"))
self[:rate] = v
end
##
# A value in seconds for the desired time to take to read the element contents.
#
# @return [Integer]
#
def duration
value = get_time_attribute :duration
value.round if value
end
##
# @param [Numeric] t
#
# @raises ArgumentError if t is not a positive numeric value
#
def duration=(t)
set_time_attribute :duration, t
end
##
# The volume for the contained text in the range 0.0 to 100.0 (higher values are louder and specifying a value of zero is equivalent to specifying "silent"). Legal values are: number, a relative change or "silent", "x-soft", "soft", "medium", "loud", "x-loud", or "default". The volume scale is linear amplitude. The default is 100.0. Labels "silent" through "x-loud" represent a sequence of monotonically non-decreasing volume levels.
#
# @return [Symbol, Float]
#
def volume
value = read_attr :volume
return unless value
if VALID_VOLUMES.include?(value.to_sym)
value.to_sym
else
value.to_f
end
end
##
# @param [Numeric, Symbol] v
#
# @raises ArgumentError if v is not one of VALID_VOLUMES or a numeric value between 0.0 and 100.0
#
def volume=(v)
raise ArgumentError, "You must specify a valid volume ([positive-number](0.0 -> 100.0), #{VALID_VOLUMES.map(&:inspect).join ', '})" unless (v.is_a?(Numeric) && (0..100).include?(v)) || VALID_VOLUMES.include?(v)
self[:volume] = v
end
def <<(arg)
raise InvalidChildError, "A Prosody can only accept String, Audio, Break, Emphasis, Mark, P, Phoneme, Prosody, SayAs, Sub, S, Voice as children" unless VALID_CHILD_TYPES.include? arg.class
super
end
def eql?(o)
super o, :pitch, :contour, :range, :rate, :duration, :volume
end
private
def set_frequency_attribute(key, value)
hz = value.is_a?(String) && value.include?('Hz') && value.to_f > 0
raise ArgumentError, "You must specify a valid #{key} (\"[positive-number]Hz\", #{VALID_PITCHES.map(&:inspect).join ', '})" unless hz || VALID_PITCHES.include?(value)
self[key] = value
end
end # Prosody
end # SSML
end # RubySpeech