lib/twitter_cldr/utils/yaml.rb
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
# This code was adapted from the ya2yaml gem, maintained by Akira Funai.
# https://github.com/afunai/ya2yaml
# Copyright (c) 2006 Akira FUNAI <funai.akira@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
module TwitterCldr
module Utils
class YAML
UCS_0X85 = [0x85].pack('U') # c285@UTF8 Unicode next line
UCS_0XA0 = [0xa0].pack('U') # c2a0@UTF8 Unicode non-breaking space
UCS_0X2028 = [0x2028].pack('U') # e280a8@UTF8 Unicode line separator
UCS_0X2029 = [0x2029].pack('U') # e280a9@UTF8 Unicode paragraph separator
# non-break characters
ESCAPE_SEQ = {
"\x00" => '\\0',
"\x07" => '\\a',
"\x08" => '\\b',
"\x0b" => '\\v',
"\x0c" => '\\f',
"\x1b" => '\\e',
"\"" => '\\"',
"\\" => '\\\\',
}
# non-breaking space
ESCAPE_SEQ_NS = {
UCS_0XA0 => '\\_',
}
# white spaces
ESCAPE_SEQ_WS = {
"\x09" => '\\t',
" " => '\\x20',
}
# line breaks
ESCAPE_SEQ_LB ={
"\x0a" => '\\n',
"\x0d" => '\\r',
UCS_0X85 => '\\N',
UCS_0X2028 => '\\L',
UCS_0X2029 => '\\P',
}
# regexps for line breaks
REX_LF = Regexp.escape("\x0a")
REX_CR = Regexp.escape("\x0d")
REX_CRLF = Regexp.escape("\x0d\x0a")
REX_NEL = Regexp.escape(UCS_0X85)
REX_LS = Regexp.escape(UCS_0X2028)
REX_PS = Regexp.escape(UCS_0X2029)
REX_ANY_LB = /(#{REX_LF}|#{REX_CR}|#{REX_NEL}|#{REX_LS}|#{REX_PS})/
REX_NORMAL_LB = /(#{REX_LF}|#{REX_LS}|#{REX_PS})/
# regexps for language-Independent types for YAML1.1
REX_BOOL = /
y|Y|yes|Yes|YES|n|N|no|No|NO
|true|True|TRUE|false|False|FALSE
|on|On|ON|off|Off|OFF
/x
REX_FLOAT = /
[-+]?([0-9][0-9_]*)?\.[0-9.]*([eE][-+][0-9]+)? # (base 10)
|[-+]?[0-9][0-9_]*(:[0-5]?[0-9])+\.[0-9_]* # (base 60)
|[-+]?\.(inf|Inf|INF) # (infinity)
|\.(nan|NaN|NAN) # (not a number)
/x
REX_INT = /
[-+]?0b[0-1_]+ # (base 2)
|[-+]?0[0-7_]+ # (base 8)
|[-+]?(0|[1-9][0-9_]*) # (base 10)
|[-+]?0x[0-9a-fA-F_]+ # (base 16)
|[-+]?[1-9][0-9_]*(:[0-5]?[0-9])+ # (base 60)
/x
REX_MERGE = /
<<
/x
REX_NULL = /
~ # (canonical)
|null|Null|NULL # (English)
| # (Empty)
/x
REX_TIMESTAMP = /
[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] # (ymd)
|[0-9][0-9][0-9][0-9] # (year)
-[0-9][0-9]? # (month)
-[0-9][0-9]? # (day)
([Tt]|[ \t]+)[0-9][0-9]? # (hour)
:[0-9][0-9] # (minute)
:[0-9][0-9] # (second)
(\.[0-9]*)? # (fraction)
(([ \t]*)Z|[-+][0-9][0-9]?(:[0-9][0-9])?)? # (time zone)
/x
REX_VALUE = /
=
/x
REX_SYMBOL = /
\A:.*
/x
class << self
def dump(obj, opts = {})
@options = opts.dup
@options[:indent_size] = 2 if @options[:indent_size].to_i <= 0
@options[:minimum_block_length] = 0 if @options[:minimum_block_length].to_i <= 0
@options.update(
{
printable_with_syck: true,
escape_b_specific: true,
escape_as_utf8: true,
}
) if @options[:syck_compatible]
"--- #{emit(obj, 1)}\n"
rescue SystemStackError
raise ArgumentError, "TwitterCLDR yaml dumper can't handle circular references"
end
private
def emit(obj, level)
case obj
when Array
if (obj.length == 0)
'[]'
else
indent = "\n#{s_indent(level - 1)}"
obj.collect do |o|
"#{indent}- #{emit(o, level + 1)}"
end.join('')
end
when Hash
if (obj.length == 0)
'{}'
else
indent = "\n#{s_indent(level - 1)}"
hash_order = @options[:hash_order]
if (hash_order && level == 1)
hash_keys = obj.keys.sort do |x, y|
x_order = hash_order.index(x) ? hash_order.index(x) : Float::MAX
y_order = hash_order.index(y) ? hash_order.index(y) : Float::MAX
o = (x_order <=> y_order)
(o != 0) ? o : (x.to_s <=> y.to_s)
end
elsif @options[:preserve_order]
hash_keys = obj.keys
else
hash_keys = obj.keys.sort { |x, y| x.to_s <=> y.to_s }
end
hash_keys.collect do |k|
key = emit(k, level + 1)
if (
is_one_plain_line?(key) ||
key =~ /\A(#{REX_BOOL}|#{REX_FLOAT}|#{REX_INT}|#{REX_NULL}|#{REX_SYMBOL})\z/x
)
"#{indent}#{key}: #{emit(obj[k], level + 1)}"
else
"#{indent}? #{key}#{indent}: #{emit(obj[k], level + 1)}"
end
end.join('')
end
when NilClass
'~'
when String
emit_string(obj, level)
when TrueClass, FalseClass
obj.to_s
when Integer, Float
obj.to_s
when Date
obj.to_s
when Time
offset = obj.gmtoff
off_hm = sprintf(
'%+.2d:%.2d',
(offset / 3600.0).to_i,
(offset % 3600.0) / 60
)
u_sec = (obj.usec != 0) ? sprintf(".%.6d", obj.usec) : ''
obj.strftime("%Y-%m-%d %H:%M:%S#{u_sec} #{off_hm}")
when Symbol
prefix = @options[:use_natural_symbols] && is_one_plain_line?(obj.to_s) ? ":" : "!ruby/symbol "
"#{prefix}#{emit_string(obj, level)}"
when Range
'!ruby/range ' + obj.to_s
when Regexp
'!ruby/regexp ' + obj.inspect
else
case
when obj.is_a?(Struct)
struct_members = {}
obj.each_pair { |k, v| struct_members[k.to_s] = v }
"!ruby/struct:#{obj.class.to_s.sub(/^(Struct::(.+)|.*)$/, '\2')} #{emit(struct_members, level + 1)}"
else
# serialized as a generic object
object_members = {}
obj.instance_variables.each do |k, v|
object_members[k.to_s.sub(/^@/, '')] = obj.instance_variable_get(k)
end
"!ruby/object:#{obj.class.to_s} #{emit(object_members, level + 1)}"
end
end
end
def emit_string(str, level)
if @options[:quote_all_strings] && !str.is_a?(Symbol)
emit_quoted_string(str, level)
else
str = str.to_s
(is_string, is_printable, is_one_line, is_one_plain_line) = string_type(str)
if is_string
if is_printable
if is_one_plain_line
emit_simple_string(str, level)
else
(is_one_line || str.length < @options[:minimum_block_length]) ?
emit_quoted_string(str, level) :
emit_block_string(str, level)
end
else
emit_quoted_string(str, level)
end
else
emit_base64_binary(str, level)
end
end
end
def emit_simple_string(str, level)
str
end
def emit_block_string(str, level)
str = normalize_line_break(str)
indent = s_indent(level)
indentation_indicator = (str =~ /\A /) ? indent.size.to_s : ''
str =~ /(#{REX_NORMAL_LB}*)\z/
chomping_indicator = case $1.length
when 0
'-'
when 1
''
else
'+'
end
str.chomp!
str.gsub!(/#{REX_NORMAL_LB}/) { $1 + indent }
"|#{indentation_indicator}#{chomping_indicator}\n#{indent}#{str}"
end
def emit_quoted_string(str, level)
str = yaml_escape(normalize_line_break(str))
if (str.length < @options[:minimum_block_length])
str.gsub!(/#{REX_NORMAL_LB}/) { ESCAPE_SEQ_LB[$1] }
else
str.gsub!(/#{REX_NORMAL_LB}$/) { ESCAPE_SEQ_LB[$1] }
str.gsub!(/(#{REX_NORMAL_LB}+)(.)/) do
trail_c = $3
$1 + trail_c.sub(/([\t ])/) { ESCAPE_SEQ_WS[$1] }
end
indent = s_indent(level)
str.gsub!(/#{REX_NORMAL_LB}/) { "#{ESCAPE_SEQ_LB[$1]}\\\n#{indent}" }
end
%Q("#{str}")
end
def emit_base64_binary(str, level)
indent = "\n#{s_indent(level)}"
base64 = [str].pack('m')
"!binary |#{indent}#{base64.gsub(/\n(?!\z)/, indent)}"
end
def string_type(str)
if str.respond_to?(:encoding) && (!str.valid_encoding? || str.encoding == Encoding::ASCII_8BIT)
return false, false, false, false
end
(ucs_codes = str.unpack('U*')) rescue (
# ArgumentError -> binary data
return false, false, false, false
)
if (
@options[:printable_with_syck] &&
str =~ /\A#{REX_ANY_LB}* | #{REX_ANY_LB}*\z|#{REX_ANY_LB}{2}\z/
)
# detour Syck bug
return true, false, nil, false
end
ucs_codes.each {|ucs_code|
return true, false, nil, false unless is_printable?(ucs_code)
}
return true, true, is_one_line?(str), is_one_plain_line?(str)
end
def is_printable?(ucs_code)
# YAML 1.1 / 4.1.1.
(
[0x09, 0x0a, 0x0d, 0x85].include?(ucs_code) ||
(ucs_code <= 0x7e && ucs_code >= 0x20) ||
(ucs_code <= 0xd7ff && ucs_code >= 0xa0) ||
(ucs_code <= 0xfffd && ucs_code >= 0xe000) ||
(ucs_code <= 0x10ffff && ucs_code >= 0x10000)
) &&
!(
# treat LS/PS as non-printable characters
@options[:escape_b_specific] &&
(ucs_code == 0x2028 || ucs_code == 0x2029)
)
end
def is_one_line?(str)
str !~ /#{REX_ANY_LB}(?!\z)/
end
def is_one_plain_line?(str)
# YAML 1.1 / 4.6.11.
str !~ /^([\-\?:,\[\]\{\}\#&\*!\|>'"%@`\s]|---|\.\.\.)/ &&
str !~ /[:\#\s\[\]\{\},]/ &&
str !~ /#{REX_ANY_LB}/ &&
str !~ /^(#{REX_BOOL}|#{REX_FLOAT}|#{REX_INT}|#{REX_MERGE}
|#{REX_NULL}|#{REX_TIMESTAMP}|#{REX_VALUE})$/x
end
def s_indent(level)
# YAML 1.1 / 4.2.2.
' ' * (level * @options[:indent_size])
end
def normalize_line_break(str)
# YAML 1.1 / 4.1.4.
str.gsub(/(#{REX_CRLF}|#{REX_CR}|#{REX_NEL})/, "\n")
end
def yaml_escape(str)
# YAML 1.1 / 4.1.6.
str.gsub(/[^a-zA-Z0-9]/u) do |c|
ucs_code, = (c.unpack('U') rescue [??])
case
when ESCAPE_SEQ[c]
ESCAPE_SEQ[c]
when is_printable?(ucs_code)
c
when @options[:escape_as_utf8]
c.respond_to?(:bytes) ?
c.bytes.collect { |b| '\\x%.2x' % b }.join :
'\\x' + c.unpack('H2' * c.size).join('\\x')
when ucs_code == 0x2028 || ucs_code == 0x2029
ESCAPE_SEQ_LB[c]
when ucs_code <= 0x7f
sprintf('\\x%.2x', ucs_code)
when ucs_code <= 0xffff
sprintf('\\u%.4x', ucs_code)
else
sprintf('\\U%.8x', ucs_code)
end
end
end
end
end
end
end