lib/twitter_cldr/resources/transforms_importer.rb
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
require 'fileutils'
require 'nokogiri'
module TwitterCldr
module Resources
class TransformsImporter < Importer
IGNORED_TRANSFORMS = [
# This transform appears to be broken. Two of its rules that deal with separator
# characters don't make sense to me:
#
# "[:Separator:]* > ' ';"
# "$space = [:Separator:]*;"
#
# Why would zero characters be replaced with a space? And why would zero characters
# be considered a separator?
#
# More importantly, the algorithm in this library works with every other transform,
# provisional and otherwise, except this one. Something's amiss with these rules.
# ICU probably works because they, once again, fixed the rules but neglected to
# upstream them into CLDR.
#
{ source: 'ug', target: 'Latin' }
]
requirement :cldr, Versions.cldr_version
output_path 'shared/transforms'
ruby_engine :mri
private
def execute
transform_id_map = {}
FileUtils.mkdir_p(output_path)
each_transform_file do |transform_file|
transform_data =
parse_transform_data(File.read(transform_file))
.reject do |transform_datum|
ignored_transform?(transform_datum)
end
next if transform_data.empty?
output_file = File.join(output_path, "#{File.basename(transform_file).chomp('.xml')}.yml")
transform_id_map.merge!(map_aliases(transform_data, output_file))
write_transform_data(transform_data, output_file)
end
write_transform_id_map(transform_id_map)
end
def ignored_transform?(transform_data)
IGNORED_TRANSFORMS.any? do |ignored_transform|
ignored_transform[:source] == transform_data[:source] &&
ignored_transform[:target] == transform_data[:target]
end
end
def map_aliases(transform_data, path)
filename = File.basename(path).chomp('.yml')
aliases = transform_data.flat_map do |transform_datum|
(transform_datum[:aliases] || []) + [
join_transform_id(
transform_datum[:source],
transform_datum[:target],
transform_datum[:variant]
)
]
end
aliases.uniq.each_with_object({}) do |aliass, ret|
ret[aliass] = filename
end
end
def join_transform_id(source, target, variant)
TwitterCldr::Transforms::TransformId.join(source, target, variant)
end
def normalize_transform_id(id_str)
TwitterCldr::Transforms::TransformId.parse(id_str).to_s
end
def write_transform_data(transform_data, path)
File.open(path, 'w:utf-8') do |output|
output.write(
TwitterCldr::Utils::YAML.dump(
TwitterCldr::Utils.deep_symbolize_keys(transforms: transform_data),
use_natural_symbols: true
)
)
end
end
def write_transform_id_map(transform_id_map)
File.open(File.join(output_path, 'transform_id_map.yml'), 'w+') do |output|
output.write(YAML.dump(transform_id_map))
end
end
def parse_transform_data(transform_data)
doc = Nokogiri.XML(transform_data)
doc.xpath('supplementalData/transforms/transform').map do |transform_node|
{
source: transform_node.attribute('source').value,
target: transform_node.attribute('target').value,
aliases: get_aliases(transform_node),
variant: get_variant(transform_node),
direction: transform_node.attribute('direction').value,
rules: rules(transform_node)
}
end
end
def get_aliases(node)
if attrib = node.attribute('alias')
attrib.value.split(' ')
end
end
def get_variant(node)
if attrib = node.attribute('variant')
attrib.value
end
end
def rules(transform_node)
rules = fix_rule_wrapping(
transform_node.xpath('tRule').flat_map do |rule_node|
fix_rule(rule_node.content).split("\n").map(&:strip)
end
)
rules.reject do |rule|
rule.strip.empty? || rule.strip.start_with?('#')
end
end
def fix_rule_wrapping(rules)
wrap = false
rules.each_with_object([]) do |rule, ret|
if wrap
ret.last.sub!(/\\\z/, rule)
else
ret << rule
end
wrap = rule.end_with?('\\')
end
end
def fix_rule(rule)
rule.
gsub("←", '<').
gsub("→", '>').
gsub("↔", '<>')
end
def each_transform_file(&block)
Dir.glob(File.join(transforms_path, '*.xml')).each(&block)
end
def transforms_path
File.join(requirements[:cldr].common_path, 'transforms')
end
def output_path
params.fetch(:output_path)
end
end
end
end