twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/resources/transforms_importer.rb

Summary

Maintainability
A
0 mins
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'fileutils'
require 'nokogiri'

module TwitterCldr
  module Resources

    class TransformsImporter < Importer

      IGNORED_TRANSFORMS = [
        # This transform appears to be broken. Two of its rules that deal with separator
        # characters don't make sense to me:
        #
        # "[:Separator:]* > ' ';"
        # "$space = [:Separator:]*;"
        #
        # Why would zero characters be replaced with a space? And why would zero characters
        # be considered a separator?
        #
        # More importantly, the algorithm in this library works with every other transform,
        # provisional and otherwise, except this one. Something's amiss with these rules.
        # ICU probably works because they, once again, fixed the rules but neglected to
        # upstream them into CLDR.
        #
        { source: 'ug', target: 'Latin' }
      ]

      requirement :cldr, Versions.cldr_version
      output_path 'shared/transforms'
      ruby_engine :mri

      private

      def execute
        transform_id_map = {}

        FileUtils.mkdir_p(output_path)

        each_transform_file do |transform_file|
          transform_data =
            parse_transform_data(File.read(transform_file))
              .reject do |transform_datum|
                ignored_transform?(transform_datum)
              end

          next if transform_data.empty?

          output_file = File.join(output_path, "#{File.basename(transform_file).chomp('.xml')}.yml")
          transform_id_map.merge!(map_aliases(transform_data, output_file))
          write_transform_data(transform_data, output_file)
        end

        write_transform_id_map(transform_id_map)
      end

      def ignored_transform?(transform_data)
        IGNORED_TRANSFORMS.any? do |ignored_transform|
          ignored_transform[:source] == transform_data[:source] &&
            ignored_transform[:target] == transform_data[:target]
        end
      end

      def map_aliases(transform_data, path)
        filename = File.basename(path).chomp('.yml')

        aliases = transform_data.flat_map do |transform_datum|
          (transform_datum[:aliases] || []) + [
            join_transform_id(
              transform_datum[:source],
              transform_datum[:target],
              transform_datum[:variant]
            )
          ]
        end

        aliases.uniq.each_with_object({}) do |aliass, ret|
          ret[aliass] = filename
        end
      end

      def join_transform_id(source, target, variant)
        TwitterCldr::Transforms::TransformId.join(source, target, variant)
      end

      def normalize_transform_id(id_str)
        TwitterCldr::Transforms::TransformId.parse(id_str).to_s
      end

      def write_transform_data(transform_data, path)
        File.open(path, 'w:utf-8') do |output|
          output.write(
            TwitterCldr::Utils::YAML.dump(
              TwitterCldr::Utils.deep_symbolize_keys(transforms: transform_data),
              use_natural_symbols: true
            )
          )
        end
      end

      def write_transform_id_map(transform_id_map)
        File.open(File.join(output_path, 'transform_id_map.yml'), 'w+') do |output|
          output.write(YAML.dump(transform_id_map))
        end
      end

      def parse_transform_data(transform_data)
        doc = Nokogiri.XML(transform_data)

        doc.xpath('supplementalData/transforms/transform').map do |transform_node|
          {
            source: transform_node.attribute('source').value,
            target: transform_node.attribute('target').value,
            aliases: get_aliases(transform_node),
            variant: get_variant(transform_node),
            direction: transform_node.attribute('direction').value,
            rules: rules(transform_node)
          }
        end
      end

      def get_aliases(node)
        if attrib = node.attribute('alias')
          attrib.value.split(' ')
        end
      end

      def get_variant(node)
        if attrib = node.attribute('variant')
          attrib.value
        end
      end

      def rules(transform_node)
        rules = fix_rule_wrapping(
          transform_node.xpath('tRule').flat_map do |rule_node|
            fix_rule(rule_node.content).split("\n").map(&:strip)
          end
        )

        rules.reject do |rule|
          rule.strip.empty? || rule.strip.start_with?('#')
        end
      end

      def fix_rule_wrapping(rules)
        wrap = false

        rules.each_with_object([]) do |rule, ret|
          if wrap
            ret.last.sub!(/\\\z/, rule)
          else
            ret << rule
          end

          wrap = rule.end_with?('\\')
        end
      end

      def fix_rule(rule)
        rule.
          gsub("←", '<').
          gsub("→", '>').
          gsub("↔", '<>')
      end

      def each_transform_file(&block)
        Dir.glob(File.join(transforms_path, '*.xml')).each(&block)
      end

      def transforms_path
        File.join(requirements[:cldr].common_path, 'transforms')
      end

      def output_path
        params.fetch(:output_path)
      end

    end

  end
end