twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/resources/list_formats_importer.rb

Summary

Maintainability
A
0 mins
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'nokogiri'
require 'parallel'
require 'etc'
require 'set'

module TwitterCldr
  module Resources

    class ListFormatsImporter < Importer

      requirement :cldr, Versions.cldr_version
      output_path 'locales'
      locales TwitterCldr.supported_locales
      ruby_engine :mri

      private

      def execute
        locales = Set.new

        finish = -> (locale, *) do
          locales.add(locale)
          STDOUT.write "\rImported #{locale}, #{locales.size} of #{params[:locales].size} total"
        end

        Parallel.each(params[:locales], in_processes: Etc.nprocessors, finish: finish) do |locale|
          import_locale(locale)
          locales << locale
        end

        puts
      end

      def import_locale(locale)
        # The merging that happens here works at the listPatternPart level of granularity.
        # In other words, a missing part will be filled in by any part with the same key
        # in the locale's ancestor chain. The raw CLDR data contains the inheritance marker
        # (i.e. "↑↑↑") for listPatterns that are missing parts, but the expanded data we
        # get in the downloadable CLDR zip file doesn't include them or the inherited data,
        # making it impossible for TwitterCLDR to know how it should handle missing keys.
        # I believe whatever massage tool the CLDR maintainers use to generate the final
        # data set doesn't take aliases into account, which explains the holes in the data.
        # By allowing individual listPatternParts to be populated by data from ancestor
        # locales, we fill in any missing parts at the minor risk of being slightly wrong
        # when formatting lists. In my opinion, it's far better to produce a slightly wrong
        # string than to error or produce an entirely empty string.
        data = requirements[:cldr].build_data(locale) do |ancestor_locale|
          ListFormats.new(ancestor_locale, requirements[:cldr]).to_h
        end

        output_file = File.join(output_path, locale.to_s, 'lists.yml')

        File.open(output_file, 'w:utf-8') do |output|
          output.write(
            TwitterCldr::Utils::YAML.dump(
              TwitterCldr::Utils.deep_symbolize_keys(locale => data),
              use_natural_symbols: true
            )
          )
        end
      end

      def output_path
        params.fetch(:output_path)
      end

    end


    class ListFormats

      attr_reader :locale, :cldr_req

      def initialize(locale, cldr_req)
        @locale = locale
        @cldr_req = cldr_req
      end

      def to_h
        { lists: lists }
      end

      def lists
        doc.xpath('//ldml/listPatterns/listPattern').each_with_object({}) do |pattern_node, pattern_result|
          pattern_type = if attribute = pattern_node.attribute('type')
            attribute.value.to_sym
          else
            :default
          end

          pattern_node = pattern_for(pattern_type)

          pattern_result[pattern_type] = pattern_node.xpath('listPatternPart').each_with_object({}) do |type_node, type_result|
            type_result[type_node.attribute('type').value.to_sym] = type_node.content
          end
        end
      end

      def pattern_for(type)
        xpath = xpath_for(type)
        pattern_node = doc.xpath(xpath)[0]
        alias_node = pattern_node.xpath('alias')[0]

        if alias_node
          alias_type = alias_node.attribute('path').value[/@type='([\w-]+)'/, 1] || :default
          # follow aliases so we can fully expand them
          pattern_node = pattern_for(alias_type)
        end

        pattern_node
      end

      def xpath_for(type)
        if type == :default
          '//ldml/listPatterns/listPattern[not(@type)]'
        else
          "//ldml/listPatterns/listPattern[@type='#{type}']"
        end
      end

      def doc
        @doc ||= begin
          locale_fs = locale.to_s.gsub('-', '_')
          Nokogiri.XML(File.read(File.join(cldr_main_path, "#{locale_fs}.xml")))
        end
      end

      def cldr_main_path
        @cldr_main_path ||= File.join(cldr_req.common_path, 'main')
      end

    end

  end
end