twitter/twitter-cldr-rb

View on GitHub
lib/twitter_cldr/resources/language_codes_importer.rb

Summary

Maintainability
A
1 hr
Test Coverage
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'open-uri'

module TwitterCldr
  module Resources

    class LanguageCodesImporter < Importer

      BCP_47_FILE, ISO_639_FILE = %w[bcp-47.txt iso-639.txt]

      INPUT_DATA = {
        BCP_47_FILE  => 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry',

        # docs: https://iso639-3.sil.org/code_tables/download_tables#639-3%20Code%20Set
        ISO_639_FILE => 'https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab'
      }

      KEYS_TO_STANDARDS = {
        Part1:      :iso_639_1,
        Part2B:     :iso_639_2,
        Part2T:     :iso_639_2_term,
        Id:         :iso_639_3,
        bcp_47:     :bcp_47,
        bcp_47_alt: :bcp_47_alt
      }.freeze

      STANDARDS_TO_KEYS = KEYS_TO_STANDARDS.invert.freeze

      output_path 'shared'
      ruby_engine :mri

      private

      def execute
        prepare_data
        import_data
      end

      def prepare_data
        INPUT_DATA.each do |file, url|
          source_path = source_path_for(file)

          unless File.exist?(source_path)
            open(source_path, 'wb') { |file| file << URI.open(url).read }
          end
        end
      end

      def source_path_for(file)
        File.join(TwitterCldr::VENDOR_DIR, file)
      end

      def import_data
        result = import_iso_639
        result = import_bcp_47(result)

        language_codes = Hash[result.inject({}) { |memo, (key, value)| memo[key] = Hash[value.sort]; memo }.sort]
        language_codes_table = build_table(language_codes)

        write('language_codes_table.dump', Marshal.dump(language_codes_table))
      end

      def write(file, data)
        File.write(File.join(params.fetch(:output_path), file), data)
      end

      # Generates codes in the following format:
      #
      # {
      #   :Albanian => {
      #     :iso_639_1      => "sq",
      #     :iso_639_2      => "alb", # default (bibliographic) code
      #     :iso_639_2_term => "sqi", # terminology code (optional)
      #     :iso_639_3      => "sqi"
      #   }
      # }
      #
      def import_iso_639(result = {})
        File.open(source_path_for(ISO_639_FILE)) do |file|
          lines = file.each_line
          lines.next # skip header

          lines.each do |line|
            entry = line.chomp.gsub(/"(.*)"/) { $1.gsub("\t", '') }
            data = Hash[ISO_639_COLUMNS.zip(entry.split("\t"))]
            h = result[data[:Ref_Name].to_sym] ||= {}

            STANDARDS_TO_KEYS.each do |standard_key, data_key|
              value = data[data_key]
              h[standard_key] = value.to_sym if value && !value.empty?
            end
          end
        end

        result
      end

      # Generates codes in the following format:
      #
      # {
      #   :Bangka => {
      #       :bcp_47     => "mfb",   # preferred code
      #       :bcp_47_alt => "ms-mfb" # alternative code (optional)
      #   }
      # }
      def import_bcp_47(result = {})
        File.open(source_path_for(BCP_47_FILE)) do |file|
          lines = file.each_line
          lines.next # skip header

          data  = {}
          entry = ''

          lines.each do |line|
            line.chomp!

            if line == '%%'
              process_bcp_47_entry(entry, data)
              process_bcp_47_data(data, result)
            else
              if line.include?(':')
                process_bcp_47_entry(entry, data)
                entry = line
              else
                entry += line
              end
            end
          end

          process_bcp_47_entry(entry, data)
          process_bcp_47_data(data, result)
        end

        result
      end

      def process_bcp_47_entry(entry, data)
        return if entry.nil? || entry.empty?

        key, value = entry.chomp.split(':', 2).map(&:strip)

        if key == 'Description'
          (data['names'] ||= []) << value.to_sym
        else
          data[key.downcase] = value
        end

        entry.clear
      end

      def process_bcp_47_data(data, result)
        if !data.empty? && %w[language extlang].include?(data['type']) && !data['names'].include?('Private use') && data['scope'] != 'collection'
          existing_names = data['names'].select { |name| result.has_key?(name) }

          prefered    = data['preferred-value']
          alternative = [data['prefix'], data['subtag']].compact.join('-')

          bcp_47 = {}

          bcp_47[:bcp_47]     = (prefered || alternative).to_sym
          bcp_47[:bcp_47_alt] = alternative.to_sym if prefered

          existing_names.each do |name|
            result[name.to_sym].merge!(bcp_47)
          end

          bcp_47.merge!(result[existing_names.first]) unless existing_names.empty?

          (data['names'] - existing_names).each do |name|
            result[name.to_sym] = bcp_47.dup
          end
        end

        data.clear
      end

      def build_table(language_codes_map)
        # can't use Hash with default proc here, because we won't be able to marshal this hash later in this case
        table = ([:name] + KEYS_TO_STANDARDS.values.uniq.sort_by(&:to_s)).inject({}) do |memo, key|
          memo.merge!(key => {})
        end

        language_codes_map.each do |name, codes|
          table[:name][name] = { name: name }.merge(codes)
        end

        table[:name].each_pair do |name, standards|
          STANDARDS_TO_KEYS.each do |standard, _|
            if standards[standard]
              table[standard.to_sym][standards[standard].to_sym] = table[:name][name]
            end
          end
        end

        table.each do |key, codes|
          table[key] = Hash[codes.sort]
        end
      end

      ISO_639_COLUMNS = [
        :Id,       # The three-letter 639-3 identifier
        :Part2B,   # Equivalent 639-2 identifier of the bibliographic applications
                   # code set, if there is one
        :Part2T,   # Equivalent 639-2 identifier of the terminology applications code
                   # set, if there is one
        :Part1,    # Equivalent 639-1 identifier, if there is one
        :Scope,    # I(ndividual), M(acrolanguage), S(pecial)
        :Type,     # A(ncient), C(onstructed),
                   # E(xtinct), H(istorical), L(iving), S(pecial)
        :Ref_Name, # Reference language name
        :Comment   # Comment relating to one or more of the columns
      ].freeze

    end

  end
end