daddyz/phonelib

View on GitHub
lib/phonelib/data_importer.rb

Summary

Maintainability
A
25 mins
Test Coverage
require 'phonelib/data_importer_helper'

module Phonelib
  # @private module processes creation of data files needed for this gem
  module DataImporter
    require 'nokogiri'

    # official libphonenumber repo for cloning
    REPO = 'https://github.com/googlei18n/libphonenumber.git'

    # importing function
    def self.import
      Importer.new
    end

    # @private class with functionality for importing data
    class Importer
      include Phonelib::DataImporterHelper

      # countries that can have double country prefix in number
      DOUBLE_COUNTRY_CODES_COUNTRIES = %w(IN DE BR IT NO PL CU VN)
      FORMAT_SHARING = {
        'CA' => 'US',
        'CC' => 'AU',
        'CX' => 'AU',
        'DM' => 'US',
        'DO' => 'US',
        'EH' => 'MA',
        'GD' => 'US',
        'GG' => 'GB',
        'GU' => 'US',
        'IM' => 'GB',
        'JE' => 'GB',
        'JM' => 'US',
        'KN' => 'US',
        'KY' => 'US',
        'KZ' => 'RU',
        'LC' => 'US',
        'MF' => 'GP',
        'MP' => 'US',
        'MS' => 'US',
        'PR' => 'US',
        'SJ' => 'NO',
        'SX' => 'US',
        'TA' => 'SH',
        'TC' => 'US',
        'TT' => 'US',
        'UM' => 'US',
        'VA' => 'IT',
        'VC' => 'US',
        'VG' => 'US',
        'VI' => 'US',
        'YT' => 'RE',
        'AG' => 'US',
        'AI' => 'US',
        'AS' => 'US',
        'AX' => 'FI',
        'BB' => 'US',
        'BL' => 'GP',
        'BM' => 'US',
        'BQ' => 'CW',
        'BS' => 'US',
      }

      # main data file in repo
      MAIN_FILE = 'resources/PhoneNumberMetadata.xml'
      # short number metadata
      SHORT_DATA_FILE = 'resources/ShortNumberMetadata.xml'
      # alternate formats data file in repo
      FORMATS_FILE = 'resources/PhoneNumberAlternateFormats.xml'
      # geocoding data dir in repo
      GEOCODING_DIR = 'resources/geocoding/en/'
      # carrier data dir in repo
      CARRIER_DIR = 'resources/carrier/en/'
      # timezones data dir in repo
      TIMEZONES_DIR = 'resources/timezones/'

      # class initialization method
      def initialize
        @destination = File.path(
          "#{File.dirname(__FILE__)}/../../data/libphonenumber/")
        @data = {}
        @prefixes = {}
        @geo_names = []
        @timezones = []
        @carriers = []
        @countries = {}

        run_import
      end

      private

      # running import method
      def run_import
        clone_repo
        import_main_data
        import_short_data
        import_alternate_formats
        process_format_links
        import_geocoding_data
        import_timezone_data
        import_carrier_data
        import_country_names
        save_data_file
        save_extended_data_file
      end

      # method clones libphonenumber repo to local dir
      def clone_repo
        repo = Phonelib::DataImporter::REPO

        system("rm -rf #{@destination}")
        cloned = system("git clone #{repo} #{@destination} --depth 1 -b master")
        fail 'Could not clone repo' unless cloned
      end

      # method parses main data file
      def import_main_data
        puts 'IMPORTING MAIN DATA'
        main_from_xml("#{@destination}#{MAIN_FILE}").elements.each do |el|
          # each country
          country = hash_from_xml(el, :attributes)
          country.merge! types_and_formats(el.children)
          country = add_double_country_flag country
          if country[Core::NATIONAL_PREFIX_TRANSFORM_RULE]
            country[Core::NATIONAL_PREFIX_TRANSFORM_RULE].gsub!('$', '\\')
          end
          @data[country[:id]] = country
        end
      end

      # method parses main data file
      def import_short_data
        puts 'IMPORTING SHORT NUMBER DATA'
        main_from_xml("#{@destination}#{SHORT_DATA_FILE}").elements.each do |el|
          # each country
          country = hash_from_xml(el, :attributes)
          country.merge! types_and_formats(el.children)

          country[:types].each do |type, data|
            merge_short_with_main_type(country[:id], type, data)
          end
        end
      end

      # method parses alternate formats file
      def import_alternate_formats
        puts 'IMPORTING ALTERNATE FORMATS'

        main_from_xml("#{@destination}#{FORMATS_FILE}").elements.each do |el|
          el.children.each do |phone_type|
            next unless phone_type.name == 'availableFormats'

            country_code = country_by_code(el.attribute('countryCode').value)
            @data[country_code][:formats] += parse_formats(phone_type.children)
          end
        end
      end

      # some countries missing formats, and are linking them to another countries
      def process_format_links
        FORMAT_SHARING.each do |destination, source|
          next unless @data[destination]
          @data[destination][:formats] ||= []
          @data[destination][:formats] = @data[destination][:formats] + @data[source][:formats]
        end
      end

      # method parses geocoding data dir
      def import_geocoding_data
        puts 'IMPORTING GEOCODING DATA'
        import_raw_files_data("#{@destination}#{GEOCODING_DIR}*",
                              @geo_names,
                              :g)
      end

      # method parses timezones data dir
      def import_timezone_data
        puts 'IMPORTING TIMEZONES DATA'
        import_raw_files_data("#{@destination}#{TIMEZONES_DIR}*",
                              @timezones,
                              :t)
      end

      # method parses carriers data dir
      def import_carrier_data
        puts 'IMPORTING CARRIER DATA'
        import_raw_files_data("#{@destination}#{CARRIER_DIR}*",
                              @carriers,
                              :c)
      end

      # import country names
      def import_country_names
        puts 'IMPORTING COUNTRY NAMES'

        require 'open-uri'
        require 'csv'
        io = URI.open('http://download.geonames.org/export/dump/countryInfo.txt')
        csv = CSV.new(io, {col_sep: "\t"})
        csv.each do |row|
          next if row[0].nil? || row[0].start_with?('#') || row[0].empty? || row[0].size != 2

          @countries[row[0]] = row[4]
        end
      end

      # adds double country code flag in case country allows
      def add_double_country_flag(country)
        if DOUBLE_COUNTRY_CODES_COUNTRIES.include?(country[:id])
          country[:double_prefix] = true
        end
        country
      end

      # method extracts formats and types from xml data
      def types_and_formats(children)
        result = { types: {}, formats: [] }

        without_comments(children).each do |phone_type|
          if phone_type.name == 'references'
            next
          elsif phone_type.name == 'availableFormats'
            result[:formats] = parse_formats(phone_type.children)
          else
            result[:types][name2sym(phone_type.name)] =
                hash_from_xml(phone_type, :element)
          end
        end

        fill_possible_to_types_if_nil(result)
      end

      # method adds short number patterns to main data parsed from main xml
      def merge_short_with_main_type(country_id, type, data)
        @data[country_id][:types][type] ||= {}
        @data[country_id][:types][type][Core::SHORT] ||= {}
        data.each do |k, v|
          if @data[country_id][:types][type][Core::SHORT][k]
            @data[country_id][:types][type][Core::SHORT][k] += "|#{v}"
          else
            @data[country_id][:types][type][Core::SHORT][k] = v
          end
        end
      end

      # adds possible pattern in case it doesn't exists
      def fill_possible_to_types_if_nil(result)
        result[:types].each do |type, data|
          if data[Core::VALID_PATTERN] && !data[Core::POSSIBLE_PATTERN]
            result[:types][type][Core::POSSIBLE_PATTERN] = case type
                  when Core::GENERAL
                    national_possible result[:types]
                  else
                    data[Core::VALID_PATTERN]
                  end
          end
        end
        result
      end

      # take all possible patters from all types
      def national_possible(types)
        types.map { |k, v| v[:possible_number_pattern] }.
            compact.map { |e| e.split('|') }.flatten.uniq.join('|')
      end

      # method parses xml for formats data
      def parse_formats(formats_children)
        without_comments(formats_children).map do |format|
          current_format = hash_from_xml(format, :children)

          without_comments(format.children).each do |f|
            current_format[name2sym(f.name)] =
                str_clean(f.children.first, not_format?(f.name))
          end

          current_format
        end
      end

      # method updates data from raw files
      def import_raw_files_data(dir, var, key)
        name2index = {}
        Dir["#{dir}"].each do |file|
          parse_raw_file(file).each do |prefix, name|
            unless name2index[name]
              var.push name
              name2index[name] = var.size - 1
            end

            @prefixes = fill_prefixes(key, name2index[name], prefix, @prefixes)
          end
        end
      end

      # method finds country by country prefix
      def country_by_code(country_code)
        match = @data.select { |_k, v| v[:country_code] == country_code }
        if match.size > 1
          match = match.select { |_k, v| v[:main_country_for_code] == 'true' }
        end

        match.keys.first
      end
    end
  end
end