GlobalNamesArchitecture/dwc-archive

View on GitHub
lib/dwc_archive/ingester.rb

Summary

Maintainability
A
2 hrs
Test Coverage
class DarwinCore
  # This module abstracts information for reading csv file to be used
  # in several classes which need such functionality
  module Ingester
    attr_reader :data, :properties, :encoding, :fields_separator, :size, :file_path, :fields, :line_separator,
                :quote_character, :ignore_headers

    def size
      @size ||= init_size
    end

    def read(batch_size = 10_000)
      DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
      res = []
      errors = []
      args = define_csv_args
      min_size = @fields.map { |f| f[:index].to_i || 0 }.max + 1
      csv = CSV.new(open(@file_path), **args)
      csv.each_with_index do |r, i|
        next if @ignore_headers && i == 0

        min_size > r.size ? errors << r : process_csv_row(res, errors, r)
        next if i == 0 || i % batch_size != 0

        DarwinCore.logger_write(@dwc.object_id,
                                format("Ingested %s records from %s",
                                       i, name))
        next unless block_given?

        yield [res, errors]
        res = []
        errors = []
      end
      yield [res, errors] if block_given?
      [res, errors]
    end

    private

    def define_csv_args
      args = { col_sep: @field_separator }
      @quote_character = "\b" if @quote_character.empty?
      args.merge(quote_char: @quote_character)
    end

    def name
      self.class.to_s.split("::")[-1].downcase
    end

    def process_csv_row(result, errors, row)
      str = row.join("")
      str = str.force_encoding("utf-8")
      if str.encoding.name == "UTF-8" && str.valid_encoding?
        result << row.map { |f| f.nil? ? nil : f.force_encoding("utf-8") }
      else
        errors << row
      end
    end

    def init_attributes
      @properties = @data[:attributes]
      init_encoding
      @field_separator = init_field_separator
      @quote_character = @properties[:fieldsEnclosedBy] || ""
      @line_separator = @properties[:linesTerminatedBy] || "\n"
      @ignore_headers = @properties[:ignoreHeaderLines] &&
                        [1, true].include?(@properties[:ignoreHeaderLines])
      init_file_path
      init_fields
    end

    def init_encoding
      @encoding = @properties[:encoding] || "UTF-8"
      accepted_encoding = %w[utf-8 utf8 utf-16 utf16].
                          include?(@encoding.downcase)
      unless accepted_encoding
        raise(
          DarwinCore::EncodingError,
          "No support for encodings other than utf-8 or utf-16 at the moment"
        )
      end
    end

    def init_file_path
      file = @data[:location] ||
             @data[:attributes][:location] ||
             @data[:files][:location]
      @file_path = File.join(@path, file)
      raise DarwinCore::FileNotFoundError, "No file data" unless @file_path
    end

    def init_fields
      @data[:field] = [data[:field]] if data[:field].class != Array
      @fields = @data[:field].map { |f| f[:attributes] }
      if @fields.empty?
        raise DarwinCore::InvalidArchiveError,
              "No data fields are found"
      end
    end

    def init_field_separator
      res = @properties[:fieldsTerminatedBy] || ","
      res = "\t" if res == "\\t"
      res
    end

    def init_size
      `wc -l #{@file_path}`.match(/^\s*(\d+)\s/)[1].to_i
    end
  end
end