lib/dwc_archive/ingester.rb
class DarwinCore
# This module abstracts information for reading csv file to be used
# in several classes which need such functionality
module Ingester
attr_reader :data, :properties, :encoding, :fields_separator, :size, :file_path, :fields, :line_separator,
:quote_character, :ignore_headers
def size
@size ||= init_size
end
def read(batch_size = 10_000)
DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
res = []
errors = []
args = define_csv_args
min_size = @fields.map { |f| f[:index].to_i || 0 }.max + 1
csv = CSV.new(open(@file_path), **args)
csv.each_with_index do |r, i|
next if @ignore_headers && i == 0
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
next if i == 0 || i % batch_size != 0
DarwinCore.logger_write(@dwc.object_id,
format("Ingested %s records from %s",
i, name))
next unless block_given?
yield [res, errors]
res = []
errors = []
end
yield [res, errors] if block_given?
[res, errors]
end
private
def define_csv_args
args = { col_sep: @field_separator }
@quote_character = "\b" if @quote_character.empty?
args.merge(quote_char: @quote_character)
end
def name
self.class.to_s.split("::")[-1].downcase
end
def process_csv_row(result, errors, row)
str = row.join("")
str = str.force_encoding("utf-8")
if str.encoding.name == "UTF-8" && str.valid_encoding?
result << row.map { |f| f.nil? ? nil : f.force_encoding("utf-8") }
else
errors << row
end
end
def init_attributes
@properties = @data[:attributes]
init_encoding
@field_separator = init_field_separator
@quote_character = @properties[:fieldsEnclosedBy] || ""
@line_separator = @properties[:linesTerminatedBy] || "\n"
@ignore_headers = @properties[:ignoreHeaderLines] &&
[1, true].include?(@properties[:ignoreHeaderLines])
init_file_path
init_fields
end
def init_encoding
@encoding = @properties[:encoding] || "UTF-8"
accepted_encoding = %w[utf-8 utf8 utf-16 utf16].
include?(@encoding.downcase)
unless accepted_encoding
raise(
DarwinCore::EncodingError,
"No support for encodings other than utf-8 or utf-16 at the moment"
)
end
end
def init_file_path
file = @data[:location] ||
@data[:attributes][:location] ||
@data[:files][:location]
@file_path = File.join(@path, file)
raise DarwinCore::FileNotFoundError, "No file data" unless @file_path
end
def init_fields
@data[:field] = [data[:field]] if data[:field].class != Array
@fields = @data[:field].map { |f| f[:attributes] }
if @fields.empty?
raise DarwinCore::InvalidArchiveError,
"No data fields are found"
end
end
def init_field_separator
res = @properties[:fieldsTerminatedBy] || ","
res = "\t" if res == "\\t"
res
end
def init_size
`wc -l #{@file_path}`.match(/^\s*(\d+)\s/)[1].to_i
end
end
end