lib/dwc_archive.rb
# frozen_string_literal: true
require "fileutils"
require "ostruct"
require "digest"
require "csv"
require "logger"
require "nokogiri"
require "biodiversity"
require_relative "dwc_archive/xml_reader"
require_relative "dwc_archive/ingester"
require_relative "dwc_archive/errors"
require_relative "dwc_archive/expander"
require_relative "dwc_archive/archive"
require_relative "dwc_archive/core"
require_relative "dwc_archive/extension"
require_relative "dwc_archive/metadata"
require_relative "dwc_archive/generator"
require_relative "dwc_archive/generator_meta_xml"
require_relative "dwc_archive/generator_eml_xml"
require_relative "dwc_archive/taxon_normalized"
require_relative "dwc_archive/gnub_taxon"
require_relative "dwc_archive/classification_normalizer"
require_relative "dwc_archive/version"
# main class for handling darwin core archives
class DarwinCore
DEFAULT_TMP_DIR = "/tmp"
VernacularNormalized = Struct.new(:name, :language, :locality, :country_code)
SynonymNormalized = Struct.new(:id, :name, :canonical_name, :status, :source,
:local_id, :global_id)
class << self
attr_writer :logger
def clean(path)
FileUtils.rm_rf(path) if FileTest.exists?(path)
end
def files(path)
return nil unless path && FileTest.exists?(path)
Dir.entries(path).reject { |e| e.match(/[.]{1,2}$/) }.sort
end
def random_path(tmp_dir)
File.join(tmp_dir, "dwc_#{rand(10_000_000_000)}")
end
end
attr_reader :archive, :core, :metadata, :classification_normalizer
alias eml metadata
def self.nil_field?(field)
return true if [nil, "", "/N"].include?(field)
false
end
def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
Dir.entries(tmp_dir).each do |entry|
path = File.join(tmp_dir, entry)
FileUtils.rm_rf(path) if FileTest.directory?(path) && entry.match(/^dwc_\d+$/)
end
end
def self.logger
@logger ||= Logger.new(nil)
end
def self.logger_reset
self.logger = Logger.new(nil)
end
def self.logger_write(obj_id, message, method = :info)
logger.send(method, "|#{obj_id}|#{message}|")
end
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
@dwc_path = dwc_path
@archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
@core = DarwinCore::Core.new(self)
@metadata = DarwinCore::Metadata.new(@archive)
extensions
end
def file_name
File.split(@dwc_path).last
end
def path
File.expand_path(@dwc_path)
end
# generates a hash from a classification data with path to each node,
# list of synonyms and vernacular names.
def normalize_classification
return nil unless parent_id?
@classification_normalizer ||=
DarwinCore::ClassificationNormalizer.new(self)
@classification_normalizer.normalize
end
def parent_id?
!@core.fields.join("|").
downcase.match(/highertaxonid|parentnameusageid/).nil?
end
def checksum
Digest::SHA1.hexdigest(File.read(@dwc_path))
end
def extensions
return @extensions if @extensions
root_key = @archive.meta.keys[0]
ext = @archive.meta[root_key][:extension]
return @extensions = [] unless ext
ext = [ext] if ext.class != Array
@extensions = ext.map { |e| DarwinCore::Extension.new(self, e) }
end
end