GlobalNamesArchitecture/dwc-archive

View on GitHub
lib/dwc_archive.rb

Summary

Maintainability
A
0 mins
Test Coverage
# frozen_string_literal: true

require "fileutils"
require "ostruct"
require "digest"
require "csv"
require "logger"
require "nokogiri"
require "biodiversity"
require_relative "dwc_archive/xml_reader"
require_relative "dwc_archive/ingester"
require_relative "dwc_archive/errors"
require_relative "dwc_archive/expander"
require_relative "dwc_archive/archive"
require_relative "dwc_archive/core"
require_relative "dwc_archive/extension"
require_relative "dwc_archive/metadata"
require_relative "dwc_archive/generator"
require_relative "dwc_archive/generator_meta_xml"
require_relative "dwc_archive/generator_eml_xml"
require_relative "dwc_archive/taxon_normalized"
require_relative "dwc_archive/gnub_taxon"
require_relative "dwc_archive/classification_normalizer"
require_relative "dwc_archive/version"

# main class for handling darwin core archives
class DarwinCore
  DEFAULT_TMP_DIR = "/tmp"
  VernacularNormalized = Struct.new(:name, :language, :locality, :country_code)
  SynonymNormalized = Struct.new(:id, :name, :canonical_name, :status, :source,
                                 :local_id, :global_id)

  class << self
    attr_writer :logger

    def clean(path)
      FileUtils.rm_rf(path) if FileTest.exists?(path)
    end

    def files(path)
      return nil unless path && FileTest.exists?(path)

      Dir.entries(path).reject { |e| e.match(/[.]{1,2}$/) }.sort
    end

    def random_path(tmp_dir)
      File.join(tmp_dir, "dwc_#{rand(10_000_000_000)}")
    end
  end

  attr_reader :archive, :core, :metadata, :classification_normalizer
  alias eml metadata

  def self.nil_field?(field)
    return true if [nil, "", "/N"].include?(field)

    false
  end

  def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
    Dir.entries(tmp_dir).each do |entry|
      path = File.join(tmp_dir, entry)
      FileUtils.rm_rf(path) if FileTest.directory?(path) && entry.match(/^dwc_\d+$/)
    end
  end

  def self.logger
    @logger ||= Logger.new(nil)
  end

  def self.logger_reset
    self.logger = Logger.new(nil)
  end

  def self.logger_write(obj_id, message, method = :info)
    logger.send(method, "|#{obj_id}|#{message}|")
  end

  def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
    @dwc_path = dwc_path
    @archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
    @core = DarwinCore::Core.new(self)
    @metadata = DarwinCore::Metadata.new(@archive)
    extensions
  end

  def file_name
    File.split(@dwc_path).last
  end

  def path
    File.expand_path(@dwc_path)
  end

  # generates a hash from a classification data with path to each node,
  # list of synonyms and vernacular names.
  def normalize_classification
    return nil unless parent_id?

    @classification_normalizer ||=
      DarwinCore::ClassificationNormalizer.new(self)
    @classification_normalizer.normalize
  end

  def parent_id?
    !@core.fields.join("|").
      downcase.match(/highertaxonid|parentnameusageid/).nil?
  end

  def checksum
    Digest::SHA1.hexdigest(File.read(@dwc_path))
  end

  def extensions
    return @extensions if @extensions

    root_key = @archive.meta.keys[0]
    ext = @archive.meta[root_key][:extension]
    return @extensions = [] unless ext

    ext = [ext] if ext.class != Array
    @extensions = ext.map { |e| DarwinCore::Extension.new(self, e) }
  end
end