GlobalNamesArchitecture/dwca-hunter

View on GitHub
lib/dwca_hunter/resources/nom-zoologicus.rb

Summary

Maintainability
A
1 hr
Test Coverage
# frozen_string_literal: true
require 'find'

module DwcaHunter
  class ResourceNomZoologicus < DwcaHunter::Resource
    def initialize(opts = { download: false, unpack: false })
      @parser = Biodiversity::Parser
      @command = "nom-zoologicus"
      @title = "Nomenclator Zoologicus"
      @url = "https://zenodo.org/record/7013826/files/rdmpage/nomenclator-zoologicus-coldp-v0.2.1.zip?download=1"
      @UUID = "02fd9b10-78e4-43a5-889e-0639a771c576
"
      @download_path = File.join(Dir.tmpdir,
                                 "dwca_hunter",
                                 "nom-zoologicus",
                                 "data.zip")
      @synonyms = []
      @names = []
      @vernaculars = []
      @extensions = []
      @synonyms_hash = {}
      @vernaculars_hash = {}
      super(opts)
    end

    def download
      DwcaHunter.logger_write(object_id, "Downloading")
      `curl -s -L #{@url} -o #{@download_path}`
    end

    def unpack
      self.class.unzip(@download_path, @download_dir)
    end

    def make_dwca
      DwcaHunter.logger_write(object_id, "Extracting data")
      get_names
      generate_dwca
    end

    private

    def get_names
      Dir.chdir(@download_dir)
      collect_names
    end

    def find_csv_file
      Dir.chdir(@download_dir)
      Find.find(@download_dir).each do |f|
        return f if f.end_with? "names.tsv"
      end
    end

    def collect_names
      @names_index = {}
      file = CSV.open(find_csv_file,
                      headers: true, col_sep: "\t", quote_char: "\b")
      file.each do |row|
        taxon_id = row["ID"].to_s.strip
        rank = row["rank"].strip
        nom_code = row["code"].strip
        name_string = row["scientificName"]
        authorship = row["authorship"]
        next if name_string.strip.empty?
        name_full = [name_string, authorship].join(' ')

        @names << {
          taxon_id: taxon_id,
          name_string: name_full.strip,
          rank: rank,
          nom_code: nom_code
        }
      end
    end

    def generate_dwca
      DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
                "http://rs.tdwg.org/dwc/terms/scientificName",
                "http://rs.tdwg.org/dwc/terms/taxonRank",
                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
      @names.each do |n|
        @core << [n[:taxon_id], n[:name_string], n[:rank], n[:nom_code]]
      end

      @eml = {
        id: @uuid,
        title: @title,
        authors: [
          { first_name: "David",
            last_name: "Remsen" },
          { first_name: "Cathy",
            last_name: "Norton" },
          { first_name: "David",
            last_name: "Patterson" }
        ],
        metadata_providers: [
          { first_name: "Roderic",
            last_name: "Page",
            email: "Roderic.Page@glasgow.ac.uk" }
        ],
        abstract: "Nomenclator Zoologicus is a catalog of the bibliographic " \
                  "origins of the names of every genus and subgenus in the " \
                  "published literature since the tenth edition of " \
                  "Linnaeus' System Natureae in 1758 (Linnæus, 1758) up " \
                  "to 1994. An estimated 340,000 genera are represented " \
                  "in the text and there are approximately 3000 " \
                  "supplemental corrections. It provides a nucleus of core " \
                  "genera data and is recognized as an essential reference " \
                  "document by the zoological taxonomic community.",
        url: "https://doi.org/10.5281/zenodo.7010676"
      }
      super
    end
  end
end