GlobalNamesArchitecture/dwca-hunter

View on GitHub
lib/dwca_hunter/resources/mcz.rb

Summary

Maintainability
A
3 hrs
Test Coverage
# frozen_string_literal: true

module DwcaHunter
  class ResourceMCZ < DwcaHunter::Resource
    def initialize(opts = {})
      @command = "mcz"
      @title = "MCZbase"
      @url = "https://uofi.box.com/shared/static/cdupnudo5usgbcjeds83fw2884jg9n5j.gz"
      @UUID = "c79d055b-211b-40de-8e27-618011656265"
      @download_path = File.join(Dir.tmpdir,
                                 "dwca_hunter",
                                 "mcz",
                                 "data.tar.gz")
      @synonyms = []
      @names = []
      @vernaculars = []
      @extensions = []
      @synonyms_hash = {}
      @vernaculars_hash = {}
      super(opts)
    end

    def download
      puts "Downloading cached verion of the file. Ask MCZ for update."
      `curl -s -L #{@url} -o #{@download_path}`
    end

    def unpack
      unpack_tar
    end

    def make_dwca
      DwcaHunter.logger_write(object_id, "Extracting data")
      get_names
      generate_dwca
    end

    private

    def get_names
      Dir.chdir(@download_dir)
      collect_names
    end

    def collect_names
      @names_index = {}
      file = CSV.open(File.join(@download_dir, "taxonomy_export_2023Mar6.csv"),
                      headers: true)
      file.each_with_index do |row, i|
        canonical = row["SCIENTIFIC_NAME"]
        authors = row["AUTHOR_TEXT"]
        kingdom = row["KINGDOM"]
        phylum = row["PHYLUM"]
        klass = row["PHYLCLASS"]
        order = row["PHYLORDER"]
        family = row["FAMILY"]
        genus = row["GENUS"]
        code = row["NOMENCLATURAL_CODE"]

        taxon_id = "gn_#{i + 1}"
        name_string = "#{canonical} #{authors}".strip
        @names << { taxon_id: taxon_id,
                    name_string: name_string,
                    kingdom: kingdom,
                    phylum: phylum,
                    klass: klass,
                    order: order,
                    family: family,
                    genus: genus,
                    code: code }
        puts "Processed %s names" % i if i % 10_000 == 0
      end
    end

    def generate_dwca
      DwcaHunter.logger_write(object_id,
                              "Creating DarwinCore Archive file")
      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
                "http://rs.tdwg.org/dwc/terms/scientificName",
                "http://rs.tdwg.org/dwc/terms/kingdom",
                "http://rs.tdwg.org/dwc/terms/phylum",
                "http://rs.tdwg.org/dwc/terms/class",
                "http://rs.tdwg.org/dwc/terms/order",
                "http://rs.tdwg.org/dwc/terms/family",
                "http://rs.tdwg.org/dwc/terms/genus",
                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
      @names.each do |n|
        @core << [n[:taxon_id], n[:name_string],
                  n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
                  n[:genus], n[:code]]
      end

      @eml = {
        id: @uuid,
        title: @title,
        authors: [
          { first_name: "MCZ",
            last_name: "Harvard University" }
        ],
        metadata_providers: [
          { first_name: "Paul",
            last_name: "Morris" }
        ],

        abstract: "The Museum of Comparative Zoology was founded in 1859 on " \
        "the concept that collections are an integral and fundamental " \
        "component of zoological research and teaching. This more than " \
        "150-year-old commitment remains a strong and proud tradition for " \
        "the MCZ. The present-day MCZ contains over 21-million specimens in " \
        "ten research collections which comprise one of the world's richest " \
        "and most varied resources for studying the diversity of life. The " \
        "museum serves as the primary repository for zoological specimens " \
        "collected by past and present Harvard faculty-curators, staff and " \
        "associates conducting research around the world. As a premier " \
        "university museum and research institution, the specimens and " \
        "their related data are available to researchers of the scientific " \
        "and museum community. doi:10.5281/zenodo.891420",
        url: @url
      }
      super
    end
  end
end