GlobalNamesArchitecture/dwca-hunter

View on GitHub
lib/dwca_hunter/resources/mycobank.rb

Summary

Maintainability
A
2 hrs
Test Coverage
# frozen_string_literal: true

module DwcaHunter
  class ResourceMycoBank < DwcaHunter::Resource
    def initialize(opts = { download: true, unpack: true })
      @command = "mycobank"
      @title = "MycoBank"
      # Download https://www.mycobank.org/images/MBList.zip, open in
      # LibreOffice, save csv file, upload it to box.com
      @url = "https://uofi.box.com/shared/static/4pcbwj40ut17ejemdzxwpio1jmxccwwc.csv"
      @UUID = "b0ac4f6f-fc56-41b4-ad69-6af30a881e7e"
      @download_path = File.join(Dir.tmpdir,
                                 "dwca_hunter",
                                 "mycobank",
                                 "data.csv")
      @synonyms = []
      @names = []
      @vernaculars = []
      @extensions = []
      @synonyms_hash = {}
      @vernaculars_hash = {}
      super(opts)
    end

    def download
      `curl -s -L #{@url} -o #{@download_path}`
    end

    def unpack; end

    def make_dwca
      DwcaHunter.logger_write(object_id, "Extracting data")
      get_names
      generate_dwca
    end

    private

    def get_names
      Dir.chdir(@download_dir)
      collect_names
    end

    def classification(s)
      s.split(",").map(&:strip)[0..5]
    end

    def collect_names
      @names_index = {}
      file = CSV.open(File.join(@download_dir, "data.csv"),
                      headers: true)
      file.each_with_index do |row, i|
        taxon_id = row["ID"].strip
        name_string = row["Taxon_name"].strip
        authors = row["Authors__abbreviated_"]
        authors = authors.nil? ? "" : authors.strip
        rank = row["Rank"].strip
        reference = row["Current name"]
        reference = reference.nil? ? "" : reference.strip
        year = row["Year_of_effective_publication"]
        status = row["Name_status"].strip
        code = "ICN"

        @names << { taxon_id: taxon_id,
                    name_string: "#{name_string} #{authors}".strip,
                    rank: rank,
                    status: status,
                    year: year,
                    reference: reference,
                    code: code }
        puts "Processed %s names" % i if i % 10_000 == 0
      end
    end

    def generate_dwca
      DwcaHunter.logger_write(object_id,
                              "Creating DarwinCore Archive file")
      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
                "http://rs.tdwg.org/dwc/terms/scientificName",
                "http://rs.tdwg.org/dwc/terms/taxonRank",
                "http://rs.tdwg.org/dwc/terms/nomenclaturalStatus",
                "http://rs.tdwg.org/dwc/terms/namePublishedInYear",
                "http://rs.tdwg.org/dwc/terms/namePublishedIn",
                "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
      @names.each do |n|
        @core << [n[:taxon_id], n[:name_string], n[:rank],
                  n[:status], n[:year], n[:reference], n[:code]]
      end

      @eml = {
        id: @uuid,
        title: @title,
        authors: [],
        metadata_providers: [
          { first_name: "Dmitry",
            last_name: "Mozzherin",
            email: "dmozzherin@gmail.com" }
        ],
        abstract: "MycoBank is an on-line database aimed as a service " \
        "to the mycological and scientific community by documenting " \
        "mycological nomenclatural novelties (new names and combinations) " \
        "and associated data. Westerijk Fungal Biodiversity Institute.",
        url: "https://www.mycobank.org/"
      }
      super
    end
  end
end