GlobalNamesArchitecture/dwca-hunter

View on GitHub
lib/dwca_hunter/resources/ion.rb

Summary

Maintainability
A
1 hr
Test Coverage
# frozen_string_literal: true

module DwcaHunter
  class ResourceION < DwcaHunter::Resource
    def initialize(opts = {})
      @command = "ion"
      @title = "Index to Organism Names"
      @url = "https://uofi.box.com/shared/static/tklh8i6q2kb33g6ki33k6s3is06lo9np.gz"
      @UUID = "1137dfa3-5b8c-487d-b497-dc0938605864"
      @download_path = File.join(Dir.tmpdir,
                                 "dwca_hunter",
                                 "ion",
                                 "data.tar.gz")
      @names = []
      @extensions = []
      super(opts)
    end

    def download
      puts "Downloading cached verion of the file. Ask Rod Page to make new."
      `curl -s -L #{@url} -o #{@download_path}`
    end

    def unpack
      unpack_tar
    end

    def make_dwca
      DwcaHunter.logger_write(object_id, "Extracting data")
      get_names
      generate_dwca
    end

    private

    def get_names
      Dir.chdir(@download_dir)
      collect_names
    end

    def collect_names
      file = CSV.open(File.join(@download_dir, "ion.tsv"),
                      headers: true, col_sep: "\t", quote_char: "\b")
      file.each_with_index do |row, i|
        id = row["id"]
        name_string = row["nameComplete"]
        auth = row["taxonAuthor"]

        @names << { taxon_id: id,
                    name_string: name_string,
                    auth: auth }

        puts "Processed %s names" % i if i % 10_000 == 0
      end
    end

    def generate_dwca
      DwcaHunter.logger_write(object_id,
                              "Creating DarwinCore Archive file")
      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
                "http://rs.tdwg.org/dwc/terms/scientificName",
                "http://rs.tdwg.org/dwc/terms/scientificNameAuthorship"]]
      @names.each do |n|
        @core << [n[:taxon_id], n[:name_string], n[:auth]]
      end

      @eml = {
        id: @uuid,
        title: @title,
        authors: [
          { first_name: "Nigel",
            last_name: "Robinson",
            email: "nigel.robinson@thomsonreuters.com" }
        ],
        metadata_providers: [
          { first_name: "Roderic",
            last_name: "Page",
            email: "rdmpage@gmail.com" }
        ],
        abstract: "ION contains millions of animal names, both fossil and " \
          "recent, at all taxonomic ranks, reported from the scientific " \
          "literature. (Bacteria, plant and virus names will be added soon)." \
          "\n\n" \
          "These names are derived from premier Clarivate databases: " \
          "Zoological Record®, BIOSIS Previews®, and Biological Abstracts®. " \
          "All names are tied to at least one published article. Together, " \
          "these resources cover every aspect of the life sciences - " \
          "providing names from over 30 million scientific records, " \
          "including approximately ,000 international journals, patents, " \
          "books, and conference proceedings. They provide a powerful " \
          "foundation for the most complete collection of organism names " \
          "available today.",
        url: @url
      }
      super
    end
  end
end