lib/dwca_hunter/resources/itis.rb from GlobalNamesArchitecture/dwca-hunter

lib/dwca_hunter/resources/itis.rb
Summary

Maintainability

4 hrs
Test Coverage

Issues
# frozen_string_literal: true

module DwcaHunter
  class ResourceITIS < DwcaHunter::Resource
    def initialize(opts = {})
      @command = "itis"
      @title = "Integrated Taxonomic Information SystemITIS"
      @url = "https://www.itis.gov/downloads/itisMySQLTables.tar.gz"
      @uuid = "5d066e84-e512-4a2f-875c-0a605d3d9f35"
      @download_path = File.join(Dir.tmpdir,
                                 "dwca_hunter",
                                 "itis",
                                 "data.tar.gz")
      @ranks = {}
      @kingdoms = {}
      @authors = {}
      @vernaculars = {}
      @synonyms = {}
      @synonym_of = {}
      @names = {}
      @extensions = []
      super(opts)
      @itis_dir = File.join(@download_dir, "itis")
    end

    def unpack
      unpack_tar
      dir = Dir.entries(@download_dir).select { |e| e.match(/itisMySQL/) }[0]
      FileUtils.mv(File.join(@download_dir, dir), @itis_dir)

      # Create a file with the same name as the directory we extracted.
      FileUtils.touch(File.join(@itis_dir, "version_" + dir))
    end

    def make_dwca
      DwcaHunter.logger_write(object_id, "Extracting data")
      get_ranks
      get_kingdoms
      get_authors
      get_vernaculars
      get_synonyms
      get_names
      generate_dwca
    end

    private

    def get_ranks
      # 0    kingdom_id integer not null
      # 1    rank_id smallint not null
      # 2    rank_name char(15) not null
      # 3    dir_parent_rank_id smallint not null
      # 4    req_parent_rank_id smallint not null
      # 5    update_date date not null
      rank_file = File.join(@itis_dir, "taxon_unit_types")
      f = open(rank_file, "r:utf-8")
      f.each do |l|
        l.encode!("UTF-8",
                  "ISO-8859-1",
                  invalid: :replace,
                  replace: "?")
        row = l.strip.split("|")
        @ranks[row[0].strip + "/" + row[1].strip] = row[2].strip
      end
    end

    def get_kingdoms
      # 0    kingdom_id serial not null
      # 1    kingdom_name char(10) not null
      # 2    update_date date not null

      f = open(File.join(@itis_dir, "kingdoms"))
      f.each do |l|
        data = l.strip.split("|")
        @kingdoms[data[0].strip] = data[1].strip
      end
    end

    def get_authors
      # 0    taxon_author_id serial not null
      # 1    taxon_author varchar(100,30) not null
      # 2    update_date date not null
      # 3    kingdom_id smallint not null

      f = open(File.join(@itis_dir, "taxon_authors_lkp"))
      f.each do |l|
        l.encode!("UTF-8",
                  "ISO-8859-1",
                  invalid: :replace,
                  replace: "?")
        data = l.strip.split("|")
        @authors[data[0].strip] = data[1].strip
      end
    end

    def get_vernaculars
      # 0    tsn integer not null
      # 1    vernacular_name varchar(80,5) not null
      # 2    language varchar(15) not null
      # 3    approved_ind char(1)
      # 4    update_date date not null
      # 5    primary key (tsn,vernacular_name,language)
      #      constraint "itis".vernaculars_key

      f = open(File.join(@itis_dir, "vernaculars"))
      f.each_with_index do |l, i|
        if i % BATCH_SIZE == 0
          DwcaHunter.logger_write(object_id,
                                  "Extracted %s vernacular names" % i)
        end
        l.encode!("UTF-8",
                  "ISO-8859-1",
                  invalid: :replace,
                  replace: "?")
        data = l.split("|").map(&:strip)
        name_tsn = data[0]
        string   = data[1]
        language = data[2]
        language = "Common name" if language == "unspecified"
        @vernaculars[name_tsn] = { name: string, language: language }
      end
    end

    def get_synonyms
      # 0    tsn integer not null
      # 1    tsn_accepted integer not null
      # 2    update_date date not null

      f = open(File.join(@itis_dir, "synonym_links"))
      f.each_with_index do |l, i|
        if i % BATCH_SIZE == 0
          DwcaHunter.logger_write(object_id,
                                  "Extracted %s synonyms" % i)
        end
        l.encode!("UTF-8",
                  "ISO-8859-1",
                  invalid: :replace,
                  replace: "?")
        data = l.split("|").map(&:strip)
        synonym_name_tsn = data[0]
        accepted_name_tsn = data[1]
        @synonyms[synonym_name_tsn] = accepted_name_tsn
      end
    end

    def get_names
      # 0    tsn serial not null
      # 1    unit_ind1 char(1)
      # 2    unit_name1 char(35) not null
      # 3    unit_ind2 char(1)
      # 4    unit_name2 varchar(35)
      # 5    unit_ind3 varchar(7)
      # 6    unit_name3 varchar(35)
      # 7    unit_ind4 varchar(7)
      # 8    unit_name4 varchar(35)
      # 9    unnamed_taxon_ind char(1)
      # 10   usage varchar(12,5) not null
      # 11   unaccept_reason varchar(50,9)
      # 12   credibility_rtng varchar(40,17) not null
      # 13   completeness_rtng char(10)
      # 14   currency_rating char(7)
      # 15   phylo_sort_seq smallint
      # 16   initial_time_stamp datetime year to second not null
      # 17   parent_tsn integer
      # 18   taxon_author_id integer
      # 19   hybrid_author_id integer
      # 20   kingdom_id smallint not null
      # 21   rank_id smallint not null
      # 22   update_date date not null
      # 23   uncertain_prnt_ind char(3)

      f = open(File.join(@itis_dir, "taxonomic_units"))
      f.each_with_index do |l, i|
        if i % BATCH_SIZE == 0
          DwcaHunter.logger_write(object_id,
                                  "Extracted %s names" % i)
        end
        l.encode!("UTF-8",
                  "ISO-8859-1",
                  invalid: :replace,
                  replace: "?")
        data = l.split("|").map(&:strip)
        name_tsn = data[0]
        x1 = data[1]
        name_part1 = data[2]
        x2         = data[3]
        name_part2 = data[4]
        sp_marker1 = data[5]
        name_part3 = data[6]
        sp_marker2 = data[7]
        name_part4 = data[8]
        status     = data[10]
        parent_tsn = data[17]
        author_id  = data[18]
        kingdom_id = data[20]
        rank_id    = data[21]

        parent_tsn = nil if parent_tsn == ""
        name = [x1, name_part1, x2, name_part2,
                sp_marker1, name_part3, sp_marker2, name_part4]
        canonical_name = name.clone
        name << @authors[author_id] if @authors[author_id]
        name = name.join(" ").strip.gsub(/\s+/, " ")
        canonical_name = canonical_name.join(" ").strip.gsub(/\s+/, " ")
        rank = @ranks[kingdom_id + "/" + rank_id] ||
               ""
        @names[name_tsn] = { name: name,
                             canonical_name: canonical_name,
                             status: status,
                             parent_tsn: parent_tsn,
                             rank: rank }
      end
    end

    def generate_dwca
      DwcaHunter.logger_write(object_id,
                              "Creating DarwinCore Archive file")
      @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
                "http://rs.tdwg.org/dwc/terms/parentNameUsageID",
                "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
                "http://rs.tdwg.org/dwc/terms/scientificName",
                "http://rs.tdwg.org/ontology/voc/TaxonName#nameComplete",
                "http://rs.tdwg.org/dwc/terms/taxonomicStatus",
                "http://rs.tdwg.org/dwc/terms/taxonRank"]]
      @extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
                               "http://rs.tdwg.org/dwc/terms/vernacularName",
                               "http://purl.org/dc/terms/language"]],
                       file_name: "vernacular_names.txt",
                       row_type: "http://rs.gbif.org/terms/1.0/VernacularName" }
      @names.keys.each_with_index do |k, _i|
        d = @names[k]
        accepted_id = @synonyms[k] || nil
        parent_id = d[:parent_tsn].to_i == 0 ? nil : d[:parent_tsn]
        row = [k, parent_id, accepted_id, d[:name], d[:canonical_name], d[:status], d[:rank]]
        @core << row
      end

      @vernaculars.keys.each_with_index do |k, _i|
        d = @vernaculars[k]
        @extensions[0][:data] << [k, d[:name], d[:language]]
      end

      @eml = {
        id: @uuid,
        title: @title,
        authors: [
          { email: "itiswebmaster@itis.gov" }
        ],
        metadata_providers: [
          { first_name: "Dmitry",
            last_name: "Mozzherin",
            email: "dmozzherin@gmail.com" }
        ],
        abstract: "The White House Subcommittee on Biodiversity and " \
                    "Ecosystem Dynamics has identified systematics as a " \
                    "research priority that is fundamental to ecosystem " \
                    "management and biodiversity conservation. This primary " \
                    "need identified by the Subcommittee requires " \
                    "improvements in the organization of, and access to, " \
                    "standardized nomenclature. ITIS (originally referred " \
                    "to as the Interagency Taxonomic Information System) " \
                    "was designed to fulfill these requirements. In the " \
                    "future, the ITIS will provide taxonomic data and a " \
                    "directory of taxonomic expertise that will support " \
                    "the system",
        url: "http://www.itis.gov"
      }
      super
    end
  end
end