lib/dwca_hunter/resources/opentree.rb
# frozen_string_literal: true
module DwcaHunter
# Harvesting resource for Open Tree of Life
class ResourceOpenTree < DwcaHunter::Resource
def initialize(opts = {})
@command = "open-tree"
@title = "Open Tree of Life Reference Taxonomy"
@uuid = "e10865e2-cdd9-4f97-912f-08f3d5ef49f7"
@data = []
@extensions = []
@count = 1
@clades = {}
@core = [["http://rs.tdwg.org/dwc/terms/taxonId",
"http://globalnames.org/terms/localID",
"http://purl.org/dc/terms/scientificName",
"http://purl.org/dc/terms/parentNameUsageId",
"http://purl.org/dc/terms/taxonRank",
"http://globalnames.org/ottCrossMaps",
"http://globalnames.org/ottNotes"]]
@eml = {
id: @uuid,
title: @title,
authors: [{ url: "https://tree.opentreeoflife.org" }],
abstract: "Open Tree of Life aims to construct a comprehensive, " \
"dynamic and digitally-available tree of life by " \
"synthesizing published phylogenetic trees along with" \
"taxonomic data. The project is a collaborative effort" \
"between 11 PIs across 10 institutions.",
metadata_providers: [
{ first_name: "Dmitry",
last_name: "Mozzherin",
email: "dmozzherin@gmail.com" }
],
url: @url
}
@url = "http://files.opentreeoflife.org/ott/ott3.2/ott3.2.tgz"
@download_path = File.join(Dir.tmpdir, "dwca_hunter",
"opentree", "data.tar.gz")
super
end
def unpack
unpack_tar if @needs_unpack
end
def make_dwca
DwcaHunter.logger_write(object_id, "Extracting data")
collect_data
generate_dwca
end
def download
puts "Downloading cached data, update it at oot website!!"
return unless @needs_download
DwcaHunter.logger_write(object_id, "Downloading file -- "\
"it will take some time...")
`curl -L #{url} -o #{@download_path}`
end
private
def collect_data
set_vars
classification
end
def set_vars
@taxonomy = File.join(@download_dir, "ott3.2", "taxonomy.tsv")
@synonyms = File.join(@download_dir, "ott3.2", "synonyms.tsv")
end
def classification
@classification = []
@names = {}
DwcaHunter.logger_write(object_id, "Building classification")
open(@taxonomy).each_with_index do |line, i|
if ((i + 1) % BATCH_SIZE).zero?
DwcaHunter.logger_write(object_id,
"Traversed #{i + 1} taxonomy lines")
end
@classification << line.split("|").map(&:strip)
end
end
def generate_dwca
DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
DwcaHunter.logger_write(object_id, "Assembling Core Data")
generate_core
generate_synonyms
super
end
def generate_core
@classification.each do |d|
if (@count % BATCH_SIZE).zero?
DwcaHunter.logger_write(object_id, "Traversing #{@count} core " \
"data record")
end
@core << [d[0], d[0], d[2], d[1], d[3], d[4], d[5]]
end
end
def synonyms
[]
end
def generate_synonyms
@extensions <<
{ data: [["http://rs.tdwg.org/dwc/terms/taxonId",
"http://rs.tdwg.org/dwc/terms/scientificName",
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"]],
file_name: "synonyms.txt" }
synonyms.each do |synonym|
@extensions.first[:data] << [d[:id], synonym[:scientificName],
synonym[:taxonomicStatus]]
end
end
end
end