lib/export/coldp.rb
require 'zip'
require 'yaml'
module Export
# Exports to the Catalog of Life in the new "coldp" format.
# http://api.col.plus/datapackage
#
# * write tests to check for coverage (missing methods)
# * Update all files formats to use tabs
# * Pending handling of both BibTeX and Verbatim
module Coldp
FILETYPES = %w{Description Name Synonym NameRelation TaxonConceptRelation TypeMaterial VernacularName}.freeze
# @return [Scope]
# A full set of valid only Otus (= Taxa in CoLDP) that are to be sent.
# !! At present no OTU with a `name` is sent. In the future this may
# !! need to change.
def self.otus(otu_id)
o = ::Otu.find(otu_id)
return ::Otu.none if o.taxon_name_id.nil?
Otu.joins(taxon_name: [:ancestor_hierarchies])
.where('taxon_name_hierarchies.ancestor_id = ?', o.taxon_name_id)
.where(taxon_name_id: TaxonName.that_is_valid)
.where('(otus.name IS NULL) OR (otus.name = taxon_names.cached)')
end
def self.project_members(project_id)
project_members = {}
ProjectMember.where(project_id:).each do |pm|
if pm.user.orcid.nil?
project_members[pm.user_id] = pm.user.name
else
project_members[pm.user_id] = pm.user.orcid
end
end
project_members
end
def self.modified(updated_at)
updated_at.iso8601
end
def self.modified_by(updated_by_id, project_members)
project_members[updated_by_id]
end
def self.sanitize_remarks(remarks)
remarks&.gsub('\r\n', ' ')&.gsub('\n', ' ')&.gsub('\t', ' ')&.gsub(/[ ]+/, ' ')
end
# Return path to the data itself
def self.export(otu_id, prefer_unlabelled_otus: true)
otus = otus(otu_id)
# source_id: [csv_array]
ref_tsv = {}
otu = ::Otu.find(otu_id)
project = ::Project.find(otu.project_id)
project_members = project_members(otu.project_id)
# TODO: This will likely have to change, it is renamed on serving the file.
zip_file_path = "/tmp/_#{SecureRandom.hex(8)}_coldp.zip"
metadata_path = Zaru::sanitize!("/tmp/#{project.name}_#{DateTime.now}_metadata.yaml").gsub(' ', '_').downcase
version = TaxonWorks::VERSION
if Settings.sandbox_mode?
version = Settings.sandbox_commit_sha
end
metadata = {
'title' => project.name,
'version' => version,
'issued' => DateTime.now.strftime('%Y-%m-%d'),
}
metadata_file = Tempfile.new(metadata_path)
metadata_file.write(metadata.to_yaml)
metadata_file.close
Zip::File.open(zip_file_path, Zip::File::CREATE) do |zipfile|
(FILETYPES - ['Name']).each do |ft|
m = "Export::Coldp::Files::#{ft}".safe_constantize
zipfile.get_output_stream("#{ft}.tsv") { |f| f.write m.generate(otus, project_members, ref_tsv) }
end
zipfile.get_output_stream('Name.tsv') { |f| f.write Export::Coldp::Files::Name.generate(otu, project_members, ref_tsv) }
zipfile.get_output_stream('Taxon.tsv') do |f|
f.write Export::Coldp::Files::Taxon.generate(otus, project_members, otu_id, ref_tsv)
end
# Sort the refs by full citation string
sorted_refs = ref_tsv.values.sort{|a,b| a[1] <=> b[1]}
d = ::CSV.generate(col_sep: "\t") do |tsv|
tsv << %w{ID citation doi modified modifiedBy} # author year source details
sorted_refs.each do |r|
tsv << r
end
end
zipfile.get_output_stream('References.tsv') { |f| f.write d }
zipfile.add('metadata.yaml', metadata_file.path)
end
zip_file_path
end
def self.filename(otu)
Zaru::sanitize!("#{::Project.find(otu.project_id).name}_coldp_otu_id_#{otu.id}_#{DateTime.now}.zip").gsub(' ', '_').downcase
end
def self.download(otu, request = nil, prefer_unlabelled_otus: true)
file_path = ::Export::Coldp.export(
otu.id,
prefer_unlabelled_otus:
)
name = "coldp_otu_id_#{otu.id}_#{DateTime.now}.zip"
::Download::Coldp.create!(
name: "ColDP Download for #{otu.otu_name} on #{Time.now}.",
description: 'A zip file containing CoLDP formatted data.',
filename: filename(otu),
source_file_path: file_path,
request:,
expires: 2.days.from_now
)
end
def self.download_async(otu, request = nil, prefer_unlabelled_otus: true)
download = ::Download::Coldp.create!(
name: "ColDP Download for #{otu.otu_name} on #{Time.now}.",
description: 'A zip file containing CoLDP formatted data.',
filename: filename(otu),
request:,
expires: 2.days.from_now
)
ColdpCreateDownloadJob.perform_later(otu, download, prefer_unlabelled_otus:)
download
end
# TODO - perhaps a utilities file --
# @return [Boolean]
# `true` if no parens in `cached_author_year`
# `false` if parens in `cached_author_year`
def self.original_field(taxon_name)
(taxon_name.type == 'Protonym') && taxon_name.is_original_name?
end
# @param taxon_name [a valid Protonym or a Combination]
# see also exclusion of OTUs/Names based on Ranks not handled
def self.basionym_id(taxon_name)
if taxon_name.type == 'Protonym'
taxon_name.reified_id
elsif taxon_name.type == 'Combination'
taxon_name.protonyms.last.reified_id
else
nil
end
end
# Replicate TaxonName.refified_id without having to use AR
def self.reified_id(taxon_name_id, cached, cached_original_combination)
# Protonym#has_alternate_original?
if cached_original_combination && (cached != cached_original_combination)
taxon_name_id.to_s + '-' + Digest::MD5.hexdigest(cached_original_combination)
else
taxon_name_id
end
end
# Reification spec
# Duplicate Combination check -> is the Combination in question already represented int he current *classification*
end
end