lib/miga/cli/action/derep_wf.rb
# @package MiGA
# @license Artistic-2.0
require 'miga/cli/action'
class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action
require 'miga/cli/action/wf'
include MiGA::Cli::Action::Wf
def parse_cli
default_opts_for_wf
cli.defaults = {
metric: :ani, threshold: 95.0, criterion: :quality,
summaries: true, collection: true
}
cli.parse do |opt|
opt.on(
'--aai',
'Use Average Amino Acid Identity (AAI) as genome similarity metric',
'By default: Use Average Nucleotide Identity (ANI)'
) { cli[:metric] = :aai }
opt.on(
'--ani',
'Use Average Nucleotide Identity (ANI) as similarity metric (default)'
) { cli[:metric] = :ani }
opt.on(
'--threshold FLOAT', Float,
"Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}"
) { |v| cli[:threshold] = v }
opt.on(
'--quality',
'Use genome with highest quality as clade representatives (default)'
) { |v| cli[:criterion] = :quality }
opt.on(
'--medoids',
'Use medoids as clade representatives'
) { |v| cli[:criterion] = :medoids }
opt.on(
'--no-collection',
'Do not generate a dereplicated collection of assemblies'
) { |v| cli[:collection] = v }
opt.on(
'--no-summaries',
'Do not generate intermediate step summaries'
) { |v| cli[:summaries] = v }
opts_for_wf_distances(opt)
opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
end
end
def perform
# Input data
p = create_project(
:assembly,
{ run_clades: false },
{ run_mytaxa_scan: false, run_ssu: false }
)
p.set_option(:gsp_metric, cli[:metric].to_s)
p.set_option(:"gsp_#{cli[:metric]}", cli[:threshold])
# Run
run_daemon
dereplicate(p)
summarize(%w[cds assembly essential_genes]) if cli[:summaries]
cleanup
end
private
def dereplicate(p)
return if cli[:prepare_and_exit]
cli.say 'Extracting genomospecies clades'
r = p.result(:clade_finding) or raise 'Result unavailable: run failed'
c_f = r.file_path(:clades_gsp) or raise 'Result incomplete: run failed'
clades = File.readlines(c_f).map { |i| i.chomp.split("\t") }
rep = representatives(p)
File.open(File.join(cli[:outdir], 'genomospecies.tsv'), 'w') do |fh|
fh.puts "Clade\tRepresentative\tMembers"
clades.each_with_index do |i, k|
fh.puts ["gsp_#{k + 1}", rep[k], i.join(',')].join("\t")
end
end
if cli[:collection]
dir = File.join(cli[:outdir], 'representatives')
FileUtils.mkdir_p(dir)
rep.each do |i|
f = p.dataset(i).result(:assembly).file_path(:largecontigs)
FileUtils.cp(f, dir)
end
end
end
def representatives(p)
cli.say 'Identifying representatives'
f = File.join(cli[:outdir], 'representatives.txt')
if cli[:criterion] == :medoids
FileUtils.cp(p.result(:clade_finding).file_path(:medoids_gsp), f)
else
src = File.join(MiGA::MiGA.root_path, 'utils/representatives.rb')
MiGA::MiGA.run_cmd("ruby '#{src}' '#{p.path}' | cut -f 2", stdout: f)
end
File.readlines(f).map(&:chomp)
end
end