bio-miga/miga

View on GitHub
lib/miga/cli/action/tax_dist.rb

Summary

Maintainability
B
5 hrs
Test Coverage
# @package MiGA
# @license Artistic-2.0

require 'miga/cli/action'
require 'miga/tax_index'
require 'tmpdir'

class MiGA::Cli::Action::TaxDist < MiGA::Cli::Action
  def parse_cli
    cli.parse do |opt|
      cli.opt_object(opt, [:project])
      cli.opt_filter_datasets(opt)
      opt.on(
        '-i', '--index FILE',
        'Pre-calculated tax-index (in tabular format) to be used',
        'If passed, dataset filtering arguments are ignored'
      ) { |v| cli[:index] = v }
      opt.on(
        '-m', '--metric STR',
        'Distance metric used to evaluate the distribution',
        'By default: AAI for genomes projects, ANI for clade projects'
      ) { |v| cli[:metric] = v.downcase }
    end
  end

  def cannid(a, b)
    (a > b ? [b, a] : [a, b]).join('-')
  end

  def perform
    dist = read_distances
    Dir.mktmpdir do |dir|
      tab = get_tab_index(dir)
      dist = traverse_taxonomy(tab, dist)
    end

    cli.say 'Generating report'
    dist.keys.each do |k|
      dist[k][5] = dist[k][4].reverse.join(' ')
      dist[k][4] = dist[k][4].first
      puts (k.split('-') + dist[k]).join("\t")
    end
  end

  private

  def read_distances
    p = cli.load_project
    cli[:metric] ||= p.clade? ? 'ani' : 'aai'
    res_n = "#{cli[:metric]}_distances"
    cli.say "Reading distances: 1-#{cli[:metric].upcase}"
    res = p.result(res_n)
    raise "#{res_n} not yet calculated" if res.nil?

    matrix = res.file_path(:matrix)
    raise "#{res_n} has no matrix" if matrix.nil?

    dist = {}
    mfh = (matrix =~ /\.gz$/) ?
      Zlib::GzipReader.open(matrix) : File.open(matrix, 'r')
    mfh.each_line do |ln|
      next if mfh.lineno == 1

      row = ln.chomp.split("\t")
      dist[cannid(row[1], row[2])] = [row[3], row[5], row[6], 0, ['root:biota']]
      cli.advance('Lines:', mfh.lineno, nil, false) if (mfh.lineno % 1_000) == 0
    end
    cli.say ''
    mfh.close
    dist
  end

  def get_tab_index(dir)
    if cli[:index].nil?
      ds = cli.load_and_filter_datasets
      ds.keep_if { |d| !d.metadata[:tax].nil? }

      cli.say 'Indexing taxonomy'
      tax_index = TaxIndex.new
      ds.each { |d| tax_index << d }
      tab = File.expand_path('index.tab', dir)
      File.open(tab, 'w') { |fh| fh.print tax_index.to_tab }
    else
      tab = cli[:index]
    end
    tab
  end

  def traverse_taxonomy(tab, dist)
    cli.say 'Traversing taxonomy'
    rank_i = 0
    Taxonomy.KNOWN_RANKS.each do |rank|
      next if rank == :ns

      rank_n = 0
      rank_i += 1
      in_rank = nil
      ds_name = []
      File.open(tab, 'r') do |fh|
        fh.each_line do |ln|
          if ln =~ /^ {0,#{(rank_i - 1) * 2}}\S+:\S+:/
            in_rank = nil
            ds_name = []
          elsif ln =~ /^ {#{rank_i * 2}}(#{rank}:(\S+)):/
            in_rank = $2 == '?' ? nil : $1
            ds_name = []
          elsif ln =~ /^ *# (\S+)/ && !in_rank.nil?
            ds_i = $1
            ds_name << ds_i
            ds_name.each do |ds_j|
              k = cannid(ds_i, ds_j)
              next if dist[k].nil?

              rank_n += 1
              dist[k][3] = rank_i
              dist[k][4].unshift in_rank
            end
          end
        end
      end
      cli.say "o #{rank}: #{rank_n} pairs of datasets"
    end
    dist
  end
end