Blahah/transrate

View on GitHub
lib/transrate/comparative_metrics.rb

Summary

Maintainability
C
1 day
Test Coverage
require 'set'
require 'crb-blast'

module Transrate

  class ComparativeMetrics

    attr_reader :rbh_per_contig
    attr_reader :rbh_per_reference
    attr_reader :reciprocal_hits
    attr_reader :has_run
    attr_reader :reference_coverage
    attr_reader :comp_stats

    def initialize assembly, reference, threads
      @assembly = assembly
      @reference = reference
      @threads = threads
      @comp_stats = Hash.new
    end

    def run
      crbblast = run_crb_blast
      calculate_reference_coverage crbblast
      @has_run = true
    end

    def calculate_reference_coverage crbblast
      # The reciprocals hash in crb blast has contig names as the key.
      # In order to look up by the reference name we need to reverse this.
      # Scan through the reciprocals and get this Hit objects and add them to
      #   the @reference object for each reference sequence
      get_reference_hits crbblast
      per_query_contig_reference_coverage
      per_target_contig_reference_coverage crbblast
    end

    def get_reference_hits crbblast
      @reference.each do |name, contig|
        contig.hits = []
      end
      crbblast.reciprocals.each do |query_id, list|
        list.each do |hit|
          unless @reference.assembly.key? hit.target
            raise TransrateError.new "#{hit.target} not in reference"
          end
          @reference[hit.target].hits << hit
        end
      end
      @comp_stats[:CRBB_hits] = crbblast.size
      @comp_stats[:n_contigs_with_CRBB] = crbblast.reciprocals.size
      @comp_stats[:p_contigs_with_CRBB] = crbblast.reciprocals.size/@assembly.size.to_f
    end

    def per_query_contig_reference_coverage
      # for each query contig in the @assembly find out how much it covers
      # the reference
      n_refs_with_recip = 0
      total_crbb_hits = 0
      @reference.each do |ref_contig_name, ref_contig|
        ref_contig.hits.each do |hit| # a Hit from query to target
          query_contig_name = hit.query
          unless @assembly.assembly.key? query_contig_name
            raise TransrateError.new "#{query_contig_name} not in assembly"
          end
          @assembly[query_contig_name].has_crb = true
          @assembly[query_contig_name].hits << hit
          raise TransrateError.new "query should not be protein" if hit.qprot
          if hit.tprot
            coverage = 3*hit.alnlen+2 - 3*hit.mismatches - 3*hit.gaps
            coverage /= 3.0*hit.tlen
          else
            coverage = hit.alnlen - hit.mismatches - hit.gaps
            coverage /= hit.tlen.to_f
          end
          @assembly[query_contig_name].reference_coverage = coverage
        end

        if ref_contig.hits.size > 0 # this reference has a crbblast hit
          n_refs_with_recip += 1
        end
        total_crbb_hits += ref_contig.hits.size
      end
      @comp_stats[:rbh_per_reference] = total_crbb_hits / @reference.size.to_f
      @comp_stats[:n_refs_with_CRBB] = n_refs_with_recip
      @comp_stats[:p_refs_with_CRBB] = n_refs_with_recip / @reference.size.to_f
    end

    def per_target_contig_reference_coverage crbblast
      # each target sequence in the reference can have multiple query contigs
      # hit it. to calculate the reference coverage you can't just add up the
      # alignment lengths. you have to make sure that overlaps are taken into
      # account
      coverage_thresholds = [0.25, 0.5, 0.75, 0.85, 0.95]
      coverage_totals = [0, 0, 0, 0, 0]
      prot = crbblast.target_is_prot
      total_coverage = 0
      total_length = 0
      @reference.each do |ref_contig_name, ref_contig|
        if prot
          covered = Array.new(ref_contig.length*3, false)
        else
          covered = Array.new(ref_contig.length, false)
        end
        ref_contig.hits.each_with_index do |hit, i| # a Hit from query to target
          if prot
            if hit.qstart % 3 == 0
              tstart = 3*hit.tstart-4
              tend = 3*hit.tend
            elsif hit.qstart % 3 == 1
              tstart = 3*hit.tstart-2
              tend = 3*hit.tend
            elsif hit.qstart % 3 == 2
              tstart = 3*hit.tstart-3
              tend = 3*hit.tend-1
            end
            if hit.qlen % 3 == 1
              tend += 1
            elsif hit.qlen % 3 == 2
              tend += 2
            end
          else
            tstart = hit.tstart
            tend = hit.tend
          end
          (tstart..tend).each do |b|
            covered[b-1] = true # blast coords are 1 indexed
          end
        end
        coverage = covered.reduce(0) { |sum, v| v ? sum + 1 : sum }
        ref_p = coverage / covered.length.to_f
        ref_contig.reference_coverage = ref_p
        coverage_thresholds.each_with_index do |n, index|
          if ref_p >= n
            coverage_totals[index] += 1
          end
        end

        total_coverage += coverage
        total_length += covered.length
      end

      # calculate proportion of ref sequences with coveragre over thresholds
      coverage_thresholds.each_with_index do |p, i|
        @comp_stats["cov#{(100*p).to_i}".to_sym] = coverage_totals[i]
        @comp_stats["p_cov#{(100*p).to_i}".to_sym] =
                                        coverage_totals[i]/@reference.size.to_f
      end
      @comp_stats[:reference_coverage] = total_coverage / total_length.to_f
    end

    def run_crb_blast
      crbblast = CRB_Blast::CRB_Blast.new @assembly.file, @reference.file
      crbblast.run(1e-5, @threads, true)
      crbblast
    end

  end # ComparativeMetrics

end # Transrate