lib/genevalidator/validation_duplication.rb from wurmlab/GeneValidator

lib/genevalidator/validation_duplication.rb
Summary

Maintainability

1 day
Test Coverage

84%
Issues
Coverage
require 'bio'
require 'forwardable'
require 'statsample'

require 'genevalidator/exceptions'
require 'genevalidator/ext/array'
require 'genevalidator/get_raw_sequences'
require 'genevalidator/validation_report'
require 'genevalidator/validation_test'

module GeneValidator
  ##
  # Class that stores the validation output information
  class DuplicationValidationOutput < ValidationReport
    attr_reader :pvalue
    attr_reader :average
    attr_reader :threshold
    attr_reader :result

    def initialize(short_header, header, description, pvalue, averages,
                   threshold = 0.05, expected = :yes)
      @short_header = short_header
      @header = header
      @description = description
      @pvalue      = pvalue
      @threshold   = threshold
      @result      = validation
      @expected    = expected
      @average     = averages.mean
      @approach    = 'We expect each BLAST hit to match each region of the' \
                     ' query at most once. Here, we calculate the' \
                     ' distribution of hit coverage against the query' \
                     ' sequence and use the Wilcoxon test to determine if it' \
                     ' is higher than 1.'
      @explanation = explain
      @conclusion  = conclude
    end

    def explain
      "The Wilcoxon test produced a p-value of #{prettify_evalue(@pvalue)}" \
      "#{@result == :no ? " (average = #{@average.round(2)})." : '.'}"
    end

    def conclude
      if @result == :yes
        'This suggests that the query sequence contains no erroneous' \
        ' duplications.'
      else
        'The null hypothesis is rejected - thus a region of the query' \
        ' sequence is likely repeated more than once.'
      end
    end

    def print
      @pvalue.round(2).to_s
    end

    def validation
      @pvalue > @threshold ? :yes : :no
    end

    def color
      validation == :yes ? 'success' : 'danger'
    end

    private

    # Copied from SequenceServer
    # Formats evalue (a float expressed in scientific notation) to "a x b^c".
    def prettify_evalue(evalue)
      evalue.to_s.sub(/(\d*\.\d*)e?([+-]\d*)?/) do
        s = format('%.3f', Regexp.last_match[1])
        s << " x 10<sup>#{Regexp.last_match[2]}</sup>" if Regexp.last_match[2]
        s
      end
    end
  end

  ##
  # This class contains the methods necessary for
  # finding duplicated subsequences in the predicted gene
  class DuplicationValidation < ValidationTest
    extend Forwardable
    def_delegators GeneValidator, :opt, :config

    attr_reader :raw_seq_file
    attr_reader :index_file_name
    attr_reader :raw_seq_file_load

    def initialize(prediction, hits)
      super
      @short_header      = 'Duplication'
      @header            = 'Duplication'
      @description       = 'Check whether there is a duplicated subsequence' \
                           ' in the predicted gene by counting the hsp' \
                           ' residue coverage of the prediction, for each hit.'
      @cli_name          = 'dup'
      @raw_seq_file      = opt[:raw_sequences]
      @index_file_name   = config[:raw_seq_file_index]
      @raw_seq_file_load = config[:raw_seq_file_load]
      @db                = opt[:db]
      @num_threads       = opt[:mafft_threads]
      @type              = config[:type]
    end

    ##
    # Check duplication in the first n hits
    # Output:
    # +DuplicationValidationOutput+ object
    def run(n = 10)
      raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
      raise unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
                   hits[0].is_a?(Query)

      start = Time.new
      # get the first n hits
      n_hits = [n - 1, @hits.length].min
      less_hits = @hits[0..n_hits]
      # get raw sequences for less_hits
      less_hits.delete_if do |hit|
        if hit.raw_sequence.nil?
          hit.raw_sequence = FetchRawSequences.run(hit.identifier,
                                                   hit.accession_no)
        end
        hit.raw_sequence.nil? ? true : false
      end

      raise NoInternetError if less_hits.length.zero?

      averages = []

      less_hits.each do |hit|
        coverage = Array.new(hit.length_protein, 0)
        # each residue of the protein should be evluated once only
        ranges_prediction = []

        hit.hsp_list.each do |hsp|
          # align subsequences from the hit and prediction that match
          if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil?
            hit_alignment   = hsp.hit_alignment
            query_alignment = hsp.query_alignment
          else
            align = find_local_alignment(hit, prediction, hsp)
            hit_alignment   = align[0]
            query_alignment = align[1]
          end

          coverage = check_multiple_coverage(hit_alignment, query_alignment,
                                             hsp, coverage, ranges_prediction)

          ranges_prediction << (hsp.match_query_from..hsp.match_query_to)
        end
        overlap = coverage.reject(&:zero?)
        if overlap != []
          averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2))
        end
      end

      # if all hsps match only one time
      if averages.reject { |x| x == 1 } == []
        @validation_report = DuplicationValidationOutput.new(@short_header,
                                                             @header,
                                                             @description, 1,
                                                             averages)
        @validation_report.run_time = Time.now - start
        return @validation_report
      end

      pval = wilcox_test(averages)

      @validation_report = DuplicationValidationOutput.new(@short_header,
                                                           @header,
                                                           @description, pval,
                                                           averages)
      @run_time = Time.now - start
      @validation_report
    rescue NotEnoughHitsError
      @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                                @short_header, @header,
                                                @description)
    rescue NoMafftInstallationError
      @validation_report = ValidationReport.new('Mafft error', :error,
                                                @short_header, @header,
                                                @description)
      @validation_report.errors.push NoMafftInstallationError
    rescue NoInternetError
      @validation_report = ValidationReport.new('Internet error', :error,
                                                @short_header, @header,
                                                @description)
      @validation_report.errors.push NoInternetError
    rescue StandardError
      @validation_report = ValidationReport.new('Unexpected error', :error,
                                                @short_header, @header,
                                                @description)
      @validation_report.errors.push 'Unexpected Error'
    end

    # Only run if the BLAST output does not contain hit alignmment
    def find_local_alignment(hit, prediction, hsp)
      # indexing in blast starts from 1
      hit_local   = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
      query_local = prediction.raw_sequence[hsp.match_query_from -
                                            1..hsp.match_query_to - 1]

      # in case of nucleotide prediction sequence translate into protein
      # use translate with reading frame 1 because
      # to/from coordinates of the hsp already correspond to the
      # reading frame in which the prediction was read to match this hsp
      if @type == :nucleotide
        s = Bio::Sequence::NA.new(query_local)
        query_local = s.translate
      end

      opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet',
             '--thread', @num_threads.to_s]
      mafft = Bio::MAFFT.new('mafft', opt)

      # local alignment for hit and query
      seqs = [hit_local, query_local]
      report = mafft.query_align(seqs)
      report.alignment.map(&:to_s)
    rescue StandardError
      raise NoMafftInstallationError
    end

    def check_multiple_coverage(hit_alignment, query_alignment, hsp, coverage,
                                ranges_prediction)
      # for each hsp of the curent hit
      # iterate through the alignment and count the matching residues
      [*(0..hit_alignment.length - 1)].each do |i|
        residue_hit   = hit_alignment[i]
        residue_query = query_alignment[i]
        next if [' ', '+', '-'].include?(residue_hit)
        next if residue_hit != residue_query
        # indexing in blast starts from 1
        idx_hit   = i + (hsp.hit_from - 1) -
                    hit_alignment[0..i].scan(/-/).length
        idx_query = i + (hsp.match_query_from - 1) -
                    query_alignment[0..i].scan(/-/).length
        coverage[idx_hit] += 1 unless in_range?(ranges_prediction, idx_query)
      end
      coverage
    end

    def in_range?(ranges, idx)
      ranges.each { |range| return true if range.member?(idx) }
      false
    end

    ##
    # wilcox test implementation from statsample ruby gem
    # many thanks to Claudio for helping us with the implementation!
    def wilcox_test(averages)
      wilcox = Statsample::Test.wilcoxon_signed_rank(
        Daru::Vector.new(averages),
        Daru::Vector.new(Array.new(averages.length, 1))
      )

      averages.length < 15 ? wilcox.probability_exact : wilcox.probability_z
    end
  end
end