wurmlab/GeneValidator

View on GitHub
lib/genevalidator/validation_gene_merge.rb

Summary

Maintainability
C
1 day
Test Coverage
A
97%
require 'forwardable'
require 'statsample'

require 'genevalidator/exceptions'
require 'genevalidator/ext/array'
require 'genevalidator/validation_report'
require 'genevalidator/validation_test'

module GeneValidator
  ##
  # Class that stores the validation output information
  class GeneMergeValidationOutput < ValidationReport
    attr_reader :slope
    attr_reader :threshold_down
    attr_reader :threshold_up
    attr_reader :unimodality
    attr_reader :result

    # These thresholds are emperically chosen.
    UPPER_THRESHOLD = 1.2 # radians
    LOWER_THRESHOLD = 0.4 # radians

    def initialize(short_header, header, description, slope, unimodality,
                   expected = :no)
      @short_header = short_header
      @header = header
      @description = description
      @slope          = slope.round(1)
      @slope          = @slope.abs if @slope == -0.0
      @unimodality    = unimodality
      @threshold_down = LOWER_THRESHOLD
      @threshold_up   = UPPER_THRESHOLD
      @result         = validation
      @expected       = expected
      @plot_files     = []
      @approach       = 'We expect the query sequence to encode a single' \
                        ' protein-coding gene. Here, we analyse the' \
                        ' High-scoring Segment Pairs (HSPs) identified by' \
                        ' BLAST to determine whether the query includes' \
                        ' sequence from two or more genes.'
      @explanation    = explain
      @conclusion     = conclude
    end

    def explain
      if @unimodality
        'The start coordinates and the end coordinates of HSPs are unimodally' \
        ' distributed.'
      else
        'The distribution of start and/or end-coordinates of HSPs are' \
        ' multi-modal. To detect potential problems we performed a linear'\
        ' regression (with coordinates weighted inversely proportionally to '\
        " hit strength). The resulting slope is #{@slope}."
      end
    end

    def conclude
      if @unimodality
        'This suggest that the query sequence represents a single gene.'
      else
        diff = @result == :yes ? ' within' : ' outside'
        t = "This slope is #{diff} our empirically calculated thresholds" \
            ' (0.4 and 1.2).'
        t << if @result == :yes
               ' This suggests the query contains sequence from two or more' \
                    ' different genes.'
             else
               ' There is no evidence that the query contains sequence from' \
                    ' multiple genes.'
             end
        t
      end
    end

    def print
      @slope.nan? ? 'Inf' : @slope.to_s
    end

    def validation
      @slope > threshold_down && @slope < threshold_up ? :yes : :no
    end

    def color
      validation == :no ? 'success' : 'danger'
    end
  end

  ##
  # This class contains the methods necessary for
  # checking whether there is evidence that the
  # prediction is a merge of multiple genes
  class GeneMergeValidation < ValidationTest
    extend Forwardable
    def_delegators GeneValidator, :opt

    attr_reader :prediction
    attr_reader :hits

    ##
    # Initilizes the object
    # Params:
    # +prediction+: a +Sequence+ object representing the blast query
    # +hits+: a vector of +Sequence+ objects (representing blast hits)
    # +plot_path+: name of the input file, used when generatig the plot files
    # +boundary+: the offset of the hit from which we start analysing the hit
    def initialize(prediction, hits, boundary = 10)
      super
      @short_header = 'GeneMerge'
      @header       = 'Gene Merge'
      @description  = 'Check whether BLAST hits make evidence about a merge' \
                      ' of two genes that match the predicted gene.'
      @cli_name     = 'merge'
      @boundary     = boundary
    end

    ##
    # Validation test for gene merge
    # Output:
    # +GeneMergeValidationOutput+ object
    def run
      raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
      raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)

      start = Time.now

      pairs = hits.map do |hit|
        Pair.new(hit.hsp_list.map(&:match_query_from).min,
                 hit.hsp_list.map(&:match_query_to).max)
      end
      xx_0 = pairs.map(&:x)
      yy_0 = pairs.map(&:y)

      # minimum start shoud be at 'boundary' residues
      xx = xx_0.map { |x| x < @boundary ? @boundary : x }

      # maximum end should be at length - 'boundary' residues
      yy = yy_0.map do |y|
        if y > @prediction.raw_sequence.length - @boundary
          @prediction.raw_sequence.length - @boundary
        else
          y
        end
      end

      line_slope = slope(xx, yy, (1..hits.length).map { |x| 1 / (x + 0.0) })
      ## YW - what is this weighting?

      unimodality = false
      if unimodality_test(xx, yy)
        unimodality = true
        lm_slope = 0.0
      else
        lm_slope = line_slope[1]
      end

      y_intercept = line_slope[0]

      @validation_report = GeneMergeValidationOutput.new(@short_header, @header,
                                                         @description, lm_slope,
                                                         unimodality)
      plot1 = if unimodality
                plot_2d_start_from
              else
                plot_2d_start_from(lm_slope, y_intercept)
              end

      @validation_report.plot_files.push(plot1)
      plot2 = plot_matched_regions
      @validation_report.plot_files.push(plot2)
      @validation_report.run_time = Time.now - start
      @validation_report
    rescue NotEnoughHitsError
      @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                                @short_header, @header,
                                                @description)
    rescue StandardError
      @validation_report = ValidationReport.new('Unexpected error', :error,
                                                @short_header, @header,
                                                @description)
      @validation_report.errors.push 'Unexpected Error'
    end

    ##
    # Generates a json file containing data used for
    # plotting the matched region of the prediction for each hit
    # Param
    # +output+: location where the plot will be saved in jped file format
    # +hits+: array of Sequence objects
    # +prediction+: Sequence objects
    def plot_matched_regions(hits = @hits)
      no_lines = hits.length

      hits_less = hits[0..[no_lines, hits.length - 1].min]

      data = hits_less.each_with_index.map do |hit, i|
        { 'y' => i,
          'start' => hit.hsp_list.map(&:match_query_from).min,
          'stop' => hit.hsp_list.map(&:match_query_to).max,
          'color' => 'black',
          'dotted' => 'true' }
      end .flatten +
             hits_less.each_with_index.map do |hit, i|
               hit.hsp_list.map do |hsp|
                 { 'y' => i,
                   'start' => hsp.match_query_from,
                   'stop' => hsp.match_query_to,
                   'color' => 'orange' }
               end
             end .flatten

      Plot.new(data,
               :lines,
               'Gene Merge Validation: Query coord covered by blast hit' \
               ' (1 line/hit)',
               '',
               'Offset in Prediction',
               'Hit Number',
               hits_less.length)
    end

    ##
    # Generates a json file containing data used for
    # plotting the start/end of the matched region offsets in the prediction
    # Param
    # +slope+: slope of the linear regression line
    # +y_intercept+: the ecuation of the line is y= slope*x + y_intercept
    # +output+: location where the plot will be saved in jped file format
    # +hits+: array of Sequence objects
    def plot_2d_start_from(slope = nil, y_intercept = nil, hits = @hits)
      pairs = hits.map do |hit|
        Pair.new(hit.hsp_list.map(&:match_query_from).min,
                 hit.hsp_list.map(&:match_query_to).max)
      end

      data = hits.map do |hit|
        { 'x' => hit.hsp_list.map(&:match_query_from).min,
          'y' => hit.hsp_list.map(&:match_query_to).max,
          'color' => 'red' }
      end

      Plot.new(data,
               :scatter,
               'Gene Merge Validation: Start/end of matching hit coord. on' \
               ' query (1 point/hit)',
               '',
               'Start Offset (most left hsp)',
               'End Offset (most right hsp)',
               y_intercept.to_s,
               slope.to_s)
    end

    ##
    # Caclulates the slope of the regression line
    # give a set of 2d coordonates of the start/stop offests of the hits
    # Param
    # xx: +Array+ of integers
    # yy : +Array+ of integers
    # weights: +Array+ of integers
    # Output:
    # The ecuation of the regression line: [y slope]
    def slope(xx, yy, weights = nil)
      weights = Array.new(hits.length, 1) if weights.nil?

      # calculate the slope
      xx_weighted = xx.each_with_index.map { |x, i| x * weights[i] }
      yy_weighted = yy.each_with_index.map { |y, i| y * weights[i] }

      denominator = weights.reduce(0) { |a, e| a + e }

      x_mean = xx_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)
      y_mean = yy_weighted.reduce(0) { |a, e| a + e } / (denominator + 0.0)

      numerator = (0...xx.length).reduce(0) do |a, e|
        a + (weights[e] * (xx[e] - x_mean) * (yy[e] - y_mean))
      end

      denominator = (0...xx.length).reduce(0) do |a, e|
        a + (weights[e] * ((xx[e] - x_mean)**2))
      end

      slope = numerator / (denominator + 0.0)
      y_intercept = y_mean - (slope * x_mean)

      [y_intercept, slope]
    end

    ##
    # Caclulates the slope of the regression line
    # give a set of 2d coordonates of the start/stop offests of the hits
    # Param
    # xx : +Array+ of integers
    # yy : +Array+ of integers
    # Output:
    # The ecuation of the regression line: [y slope]
    def slope_statsample(xx, yy)
      sr = Statsample::Regression.simple(xx.to_scale, yy.to_scale)
      [sr.a, sr.b]
    end

    ##
    # xx and yy are the projections of the 2-d data on the two axes
    def unimodality_test(xx, yy)
      mean_x = xx.mean
      median_x = xx.median
      mode_x = xx.mode
      sd_x = xx.standard_deviation

      if sd_x == 0
        cond1_x = true
        cond2_x = true
        cond3_x = true
      else
        cond1_x = ((mean_x - median_x).abs / (sd_x + 0.0)) < Math.sqrt(0.6)
        cond2_x = ((mean_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
        cond3_x = ((median_x - mode_x).abs / (sd_x + 0.0)) < Math.sqrt(0.3)
      end

      mean_y = yy.mean
      median_y = yy.median
      mode_y = yy.mode
      sd_y = yy.standard_deviation

      if sd_y == 0
        cond1_y = true
        cond2_y = true
        cond3_y = true
      else
        cond1_y = ((mean_y - median_y).abs / (sd_y + 0.0)) < Math.sqrt(0.6)
        cond2_y = ((mean_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
        cond3_y = ((median_y - mode_y).abs / (sd_y + 0.0)) < Math.sqrt(0.3)
      end

      if cond1_x && cond2_x && cond3_x && cond1_y && cond2_y && cond3_y
        true
      else
        false
      end
    end
  end
end