lib/genevalidator/validation_open_reading_frame.rb
require 'bio'
require 'forwardable'
require 'genevalidator/exceptions'
require 'genevalidator/validation_report'
require 'genevalidator/validation_test'
module GeneValidator
##
# Class that stores the validation output information
class ORFValidationOutput < ValidationReport
attr_reader :orfs
attr_reader :coverage
attr_reader :threshold
attr_reader :mainORFFrame
attr_reader :result
def initialize(short_header, header, description, orfs, coverage,
longest_orf_frame, threshold = 80, expected = :yes)
@short_header = short_header
@header = header
@description = description
@orfs = orfs
@coverage = coverage
@threshold = threshold
@expected = expected
@result = validation
@plot_files = []
@mainORFFrame = longest_orf_frame
@approach = 'We expect the query sequence to encode a single gene,' \
' thus it should contain one main Open Reading Frame' \
' (ORF) that occupies most of the query sequence.' \
' This validation does not require any BLAST hits.'
@explanation = " The longest ORF is in frame #{@mainORFFrame}, where" \
" it occupies #{@coverage.round}% of the query" \
' sequence.'
@conclusion = conclude
end
def conclude
if @result == :yes
'There is no evidence to believe that there is any problem with the' \
' ORF of the query sequence.'
else
'This only represents a portion of the query sequence. In some cases' \
' this indicates that a frame shift exists in the query sequence.'
end
end
def print
@orfs.map { |elem| elem[1].length }.reduce(:+)
orf_list = ''
@orfs.map { |elem| orf_list << "#{elem[0]}:#{elem[1]}," }
"#{@coverage.round}% (frame #{@mainORFFrame})"
end
def validation
@coverage > @threshold ? :yes : :no
end
end
##
# This class contains the methods necessary for checking whether there is
# a main Open Reading Frame in the predicted sequence
class OpenReadingFrameValidation < ValidationTest
extend Forwardable
def_delegators GeneValidator, :config
##
# Initilizes the object
# Params:
# +prediction+: a +Sequence+ object representing the blast query
# +hits+: a vector of +Sequence+ objects (representing blast hits)
def initialize(prediction, hits)
super
@short_header = 'MainORF'
@header = 'Main ORF'
@description = 'Check whether there is a single main Open Reading' \
' Frame in the predicted gene. This validation does not' \
' require any BLAST hits.'
@cli_name = 'orf'
@type = config[:type]
end
##
# Check whether there is a main reading frame
# Output:
# +ORFValidationOutput+ object
def run
if type.to_s != 'nucleotide'
@validation_report = ValidationReport.new('', :unapplicable)
return @validation_report
end
raise unless prediction.is_a?(Query)
start = Time.new
orfs = get_orfs
longest_orf = orfs.max_by { |_key, hash| hash[:coverage] }
longest_orf_frame = longest_orf[1][:frame]
coverage = longest_orf[1][:coverage]
translated_length = longest_orf[1][:translated_length]
plot1 = plot_orfs(orfs, translated_length)
@validation_report = ORFValidationOutput.new(@short_header, @header,
@description, orfs,
coverage, longest_orf_frame)
@validation_report.run_time = Time.now - start
@validation_report.plot_files.push(plot1)
@validation_report
rescue StandardError
@validation_report = ValidationReport.new('Unexpected error', :error,
@short_header, @header,
@description)
@validation_report.errors.push 'Unexpected Error'
end
##
# Find open reading frames in the original sequence
# Applied only to nucleotide sequences
# Params:
# +orf_length+: minimimum ORF length, default 100
# +prediction+: +Sequence+ object
# Output:
# +Hash+ containing the data on ORFs
def get_orfs(_orf_length = 100, prediction = @prediction)
'-' if prediction.type != 'nucleotide'
seq = Bio::Sequence::NA.new(prediction.raw_sequence)
result = {}
key = 0
(1..6).each do |f|
s = seq.translate(f)
f = -1 if f == 4
f = -2 if f == 5
f = -3 if f == 6
s.scan(/(\w{30,})/) do |_orf|
orf_start = $LAST_MATCH_INFO.offset(0)[0] + 1
orf_end = $LAST_MATCH_INFO.offset(0)[1] + 1
coverage = (((orf_end - orf_start) / s.length.to_f) * 100).ceil
# reduce the orf_end and the translated length by 2% to increase the
# width between ORFs on the plot
chopping = s.length * 0.02
orf_end = (orf_end.to_f - chopping).floor
translated_length = (s.length - chopping).ceil
key += 1
result[key] = { frame: f, orf_start: orf_start, orf_end: orf_end,
coverage: coverage,
translated_length: translated_length }
end
end
result
end
##
# Plots the resions corresponding to open reading frames
# Param
# +orfs+: +Hash+ containing the open reading frame
# +output+: location where the plot will be saved in jped file format
# +prediction+: Sequence objects
def plot_orfs(orfs, translated_length)
raise QueryError unless orfs.is_a? Hash
data = []
# Create hashes for the Background
(-3..3).each do |frame|
next if frame == 0
data << { 'y' => frame, 'start' => 1, 'stop' => translated_length,
'color' => 'gray' }
end
# Create the hashes for the ORFs...
orfs.each do |_key, h|
data << { 'y' => h[:frame], 'start' => h[:orf_start],
'stop' => h[:orf_end], 'color' => 'red' }
end
Plot.new(data,
:lines,
'Open Reading Frames in all 6 Frames',
'Open Reading Frame (Minimimum Length: 30 amino acids),red',
'Offset in the Prediction',
'Reading Frame',
14)
end
end
end