lib/genevalidator/validation_alignment.rb
require 'bio'
require 'forwardable'
require 'genevalidator/exceptions'
require 'genevalidator/get_raw_sequences'
require 'genevalidator/validation_report'
require 'genevalidator/validation_test'
module GeneValidator
##
# Class that stores the validation output information
class AlignmentValidationOutput < ValidationReport
attr_reader :gaps
attr_reader :extra_seq
attr_reader :consensus
attr_reader :threshold
attr_reader :result
def initialize(short_header, header, description, gaps = 0, extra_seq = 0,
consensus = 1, threshold = 20, expected = :yes)
@short_header = short_header
@header = header
@description = description
@gaps = (gaps * 100).round.to_s + '%'
@extra_seq = (extra_seq * 100).round.to_s + '%'
@consensus = (consensus * 100).round.to_s + '%'
@threshold = threshold
@result = validation
@expected = expected
@plot_files = []
@approach = 'We expect the query sequence to be similar to the top' \
' ten BLAST hits. Here, we create a statistical' \
' consensus model of those top hits and compare the' \
' query to this model.'
@explanation = "The query sequence includes #{@consensus} amino-acid" \
' residues present in the consensus model.' \
" #{@extra_seq} of residues in the query sequence are" \
' absent from the consensus profile. ' \
" #{@gaps} of residues in the consensus profile are" \
' absent from the query sequence.'
@conclusion = conclude
end
def conclude
if @result == :yes
'There is no evidence based on the top 10 BLAST hits to suggest any' \
' problems with the query sequence.'
else
t = 'These results suggest that there may be some problems with' \
' the query sequence.'
t1 = ''
t2 = ''
t3 = '' # Create empty string variables
if (1 - consensus.to_i) > @threshold
t1 = ' There is low conservation of residues between the' \
' statistical profile and the query sequence (the cut-off' \
' is 80%).'
end
if extra_seq.to_i > @threshold
t2 = " The query sequence has a high percentage (#{@extra_seq})" \
' of extra residues absent from the statistical profile' \
' (the cut-off is 20%).'
end
if gaps.to_i > @threshold
t3 = " The query sequence has a high percentage (#{@gaps}) of" \
' missing residues when compared to the statistical profile' \
' (the cut-off is 20%).'
end
t + t1 + t2 + t3
end
end
def print
"#{@consensus} conserved; #{@extra_seq} extra;" \
" #{@gaps} missing."
end
def validation
if gaps.to_i < @threshold && extra_seq.to_i < @threshold &&
(1 - consensus.to_i) < @threshold
:yes
else
:no
end
end
end
##
# This class contains the methods necessary for
# validations based on multiple alignment
class AlignmentValidation < ValidationTest
extend Forwardable
def_delegators GeneValidator, :opt, :config
attr_reader :multiple_alignment
attr_reader :raw_seq_file
attr_reader :index_file_name
attr_reader :raw_seq_file_load
##
# Initilizes the object
# Params:
# +prediction+: a +Sequence+ object representing the blast query
# +hits+: a vector of +Sequence+ objects (representing blast hits)
# +plot_path+: name of the fasta file
def initialize(prediction, hits)
super
@short_header = 'MissingExtraSequences'
@cli_name = 'align'
@header = 'Missing/Extra Sequences'
@description = 'Finds missing and extra sequences in the' \
' prediction, based on the multiple alignment of' \
' the best hits. Also counts the percentage of' \
' the conserved regions that appear in the' \
' prediction.'
@raw_seq_file = opt[:raw_sequences]
@index_file_name = config[:raw_seq_file_index]
@raw_seq_file_load = config[:raw_seq_file_load]
@db = opt[:db]
@multiple_alignment = []
@num_threads = opt[:mafft_threads]
@type = config[:type]
end
##
# Find gaps/extra regions based on the multiple alignment
# of the first n hits
# Output:
# +AlignmentValidationOutput+ object
def run
n = opt[:min_blast_hits] < 10 ? 10 : opt[:min_blast_hits]
n = 50 if n > 50
raise NotEnoughHitsError if hits.length < n
raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)
start = Time.new
# get the first n hits
n_hits = [n - 1, @hits.length].min
less_hits = @hits[0..n_hits]
# get raw sequences for less_hits
less_hits.delete_if do |hit|
if hit.raw_sequence.nil?
hit.raw_sequence = FetchRawSequences.run(hit.identifier,
hit.accession_no)
end
hit.raw_sequence.nil? ? true : false
end
raise NoInternetError if less_hits.length.zero?
# in case of nucleotide prediction sequence translate into protein
# translate with the reading frame of all hits considered for alignment
reading_frames = less_hits.map(&:reading_frame).uniq
raise ReadingFrameError if reading_frames.length != 1
if @type == :nucleotide
s = Bio::Sequence::NA.new(prediction.raw_sequence)
prediction.protein_translation = s.translate(reading_frames[0])
end
# multiple align sequences from less_hits with the prediction
# the prediction is the last sequence in the vector
@multiple_alignment = multiple_align_mafft(prediction, less_hits)
out = get_sm_pssm(@multiple_alignment[0..@multiple_alignment.length - 2])
sm = out[0]
freq = out[1]
# remove isolated residues from the predicted sequence
index = @multiple_alignment.length - 1
prediction_raw = remove_isolated_residues(@multiple_alignment[index])
# remove isolated residues from the statistical model
sm = remove_isolated_residues(sm)
a1 = get_consensus(@multiple_alignment[0..@multiple_alignment.length - 2])
plot1 = plot_alignment(freq)
gaps = gap_validation(prediction_raw, sm)
extra_seq = extra_sequence_validation(prediction_raw, sm)
consensus = consensus_validation(prediction_raw, a1)
@validation_report = AlignmentValidationOutput.new(@short_header, @header,
@description, gaps,
extra_seq, consensus)
@validation_report.plot_files.push(plot1)
@validation_report.run_time = Time.now - start
@validation_report
rescue NotEnoughHitsError
@validation_report = ValidationReport.new('Not enough evidence',
:warning, @short_header,
@header, @description)
rescue NoMafftInstallationError
@validation_report = ValidationReport.new('Mafft error', :error,
@short_header, @header,
@description)
@validation_report.errors.push NoMafftInstallationError
rescue NoInternetError
@validation_report = ValidationReport.new('Internet error', :error,
@short_header, @header,
@description)
@validation_report.errors.push NoInternetError
rescue ReadingFrameError
@validation_report = ValidationReport.new('Multiple reading frames',
:error, @short_header,
@header, @description)
@validation_report.errors.push 'Multiple reading frames Error'
rescue StandardError
@validation_report = ValidationReport.new('Unexpected error', :error,
@short_header, @header,
@description)
@validation_report.errors.push 'Unexpected Error'
end
##
# Builds the multiple alignment between
# all the hits and the prediction
# using MAFFT tool
# Also creates a fasta file with the alignment
# Params:
# +prediction+: a +Sequence+ object representing the blast query
# +hits+: a vector of +Sequience+ objects (usually representing blast hits)
# +path+: path of mafft installation
# Output:
# Array of +String+s, corresponding to the multiple aligned sequences
# the prediction is the last sequence in the vector
def multiple_align_mafft(prediction, hits)
raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)
opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet',
'--thread', @num_threads.to_s]
mafft = Bio::MAFFT.new('mafft', opt)
sequences = hits.map do |h|
# remove the seq id - as MAFFT sometimes has an issue with this
f = Bio::FastaFormat.new(h.raw_sequence)
# check if fasta sequence otherwise returne original entry
f.seq.empty? ? h.raw_sequence : f.seq
end
sequences.push(prediction.protein_translation)
report = mafft.query_align(sequences)
alignment = report.alignment.map(&:to_s)
raise NoMafftInstallationError if alignment.empty?
alignment
rescue StandardError
raise NoMafftInstallationError
end
##
# Returns the consensus regions among
# a set of multiple aligned sequences
# i.e positions where there is the same
# element in all sequences
# Params:
# +ma+: array of +String+s, corresponding to the multiple aligned sequences
# Output:
# +String+ with the consensus regions
def get_consensus(ma)
align = Bio::Alignment.new(ma)
align.consensus
end
##
# Returns the percentage of gaps in the prediction
# with respect to the statistical model
# Params:
# +prediction+: +String+ corresponding to the prediction sequence
# +sm+: +String+ corresponding to the statistical model
# Output:
# +Fixnum+ with the score
def gap_validation(prediction_raw, sm)
return 1 if prediction_raw.length != sm.length
# find gaps in the prediction and
# not in the statistical model
no_gaps = 0
(0..sm.length - 1).each do |i|
no_gaps += 1 if prediction_raw[i] == '-' && sm[i] != '-'
end
no_gaps / (sm.length + 0.0)
end
##
# Returns the percentage of extra sequences in the prediction
# with respect to the statistical model
# Params:
# +prediction+: +String+ corresponding to the prediction sequence
# +sm+: +String+ corresponding to the statistical model
# Output:
# +Fixnum+ with the score
def extra_sequence_validation(prediction_raw, sm)
return 1 if prediction_raw.length != sm.length
# find residues that are in the prediction
# but not in the statistical model
no_insertions = 0
(0..sm.length - 1).each do |i|
no_insertions += 1 if prediction_raw[i] != '-' && sm[i] == '-'
end
no_insertions / (sm.length + 0.0)
end
##
# Returns the percentage of consesnsus residues from the ma
# that are in the prediction
# Params:
# +prediction_raw+: +String+ corresponding to the prediction sequence
# +consensus+: +String+ corresponding to the statistical model
# Output:
# +Fixnum+ with the score
def consensus_validation(prediction_raw, consensus)
return 1 if prediction_raw.length != consensus.length
# no of conserved residues among the hits
no_conserved_residues = consensus.length - consensus.scan(/[\?-]/).length
return 1 if no_conserved_residues.zero?
# no of conserved residues from the hita that appear in the prediction
no_conserved_pred = consensus.split(//).each_index.count { |j| consensus[j] != '-' && consensus[j] != '?' && consensus[j] == prediction_raw[j] }
no_conserved_pred / (no_conserved_residues + 0.0)
end
##
# Builds a statistical model from
# a set of multiple aligned sequences
# based on PSSM (Position Specific Matrix)
# Params:
# +ma+: array of +String+s, corresponding to the multiple aligned sequences
# +threshold+: percentage of genes that are considered in statistical model
# Output:
# +String+ representing the statistical model
# +Array+ with the maximum frequeny of the majoritary residue for each
# position
def get_sm_pssm(ma, threshold = 0.7)
sm = ''
freq = []
(0..ma[0].length - 1).each do |i|
freqs = Hash.new(0)
ma.map { |seq| seq[i] }.each { |res| freqs[res] += 1 }
# get the residue with the highest frequency
max_freq = freqs.map { |_res, n| n }.max
residue = freqs.map { |res, n| n == max_freq ? res : [] }.flatten[0]
freq << residue == '-' ? 0 : (max_freq / ma.length.to_f)
sm += (max_freq / ma.length.to_f) >= threshold ? residue : '?'
end
[sm, freq]
end
##
# Remove isolated residues inside long gaps
# from a given sequence
# Params:
# +seq+:+String+: sequence of residues
# +len+:+Fixnum+: number of isolated residues to be removed
# Output:
# +String+: the new sequence
def remove_isolated_residues(seq, len = 2)
gap_starts = seq.to_enum(:scan, /(-\w{1,#{len}}-)/i).map { |_m| $`.size + 1 }
# remove isolated residues
gap_starts.each do |i|
(i..i + len - 1).each { |j| seq[j] = '-' if isalpha(seq[j]) }
end
# remove isolated gaps
res_starts = seq.to_enum(:scan, /([?\w]-{1,2}[?\w])/i).map { |_m| $`.size + 1 }
res_starts.each do |i|
(i..i + len - 1).each { |j| seq[j] = '?' if seq[j] == '-' }
end
seq
end
##
# Returns true if the string contains only letters
# and false otherwise
def isalpha(str)
!str.match(/[^A-Za-z]/)
end
##
# converts an array of integers into array of ranges
def array_to_ranges(ar)
prev = ar[0]
ranges = ar.slice_before do |e|
prev2 = prev
prev = e
prev2 + 1 != e
end.map { |a| a[0]..a[-1] }
ranges
end
# Generates a json file cotaining data used for plotting
# lines for multiple hits alignment, prediction and statistical model
# Params:
# +freq+: +String+ residue frequency from the statistical model
# +output+: plot_path of the json file
# +ma+: +String+ array with the multiple alignmened hits and prediction
def plot_alignment(freq, ma = @multiple_alignment)
# get indeces of consensus in the multiple alignment
consensus = get_consensus(ma[0..ma.length - 2])
consensus_idxs = consensus.split(//).each_index.select { |j| isalpha(consensus[j]) }
consensus_ranges = array_to_ranges(consensus_idxs)
consensus_all = get_consensus(ma)
consensus_all_idxs = consensus_all.split(//).each_index.select { |j| isalpha(consensus_all[j]) }
consensus_all_ranges = array_to_ranges(consensus_all_idxs)
match_alignment = ma[0..ma.length - 2].each_with_index.map { |seq, _j| seq.split(//).each_index.select { |j| isalpha(seq[j]) } }
match_alignment_ranges = match_alignment.map { |e| array_to_ranges(e) }
query_alignment = ma[ma.length - 1].split(//).each_index.select { |j| isalpha(ma[ma.length - 1][j]) }
query_alignment_ranges = array_to_ranges(query_alignment)
len = ma[0].length
# plot statistical model
data = freq.each_with_index.map { |h, j| { 'y' => ma.length, 'start' => j, 'stop' => j + 1, 'color' => 'orange', 'height' => h } } +
# hits
match_alignment_ranges.each_with_index.map { |ranges, j| ranges.map { |range| { 'y' => ma.length - j - 1, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } } }.flatten +
ma[0..ma.length - 2].each_with_index.map { |_seq, j| consensus_ranges.map { |range| { 'y' => j + 1, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } } }.flatten +
# plot prediction
[{ 'y' => 0, 'start' => 0, 'stop' => len, 'color' => 'gray', 'height' => -1 }] +
query_alignment_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'red', 'height' => -1 } }.flatten +
# plot consensus
consensus_all_ranges.map { |range| { 'y' => 0, 'start' => range.first, 'stop' => range.last, 'color' => 'yellow', 'height' => -1 } }.flatten
y_axis_values = 'Prediction'
(1..ma.length - 1).each { |i| y_axis_values << ", hit #{i}" }
y_axis_values << ', Statistical Model'
Plot.new(data,
:align,
'Missing/Extra sequences Validation: Multiple Align. &' \
'Statistical model of hits',
'Aligned Hit Sequence, red; Conserved Region, Yellow',
'Offset in the Alignment',
'',
ma.length + 1,
y_axis_values)
end
end
end