lib/genevalidator/validation_length_cluster.rb
require 'forwardable'
require 'genevalidator/clusterization'
require 'genevalidator/exceptions'
require 'genevalidator/validation_report'
require 'genevalidator/validation_test'
module GeneValidator
##
# Class that stores the validation output information
class LengthClusterValidationOutput < ValidationReport
attr_reader :query_length
attr_reader :limits
attr_reader :result
def initialize(short_header, header, description, query_length, limits,
expected = :yes)
@short_header = short_header
@header = header
@description = description
@limits = limits
@query_length = query_length
@expected = expected
@result = validation
@plot_files = []
@approach = 'If the query sequence is well conserved and similar' \
' sequences (BLAST hits) are correct, we can expect' \
' query and hit sequences to have similar lengths.' \
' Here, we cluster the lengths of hit sequences and' \
' compare the length of our query sequence to the most' \
' dense cluster of hit lengths. '
@explanation = explain
@conclusion = conclude
end
def explain
diff = @result == :yes ? 'inside' : 'outside'
'The most dense length-cluster of BLAST hits includes' \
" sequences that are from #{@limits[0]} to #{@limits[1]} amino-acids" \
" long. The query sequence is #{@query_length} amino-acids long and" \
" is thus #{diff} the most dense length-cluster of BLAST hits."
end
def conclude
if @result == :yes # i.e. if inside the main cluster
'There is no reason to believe there is any problem with the length' \
' of the query sequence.'
else
size_diff = @query_length > @limits[1] ? 'long' : 'short'
"This suggests that the query sequence may be too #{size_diff}."
end
end
def print
"#{@query_length} #{@limits.to_s.gsub(' ', ' ')}"
end
def validation
return if @limits.nil?
if @query_length >= @limits[0] && @query_length <= @limits[1]
:yes
else
:no
end
end
end
##
# This class contains the methods necessary for
# length validation by hit length clusterization
class LengthClusterValidation < ValidationTest
extend Forwardable
def_delegators GeneValidator, :opt
attr_reader :clusters
attr_reader :max_density_cluster
##
# Initilizes the object
# Params:
# +type+: type of the predicted sequence (:nucleotide or :protein)
# +prediction+: a +Sequence+ object representing the blast query
# +hits+: a vector of +Sequence+ objects (representing blast hits)
# +dilename+: +String+ with the name of the fasta file
def initialize(prediction, hits)
super
@short_header = 'LengthCluster'
@header = 'Length Cluster'
@description = 'Check whether the prediction length fits most of the' \
' BLAST hit lengths, by 1D hierarchical clusterization.' \
' Meaning of the output displayed: Query_length' \
' [Main Cluster Length Interval]'
@cli_name = 'lenc'
end
##
# Validates the length of the predicted gene by comparing the length
# of the prediction to the most dense cluster
# The most dense cluster is obtained by hierarchical clusterization
# Plots are generated if required (see +plot+ variable)
# Output:
# +LengthClusterValidationOutput+ object
def run
raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
raise unless prediction.is_a?(Query) && hits[0].is_a?(Query)
start = Time.now
# get [clusters, max_density_cluster_idx]
clusterization = clusterization_by_length
@clusters = clusterization[0]
@max_density_cluster = clusterization[1]
limits = @clusters[@max_density_cluster].get_limits
query_length = @prediction.length_protein
@validation_report = LengthClusterValidationOutput.new(@short_header,
@header,
@description,
query_length,
limits)
plot1 = plot_histo_clusters
@validation_report.plot_files.push(plot1)
@validation_report.run_time = Time.now - start
@validation_report
rescue NotEnoughHitsError
@validation_report = ValidationReport.new('Not enough evidence', :warning,
@short_header, @header,
@description)
rescue StandardError
@validation_report = ValidationReport.new('Unexpected error', :error,
@short_header, @header,
@description)
@validation_report.errors.push 'Unexpected Error'
end
##
# Clusterization by length from a list of sequences
# Params:
# +debug+ (optional):: true to display debug information, false by default
# +lst+:: array of +Query+ objects
# +predicted_seq+:: +Query+ objetc
# Output
# output 1:: array of Cluster objects
# output 2:: the index of the most dense cluster
def clusterization_by_length(_debug = false,
lst = @hits,
predicted_seq = @prediction)
raise TypeError unless lst[0].is_a?(Query) && predicted_seq.is_a?(Query)
contents = lst.map { |x| x.length_protein.to_i }.sort { |a, b| a <=> b }
hc = HierarchicalClusterization.new(contents)
clusters = hc.hierarchical_clusterization
max_density = 0
max_density_cluster_idx = 0
clusters.each_with_index do |item, i|
next unless item.density > max_density
max_density = item.density
max_density_cluster_idx = i
end
[clusters, max_density_cluster_idx]
rescue TypeError => error
error_location = error.backtrace[0].scan(%r{([^/]+:\d+):.*})[0][0]
warn "Type error at #{error_location}."
warn ' Possible cause: one of the arguments of the' \
' "clusterization_by_length" method has not the proper type.'
exit 1
end
##
# Generates a json file containing data used for plotting the histogram
# of the length distribution given a lust of Cluster objects
# +output+: plot_path where to save the graph
# +clusters+: array of +Cluster+ objects
# +max_density_cluster+: index of the most dense cluster
# +prediction+: +Sequence+ object
# Output:
# +Plot+ object
def plot_histo_clusters(clusters = @clusters,
max_density_cluster = @max_density_cluster,
prediction = @prediction)
data = clusters.each_with_index.map do |cluster, i|
cluster.lengths.collect do |k, v|
{ 'key' => k, 'value' => v, 'main' => (i == max_density_cluster) }
end
end
Plot.new(data,
:bars,
'Length Cluster Validation: Distribution of BLAST hit lengths',
'Query Sequence, black;Most Dense Cluster,red;Other Hits, blue',
'Sequence Length',
'Number of Sequences',
prediction.length_protein)
end
end
end