wurmlab/GeneValidator

View on GitHub
lib/genevalidator/blast.rb

Summary

Maintainability
A
25 mins
Test Coverage
D
68%
require 'bio'
require 'bio-blastxmlparser'
require 'forwardable'

require 'genevalidator/exceptions'
require 'genevalidator/hsp'
require 'genevalidator/output'
require 'genevalidator/query'

module GeneValidator
  # Contains methods that run BLAST and methods that analyse sequences
  class BlastUtils
    class << self
      extend Forwardable
      def_delegators GeneValidator, :opt, :config, :dirs

      EVALUE = 1e-5

      # Runs BLAST on an input file
      # Params:
      # +blast_type+: blast command in String format (e.g 'blastx' or 'blastp')
      # +opt+: Hash made of :input_fasta_file :blast_xml_file, :db, :num_threads
      # +gapopen+: gapopen blast parameter
      # +gapextend+: gapextend blast parameter
      # +nr_hits+: max number of hits
      # Output:
      # XML file
      def run_blast_on_input_file
        remote = opt[:db].match?(/remote/) ? true : false
        print_blast_info_text(remote)

        log_file = File.join(dirs[:tmp_dir], 'blast_cmd_output.txt')
        `#{blast_cmd(opt, config, remote)} > #{log_file} 2>&1`

        return unless File.zero?(opt[:blast_xml_file])
        warn 'Blast failed to run on the input file.'
        if remote
          warn 'You are using BLAST with a remote database. Please'
          warn 'ensure that you have internet access and try again.'
        else
          warn 'Please ensure that the BLAST database exists and try again.'
        end
        exit 1
      end

      ##
      # Parses the next query from the blast xml output query
      # Params:
      # +iterator+: blast xml iterator for hits
      # +type+: the type of the sequence: :nucleotide or :protein
      # Outputs:
      # Array of +Sequence+ objects corresponding to the list of hits
      def parse_next(iterator)
        iter = iterator.next

        # parse blast the xml output and get the hits
        # hits obtained are proteins! (we use only blastp and blastx)
        hits = []
        iter.each do |hit|
          seq                = Query.new
          seq.length_protein = hit.len.to_i
          seq.type           = :protein
          seq.identifier     = hit.hit_id
          seq.definition     = hit.hit_def
          seq.accession_no   = hit.accession
          seq.hsp_list       = hit.hsps.map { |hsp| Hsp.new(xml_input: hsp) }

          hits << seq
        end
        hits
      rescue StopIteration
        nil
      end

      ##
      # Method copied from sequenceserver/sequencehelpers.rb
      # Splits input at putative fasta definition lines (like ">adsfadsf");
      # then guesses sequence type for each sequence.
      # If not enough sequence to determine, returns nil.
      # If 2 kinds of sequence mixed together, raises ArgumentError
      # Otherwise, returns :nucleotide or :protein
      # Params:
      # +sequence_string+: String to validate
      # Output:
      # nil, :nucleotide or :protein
      def type_of_sequences(fasta_format_string)
        # the first sequence does not need to have a fasta definition line
        sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
        # get all sequence types
        sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }
                                  .uniq.compact

        return nil if sequence_types.empty?
        sequence_types.first if sequence_types.length == 1
      end

      ##
      # Strips all non-letter characters. guestimates sequence based on that.
      # If less than 10 useable characters... returns nil
      # If more than 90% ACGTU returns :nucleotide. else returns :protein
      # Params:
      # +sequence_string+: String to validate
      # Output:
      # nil, :nucleotide or :protein
      def guess_sequence_type(sequence_string)
        # removing non-letter and ambiguous characters
        cleaned_sequence = sequence_string.gsub(/[^A-Z]|[NX]/i, '')
        return nil if cleaned_sequence.length < 10 # conservative

        type = Bio::Sequence.new(cleaned_sequence).guess(0.9)
        type == Bio::Sequence::NA ? :nucleotide : :protein
      end

      ##
      #
      def guess_sequence_type_from_input_file(file = opt[:input_fasta_file])
        lines = File.foreach(file).first(10)
        seqs = ''
        lines.each { |l| seqs += l.chomp unless l[0] == '>' }
        guess_sequence_type(seqs)
      end

      private

      def blast_cmd(opt, config, remote)
        blast_type = config[:type] == :protein ? 'blastp' : 'blastx'
        # -num_threads is not supported on remote databases
        threads = remote ? '' : "-num_threads #{opt[:num_threads]}"

        "#{blast_type} -query '#{opt[:input_fasta_file]}'" \
        " -db #{opt[:db]} -outfmt 5 -evalue #{EVALUE} #{threads}" \
        " -out '#{opt[:blast_xml_file]}' #{opt[:blast_options]}"
      end

      def print_blast_info_text(remote)
        warn '' # a blank line
        if remote
          warn '==> BLAST search and subsequent analysis will be done on a remote'
          warn '    database. Please use a local database for larger analysis.'
        else
          warn '==> Running BLAST. This may take a while.'
        end
        warn '' # a blank line
      end
    end
  end
end