wurmlab/GeneValidator

View on GitHub
lib/genevalidator/get_raw_sequences.rb

Summary

Maintainability
A
3 hrs
Test Coverage
F
45%
require 'bio-blastxmlparser'
require 'forwardable'
require 'net/http'
require 'tempfile'
require 'uri'
require 'yaml'

require 'genevalidator/exceptions'
require 'genevalidator/query'

module GeneValidator
  # Gets the raw sequences for each hit in a BLAST output file
  class RawSequences
    class <<self
      extend Forwardable
      def_delegators GeneValidator, :opt, :config, :dirs

      def init
        warn '==> Extracting fasta sequences for each BLAST HSP from the' \
             ' BLAST database'

        @blast_file = opt[:blast_xml_file] if opt[:blast_xml_file]
        @blast_file = opt[:blast_tabular_file] if opt[:blast_tabular_file]

        fname = File.basename(@blast_file)
        opt[:raw_sequences] = File.join(dirs[:tmp_dir], "#{fname}.raw_seq")
        @index_file         = File.join(dirs[:tmp_dir], "#{fname}.index")
      end

      ##
      # Obtains raw_sequences from BLAST output file...
      def run
        init
        if opt[:db].match?(/remote/)
          write_a_raw_seq_file(opt[:raw_sequences], 'remote')
        else
          write_an_index_file(@index_file, 'local')
          FetchRawSequences.extract_from_local_db(true, nil, @index_file)
        end
        index_raw_seq_file(opt[:raw_sequences])
      end

      ##
      #
      # Index the raw sequences file...
      def index_raw_seq_file(raw_seq_file = opt[:raw_sequences])
        # leave only the identifiers in the fasta description
        content = File.open(raw_seq_file, 'rb').read.gsub(/ .*/, '')
        File.open(raw_seq_file, 'w+') { |f| f.write(content) }

        # index the fasta file
        keys   = content.scan(/>(.*)\n/).flatten
        values = content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) }

        # remove trailing version numberr from the accession field
        keys.map! { |key| key.sub(/\.\d+$/, '') }

        # make an index hash
        index_hash = {}
        keys.each_with_index do |k, i|
          start = values[i]
          endf  = i == values.length - 1 ? content.length - 1 : values[i + 1]
          index_hash[k] = [start, endf]
        end

        # create FASTA index
        fname = File.basename(raw_seq_file)
        config[:raw_seq_file_index] = File.join(dirs[:tmp_dir], "#{fname}.idx")
        config[:raw_seq_file_load]  = index_hash

        File.open(config[:raw_seq_file_index], 'w') do |f|
          YAML.dump(index_hash, f)
        end
        content = nil
      end

      private

      def write_an_index_file(output_file, db_type)
        file = File.open(output_file, 'w+')
        iterate_xml(file, db_type) if opt[:blast_xml_file]
        iterate_tabular(file, db_type) if opt[:blast_tabular_file]
      rescue BLASTDBError
        warn '*** BLAST Database Error: Genevalidator requires BLAST' \
        " databases to be created with the '-parse_seqids argument."
        warn '    See https://github.com/wurmlab/genevalidator' \
        '#setting-up-a-blast-database for more information'
        exit 1
      rescue StandardError
        warn '*** Error: There was an error in analysing the BLAST'
        warn '    output file. Please ensure that BLAST output file'
        warn '    is in the correct format and then try again. If you'
        warn '    are using a remote database, please ensure that you'
        warn '    have internet access.'
        exit 1
      ensure
        file.close unless file.nil?
      end

      alias write_a_raw_seq_file write_an_index_file

      def iterate_xml(file, db_type)
        n = Bio::BlastXMLParser::XmlIterator.new(opt[:blast_xml_file]).to_enum
        n.each do |iter|
          iter.each do |hit|
            raise BLASTDBError if hit.hit_id =~ /\|BL_ORD_ID\|/
            if db_type == 'remote' || hit.hit_id.nil?
              file.puts FetchRawSequences.extract_from_remote_db(hit.accession)
            else
              file.puts hit.accession
            end
          end
        end
      end

      def iterate_tabular(file, db_type)
        table_headers = opt[:blast_tabular_options].split(/[ ,]/)
        tab_file      = File.read(opt[:blast_tabular_file])
        rows = CSV.parse(tab_file, col_sep: "\t",
                                   skip_lines: /^#/,
                                   headers: table_headers)

        rows.each do |row|
          raise BLASTDBError if row['sseqid'] =~ /\|BL_ORD_ID\|/i
          if db_type == 'remote' || row['sseqid'].nil?
            file.puts FetchRawSequences.extract_from_remote_db(row['sacc'])
          else
            file.puts row['sseqid']
          end
        end
      end
    end
  end

  class FetchRawSequences
    class << self
      extend Forwardable
      def_delegators GeneValidator, :opt, :config

      def run(identifier, accession)
        # first try to extract from previously created raw_sequences HASH
        raw_seq = extract_from_index(accession) if opt[:raw_sequences]
        # then try to just extract that sequence based on accession.
        if opt[:db] !~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i)
          raw_seq = extract_from_local_db(false, accession)
        end
        # then try to extract from remote database
        if opt[:db] =~ /remote/ && (raw_seq.nil? || raw_seq =~ /Error/i)
          raw_seq = extract_from_remote_db(accession)
        end
        # return nil if the raw_sequence still produces an error.
        raw_seq.nil? || raw_seq.empty? || raw_seq =~ /Error/i ? nil : raw_seq
      end

      ##
      # Gets raw sequence by fasta identifier from a fasta index file
      # Params:
      # +identifier+: String
      # Output:
      # String with the nucleotide sequence corresponding to the identifier
      def extract_from_index(accession)
        # remove trailing version numberr
        accession.sub!(/\.\d+$/, '')
        idx         = config[:raw_seq_file_load][accession]
        query       = IO.binread(opt[:raw_sequences], idx[1] - idx[0], idx[0])
        parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]
        parse_query[1].delete("\n")
      rescue StandardError
        'Error' # return error so it can then try alternative fetching method.
      end

      ##
      # Gets raw sequence by accession number from a givem database
      # Params:
      # +accno+: accession number as String
      # +db+: database as String
      # Output:
      # String with the nucleotide sequence corresponding to the accession
      def extract_from_local_db(batch, accno = nil, idx_file = nil)
        efile = Tempfile.new('blast_out')
        cmd = batch ? batch_raw_seq_cmd(idx_file) : single_raw_seq_cmd(accno, efile.path)
        `#{cmd}`
        raw_seqs = batch ? File.read(opt[:raw_sequences]) : efile.read
        failed_raw_sequences(raw_seqs) if batch && raw_seqs =~ /Error/
        raw_seqs # when obtaining a single raw_seq, this contains the sequence
      ensure
        efile.close
        efile.unlink
      end

      def batch_raw_seq_cmd(index_file)
        "blastdbcmd -entry_batch '#{index_file}' -db '#{opt[:db]}'" \
        " -outfmt '%f' -out '#{opt[:raw_sequences]}'"
      end

      def single_raw_seq_cmd(accession, file)
        "blastdbcmd -entry '#{accession}' -db '#{opt[:db]}' -outfmt '%s' -out '#{file}'"
      end

      def failed_raw_sequences(blast_output)
        blast_output.each_line do |line|
          acc = line.match(/Error: (\w+): OID not found/)[1]
          warn "\nCould not find sequence '#{acc.chomp}' within the" \
                       ' BLAST database.'
          warn "Attempting to obtain sequence '#{acc.chomp}' from" \
                       ' remote BLAST databases.'
          File.open(opt[:raw_sequences], 'a+') do |f|
            f.puts extract_from_remote_db(acc)
          end
        end
      end

      def extract_from_remote_db(accession, db_seq_type = 'protein')
        uri     = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?' \
                  "db=#{db_seq_type}&retmax=1&usehistory=y&term=#{accession}/"
        result  = Net::HTTP.get(URI.parse(uri))
        query   = result.match(%r{<\bQueryKey\b>([\w\W\d]+)</\bQueryKey\b>})[1]
        web_env = result.match(%r{<\bWebEnv\b>([\w\W\d]+)</\bWebEnv\b>})[1]
        uri     = 'https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' \
                  'rettype=fasta&retmode=text&retstart=0&retmax=1&' \
                  "db=#{db_seq_type}&query_key=#{query}&WebEnv=#{web_env}"
        result  = Net::HTTP.get(URI.parse(uri))
        result[0..result.length - 2]
      end
    end
  end
end