wurmlab/sequenceserver

View on GitHub
lib/sequenceserver/doctor.rb

Summary

Maintainability
A
0 mins
Test Coverage
require 'set'
require 'sequenceserver/config'
require 'sequenceserver/database'
require 'sequenceserver/sequence'

module SequenceServer
  # Doctor detects inconsistencies likely to cause problems with Sequenceserver
  # operation.
  class Doctor
    ERROR_PARSE_SEQIDS        = 1
    ERROR_NUMERIC_IDS         = 2
    ERROR_PROBLEMATIC_IDS     = 3

    AVOID_ID_REGEX            = /^(?!gi|bbs)\w+\|\w*\|?/

    class << self
      extend Forwardable

      def_delegators SequenceServer, :config

      # Returns an array of database objects in which each of the object has an
      # array of sequence_ids satisfying the block passed to the method.
      def inspect_seqids(seqids, &block)
        seqids.map do |sq|
          sq[:db] unless sq[:seqids].select(&block).empty?
        end.compact
      end

      # Retrieve sequence ids (specified by %i) from all databases. Using
      # accession number is problematic because of several reasons.[2]
      def all_sequence_ids(ignore)
        Database.map do |db|
          next if ignore.include? db

          out = `blastdbcmd -entry all -db #{db.name} -outfmt "%i" 2> /dev/null`
          {
            db:     db,
            seqids: out.to_s.split
          }
        end.compact
      end

      # FASTA files formatted without -parse_seqids option won't support the
      # blastdbcmd command of fetching sequence ids using '%i' identifier.
      # In such cases, an array of 'N/A' values are returned which is
      # checked in this case.
      def inspect_parse_seqids(seqids)
        seqids.map do |sq|
          sq[:db] if sq[:seqids].include? 'N/A'
        end.compact
      end

      # Pretty print database list.
      def bullet_list(values)
        list = ''
        values.each do |value|
          list << "      - #{value}\n"
        end
        list
      end

      # Print diagnostic error messages according to the type of error.
      # rubocop:disable Metrics/MethodLength
      def show_message(error, values)
        return if values.empty?

        case error
        when ERROR_PARSE_SEQIDS
          puts <<~MSG
            *** Doctor has found improperly formatted database:
            #{bullet_list(values)}
            Please reformat your databases with -parse_seqids switch (or use
            sequenceserver -m) for using SequenceServer as the current format
            may cause problems.

            These databases are ignored in further checks.
          MSG

        when ERROR_NUMERIC_IDS
          puts <<~MSG
            *** Doctor has found databases with numeric sequence ids:
            #{bullet_list(values)}
            Note that this may cause problems with sequence retrieval.
          MSG

        when ERROR_PROBLEMATIC_IDS
          puts <<~MSG
            *** Doctor has found databases with problematic sequence ids:
            #{bullet_list(values)}
            This causes some sequence to contain extraneous words like `gnl|`
            appended to their id string.
          MSG
        end
      end
      # rubocop:enable Metrics/MethodLength
    end

    def initialize
      @ignore     = []
      @all_seqids = Doctor.all_sequence_ids(@ignore)
    end

    attr_reader :invalids, :all_seqids

    def diagnose
      puts "\n1/3 Inspecting databases for proper -parse_seqids formatting.."
      check_parse_seqids
      remove_invalid_databases

      puts "\n2/3 Inspecting databases for numeric sequence ids.."
      check_numeric_ids

      puts "\n3/3 Inspecting databases for problematic sequence ids.."
      check_id_format
    end

    # Remove entried which are in ignore list or not formatted with
    # -parse_seqids option.
    def remove_invalid_databases
      @all_seqids.delete_if { |sq| @ignore.include? sq[:db] }
    end

    # Obtain files which aren't formatted with -parse_seqids and add them to
    # ignore list.
    def check_parse_seqids
      without_parse_seqids = Doctor.inspect_parse_seqids(@all_seqids)
      Doctor.show_message(ERROR_PARSE_SEQIDS, without_parse_seqids)

      @ignore.concat(without_parse_seqids)
    end

    # Check for the presence of numeric sequence ids within a database.
    def check_numeric_ids
      selector = proc { |id| !id.to_i.zero? }

      Doctor.show_message(ERROR_NUMERIC_IDS,
                          Doctor.inspect_seqids(@all_seqids, &selector))
    end

    # Warn users about sequence identifiers of format abc|def because then
    # BLAST+ appends a gnl (for general) infront of the database
    # identifiers. There are only two identifiers that we need to avoid
    # when searching for this format.[1]
    # bbs|number, gi|number
    # Note that while sequence ids could have been arbitrary, using
    # parse_seqids reduces our search space substantially.
    def check_id_format
      selector = proc { |id| id.match(AVOID_ID_REGEX) }

      Doctor.show_message(ERROR_PROBLEMATIC_IDS,
                          Doctor.inspect_seqids(@all_seqids, &selector))
    end
  end
end

# [1]: http://etutorials.org/Misc/blast/Part+IV+Industrial-Strength+BLAST/Chapter+11.+BLAST+Databases/11.1+FASTA+Files/
# [2]: For sequence deflines of the kind abc|def, accession number is returned
# as abc:def. Even though you take hacky measure and ensure that latter is
# queried, correct entries are not returned. Using the value returned by %i
# works in all cases. This may change in later versions of BLAST+.