yannickwurm/sequenceserver

View on GitHub
lib/sequenceserver/blast/report.rb

Summary

Maintainability
A
45 mins
Test Coverage
require 'ox'
Ox.default_options = { skip: :skip_none }

require 'sequenceserver/report'
require 'sequenceserver/links'

require_relative 'formatter'
require_relative 'query'
require_relative 'hit'
require_relative 'hsp'

module SequenceServer
  module BLAST
    # Captures results of a BLAST search.
    #
    # A report is constructed from a search id. Search id is simply the
    # basename of the temporary file that holds BLAST results in binary
    # BLAST archive format.
    #
    # For a given search id, result is obtained in XML format using the
    # Formatter class, parsed into a simple intermediate representation
    # (Array of values and Arrays) and information extracted from the
    # intermediate representation (ir).
    class Report < Report
      def initialize(job)
        super do
          @querydb = job.databases
        end
      end

      def to_json(*_args)
        %i[querydb program program_version params stats
           queries].inject({}) do |h, k|
          h[k] = send(k)
          h
        end.update(search_id: job.id,
                   submitted_at: job.submitted_at.utc,
                   imported_xml: !job.imported_xml_file.nil?,
                   seqserv_version: SequenceServer::VERSION,
                   cloud_sharing_enabled: SequenceServer.config[:cloud_share_url].start_with?('http'),
                   non_parse_seqids: !!job.databases&.any?(&:non_parse_seqids?)).to_json
      end

      def xml_file_size
        return File.size(job.imported_xml_file) if job.imported_xml_file

        xml_formatter.size
      end

      def done?
        return true if job.imported_xml_file

        File.exist?(xml_formatter.filepath) && File.exist?(tsv_formatter.filepath)
      end

      def program
        @program ||= xml_ir[0]
      end

      def program_version
        @program_version ||= xml_ir[1]
      end

      def querydb
        @querydb ||= xml_ir[3].split.map do |path|
          { title: File.basename(path) }
        end
      end

      def dbtype
        @dbtype ||= querydb&.first&.type || dbtype_from_program
      end

      def params
        @params ||= extract_params
      end

      def stats
        @stats ||= extract_stats
      end

      def queries
        @queries ||= xml_ir[8].map do |n|
          query = Query.new(self, n[0], n[2], n[3], [])
          query.hits = query_hits(n[4], tsv_ir[query.id], query)

          query
        end
      end

      private

      def xml_ir
        @xml_ir ||=
          if job.imported_xml_file
            parse_xml File.read(job.imported_xml_file)
          else
            job.raise!
            parse_xml(xml_formatter.read_file)
          end
      end

      def tsv_ir
        @tsv_ir ||=
          if job.imported_xml_file
            Hash.new do |h1, k1|
              h1[k1] = Hash.new do |h2, k2|
                h2[k2] = ['', '', []]
              end
            end
          else
            job.raise!
            parse_tsv(tsv_formatter.read_file)
          end
      end

      def xml_formatter
        @xml_formatter ||= Formatter.run(job, 'xml')
      end

      def tsv_formatter
        @tsv_formatter ||= Formatter.run(job, 'custom_tsv')
      end

      # Search params tweak the results. Like evalue cutoff or penalty to open
      # a gap. BLAST+ doesn't list all input params in the XML output. Only
      # matrix, evalue, gapopen, gapextend, and filters are available from XML
      # output.
      def extract_params
        # Parse/get params from the job first.
        job_params = parse_advanced(job.advanced)
        # Old jobs from beta releases may not have the advanced key but they
        # will have the deprecated advanced_params key.
        job_params.update(job.advanced_params) if job.advanced_params

        # Parse params from BLAST XML.
        @params = Hash[
          *xml_ir[7].first.map { |k, v| [k.gsub('Parameters_', ''), v] }.flatten
        ]
        @params['evalue'] = @params.delete('expect')

        # Merge into job_params.
        @params = job_params.merge(@params)
      end

      # Search stats are computed metrics. Like total number of sequences or
      # effective search space.
      def extract_stats
        stats = xml_ir[8].first[5][0]
        {
          nsequences: stats[0],
          ncharacters: stats[1],
          hsp_length: stats[2],
          search_space: stats[3],
          kappa: stats[4],
          labmda: stats[5],
          entropy: stats[6]
        }
      end

      # Create Hit objects for the given query from the given ir.
      def query_hits(xml_ir, tsv_ir, query)
        return [] if xml_ir == ["\n"] # => No hits.

        xml_ir.map do |n|
          # If hit comes from a non -parse_seqids database, then id (n[1]) is a
          # BLAST assigned internal id of the format 'gnl|BL_ORD_ID|serial'. We
          # assign the id to accession (because we use accession for sequence
          # retrieval and this id is what blastdbcmd expects for non
          # -parse_seqids databases) and parse the hit defline to
          # obtain id and title ourselves (we use id and title
          # for display purposes).
          if n[1] =~ /^gnl\|BL_ORD_ID\|\d+/
            n[3] = n[1]
            defline = n[2].split
            n[1] = defline.shift
            n[2] = defline.join(' ')
          end

          hit = Hit.new(query, n[0], n[1], n[3], n[2], n[4],
                        tsv_ir[n[1]][0], tsv_ir[n[1]][1], [])

          hit.hsps = hsps(n[5], tsv_ir[n[1]][2], hit)

          hit
        end
      end

      def hsps(xml_ir, tsv_ir, hit)
        xml_ir.map.with_index do |n, i|
          n.insert(14, tsv_ir[i])

          HSP.new(hit, *n)
        end
      end

      def parse_xml(xml)
        node_to_array Ox.parse(xml).root
      rescue Ox::ParseError
        raise 'Error parsing XML file' if job.imported_xml_file

        raise InputError, <<~MSG
          BLAST generated incorrect XML output. This can happen if sequence ids in your
          databases are not unique across all files. As a temporary workaround, you can
          repeat the search with one database at a time. Proper fix is to recreate the
          following databases with unique sequence ids:

              #{querydb.map(&:title).join(', ')}

          If you are not the one managing this server, try to let the manager know
          about this.
        MSG
      end

      PARSEABLE_AS_HASH  = %w[Parameters].freeze
      PARSEABLE_AS_ARRAY = %w[BlastOutput_param Iteration_stat Statistics
                              Iteration_hits BlastOutput_iterations
                              Iteration Hit Hit_hsps Hsp].freeze

      def node_to_hash(element)
        Hash[*element.nodes.map { |n| [n.name, node_to_value(n)] }.flatten]
      end

      def node_to_array(element)
        element.nodes.map { |n| node_to_value n }
      end

      def node_to_value(node)
        # Ensure that the recursion doesn't fails when String value is received.
        return node if node.is_a?(String)

        if PARSEABLE_AS_HASH.include? node.name
          node_to_hash(node)
        elsif PARSEABLE_AS_ARRAY.include? node.name
          node_to_array(node)
        else
          first_text(node)
        end
      end

      def first_text(node)
        node.nodes.find { |n| n.is_a? String }
      end

      # Parses the given TSV string as:
      #
      # {
      #    qseqid: {
      #      sseqid: [sciname, qcovs, [qcovhsp]],
      #      ...
      #    },
      #    ...
      # }
      def parse_tsv(tsv)
        ir = Hash.new { |h, k| h[k] = {} }
        tsv.each_line do |line|
          next if line.start_with? '#'

          row = line.chomp.split("\t")

          (ir[row[0]][row[1]] ||= [row[2], row[3], []])[2] << row[4]
        end
        ir
      end

      # Parse BLAST CLI string from job.advanced.
      def parse_advanced(param_line)
        param_list = (param_line || '').split(' ')
        res = {}

        param_list.each_with_index do |word, i|
          nxt = param_list[i + 1]
          next unless word.start_with? '-'

          word.sub!('-', '')
          res[word] = unless nxt.nil? || nxt.start_with?('-')
                        nxt
                      else
                        'True'
                      end
        end
        res
      end

      # Returns database type (nucleotide or protein) inferred from
      # Report#program (i.e., the BLAST algorithm)
      def dbtype_from_program
        case program
        when /blastn|tblastn|tblastx/
          'nucleotide'
        when /blastp|blastx/
          'protein'
        end
      end
    end
  end
end