princelab/mspire

View on GitHub
lib/mspire/quant/qspec.rb

Summary

Maintainability
A
3 hrs
Test Coverage
module Mspire ; end
module Mspire::Quant ; end

class Mspire::Quant::Qspec
  # This is my current best guess based on the behavior of the original QSpec
  # and going into the source code and looking at the paired and param
  # versions.
  
  # qspec: discrete spectral count data
  # qprot: continuous protein abundance data (could be non-discrete spectral
  # counts or quantitation data)
  # paired: one sample against another sample
  # param: one sample against another sample but with one or more replicates
  EXE = {
    qspec: {
      paired: 'qspec-paired',  # <- the old qspec (use qspec here if you have old software)
      param: 'qspec-param',    # <  the old qspecgp (use qspecgp if you have old software)
    },
    qprot: {
      paired: 'qprot-paired',
      param: 'qprot-param',
    },
    getfdr: 'getfdr',
  }

  # personal communication with Hyungwon Choi: "We typically use nburn=2000,
  # niter=10000, which is quite sufficient to guarantee the reproducibility of
  # results using the same data."
  NBURNIN = 2000
  NITER = 10000
  INIT_HEADER = %w(protid protLen)
  DELIMITER = "\t"

  # takes an ordered list of conditions ['cond1', 'cond1', 'cond2', 'cond2'] and
  # returns an array of ints [0,0,0,1,1,1...]
  def self.conditions_to_ints(conditions)
    i = 0
    current_condition = conditions.first
    conditions.map do |cond| 
      if current_condition == cond ; i
      else
        i += 1
        current_condition = cond
        i
      end
    end
  end

  # returns an array of Results structs which is each row of the returned file
  # works with version 1.2.2 of Qprot
  def self.results_array(resultsfile)
    rows = IO.readlines(resultsfile).map {|line| line.chomp.split("\t") }
    headers = rows.shift
    start_log_fold = headers.index {|v| v =~ /LogFoldChange/i }
    rows.map do |row| 
      data = [row[0]]
      data.push( row[1...start_log_fold].map(&:to_f) )
      data.push( *row[start_log_fold,5].map(&:to_f) )
      Results.new(*data)
    end
  end

  # returns the right executable based on the array of conditions
  def executable
    biggest_size = conditions.group_by {|v| v }.values.map(&:size).max
    EXE[@protnames ? :qprot : :qspec][(biggest_size >= 3) ? :param : :paired]
  end

  # protname is a list of protein names.
  # by default, qprot will be run.  If you really want qspec to be run, then
  # supply a [protname, length] doublet in place of each protname.
  # condition_to_count_array is an array doublets: [condition, array_of_counts]
  def initialize(protnames, condition_to_count_array)
    @protnames = protnames
    if @protnames.first.is_a?(Array)
      @protname_length_pairs = @protnames
      @protnames = nil
    end
    @condition_to_count_array = condition_to_count_array
  end

  def conditions
    @condition_to_count_array.map(&:first)
  end

  # writes a qspec formatted file to filename
  def write(filename)
    header_cats = %w(protid)
    header_cats << 'protLen' if @protname_length_pairs
    header_cats.push(*Mspire::Quant::Qspec.conditions_to_ints(conditions))
    ar = @protnames || @protname_length_pairs
    rows = ar.map {|obj| Array(obj) }
    @condition_to_count_array.each do |cond,counts|
      rows.zip(counts) {|row,cnt| row << cnt }
    end
    File.open(filename,'w') do |out|
      out.puts header_cats.join(DELIMITER)
      rows.each {|row| out.puts row.join(DELIMITER) }
    end
  end

  # returns an array of Qspec::Results objects (each object can be considered
  # a row of data)
  def run(normalize=true, opts={})
    exe = executable
    puts "using #{exe}" if $VERBOSE
    executable_base = exe.split('-')[0]

    puts "normalize: #{normalize}" if $VERBOSE
    tfile = Tempfile.new(executable_base)
    write(tfile.path)
    if opts[:keep]
      local_file = File.join(Dir.pwd,File.basename(tfile.path))
      FileUtils.cp(tfile.path, local_file, :verbose => $VERBOSE)
      puts "(copy of) file submitted to #{exe}: #{local_file}" if $VERBOSE
    end
    cmd = [exe, tfile.path, NBURNIN, NITER, (normalize ? 1 : 0)].join(' ')
    if $VERBOSE
      puts "running #{cmd}" if $VERBOSE
    else
      cmd << " 2>&1"
    end
    reply = `#{cmd}`
    puts reply if $VERBOSE
    outfile = tfile.path + '_' + executable_base
    system EXE[:getfdr], outfile
    fdr_file = outfile + "_fdr"
    puts "FDR_FILE: #{fdr_file} exists? #{fdr_file}" if $VERBOSE
    results = self.class.results_array(fdr_file)
    if opts[:keep]
      local_outfile = File.join(Dir.pwd, File.basename(outfile))
      local_fdrfile = File.join(Dir.pwd, File.basename(fdr_file))
      FileUtils.cp(outfile, local_outfile, :verbose => $VERBOSE)
      FileUtils.cp(fdr_file, local_fdrfile, :verbose => $VERBOSE)
      if $VERBOSE
        puts "(copy of) file returned from qspec: #{outfile}"
        puts "(copy of) file returned from qspec: #{fdr_file}"
      end
    end
    tfile.unlink
    results
  end

  # for version 2 of QSpec
  # counts array is parallel to the experiment names passed in originally
  #Results = Struct.new(:protid, :counts_array, :bayes_factor, :fold_change, :rb_stat, :fdr, :flag)
  
  # for version 1.2.2 of QProt
  # counts array is parallel to the experiment names passed in originally
  Results = Struct.new(:protid, :counts_array, :log_fold_change, :z_statistic, :fdr, :fdr_up, :fdr_down)
end