princelab/mspire

View on GitHub
lib/mspire/peaklist.rb

Summary

Maintainability
C
1 day
Test Coverage
require 'mspire/bin'
require 'mspire/peak'

module Mspire
  # a collection of Peak objects.  At a minimum, each peak should respond to
  # :x and :y
  class Peaklist < Array

    def lo_x
      self.first[0] if size > 0
    end

    def hi_x
      self.last[0] if size > 0
    end

    DEFAULT_MERGE = {
      :bin_width => 5,
      :bin_unit => :ppm,
      :normalize => true,
      :return_data => false,
      :split => :share,
      :centroided => true,
    }

    # for spectral peaks, this is the weighted m/z
    def weighted_x
      tot_intensity = self.inject(0.0) {|sum,peak| sum + peak.y }
      _weighted_x = 0.0
      self.each do |peak|
        int = peak.y
        signal_by_sample_index[peak.sample_id] += int
        _weighted_x += (peak.first * (int/tot_intensity))
      end
      _weighted_x
    end

    # class methods
    class << self

      # creates a new Mspire::Peaklist and coerces each peak into an
      # Mspire::Peak.  If your peaks already behave like peaks you should use
      # .new
      def [](*peaks)
        self.new( peaks.map {|peak| Mspire::Peak.new(peak) } )
      end

      def create_bins(peaklists, opts)
        min, max = min_max_x(peaklists)

        divisions = []
        bin_width = opts[:bin_width]
        use_ppm = (opts[:bin_unit] == :ppm)

        puts "using bin width: #{bin_width}" if $VERBOSE
        puts "using ppm for bins: #{use_ppm}" if $VERBOSE

        current_x = min
        loop do
          if current_x >= max
            divisions << max
            break
          else
            divisions << current_x
            current_x += ( use_ppm ? current_x./(1e6).*(bin_width) : bin_width )
          end
        end
        # make each bin exclusive so there is no overlap
        bins = divisions.each_cons(2).map {|pair| Mspire::Bin.new(*pair, true) }
        # make the last bin *inclusive* of the terminating value
        bins[-1] = Mspire::Bin.new(bins.last.begin, bins.last.end)
        bins
      end

      def min_max_x(peaklists)
        # find the min and max across all spectra
        first_peaklist = peaklists.first
        min = first_peaklist.first.x; max = first_peaklist.last.x
        peaklists.each do |peaklist|
          if peaklist.size > 0
            min = peaklist.lo_x if peaklist.lo_x < min
            max = peaklist.hi_x if peaklist.hi_x > max
          end
        end
        [min, max]
      end

      def merge_centroids(peaklists, opts={})
        opts[:return_data] = true if opts[:only_data]

        # Create Mspire::Bin objects
        bins = opts[:bins] ? opts[:bins] : create_bins(peaklists, opts)
        puts "created #{bins.size} bins" if $VERBOSE

        peaklists.each do |peaklist|
          Mspire::Bin.bin(bins, peaklist, &:x)
        end

        pseudo_peaks = bins.map do |bin|
          Mspire::Peak.new( [bin, bin.data.reduce(0.0) {|sum,peak| sum + peak.y }] )
        end

        pseudo_peaklist = Mspire::Peaklist.new(pseudo_peaks)

        separate_peaklists = pseudo_peaklist.split(opts[:split])

        normalize_factor = opts[:normalize] ? peaklists.size : 1

        return_data = []
        final_peaklist = Mspire::Peaklist.new unless opts[:only_data]

        separate_peaklists.each do |pseudo_peaklist|
          data_peaklist = Mspire::Peaklist.new
          weight_x = 0.0
          tot_intensity = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.y }
          #puts "TOT INTENSITY:"
          #p tot_intensity
          calc_from_lil_bins = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.x.data.map(&:y).reduce(:+) }
          #puts "LILBINS: "
          #p calc_from_lil_bins
          pseudo_peaklist.each do |bin_peak|

            # For the :share method, the psuedo_peak intensity may have been
            # adjusted, but the individual peaks were not.  Correct this.             
            if opts[:split] == :share
              post_scaled_y = bin_peak.y
              pre_scaled_y = bin_peak.x.data.reduce(0.0) {|sum,peak| sum + peak.last }
              #puts "PRESCALED Y:"
              #p pre_scaled_y
              if (post_scaled_y - pre_scaled_y).abs.round(10) != 0.0
                correction = post_scaled_y / pre_scaled_y
                bin_peak.x.data.each {|peak| peak.y = (peak.y * correction) }
              end
            end

            unless opts[:only_data]
              bin_peak.x.data.each do |peak|
                weight_x += peak.x * ( peak.y.to_f / tot_intensity)
              end
            end
            (data_peaklist.push( *bin_peak.x.data )) if opts[:return_data]
          end
          final_peaklist << Mspire::Peak.new([weight_x, tot_intensity / normalize_factor]) unless opts[:only_data]
          return_data << data_peaklist if opts[:return_data]
        end
        [final_peaklist, return_data]
      end

      # returns a new peaklist which has been merged with the others. 
      # opts[:resolution]) and then segment according to monotonicity (sharing
      # intensity between abutting points).  The  final m/z is the weighted
      # averaged of all the m/z's in each peak.  Valid opts (with default listed
      # first):
      #
      #     :bin_width => 5 
      #     :bin_unit => :ppm|:amu        interpret bin_width as ppm or amu
      #     :bins => array of Mspire::Bin objects  for custom bins (overides other bin options)
      #     :normalize => true              if true, divides total intensity by 
      #                                     number of spectra
      #     :return_data => false           returns a parallel array containing
      #                                     the peaks associated with each returned peak
      #     :only_data => false           only returns the binned peaks
      #     :split => :zero|:greedy_y|:share  see Mspire::Peak#split
      #     :centroided => true             treat the data as centroided
      #
      # The binning algorithm is roughly the fastest possible algorithm that
      # would allow for arbitrary, non-constant bin widths (a ratcheting
      # algorithm O(n + m))
      #
      # Assumes the peaklists are already sorted by m/z.
      #
      # Note that the peaks themselves will be altered if using the :share
      # split method.
      def merge(peaklists, opts={})
        opts = DEFAULT_MERGE.merge(opts)

        (peaklist, returned_data) =  
          if opts[:centroided]
            merge_centroids(peaklists, opts.dup)
          else
            raise NotImplementedError, "need to implement profile merging"
          end

        if opts[:only_data]
          returned_data
        elsif opts[:return_data]
          [peaklist, returned_data]
        else
          peaklist
        end
      end

      # takes an array of peaklists, merges them together (like merge), then
      # returns an array of doublets.  Each doublet is an m/z and an array of
      # values (parallel to input) of intensities]
      #
      # If you are already using tagged peak objects, then you should set
      # have_tagged_peaks to false (default is true).  The peaks will be
      # deconvolved based on the sample_id in this case.
      #
      #     :have_tagged_peaks => false|true
      #
      def merge_and_deconvolve(peaklists, opts={})
        unless htp=opts.delete(:have_tagged_peaks)
          peaklists.each_with_index do |peaklist,i|
            peaklist.map! do |peak|
              TaggedPeak.new(peak, i)
            end
          end
        end

        # make sure we get the data back out
        opts[:return_data] = true
        opts[:only_data] = false

        sample_ids = peaklists.map {|pl| pl.first.sample_id }
        peaks, data = merge(peaklists, opts)
        data.zip(peaks).map do |bucket_of_peaks, meta_peak|
          signal_by_sample_id = Hash.new {|h,k| h[k] = 0.0 }
          bucket_of_peaks.each do |peak|
            signal_by_sample_id[peak.sample_id] += peak.y
          end
          [meta_peak.x, sample_ids.map {|sample_id| signal_by_sample_id[sample_id] } ]
        end
      end

    end # end class << self

    # returns an array with the indices outlining each peak.  The first index
    # is the start of the peak, the last index is the last of the peak.
    # Interior indices represent local minima.  So, peaks that have only two
    # indices have no local minima.
    def peak_boundaries(gt=0.0)
      in_peak = false
      prev_y = gt
      prev_prev_y = gt
      peak_inds = []
      self.each_with_index do |peak, index|
        curr_y = peak.y
        if curr_y > gt
          if !in_peak
            in_peak = true
            peak_inds << [index]
          else
            # if on_upslope
            if prev_y < curr_y
              # If we were previously on a downslope and we are now on an upslope
              # then the previous index is a local min
              # on_downslope(prev_previous_y, prev_y)
              if prev_prev_y > prev_y
                # We have found a local min
                peak_inds.last << (index - 1)
              end
            end # end if (upslope)
          end # end if !in_peak
        elsif in_peak
          peak_inds.last << (index - 1)
          in_peak = false
        end 
        prev_prev_y = prev_y
        prev_y = curr_y
      end
      # add the last one to the last peak if it is a boundary
      if self[-1].y > gt
        peak_inds.last << (self.size-1)
      end
      peak_inds
    end

    # returns an array of Peaklist objects
    def split_on_zeros(given_peak_boundaries=nil)
      pk_bounds = given_peak_boundaries || peak_boundaries(0.0)
      pk_bounds.map do |indices|
        self.class.new self[indices.first..indices.last]
      end
    end

    # returns an array of Peaklist objects
    # assumes that this is one connected list of peaks (i.e., no
    # zeros/whitespace on the edges or internally)
    #
    #       /\       
    #      /  \/\
    #     /      \
    #
    # if there are no local minima, just returns self inside the array
    def split_contiguous(methd=:greedy_y, local_min_indices=nil)
      local_min_indices ||= ((pb=peak_boundaries.first) && pb.shift && pb.pop && pb)

      if local_min_indices.size == 0
        self
      else
        peak_class = first.class
        prev_lm_i = 0  # <- don't worry, will be set to bumped to zero
        peaklists = [ self.class.new([self[0]]) ]
        local_min_indices.each do |lm_i|
          peaklists.last.push( *self[(prev_lm_i+1)..(lm_i-1)] )
          case methd
          when :greedy_y
            if self[lm_i-1].y >= self[lm_i+1].y
              peaklists.last << self[lm_i]
              peaklists << self.class.new
            else
              peaklists << self.class.new( [self[lm_i]] )
            end
          when :share
            # for each local min, two new peaks will be created, with
            # intensity shared between adjacent peaklists
            lm = self[lm_i]
            sum = self[lm_i-1].y + self[lm_i+1].y
            # push onto the last peaklist its portion of the local min
            peaklists.last << peak_class.new( [lm.x, lm.y * (self[lm_i-1].y.to_f/sum)] )
            # create a new peaklist that contains its portion of the local min
            peaklists << self.class.new( [peak_class.new([lm.x, lm.y * (self[lm_i+1].y.to_f/sum)])] )
          end
          prev_lm_i = lm_i
        end
        peaklists.last.push(*self[(prev_lm_i+1)...(self.size)] )
        peaklists
      end
    end

    # returns an Array of peaklist objects.  Splits run of 1 or more local
    # minima into multiple peaklists.  When a point is 'shared' between two
    # adjacent hill-ish areas, the choice of how to resolve multi-hills (runs
    # of data above zero) is one of:
    #
    #     :zero = only split on zeros
    #     :share = give each peak its rightful portion of shared peaks, dividing the
    #               intensity based on the intensity of adjacent peaks
    #     :greedy_y = give the point to the peak with highest point next to
    #                  the point in question. tie goes lower.
    #
    # Note that the peak surrounding a local_minima may be altered if using
    # :share
    #
    # assumes that a new peak can be made with an array containing the x
    # value and the y value.
    def split(split_multipeaks_mthd=:zero)
      if split_multipeaks_mthd == :zero
        split_on_zeros
      else
        boundaries = peak_boundaries(0.0)
        no_lm_pklsts = []
        boundaries.each do |indices|
          peak = self[indices.first..indices.last]
          if indices.size == 2
            no_lm_pklsts << peak
          else # have local minima
            multipeak = Peaklist.new(peak)
            local_min_inds = indices[1..-2].map {|i| i-indices.first}
            peaklists = multipeak.split_contiguous(split_multipeaks_mthd, local_min_inds)
            no_lm_pklsts.push *peaklists
          end
        end
        #$stderr.puts "now #{no_lm_pklsts.size} peaks." if $VERBOSE
        no_lm_pklsts 
      end 
    end # def split

    # returns an Mspire::Spectrum object
    def to_spectrum
      Mspire::Spectrum.new(self.transpose)
    end
  end
end



=begin
if !opts[:only_data]
=end