princelab/mspire

View on GitHub
lib/mspire/mzml.rb

Summary

Maintainability
A
3 hrs
Test Coverage

require 'mspire'
require 'builder'
require 'core_ext/enumerable'

require 'mspire/mzml/reader'

require 'mspire/mzml/scan_settings'

module Mspire
  # Reading an mzml file:
  #
  #     Mspire::Mzml.open("somefile.mzML") do |mzml|
  #       mzml.each do |spectrum|
  #         scan = spectrum.scan
  #         spectrum.mzs                  # array of m/zs
  #         spectrum.intensities          # array of intensities
  #         spectrum.peaks do |mz,intensity|
  #           puts "mz: #{mz} intensity: #{intensity}" 
  #         end
  #
  #         spectrum.params  # list all the params associated with an object
  #
  #         # true if key exists and no value, the value if present, or false
  #         if spectrum.fetch_by_acc('MS:1000128')
  #           puts "this is a profile spectrum!"
  #         end
  #
  #         if spectrum.ms_level == 2
  #           low_mz = spectrum.scan_list.first.scan_windows.first.to_i
  #           puts "begin scan at #{low_mz} m/z"
  #         end
  #       end
  #
  #       mzml.each_chromatogram do |chrm|
  #         chrm.times
  #         chrm.intensities
  #       end
  #     end
  #
  # Note that the mzml object supports random spectrum access (even if the
  # mzml was not indexed):
  #
  #     mzml[22]  # retrieve spectrum at index 22
  #
  # Writing an mzml file from scratch:
  #
  #    spec1 = Mspire::Mzml::Spectrum.new('scan=1') do |spec|
  #      # profile and ms_level 1
  #      spec.describe_many!(['MS:1000128', ['MS:1000511', 1]])
  #      spec.data_arrays = [
  #        Mspire::Mzml::DataArray[1,2,3].describe!('MS:1000514'),  
  #        Mspire::Mzml::DataArray[4,5,6].describe!('MS:1000515')   
  #      ]
  #      spec.scan_list = Mspire::Mzml::ScanList.new do |sl|
  #        scan = Mspire::Mzml::Scan.new do |scan|
  #          # retention time of 42 seconds
  #          scan.describe! 'MS:1000016', 40.0, 'UO:0000010'
  #        end
  #        sl << scan
  #      end
  #    end
  #  
  #    spec2 = Mspire::Mzml::Spectrum.new('scan=2') do |spec| 
  #      # centroid,  ms_level 2, MSn spectrum, 
  #      spec.describe_many!(['MS:1000127', ['MS:1000511', 2], "MS:1000580"])
  #      spec.data_arrays = [
  #        Mspire::Mzml::DataArray[1,2,3.5].describe!('MS:1000514'),  
  #        Mspire::Mzml::DataArray[5,6,5].describe!('MS:1000515')   
  #      ]
  #      spec.scan_list = Mspire::Mzml::ScanList.new do |sl|
  #        scan = Mspire::Mzml::Scan.new do |scan|
  #          # retention time of 42 seconds
  #          scan.describe! 'MS:1000016', 45.0, 'UO:0000010'
  #        end
  #        sl << scan
  #      end
  #      precursor = Mspire::Mzml::Precursor.new( spec1.id )
  #      si = Mspire::Mzml::SelectedIon.new
  #      # the selected ion m/z:
  #      si.describe! "MS:1000744", 2.0
  #      # the selected ion charge state
  #      si.describe! "MS:1000041", 2
  #      # the selected ion intensity
  #      si.describe! "MS:1000042", 5
  #      precursor.selected_ions = [si]
  #      spec.precursors = [precursor]
  #    end
  #  
  #    mzml = Mspire::Mzml.new do |mzml|
  #      mzml.id = 'ms1_and_ms2'
  #      mzml.cvs = Mspire::Mzml::CV::DEFAULT_CVS
  #      mzml.file_description = Mspire::Mzml::FileDescription.new  do |fd|
  #        fd.file_content = Mspire::Mzml::FileContent.new
  #        fd.source_files << Mspire::Mzml::SourceFile.new
  #      end
  #      default_instrument_config = Mspire::Mzml::InstrumentConfiguration.new("IC").describe!('MS:1000031')
  #      mzml.instrument_configurations << default_instrument_config
  #      software = Mspire::Mzml::Software.new
  #      mzml.software_list << software
  #      default_data_processing = Mspire::Mzml::DataProcessing.new("did_nothing")
  #      mzml.data_processing_list << default_data_processing
  #      mzml.run = Mspire::Mzml::Run.new("little_run", default_instrument_config) do |run|
  #        spectrum_list = Mspire::Mzml::SpectrumList.new(default_data_processing, [spec1, spec2])
  #        run.spectrum_list = spectrum_list
  #      end
  #    end
  #    
  #    mzml.write("writtenxml.mzML")
  class Mzml
    include Enumerable  # each_spectrum

    class << self
      # read-only right now
      def open(filename, &block)
        File.open(filename) do |io|
          block.call(self.new(io))
        end
      end

      def foreach(filename, &block)
        return to_enum(__method__, filename) unless block
        open(filename) do |mzml|
          mzml.each(&block)
        end
      end
    end

    module Default
      NAMESPACE = {
        :xmlns => "http://psi.hupo.org/ms/mzml",
        "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance", 
        "xmlns:xsd" => "http://www.w3.org/2001/XMLSchema", 
        "xsi:schemaLocation" => "http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd"
      }

      VERSION = '1.1.0'
    end

    ###############################################
    # ATTRIBUTES
    ###############################################

    # (optional) an id for accessing from external files
    attr_accessor :id
   
    # (required) the Mzml document version
    attr_accessor :version

    # (optional) e.g. a PRIDE accession number
    attr_accessor :accession

    ###############################################
    # SUBELEMENTS
    ###############################################

    # (required) an array of Mspire::Mzml::CV objects
    attr_accessor :cvs

    # (required) an Mspire::Mzml::FileDescription
    attr_accessor :file_description

    # (optional) an array of CV::ReferenceableParamGroup objects
    attr_accessor :referenceable_param_groups

    # (optional) an array of Mspire::Mzml::Sample objects
    attr_accessor :samples

    # (required) an array of Mspire::Mzml::Software objects 
    attr_accessor :software_list

    # (optional) an array of Mspire::Mzml::ScanSettings objects
    attr_accessor :scan_settings_list

    # (required) an array of Mspire::Mzml::InstrumentConfiguration objects
    attr_accessor :instrument_configurations

    # (required) an array of Mspire::Mzml::DataProcessing objects
    attr_accessor :data_processing_list

    # (required) an Mspire::Mzml::Run object
    attr_accessor :run

    # the io object of the mzml file
    attr_accessor :io

    # Mspire::Mzml::IndexList object associated with the file (only expected when reading
    # mzml files at the moment)
    attr_accessor :index_list

    # xml file encoding
    attr_accessor :encoding


    # arg must be an IO object for automatic index and header parsing to
    # occur.  If arg is a hash, then attributes are set.  In addition (or
    # alternatively) a block called that yields self to setup the object.
    #
    # io must respond_to?(:size), giving the size of the io object in bytes
    # which allows seeking.  get_index_list is called to get or create the
    # index list.
    def initialize(arg=nil, &block)
      %w(cvs software_list instrument_configurations samples data_processing_list).each {|guy| self.send( guy + '=', [] ) }

      case arg
      when IO
        set_from_xml_io!(arg)
      when Hash
        arg.each {|k,v| self.send("#{k}=", v) }
      end
      block.call(self) if block
    end

    module Convenience
      def each_chromatogram(&block)
        @run.chromatogram_list.each(&block)
      end

      def each_spectrum(&block)
        @run.spectrum_list.each(&block)
      end

      alias_method :each, :each_spectrum

      # @param [Object] arg an index number (Integer) or id string (String)
      # @return [Mspire::Mzml::Spectrum] a spectrum object
      def spectrum(arg)
        run.spectrum_list[arg]
      end
      alias_method :'[]', :spectrum

      # @param [Object] arg an index number (Integer) or id string (String)
      # @return [Mspire::Mzml::Chromatogram] a spectrum object
      def chromatogram(arg)
        run.chromatogram_list[arg]
      end

      def num_chromatograms
        run.chromatogram_list.size
      end

      # returns the number of spectra
      def length
        run.spectrum_list.size
      end
      alias_method :size, :length

      # @param [Integer] scan_num the scan number 
      # @return [Mspire::Spectrum] a spectrum object, or nil if not found
      # @raise [ScanNumbersNotUnique] if scan numbers are not unique
      # @raise [ScanNumbersNotFound] if spectra exist but scan numbers were not
      #   found
      def spectrum_from_scan_num(scan_num)
        @scan_to_index ||= @index_list[0].create_scan_to_index
        raise ScanNumbersNotUnique if @scan_to_index == false
        raise ScanNumbersNotFound if @scan_to_index == nil
        spectrum(@scan_to_index[scan_num])
      end
    end
    include Convenience
    
    # Because mzml files are often very large, we try to avoid storing the
    # entire object tree in memory before writing.
    # 
    # takes a filename and uses builder to write to it
    # if no filename is given, returns a string
    def to_xml(filename=nil)
      # TODO: support indexed mzml files
      io = filename ? File.open(filename, 'w') : StringIO.new
      xml = Builder::XmlMarkup.new(:target => io, :indent => 2)
      xml.instruct!

      mzml_atts = Default::NAMESPACE.dup
      mzml_atts[:version] = @version || Default::VERSION
      mzml_atts[:accession] = @accession if @accession
      mzml_atts[:id] = @id if @id

      xml.mzML(mzml_atts) do |mzml_n|
        # the 'if' statements capture whether or not the list is required or not
        raise "#{self.class}#cvs must have > 0 Mspire::Mzml::CV objects" unless @cvs.size > 0 
        Mspire::Mzml::CV.list_xml(@cvs, mzml_n)
        @file_description.to_xml(mzml_n)
        if @referenceable_param_groups
          Mspire::Mzml::ReferenceableParamGroup.list_xml(@referenceable_param_groups, mzml_n)
        end
        if @samples && @samples.size > 0
          Mspire::Mzml::Sample.list_xml(@samples, mzml_n)
        end
        Mspire::Mzml::Software.list_xml(@software_list, mzml_n)
        if @scan_settings_list && @scan_settings_list.size > 0
          Mspire::Mzml::ScanSettings.list_xml(@scan_settings_list, mzml_n)
        end
        icl = Mspire::Mzml::InstrumentConfiguration.list_xml(@instrument_configurations, mzml_n)
        Mspire::Mzml::DataProcessing.list_xml(@data_processing_list, mzml_n)
        @run.to_xml(mzml_n)
      end
      
      if filename
        io.close 
        self
      else
        io.string
      end
    end
    alias_method :write, :to_xml

    class ScanNumbersNotUnique < Exception
    end
    class ScanNumbersNotFound < Exception
    end
  end
end