princelab/mspire

View on GitHub
lib/mspire/mzml/reader.rb

Summary

Maintainability
A
2 hrs
Test Coverage
require 'io/bookmark'

%w(
  parser

  index_list

  cv
  referenceable_param_group
  file_description

  sample
  software
  instrument_configuration
  data_processing
  run
).each do |file|
  require "mspire/mzml/#{file}"
end

module Mspire
  class Mzml
  end
end

module Mspire::Mzml::Reader

  attr_accessor :link

  def set_from_xml_io!(xml_io)
    @io = xml_io
    begin
      @encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
    rescue EOFError
      raise RuntimeError, "no encoding present in XML!  (Is this even an xml file?)"
    end
    @index_list = Mspire::Mzml::IndexList.from_io(@io)
    read_header!( get_default_data_processing_ids(@io, @index_list) )
  end

  # returns a hash keyed by :spectrum or :chromatogram that gives the id
  # (aka ref) as a string.
  def get_default_data_processing_ids(io, index_list, lookback=300)
    hash = {}
    index_list.each_pair do |name, index|
      if index.size > 0
        # ^ we cannot quickly retrieve a defaultDataProcessingRef unless there
        # is at least one spectrum/chromatogram to start with.  However, if
        # there is no spectrum/chromatogram, then the defaultDataProcessingRef
        # will not be needed either.
        io.bookmark do |io|
          io.pos = index[0] - lookback 
          hash[name] = io.read(lookback)[/<#{name}List.*defaultDataProcessingRef=['"](.*?)['"]/m, 1]
        end
      end
    end
    hash
  end

  # saves ~ 3 seconds when reading a 83M mzML file to scrape off the
  # header string (even though we're just handing in an IO object to
  # Nokogiri::XML::Document.parse and we are very careful to not parse too
  # far).
  def get_header_string(io)
    chunk_size = 2**12
    loc = 0
    string = ''
    while chunk = @io.read(chunk_size)
      string << chunk
      start_looking = ((loc-20) < 0) ? 0 : (loc-20)
      break if string[start_looking..-1] =~ /<(spectrum|chromatogram)/
        loc += chunk_size
    end
    string
  end

  # list_type_to_default_data_processing_id is a hash keyed by :spectrum or
  # :chromatogram that gives the default data_processing_object for the
  # SpectrumList and/or the ChromatogramList.  This information is not
  # obtainable from the header string, so must be pre-obtained.
  def read_header!(list_type_to_default_data_processing_id)
    @io.rewind

    string = get_header_string(@io)
    doc = Nokogiri::XML.parse(string, nil, @encoding, Mspire::Mzml::Parser::NOBLANKS)

    doc.remove_namespaces!
    mzml_n = doc.root
    if mzml_n.name == 'indexedmzML'
      mzml_n = mzml_n.child
    end

    @id = mzml_n[:id]

    cv_list_n = mzml_n.child
    self.cvs = cv_list_n.children.map do |cv_n|
      Mspire::Mzml::CV.from_xml(cv_n)
    end

    # get the file description node but deal with it after getting ref_hash
    file_description_n = cv_list_n.next

    xml_n = file_description_n.next

    # a hash of referenceable_param_groups indexed by id
    @link = {}

    if xml_n.name == 'referenceableParamGroupList'
      self.referenceable_param_groups = xml_n.children.map do |rpg_n|
        Mspire::Mzml::ReferenceableParamGroup.from_xml(rpg_n) # <- no ref_hash (not made yet)
      end
      @link[:ref_hash] = self.referenceable_param_groups.index_by(&:id)
      xml_n = xml_n.next
    end

    # now we can set the file description because we have the ref_hash
    self.file_description = Mspire::Mzml::FileDescription.from_xml(file_description_n, @link)
    @link[:source_file_hash] = self.file_description.source_files.index_by(&:id)


    loop do
      case xml_n.name
      when 'sampleList'
        self.samples = xml_n.children.map do |sample_n|
          Mspire::Mzml::Sample.from_xml(sample_n, @link)
        end
        @link[:sample_hash] = self.samples.index_by(&:id)
      when 'softwareList'  # required
        self.software_list = xml_n.children.map do |software_n|
          Mspire::Mzml::Software.from_xml(software_n, @link)
        end
        @link[:software_hash] = self.software_list.index_by(&:id)
      when 'instrumentConfigurationList'
        self.instrument_configurations = xml_n.children.map do |inst_config_n|
          Mspire::Mzml::InstrumentConfiguration.from_xml(inst_config_n, @link)
        end
        @link[:instrument_configuration_hash] = self.instrument_configurations.index_by(&:id)
      when 'dataProcessingList'
        self.data_processing_list = xml_n.children.map do |data_processing_n|
          Mspire::Mzml::DataProcessing.from_xml(data_processing_n, @link)
        end
        @link[:data_processing_hash] = self.data_processing_list.index_by(&:id)
      when 'run'
        @link[:index_list] = @index_list
        list_type_to_default_data_processing_id.each do |type, process_id|
          @link["#{type}_default_data_processing".to_sym] = @link[:data_processing_hash][process_id]
        end
        self.run = Mspire::Mzml::Run.from_xml(@io, xml_n, @link)
        break
      end
      xml_n = xml_n.next
    end
  end
end

module Mspire
  class Mzml
    include Reader
  end
end