lib/mspire/imzml/writer.rb from princelab/mspire

lib/mspire/imzml/writer.rb
Summary

Maintainability

2 days
Test Coverage

Issues

require 'nokogiri'
require 'securerandom'
require 'cv'
require 'mspire/mzml'
require 'mspire/mzml/spectrum'
require 'mspire/spectrum'
require 'pathname'
require 'digest/sha1'

module Mspire end

module Mspire::Imzml
  # array length (number of values), the offset (start in file in bytes),
  # and the encoded length (in bytes).
  DataArrayInfo = Struct.new :array_length, :offset, :encoded_length

  class Writer

    DEFAULTS = {
      mz_data_type: :float,
      mz_precision: 64,
      intensity_data_type: :float, 
      intensity_precision: 32,
    }

    # Integer values (dtype=:int) may be stored with precision = 8, 16, 32,
    # 64 . Floating point values may be stored as 32 or 64 bit precision.
    # The byte order is always little endian (intel style).
    #
    # NOTE: the documentation is not clear whether they want signed or
    # unsigned integers!  Right now this outputs signed integers (my
    # educated guess as to what is required)
    def write_data_array(fh, values, dtype=:float, precision=32)
      pack_code = 
        case dtype
        when :float
          precision == 64 ? 'E*' : 'e*'
        when :int
          # using signed!!!! (should these be unsigned??? NOT DOCUMENTED in
          # imzml resources anywhere!)
          case precision
          when 8  ; 'c<*'
          when 16 ; 's<*'
          when 32 ; 'l<*'
          when 64 ; 'q<*'
          else
            raise 'precision must be 8, 16, 32, or 64 for dtype==:int'
          end
        else
          raise 'dtype must be :int or :float!'
        end
      fh.print values.pack(pack_code)
    end

    # returns an array of DataArrayInfo pairs
    #
    # These must be defined in the config hash with valid values (see
    # write_data_array for what those are):
    # 
    #     :mz_data_type
    #     :mz_precision
    #     :intensity_data_type
    #     :intensity_precision
    #
    # Also recognizes :data_structure (:processed or :continuous).  If
    # :continuous, then the first spectrum is used to write the initial m/z
    # and intensity pair, then in every spectrum after that the m/z data (if
    # any) is ignored and only the intensities are written).  The return is
    # the same, the data info for each intensity is coupled with the m/z
    # info from the first m/z data.
    def write_binary(filename, spectra_iterator, config)
      config = DEFAULTS.merge(config)
      raise ":data_structure must be :continuous or :processed" unless [:processed, :continuous].include?(config[:data_structure])
      (mz_dtype, mz_prec, int_dtype, int_prec) = config.values_at(:mz_data_type, :mz_precision, :intensity_data_type, :intensity_precision)
      mz_prec_in_bytes = mz_prec / 8
      int_prec_in_bytes = int_prec / 8
      File.open(filename, 'wb') do |out|
        out.print [config[:uuid]].pack("H*")

        if config[:data_structure] == :continuous
          # write the first m/z and get its info
          mzs = spectra_iterator.peek.mzs
          mz_info = Mspire::Imzml::DataArrayInfo.new mzs.size, out.pos, mzs.size * mz_prec_in_bytes
          write_data_array(out, mzs, mz_dtype, mz_prec)
        end
        spectra_iterator.map do |spec|

          if config[:data_structure] == :processed
            mzs = spec.mzs
            mz_info = Mspire::Imzml::DataArrayInfo.new mzs.size, out.pos, mzs.size * mz_prec_in_bytes
            write_data_array(out, spec.mzs, mz_dtype, mz_prec)
          end

          ints = spec.intensities
          int_info = Mspire::Imzml::DataArrayInfo.new ints.size, out.pos, ints.size * int_prec_in_bytes
          write_data_array(out, spec.intensities, int_dtype, int_prec)
          [mz_info, int_info]
        end
      end
    end

    # converts image related hash values to an array of loose describe!
    # params
    def image_hash_to_cvs(hash)
      cvs = []
      cvs << case hash[:data_structure].to_sym
      when :processed ; 'IMS:1000031'
      when :continuous ; 'IMS:1000030'
      else ; raise ":data_structure must be :processed or :continuous"
      end 

      cvs << case hash[:scan_pattern].to_sym
      when :meandering ; 'IMS:1000410'
      when :random ; 'IMS:1000412'
      when :flyback ; 'IMS:1000413'
      else ; raise ":scan_pattern must be :meandering, :random or :flyback"
      end

      cvs << case hash[:scan_type].to_sym
      when :horizontal ; 'IMS:1000480'
      when :vertical ; 'IMS:1000481'
      else ; raise ":scan_type must be :horizontal or :vertical"
      end

      cvs << case hash[:linescan_direction].to_s
      when 'left-right' ; 'IMS:1000402'
      when 'right-left' ; 'IMS:1000403'
      when 'bottom-up' ; 'IMS:1000400'
      when 'top-down' ; 'IMS:1000401'
      when 'none' ; 'IMS:1000404'
      else ; raise ":linescan_direction unacceptable"
      end

      cvs << case hash[:linescan_sequence].to_s
      when 'top-down' ; 'IMS:1000493'
      when 'bottom-up' ; 'IMS:1000492'
      when 'left-right' ; 'IMS:1000491'
      when 'right-left' ; 'IMS:1000490'
      else ; raise "linescan_sequence unacceptable"
      end

      max_pix_dims = hash[:max_dimensions_pixels].split(/x/i)
      cvs.push *['IMS:1000042', 'IMS:1000043'].zip(max_pix_dims).to_a

      real_dims = hash[:max_dimensions_microns].split(/x/i)
      cvs.push *['IMS:1000044', 'IMS:1000045'].zip(real_dims).map {|pair| [*pair, "UO:0000017"] }

      pix_dims = hash[:pixel_size].split(/x/i)
      cvs.push *["IMS:1000046", "IMS:1000047"].zip(pix_dims).map {|pair| [*pair, "UO:0000017"] }
      cvs
    end

    def experiment_hash_to_cvs(hash)
      cvs = { 
        :matrix_solution_concentration => "MS:1000835",
        :matrix_solution => "MS:1000834",
        :solvent => 'IMS:1001211',
        :solvent_flowrate => 'IMS:1001213',
        :spray_voltage => 'IMS:1001212',
        :target_material => 'IMS:10000202'
      }.map do |key,cv_acc|
        [cv_acc, hash[key]] if hash[key]
      end.compact

      mats = hash[:matrix_application_types]
      if mats
        app_types = mats.map do |typ|
          case typ.to_s
          when 'sprayed' ; 'MS:1000838'
          when 'precoated' ; 'MS:1000839'
          when 'printed' ; 'MS:1000837'
          when 'drieddroplet' ; 'MS:1000836'
          else ; raise "invalid matrix_application_type"
          end
        end
        cvs.push(*app_types)
      end
      cvs
    end

    # returns an array with x, y pairs in the correct order
    # accounts for scan_pattern, scan_type, linescan_direction,
    # linescan_sequence and shots_per_position
    def x_y_positions(config)
      (scan_pattern, scan_type, scan_direction, scan_sequence, shots_per_pos) = config.values_at(:scan_pattern, :scan_type, :linescan_direction, :linescan_sequence, :shots_per_position).map(&:to_s)
      shots_per_pos = shots_per_pos.to_i
      #puts "EXAMIN ARGS: "
      #p [ scan_pattern, scan_type, scan_direction, scan_sequence, shots_per_pos, shots_per_pos]

      # the true dimensions we need to work off come from the pixel dimensions.
      (slen, plen) = config[:max_dimensions_pixels].split(/x/i).map(&:to_i)

      flip = (scan_type == 'vertical')
      plen, slen = slen, plen if flip

      pindices = (1..plen).to_a  # mindices if linescan_direction is 'horizontal'
      sindices = (1..slen).to_a  # nindices if linescan_direction is 'horizontal'

      if flip
        sindices.reverse! if scan_direction == 'bottom-top' || scan_direction == 'right-left'
        pindices.reverse! if scan_sequence == 'right-left' || scan_sequence == 'bottom-top'
      end

      indices = pindices.map do |a|
        row = sindices.map do |b|
          flip ? [a,b] : [b,a]
        end
        sindices.reverse! if scan_pattern == 'meandering'
        row
      end.flatten(1)

      indices.map {|pair| shots_per_pos.times.map { pair } }.flatten(1)
    end

    def create_file_description(source_files, config)
      Mspire::Mzml::FileDescription.new  do |fd|

        fd.file_content = Mspire::Mzml::FileContent.new.describe_many!( [
          'MS:1000579', # MS1 Spectrum
          config[:profile] ? 'MS:1000128' : 'MS:1000127',
          ['IMS:1000080', "{"+config[:uuid_hyphenated]+"}"],
          ['IMS:1000091', config[:ibd_sha1]],
          (config[:data_structure] == :processed) ? 'IMS:1000031' : 'IMS:1000030',
        ] )

        fd.source_files.replace(source_files)
        if [:name, :organization, :address, :email].any? {|key| config[key] }
          contact = Mspire::Mzml::Contact.new
          [ [:name, 'MS:1000586'], 
            [:organization, 'MS:1000590'],
            [:address, 'MS:1000587'],
            [:email, 'MS:1000589']
          ].each do |key, accession|
            contact.describe!(accession, config[key]) if config[key]
          end
          fd.contacts << contact
        end
      end
    end

    def create_referenceable_params_by_id_hash
      # m/z array and no compression (because we'll ref peaks in different
      # file)
      rparms = {
        :mz_array => [
          ["MS:1000514", "MS:1000040"], # m/z array, units = m/z
          "MS:1000576",                 # no compression
          ["IMS:1000101", true],        # external data
          'MS:1000523'                  # 64-bit float
      ],
        :intensity_array =>  [
          ["MS:1000515", "MS:1000131"], # intensity array, units = number of counts
          "MS:1000576",                 # no compression
          ["IMS:1000101", true],        # external data
          'MS:1000521'                  # 32-bit float
      ],
        :scan1 => [
          # this should probably be ascertained from the mzml file:
          "MS:1000093", # increasing m/z scan
          # "MS:1000095" # linear  # <- this should probably be gathered
          # from mzml (leave outfor now)
          # could include the filter string here in future
      ],
      :spectrum1 => [
        "MS:1000579", # MS1 spectrum
        ["MS:1000511", 1], # ms level - default implementation uses 0 but I disagree...
        "MS:1000127",  # centroid spectrum
        "MS:1000130" # <- positive scan
      ]}.map do |id, list|
        Mspire::Mzml::ReferenceableParamGroup.new(id).describe_many!(list)
      end
      Hash[ rparms.map {|prm_group_obj| [prm_group_obj.id, prm_group_obj]} ]
    end

    # mzml_filenames can each be a partial or relative path
    # by default the file will write to the same directory and basename
    #
    #     contact:
    #       :name => name of the person or organization
    #       :address => address of the person or organization
    #       :url => url of person or organization
    #       :email => email of person or organization
    #       :organization => home institution of contact
    #     experiment:
    #       :solvent => the solvent used
    #       :solvent_flowrate => flowrate of the solvent (ml/min)
    #       :spray_voltage => spray voltage in kV
    #       :target_material => the material the target is made of
    #     editing:
    #       :trim_files => determines min # spectra in file and lops
    #                      off the ends of other files.
    #     general:
    #       :omit_zeros => remove zero values
    #       :combine => use this outfile base name to combine files
    #                   also works on single files to rename them.
    #                   MUST be present if more than one mzml given.
    #     imaging:
    #       :data_structure => :processed or :continuous  
    #       :scan_pattern => meandering random flyback
    #       :scan_type => horizontal or vertical
    #       :linescan_direction => scan_direction.join('|')
    #       :linescan_sequence => scan_sequence.join('|')
    #       :max_dimensions_pixels => maximum X by Y pixels (e.g. 300x100)
    #       :shots_per_position => number of spectra per position
    #       :pixel_size => X by Y of a single pixel in microns (μm)
    #       :max_dimensions_microns => maximum X by Y in microns (e.g. 25x20)
    def write(mzml_filenames, config={})

      base = config[:combine] || mzml_filenames.first.chomp(File.extname(mzml_filenames.first))
      config[:imzml_filename] = base + ".imzML"
      config[:ibd_filename] = base + ".ibd"

      uuid_with_hyphens = SecureRandom.uuid
      config[:uuid_hyphenated] = uuid_with_hyphens
      config[:uuid] = uuid_with_hyphens.gsub('-','')

      if config[:trim_files]
        config[:trim_to] = mzml_filenames.map {|fn| Mspire::Mzml.open(fn, &:size) }.min
      end

      sourcefile_id_parallel_to_spectra = []
      sourcefile_ids = []
      all_spectra_iter = Enumerator.new do |yielder|
        mzml_filenames.each_with_index do |mzml_filename,i|
          sourcefile_id = "source_file_#{i}"
          sourcefile_ids << sourcefile_id
          Mspire::Mzml.open(mzml_filename) do |mzml|
            enumerates_spectra = 
              # handle SIM files "MS:1001472", "selected ion monitoring chromatogram"
              if mzml.file_description.file_content.fetch_by_acc('MS:1001472')
                # handle normal mzml files
                mz_ars = []
                its_ars = []
                mzml.each_chromatogram.each do |chromatogram,i| 
                  next unless chromatogram.fetch_by_acc('MS:1001472')
                  target_mz = chromatogram.precursor.isolation_window.fetch_by_acc('MS:1000827').to_f
                  its = chromatogram.intensities
                  mz_ars << Array.new(its.size, target_mz)
                  its_ars << its
                end
                mz_ars.transpose.zip(its_ars.transpose).map {|mzs, its| Mspire::Spectrum.new([mzs, its]) }
              else
                # normal mzml file with spectra
                mzml
              end
            enumerates_spectra.each_with_index do |spec,i| 
              break if config[:trim_to] && (i >= config[:trim_to])
              sourcefile_id_parallel_to_spectra << sourcefile_id 
              yielder << spec
            end
          end
        end
      end

      data_info_pairs = write_binary(config[:ibd_filename], all_spectra_iter, config)
      xy_positions_array = x_y_positions(config)

      unless data_info_pairs.size == xy_positions_array.size
        STDERR.puts "Input Error! The number of calculated scans must equal the number of actual scans!" 
        STDERR.puts "The number of calculated positions: #{xy_positions_array.size}"
        STDERR.puts "The number of actual scans: #{data_info_pairs.size}"
        raise
      end

      config[:ibd_sha1] = Digest::SHA1.hexdigest(IO.read(config[:ibd_filename]))

      source_files = mzml_filenames.zip(sourcefile_ids).map do |mzml_filename, source_file_id|
        sfile = Mspire::Mzml::SourceFile[ mzml_filename ]
        sfile.id = source_file_id
        sfile.describe! 'MS:1000584'
        sfile.describe! 'MS:1000569', Digest::SHA1.hexdigest(IO.read(mzml_filename))
        sfile
      end
      sourcefile_id_to_sourcefile = Hash[ source_files.group_by(&:id).map {|k,v| [k,v.first] } ]

      imzml_obj = Mspire::Mzml.new do |imzml|
        imzml.id = SecureRandom.uuid.gsub('-','')
        imzml.cvs = Mspire::Mzml::CV::DEFAULT_CVS

        imzml.file_description = create_file_description(source_files, config)

        rparms_by_id = create_referenceable_params_by_id_hash

        warn "using positive scan in every case but need to get this from original mzml!"

        imzml.referenceable_param_groups = rparms_by_id.values
        # skip sample list for now
        mspire_software = Mspire::Mzml::Software.new("mspire", Mspire::VERSION).describe!("MS:1000799")
        imzml.software_list << mspire_software

        scan_setting_params = image_hash_to_cvs( config )
        scan_setting_params.push *experiment_hash_to_cvs( config )
        imzml.scan_settings_list = [Mspire::Mzml::ScanSettings.new("scansettings1").describe_many!(scan_setting_params)]

        warn 'todo: need to borrow instrumentConfiguration from original mzml'

        default_instrument_config = Mspire::Mzml::InstrumentConfiguration.new("borrow_from_mzml")
        warn 'todo: need to include default softare from mzml in default_instrument_config'
        #default_instrument_config.software = Software.new( from the mzml file! )
        imzml.instrument_configurations << default_instrument_config

        # this is a generic 'file format conversion' but its the closest we
        # have for mzml to imzml (which is really mzml to mzml)
        data_processing_obj = Mspire::Mzml::DataProcessing.new('mzml_to_imzml')
        data_processing_obj.processing_methods << Mspire::Mzml::ProcessingMethod.new(mspire_software).describe!('MS:1000530')
        imzml.data_processing_list << data_processing_obj

        warn "not implemented 'omit_zeros' yet"
        # low intensity data point removal: "MS:1000594"
        imzml.run = Mspire::Mzml::Run.new("run1", default_instrument_config) do |run|
          spectra = []
          data_info_pairs.zip(xy_positions_array, sourcefile_id_parallel_to_spectra).each_with_index do |(pair, xy, sourcefile_id),i|
            # TODO: we should probably copy the id from the orig mzml (e.g.
            # scan=1)
            spectrum = Mspire::Mzml::Spectrum.new("spectrum#{i}").describe!(rparms_by_id[:spectrum1])
            spectrum.source_file = sourcefile_id_to_sourcefile[sourcefile_id]
            scan_list = Mspire::Mzml::ScanList.new.describe!('MS:1000795')  # no combination
            scan = Mspire::Mzml::Scan.new.describe_many!([rparms_by_id[:scan1], ["IMS:1000050", xy[0]], ["IMS:1000051", xy[1]]])
            scan.instrument_configuration = default_instrument_config
            spectrum.scan_list = (scan_list << scan)
            
            data_arrays = %w(mz intensity).zip(pair).map do |type, data_array_info|
              rparmgroup = rparms_by_id[(type + "_array").to_sym]
              # the type is defined in the refparams
              data_array = Mspire::Mzml::DataArray.new
              data_array.external = true
              data_array.describe_many! [rparmgroup, *%w(IMS:1000103 IMS:1000102 IMS:1000104).zip(data_array_info).map.to_a]
              data_array
            end
            spectrum.data_arrays = data_arrays
            spectra << spectrum
          end
          run.spectrum_list = Mspire::Mzml::SpectrumList.new(data_processing_obj, spectra)
        end # run
      end # imzml
      imzml_obj.to_xml(config[:imzml_filename])
    end
  end
end