SysMO-DB/seek

View on GitHub
lib/seek/parser_mapper.rb

Summary

Maintainability
F
6 days
Test Coverage
module Seek

  class ParserMapper


    # how to define a new parser mapping with name NEW_MAPPER
    # * create a private method NEW_MAPPER_mapping
    # * copy the hash from empty mapping to NEW_MAPPER_mapping
    # * specify data like :name, :data_offset, :probing_column etc...
    # * fill in your mapped fields
    # ** see existing mappings for examples for using different blocks to handle FIXED-data columns
    # ** or columns that share some raw data that has be extracted by a regular expression
    # * (OPTIONAL) define a (filename-pattern-to-parser-mapping-name)-mapping in filename_to_mapping_name to enable automatic detection of parser mappings by filename

    def initialize
    end

    # returns a mapping given a mapping name (convention: name of method without "_mapping")
    def mapping mapping_name
      mapping = self.send(mapping_name+"_mapping")
      if mapping
        if check_mapping mapping
          return mapping
        end

        nil

      else
        nil

      end

    end

    # this method defines a (filename-pattern-to-parser-mapping-name)-mapping
    # to add a mapping just add a when clause relating a file pattern to the name of the parser-mapping (without "_mapping")
    def filename_to_mapping_name filename
      case filename
        when /^jena.*/i then "jena"
        when /^do_hengstler.*/i then "dortmund_hengstler"
        when /^do_bcat_ko.*/i then "dortmund_bcat_ko"
        when /^due_bode.*/i then "duesseldorf_bode"
        when /^due_bode_surgical.*/i then "duesseldorf_bode_surgical"
        else "unknown"
      end
    end


    private

    def check_mapping mapping      #TODO check whether all necessary fields are mapped
      true
    end

    # the default mapping entry
    # if no block is specified a simple block is defined that just returns the input
    def mapping_entry column_name, block=nil
      unless block
        block = proc { |data| data}
      end

      {:column => column_name, :value => block}
    end

    # mapping for jena excel sheets
    # example file: jenage-excel_template-without-protection correct.xlsm
    def jena_mapping
      {
          :name => "jena",
          :data_row_offset => 2,



          :samples_mapping => {

              :samples_sheet_name => "SDRF",

              :add_specimens => true,
              :add_treatments => true,
              :add_samples => true,

              :probing_column => :"specimens.title",


              :"organisms.title" => mapping_entry("Organism"),

              :"strains.title" => mapping_entry("Strain or Line"),

              :"specimens.sex" => mapping_entry("Sex"),
              :"specimens.title" => mapping_entry("# Specimen"),
              :"specimens.lab_internal_number" => mapping_entry("# Specimen"),
              :"specimens.age" => mapping_entry("Age"),
              :"specimens.age_unit" => mapping_entry("Age Time Unit"),
              :"specimens.comments" => mapping_entry("FIXED", proc {""}),
              :"specimens.genotype.title" => mapping_entry("FIXED", proc {"none"}),
              :"specimens.genotype.modification" => mapping_entry("FIXED", proc{""}),

              :"treatment.treatment_protocol" => mapping_entry("Treatment Protocol"),
              :"treatment.type" => mapping_entry("FIXED", proc {"concentration"}),
              :"treatment.comments" => mapping_entry("FIXED", proc {""}),
              :"treatment.substance" => mapping_entry("Substance"),
              :"treatment.concentration" => mapping_entry("Concentration"),
              :"treatment.unit" => mapping_entry("Unit"),
              :"treatment.incubation_time" => mapping_entry("FIXED", proc {nil}),
              :"treatment.incubation_time_unit" => mapping_entry("FIXED", proc {""}),

              :"samples.comments" => mapping_entry("Additional Information"),
              :"samples.title" => mapping_entry("Sample Name"),
              :"samples.sample_type" => mapping_entry("Material Type"),
              :"samples.donation_date" => mapping_entry("Storage Date", proc  {|data| data != "" ? data : Time.now}), # the default value is certainly wrong -- but we need some donation_date
              :"samples.organism_part"  => mapping_entry("FIXED", proc {""}),

              :"tissue_and_cell_types.title" => mapping_entry("Organism Part"),

              :"sop.title" => mapping_entry("Storage Protocol"),
              :"institution.title" => mapping_entry("Storage Location")


          },

          :assay_mapping => {

              :assay_sheet_name => "IDF",
              :parsing_direction => "horizontal",
              :probing_column => "",

              :"investigation.title" => mapping_entry("Investigation Title"),
              :"assay_type.title" => mapping_entry("Experiment Class"),
              :"study.title" => mapping_entry("Experiment Description"),

              :"creator.email" => mapping_entry("Person Email"),
              :"creator.last_name" => mapping_entry("Person Last Name"),
              :"creator.first_name" => mapping_entry("Person First Name")

          }



      }

    end

    # mapping for dortmund excel sheets (I)
    # example file: do_hengstler_Sample Data Hengstler_corrected.xls
    def dortmund_hengstler_mapping

      age_regex = /(\d+-?\d*)\s*(day|week|month|year)s?/i
      treatment_regex = /(\d*\.?\d*)\s*(\w+\/\w+)\s*([\w\.\s,']*),?\s+([\w\.]*)/
      incubation_time_regex = /(\d+\.?\d*)(\w{1})/




      {
          :name => "dortmund_hengstler",
          :data_row_offset => 1,


          :samples_mapping => {

              :samples_sheet_name => "Tabelle1",
              :add_specimens => true,
              :add_treatments => true,
              :add_samples => true,

              :probing_column => :"specimens.title",


              :"organisms.title" => mapping_entry("FIXED", proc {"Mus musculus"}),

              :"strains.title" => mapping_entry("Mouse strain"),


              :"specimens.institution_id" => mapping_entry("Responsible Lab"),
              :"specimens.title" => mapping_entry("Lab internal number", proc { |data| data.chomp(".0")}),
              :"specimens.lab_internal_number" => mapping_entry("Lab internal number", proc { |data| data.chomp(".0")}),
              :"specimens.sex" => mapping_entry("FIXED", proc {"unknown"}),
              :"specimens.age" => mapping_entry("Age", proc do |data|
                  if data =~ age_regex
                    $1 ? $1 : ""
                  else
                    ""
                  end
              end),
              :"specimens.age_unit" => mapping_entry("Age", proc do |data|
                  if data =~ age_regex
                    $2 ? $2 : ""
                  else
                    ""
                  end
              end),




              :"specimens.donation_date" => mapping_entry("date of experiment"),
              :"specimens.comments" => mapping_entry("FIXED", proc {""}),
              :"specimens.genotype.title" => mapping_entry("FIXED", proc {"none"}),
              :"specimens.genotype.modification" => mapping_entry("FIXED", proc{""}),

              :"treatment.concentration" => mapping_entry("Treatment", proc do |data|
                if data =~ treatment_regex
                  $1 ? $1 : ""
                else
                  ""
                end
              end),
              :"treatment.unit" => mapping_entry("Treatment", proc do |data|
                if data =~ treatment_regex
                  $2 ? $2 : ""
                else
                  ""
                end
              end),
              :"treatment.substance" => mapping_entry("Treatment", proc do |data|
                if data =~ treatment_regex
                  if $3
                    substance = $3
                    if substance.end_with?(',')
                      substance.chop
                    else
                      substance
                    end
                  else
                    ""
                  end
                else
                  ""
                end
              end),
              :"treatment.treatment_protocol" => mapping_entry("Treatment", proc do |data|
                if data =~ treatment_regex
                  $4 ? $4 : ""
                else
                  ""
                end
              end),
              :"treatment.type" => mapping_entry("FIXED", proc {"concentration"}),
              :"treatment.comments" => mapping_entry("FIXED", proc {""}),
              :"treatment.incubation_time" => mapping_entry("Explantation", proc do |data|
                  if data =~ incubation_time_regex
                    $1
                  else
                    ""
                  end
              end),

              :"treatment.incubation_time_unit" => mapping_entry("Explantation", proc do |data|
                if data =~ incubation_time_regex
                  case $2
                    when "d" then "day"
                    when "h" then "hour"
                    when "w" then "week"
                    else ""
                  end
                else
                  ""
                end
              end),

              :"samples.comments" => mapping_entry("Comments"),
              :"samples.title" => mapping_entry("Tissue specimen no.", proc { |data| data.chomp(".0")}),
              :"samples.sample_type" => mapping_entry("FIXED", proc {""}),
              :"samples.donation_date" => mapping_entry("date of experiment"),
              :"samples.organism_part"  => mapping_entry("FIXED", proc {"organ"}),

              :"tissue_and_cell_types.title" => mapping_entry("FIXED", proc {"Liver"}),

              :"sop.title" => mapping_entry("FIXED", proc {""}),
              :"institution.title" => mapping_entry("FIXED", proc{""}),








      },

          :assay_mapping => {

              :assay_sheet_name => "Tabelle1",
              :parsing_direction => "vertical",
              :probing_column => :"creator.last_name",

              :"investigation.title" => mapping_entry("FIXED", proc { nil }),
              :"assay_type.title" => mapping_entry("FIXED", proc { nil }),
              :"study.title" => mapping_entry("FIXED", proc { nil }),

              :"creator.email" => mapping_entry("FIXED", proc { "" }),
              :"creator.last_name" => mapping_entry("Experimentator", proc do |data|
                if data.split(/\s+/)
                  data.split(/\s+/).last
                end
              end),
              :"creator.first_name" => mapping_entry("Experimentator", proc do |data|
                if data.split(/\s+/)
                  data.gsub(data.split(/\s+/).last, "").chop
                end
              end)

          }




      }

    end

    # mapping for dortmund excel sheets (II)
    # example file: BCat KO sample data Dortmund.xls
    def dortmund_bcat_ko_mapping

      age_regex = /(\d+-?\d*)\s*(day|week|month|year)s?/i
      treatment_substance_regex = /.*\((control|treated)\s?=\s?(.*)\)/i
      treatment_concentration_unit_regex = /(\d*[,\.]?\d*)\s*([\w\s\/]*).*$/
      treatment_protocol_regex = /.*,\s*([\w\s\.]*)$/
      genotype_modification_regex = /(WT|KO)/
      incubation_time_regex = /(\d+\.?\d*)\s+(\w*)s/

      {
          :name => "dortmund_bcat_ko",
          :data_row_offset => 1,

          :samples_mapping => {     # TODO: which fields are really necessary?
              :samples_sheet_name => "BCat KO",

              :add_specimens => true,
              :add_treatments => true,
              :add_samples => true,


              :probing_column => :"specimens.title",


              :"organisms.title" => mapping_entry("FIXED", proc {"Mus musculus"}),

              :"strains.title" => mapping_entry("FIXED", proc {"Wildtype"}),

              :"specimens.sex" => mapping_entry("Gender", proc { |data| data.downcase }),
              :"specimens.title" => mapping_entry("Sample ID"),
              :"specimens.lab_internal_number" => mapping_entry("Sample ID"),
              :"specimens.age" => mapping_entry("Age", proc do |data|
                   if data =~ age_regex
                     $1 ? $1 : ""
                   else
                     ""
                   end
              end),
              :"specimens.age_unit" => mapping_entry("Age", proc do |data|
                  if data =~ age_regex
                    $2 ? $2 : ""
                  else
                    ""
                  end
              end),
              :"specimens.comments" => mapping_entry("FIXED", proc {""}),
              :"specimens.genotype.title" => mapping_entry("FIXED", proc {"Bcat"}),
              :"specimens.genotype.modification" => mapping_entry("Genotype", proc do |data|
                  if data =~ genotype_modification_regex
                    case $1
                      when "KO" then "Knock out"
                      when "WT" then "Wildtype"
                      else "Wildtype"
                    end
                  else
                    "Wildtype"
                  end
              end),

              :"treatment.treatment_protocol" => mapping_entry("Dose/route of adminstration", proc do |data|         # sic!?
                if data =~ treatment_protocol_regex
                  $1 ? $1 : ""
                else
                  ""
                end
              end),
              :"treatment.type" => mapping_entry("FIXED", proc {"concentration"}),
              :"treatment.comments" => mapping_entry("FIXED", proc {""}),
              :"treatment.substance" => mapping_entry("Genotype", proc do |data|
                if data =~ treatment_substance_regex
                  $2 ? $2 : ""
                else
                  ""
                end
              end),
              :"treatment.concentration" => mapping_entry("Dose/route of adminstration", proc do |data|
                if data =~ treatment_concentration_unit_regex
                  $1 ? $1 : nil
                else
                  nil
                end
              end),
              :"treatment.unit" => mapping_entry("Dose/route of adminstration", proc do |data|
                if data =~ treatment_concentration_unit_regex
                  $2 ? $2 : ""
                else
                  ""
                end
              end),
              :"treatment.incubation_time" => mapping_entry("Time point", proc do |data|
                if data =~ incubation_time_regex
                  $1
                else
                  ""
                end
              end),

              :"treatment.incubation_time_unit" => mapping_entry("Time point", proc do |data|
                if data =~ incubation_time_regex
                  $2
                else
                  ""
                end
              end),

              :"samples.comments" => mapping_entry("FIXED", proc {""}),
              :"samples.title" => mapping_entry("Sample ID"),
              :"samples.sample_type" => mapping_entry("FIXED", proc {""}),
              :"samples.donation_date" => mapping_entry("Arrival Date"),
              :"samples.organism_part"  => mapping_entry("FIXED", proc {"organ"}),

              :"tissue_and_cell_types.title" => mapping_entry("FIXED", proc {"Liver"}),

              :"sop.title" => mapping_entry("FIXED", proc {""}),
              :"institution.title" => mapping_entry("FIXED", proc {""})
          },


          :assay_mapping => nil


      }

    end

    # mapping for duesseldorf excel sheet
    # example file: due_bode_Tierbestandsliste G96 parsing format.xls
    def duesseldorf_bode_mapping

      concentration_regex = /(\d*,?\.?\d*).*/
      gene_modification_regex = /([\w\d]+)([\/+-]+)/

      {
              :name => "duesseldorf_bode",
              :data_row_offset => 1,


              :samples_mapping => {
                  :samples_sheet_name => "Tabelle2",
                  :add_specimens => true,
                  :add_treatments => true,
                  :add_samples => true,

                  :probing_column => :"specimens.title",

                  :"organisms.title" => mapping_entry("species"),

                  :"strains.title" => mapping_entry("strain"),

                  :"specimens.sex" => mapping_entry("sex"),
                  :"specimens.title" => mapping_entry("Animal Nr."),
                  :"specimens.lab_internal_number" => mapping_entry("Animal Nr."),
                  :"specimens.age" => mapping_entry("Age (Weeks)"),
                  :"specimens.age_unit" => mapping_entry("FIXED", proc {"week"}),
                  :"specimens.comments" => mapping_entry("Specials", proc {|data| data == "-" ? "" : data }),
                  :"specimens.genotype.title" => mapping_entry("Genotype", proc do |data|
                       if data =~ gene_modification_regex
                         $1
                       else
                         "none"
                       end
                  end),
                  :"specimens.genotype.modification" => mapping_entry("Genotype", proc do |data|
                      if data =~ gene_modification_regex
                        $2
                      else
                        ""
                      end
                  end),

                  :"treatment.treatment_protocol" => mapping_entry("FIXED", proc {""}),
                  :"treatment.type" => mapping_entry("FIXED", proc {"concentration"}),
                  :"treatment.comments" => mapping_entry("FIXED", proc {""}),
                  :"treatment.substance" => mapping_entry("FIXED", proc {"LPS"}),
                  :"treatment.concentration" => mapping_entry("LPS (µg/g KG)", proc do |data|
                    if data =~ concentration_regex
                      $1.gsub(/,/, ".")
                    else
                      ""
                    end
                  end),
                  :"treatment.unit" => mapping_entry("FIXED", proc {"µg/g KG"}),
                  :"treatment.incubation_time" => mapping_entry("Incubation period x Std."),
                  :"treatment.incubation_time_unit" => mapping_entry("FIXED", proc {"hour"}),


                  :"samples.comments" => mapping_entry("FIXED", proc {""}),
                  :"samples.title" => mapping_entry("Animal Nr.", proc { |data| data + "_liver"}),
                  :"samples.sample_type" => mapping_entry("FIXED", proc {"liver"}),
                  :"samples.donation_date" => mapping_entry("Donation Date"),
                  :"samples.organism_part"  => mapping_entry("FIXED", proc {"organ"}),

                  :"tissue_and_cell_types.title" => mapping_entry("FIXED", proc {"Liver"}),

                  :"sop.title" => mapping_entry("FIXED", proc {""}),
                  :"institution.title" => mapping_entry("FIXED", proc {""})
              },


              :assay_mapping => {

                  :assay_sheet_name => "Tabelle2",
                  :parsing_direction => "vertical",
                  :probing_column => :"creator.last_name",

                  :"investigation.title" => mapping_entry("FIXED", proc { nil }),
                  :"assay_type.title" => mapping_entry("FIXED", proc { nil }),
                  :"study.title" => mapping_entry("FIXED", proc { nil }),

                  :"creator.email" => mapping_entry("FIXED", proc { "" }),
                  :"creator.last_name" => mapping_entry("experimentator", proc do |data|
                    if data.split(/\s+/)
                      data.split(/\s+/).last
                    end
                  end),
                  :"creator.first_name" => mapping_entry("experimentator", proc do |data|
                    if data.split(/\s+/)
                      data.split(/\s+/).first
                    end
                  end)

              }

      }

    end

    # mapping for duesseldorf excel sheet with surgical procedure
    # example file: due_bode_Tierbestandsliste G96 parsing format.xls
    def duesseldorf_bode_surgical_mapping

      gene_modification_regex = /([\w\d]+)([\/+-]+)/

      {
          :name => "duesseldorf_bode",
          :data_row_offset => 3,


          :samples_mapping => {
              :samples_sheet_name => "Tabelle1",
              :add_specimens => true,
              :add_treatments => true,
              :add_samples => true,

              :probing_column => :"specimens.title",

              :"organisms.title" => mapping_entry("species"),

              :"strains.title" => mapping_entry("strain"),

              :"specimens.sex" => mapping_entry("sex"),
              :"specimens.title" => mapping_entry("Animal Nr."),
              :"specimens.lab_internal_number" => mapping_entry("Animal Nr."),
              :"specimens.age" => mapping_entry("Age (Weeks)"),
              :"specimens.age_unit" => mapping_entry("FIXED", proc {"week"}),
              :"specimens.comments" => mapping_entry("Specials", proc {|data| data == "-" ? "" : data }),
              :"specimens.genotype.title" => mapping_entry("Genotype", proc do |data|
                if data =~ gene_modification_regex
                  $1
                else
                  "none"
                end
              end),
              :"specimens.genotype.modification" => mapping_entry("Genotype", proc do |data|
                if data =~ gene_modification_regex
                  $2
                else
                  ""
                end
              end),

              :"treatment.treatment_protocol" => mapping_entry("FIXED", proc {""}),
              :"treatment.type" => mapping_entry("FIXED", proc {"surgical procedure"}),
              :"treatment.comments" => mapping_entry("Treatment"),
              :"treatment.substance" => mapping_entry("FIXED", proc {""}),
              :"treatment.concentration" => mapping_entry("FIXED", proc {""}),
              :"treatment.unit" => mapping_entry("FIXED", proc {""}),
              :"treatment.incubation_time" => mapping_entry("Incubation period (hours)"),
              :"treatment.incubation_time_unit" => mapping_entry("FIXED", proc {"hour"}),


              :"samples.comments" => mapping_entry("FIXED", proc {""}),
              :"samples.title" => mapping_entry("Animal Nr.", proc { |data| data + "_liver"}),
              :"samples.sample_type" => mapping_entry("FIXED", proc {"liver"}),
              :"samples.donation_date" => mapping_entry("Donation Date"),
              :"samples.organism_part"  => mapping_entry("FIXED", proc {"organ"}),

              :"tissue_and_cell_types.title" => mapping_entry("FIXED", proc {"Liver"}),

              :"sop.title" => mapping_entry("FIXED", proc {""}),
              :"institution.title" => mapping_entry("FIXED", proc {""})
          },


          :assay_mapping => {

              :assay_sheet_name => "Tabelle1",
              :parsing_direction => "vertical",
              :probing_column => :"creator.last_name",

              :"investigation.title" => mapping_entry("FIXED", proc { nil }),
              :"assay_type.title" => mapping_entry("FIXED", proc { nil }),
              :"study.title" => mapping_entry("FIXED", proc { nil }),

              :"creator.email" => mapping_entry("FIXED", proc { "" }),
              :"creator.last_name" => mapping_entry("experimentator", proc do |data|
                if data.split(/\s+/)
                  data.split(/\s+/).last
                end
              end),
              :"creator.first_name" => mapping_entry("experimentator", proc do |data|
                if data.split(/\s+/)
                  data.split(/\s+/).first
                end
              end)

          }

      }

    end

    def unknown_mapping
      nil
    end


    def empty_mapping # defines basic mapping to start with, not that useful for the real parsing business ;-)
      {
          :name => "",
          :data_row_offset => 1, # add this to the row of a header column to get to row with the first data element

          :samples_mapping => {     # TODO: which fields are really necessary?
              :samples_sheet_name => "",  # required

              :add_specimens => true,
              :add_treatments => true,
              :add_samples => true,

              :probing_column => :"specimens.title",  # this should map to a column that has no blank elements in between other elements

              :"organisms.title" => mapping_entry(""),

              :"strains.title" => mapping_entry(""),

              :"specimens.sex" => mapping_entry(""),
              :"specimens.title" => mapping_entry(""),
              :"specimens.lab_internal_number" => mapping_entry(""),
              :"specimens.age" => mapping_entry(""),
              :"specimens.age_unit" => mapping_entry(""),
              :"specimens.comments" => mapping_entry(""),
              :"specimens.genotype.title" => mapping_entry("FIXED", proc {"none"}),
              :"specimens.genotype.modification" => mapping_entry("FIXED", proc {""}),

              :"treatment.treatment_protocol" => mapping_entry(""),
              :"treatment.type" => mapping_entry("FIXED", proc {"concentration"}),
              :"treatment.comments" => mapping_entry("FIXED", proc {""}),
              :"treatment.substance" => mapping_entry(""),
              :"treatment.concentration" => mapping_entry(""),
              :"treatment.unit" => mapping_entry(""),
              :"treatment.incubation_time" => mapping_entry(""),
              :"treatment.incubation_time_unit" => mapping_entry(""),

              :"samples.comments" => mapping_entry(""),
              :"samples.title" => mapping_entry(""),
              :"samples.sample_type" => mapping_entry(""),
              :"samples.donation_date" => mapping_entry(""),
              :"samples.organism_part"  => mapping_entry(""),

              :"tissue_and_cell_types.title" => mapping_entry(""),

              :"sop.title" => mapping_entry(""),
              :"institution.title" => mapping_entry("")
          },


          :assay_mapping => {      # can be nil if no assays are mapped

              :assay_sheet_name => "",
              :parsing_direction => "vertical",
              :probing_column => :"creator.last_name",

              :"investigation.title" => mapping_entry(""),
              :"assay_type.title" => mapping_entry(""),
              :"study.title" => mapping_entry(""),

              :"creator.email" => mapping_entry(""),
              :"creator.last_name" => mapping_entry(""),
              :"creator.first_name" => mapping_entry("")

          }


      }

    end



  end

end