bio-miga/miga

View on GitHub
lib/miga/project/dataset.rb

Summary

Maintainability
A
3 hrs
Test Coverage
B
83%
# @package MiGA
# @license Artistic-2.0

##
# Helper module including specific functions handle datasets.
module MiGA::Project::Dataset
  ##
  # Returns Array of MiGA::Dataset
  def datasets
    metadata[:datasets].map { |name| dataset(name) }
  end

  ##
  # Returns Array of String (without evaluating dataset objects)
  def dataset_names
    metadata[:datasets]
  end

  ##
  # Returns Hash of +{ String => true }+. Similar to +dataset_names+ but as
  # Hash for efficiency
  def dataset_names_hash
    warn 'The Project#dataset_names_hash method will be deprecated soon'
    @dataset_names_hash ||= Hash[dataset_names.map { |i| [i, true] }]
  end

  ##
  # Returns Set of Strings. Similar to +dataset_names+ but as Set for
  # efficiency
  def dataset_names_set
    @dataset_names_set ||= Set.new(dataset_names)
  end

  ##
  # Cache for the special set of datasets which are both reference and
  # active, returned as an Array. Use carefully, as it doesn't get
  # recalculated upon dataset (in)activation once loaded. To force
  # recalculating, use +dataset_ref_active!+
  def dataset_ref_active
    @dataset_ref_active ||= dataset_ref_active!
  end

  ##
  # Force recalculation of +dataset_ref_active+ and returns the Array
  # of MiGA::Dataset objects
  def dataset_ref_active!
    @dataset_ref_active = datasets.select(&:ref?).select(&:active?)
  end

  ##
  # Returns MiGA::Dataset
  def dataset(name)
    name = name.to_s.miga_name
    return nil unless MiGA::Dataset.exist?(self, name)

    @datasets ||= {}
    @datasets[name] ||= MiGA::Dataset.new(self, name)
    @datasets[name]
  end

  ##
  # Iterate through datasets (MiGA::Dataset)
  def each_dataset(&blk)
    if block_given?
      metadata[:datasets].each { |name| blk.call(dataset(name)) }
    else
      to_enum(:each_dataset)
    end
  end

  ##
  # Add dataset identified by +name+ and return MiGA::Dataset.
  def add_dataset(name)
    unless metadata[:datasets].include? name
      d = MiGA::Dataset.new(self, name)
      @metadata[:datasets] << name
      @dataset_names_hash[name] = true if @dataset_names_hash
      @dataset_names_set << name if @dataset_names_set
      @dataset_ref_active = nil
      save
      if d.ref? && d.active?
        recalculate_tasks("Reference dataset added: #{d.name}")
      end
      pull_hook(:on_add_dataset, name)
    end
    dataset(name)
  end

  ##
  # Unlink dataset identified by +name+ and return MiGA::Dataset.
  def unlink_dataset(name)
    d = dataset(name)
    return nil if d.nil?

    @dataset_names_hash = nil
    @dataset_names_set  = nil
    @dataset_ref_active = nil
    self.metadata[:datasets].delete(name)
    save
    if d.ref? && d.active?
      recalculate_tasks("Reference dataset unlinked: #{d.name}")
    end
    pull_hook(:on_unlink_dataset, name)
    d
  end

  ##
  # Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
  # supported by File#generic_transfer.
  def import_dataset(ds, method = :hardlink)
    raise "Impossible to import dataset, it already exists: #{ds.name}." if
      MiGA::Dataset.exist?(self, ds.name)

    # Import dataset results
    ds.each_result do |task, result|
      # import result files
      result.each_file do |file|
        File.generic_transfer(
          File.join(result.dir, file),
          File.join(path, 'data', MiGA::Dataset.RESULT_DIRS[task], file),
          method
        )
      end
      # import result metadata
      %w(json start done).each do |suffix|
        if File.exist? File.join(result.dir, "#{ds.name}.#{suffix}")
          File.generic_transfer(
            File.join(result.dir, "#{ds.name}.#{suffix}"),
            File.join(
              path, 'data', MiGA::Dataset.RESULT_DIRS[task],
              "#{ds.name}.#{suffix}"
            ),
            method
          )
        end
      end
    end
    # Import dataset metadata
    File.generic_transfer(
      File.join(ds.project.path, 'metadata', "#{ds.name}.json"),
      File.join(self.path, 'metadata', "#{ds.name}.json"),
      method
    )
    # Save dataset
    self.add_dataset(ds.name)
  end

  ##
  # Find all datasets with (potential) result files but are yet unregistered.
  def unregistered_datasets
    datasets = []
    MiGA::Dataset.RESULT_DIRS.values.each do |dir|
      dir_p = "#{path}/data/#{dir}"
      next unless Dir.exist? dir_p

      Dir.entries(dir_p).each do |file|
        next unless
          file =~ %r{
            \.(fa(a|sta|stqc?)?|fna|solexaqa|gff[23]?|done|ess)(\.gz)?$
          }x

        m = /([^\.]+)/.match(file)
        datasets << m[1] unless m.nil? or m[1] == "miga-project"
      end
    end
    datasets.uniq - metadata[:datasets]
  end

  ##
  # Are all the datasets in the project preprocessed? Save intermediate results
  # if +save+ (until the first incomplete dataset is reached).
  def done_preprocessing?(save = false)
    !each_dataset.any? do |d|
      d.ref? && d.active? && !d.done_preprocessing?(save)
    end
  end

  ##
  # Returns a two-dimensional matrix (Array of Array) where the first index
  # corresponds to the dataset, the second index corresponds to the dataset
  # task, and the value corresponds to:
  # - 0: Before execution.
  # - 1: Done (or not required).
  # - 2: To do.
  def profile_datasets_advance
    advance = []
    each_dataset_profile_advance { |adv| advance << adv }
    advance
  end

  ##
  # Call +blk+ passing the result of MiGA::Dataset#profile_advance for each
  # registered dataset.
  def each_dataset_profile_advance(&blk)
    each_dataset { |ds| blk.call(ds.profile_advance) }
  end
end