lib/miga/project/dataset.rb
# @package MiGA
# @license Artistic-2.0
##
# Helper module including specific functions handle datasets.
module MiGA::Project::Dataset
##
# Returns Array of MiGA::Dataset
def datasets
metadata[:datasets].map { |name| dataset(name) }
end
##
# Returns Array of String (without evaluating dataset objects)
def dataset_names
metadata[:datasets]
end
##
# Returns Hash of +{ String => true }+. Similar to +dataset_names+ but as
# Hash for efficiency
def dataset_names_hash
warn 'The Project#dataset_names_hash method will be deprecated soon'
@dataset_names_hash ||= Hash[dataset_names.map { |i| [i, true] }]
end
##
# Returns Set of Strings. Similar to +dataset_names+ but as Set for
# efficiency
def dataset_names_set
@dataset_names_set ||= Set.new(dataset_names)
end
##
# Cache for the special set of datasets which are both reference and
# active, returned as an Array. Use carefully, as it doesn't get
# recalculated upon dataset (in)activation once loaded. To force
# recalculating, use +dataset_ref_active!+
def dataset_ref_active
@dataset_ref_active ||= dataset_ref_active!
end
##
# Force recalculation of +dataset_ref_active+ and returns the Array
# of MiGA::Dataset objects
def dataset_ref_active!
@dataset_ref_active = datasets.select(&:ref?).select(&:active?)
end
##
# Returns MiGA::Dataset
def dataset(name)
name = name.to_s.miga_name
return nil unless MiGA::Dataset.exist?(self, name)
@datasets ||= {}
@datasets[name] ||= MiGA::Dataset.new(self, name)
@datasets[name]
end
##
# Iterate through datasets (MiGA::Dataset)
def each_dataset(&blk)
if block_given?
metadata[:datasets].each { |name| blk.call(dataset(name)) }
else
to_enum(:each_dataset)
end
end
##
# Add dataset identified by +name+ and return MiGA::Dataset.
def add_dataset(name)
unless metadata[:datasets].include? name
d = MiGA::Dataset.new(self, name)
@metadata[:datasets] << name
@dataset_names_hash[name] = true if @dataset_names_hash
@dataset_names_set << name if @dataset_names_set
@dataset_ref_active = nil
save
if d.ref? && d.active?
recalculate_tasks("Reference dataset added: #{d.name}")
end
pull_hook(:on_add_dataset, name)
end
dataset(name)
end
##
# Unlink dataset identified by +name+ and return MiGA::Dataset.
def unlink_dataset(name)
d = dataset(name)
return nil if d.nil?
@dataset_names_hash = nil
@dataset_names_set = nil
@dataset_ref_active = nil
self.metadata[:datasets].delete(name)
save
if d.ref? && d.active?
recalculate_tasks("Reference dataset unlinked: #{d.name}")
end
pull_hook(:on_unlink_dataset, name)
d
end
##
# Import the dataset +ds+, a MiGA::Dataset, using +method+ which is any method
# supported by File#generic_transfer.
def import_dataset(ds, method = :hardlink)
raise "Impossible to import dataset, it already exists: #{ds.name}." if
MiGA::Dataset.exist?(self, ds.name)
# Import dataset results
ds.each_result do |task, result|
# import result files
result.each_file do |file|
File.generic_transfer(
File.join(result.dir, file),
File.join(path, 'data', MiGA::Dataset.RESULT_DIRS[task], file),
method
)
end
# import result metadata
%w(json start done).each do |suffix|
if File.exist? File.join(result.dir, "#{ds.name}.#{suffix}")
File.generic_transfer(
File.join(result.dir, "#{ds.name}.#{suffix}"),
File.join(
path, 'data', MiGA::Dataset.RESULT_DIRS[task],
"#{ds.name}.#{suffix}"
),
method
)
end
end
end
# Import dataset metadata
File.generic_transfer(
File.join(ds.project.path, 'metadata', "#{ds.name}.json"),
File.join(self.path, 'metadata', "#{ds.name}.json"),
method
)
# Save dataset
self.add_dataset(ds.name)
end
##
# Find all datasets with (potential) result files but are yet unregistered.
def unregistered_datasets
datasets = []
MiGA::Dataset.RESULT_DIRS.values.each do |dir|
dir_p = "#{path}/data/#{dir}"
next unless Dir.exist? dir_p
Dir.entries(dir_p).each do |file|
next unless
file =~ %r{
\.(fa(a|sta|stqc?)?|fna|solexaqa|gff[23]?|done|ess)(\.gz)?$
}x
m = /([^\.]+)/.match(file)
datasets << m[1] unless m.nil? or m[1] == "miga-project"
end
end
datasets.uniq - metadata[:datasets]
end
##
# Are all the datasets in the project preprocessed? Save intermediate results
# if +save+ (until the first incomplete dataset is reached).
def done_preprocessing?(save = false)
!each_dataset.any? do |d|
d.ref? && d.active? && !d.done_preprocessing?(save)
end
end
##
# Returns a two-dimensional matrix (Array of Array) where the first index
# corresponds to the dataset, the second index corresponds to the dataset
# task, and the value corresponds to:
# - 0: Before execution.
# - 1: Done (or not required).
# - 2: To do.
def profile_datasets_advance
advance = []
each_dataset_profile_advance { |adv| advance << adv }
advance
end
##
# Call +blk+ passing the result of MiGA::Dataset#profile_advance for each
# registered dataset.
def each_dataset_profile_advance(&blk)
each_dataset { |ds| blk.call(ds.profile_advance) }
end
end