bio-miga/miga

View on GitHub
lib/miga/dataset/base.rb

Summary

Maintainability
A
20 mins
Test Coverage
A
96%
# frozen_string_literal: true

require 'miga/common/with_option'

class MiGA::Dataset < MiGA::MiGA
  include MiGA::Common::WithOption

  # Class-level
  class << self
    ##
    # Directories containing the results from dataset-specific tasks
    def RESULT_DIRS
      @@RESULT_DIRS
    end

    ##
    # Supported dataset types
    def KNOWN_TYPES
      @@KNOWN_TYPES
    end

    ##
    # Returns an Array of tasks (Symbols) to be executed before project-wide
    # tasks
    def PREPROCESSING_TASKS
      @@PREPROCESSING_TASKS
    end

    ##
    # Tasks to be excluded from query datasets
    def EXCLUDE_NOREF_TASKS
      @@EXCLUDE_NOREF_TASKS
    end

    ##
    # Tasks to be excluded from datasets without markers
    def EXCLUDE_NOMARKER_TASKS
      @@EXCLUDE_NOMARKER_TASKS
    end

    ##
    # Tasks to be executed only in datasets that are single-organism. These
    # tasks are ignored for multi-organism datasets or for unknown types
    def ONLY_NONMULTI_TASKS
      @@ONLY_NONMULTI_TASKS
    end

    ##
    # Tasks to be executed only in datasets that are multi-organism. These
    # tasks are ignored for single-organism datasets or for unknwon types
    def ONLY_MULTI_TASKS
      @@ONLY_MULTI_TASKS
    end

    ##
    # Options supported by datasets
    def OPTIONS
      @@OPTIONS
    end
  end
end

module MiGA::Dataset::Base
  ##
  # Directories containing the results from dataset-specific tasks
  @@RESULT_DIRS = {
    # Preprocessing
    raw_reads: '01.raw_reads',
    trimmed_reads: '02.trimmed_reads',
    read_quality: '03.read_quality',
    trimmed_fasta: '04.trimmed_fasta',
    assembly: '05.assembly',
    cds: '06.cds',
    # Annotation
    essential_genes: '07.annotation/01.function/01.essential',
    mytaxa: '07.annotation/02.taxonomy/01.mytaxa',
    mytaxa_scan: '07.annotation/03.qa/02.mytaxa_scan',
    # Distances (for single-species datasets)
    taxonomy: '09.distances/05.taxonomy',
    distances: '09.distances',
    # Post-QC
    ssu: '07.annotation/01.function/02.ssu',
    stats: '90.stats'
  }

  ##
  # Supported dataset types
  @@KNOWN_TYPES = {
    genome: {
      description: 'The genome from an isolate',
      multi: false, markers: true,
      project_types: %i[mixed genomes clade]
    },
    scgenome: {
      description: 'A Single-cell Amplified Genome (SAG)',
      multi: false, markers: true,
      project_types: %i[mixed genomes clade]
    },
    popgenome: {
      description: 'A Metagenome-Assembled Genome (MAG)',
      multi: false, markers: true,
      project_types: %i[mixed genomes clade]
    },
    metagenome: {
      description: 'A metagenome (excluding viromes)',
      multi: true, markers: true,
      project_types: %i[mixed metagenomes]
    },
    virome: {
      description: 'A viral metagenome',
      multi: true,
      markers: true, # <- We don't expect, but can be useful for contamination
      project_types: %i[mixed metagenomes]
    },
    plasmid: {
      description: 'An individual plasmid',
      multi: false, markers: false,
      project_types: %i[mixed plasmids]
    }
  }

  ##
  # Returns an Array of tasks (Symbols) to be executed before project-wide tasks
  @@PREPROCESSING_TASKS = %i[
    raw_reads trimmed_reads read_quality trimmed_fasta
    assembly cds essential_genes mytaxa mytaxa_scan
    taxonomy distances ssu stats
  ]

  ##
  # Tasks to be excluded from query datasets
  @@EXCLUDE_NOREF_TASKS = %i[mytaxa_scan taxonomy]
  @@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]

  ##
  # Tasks to be excluded from datasets without markers
  @@EXCLUDE_NOMARKER_TASKS = %i[essential_genes ssu]
  @@_EXCLUDE_NOMARKER_TASKS_H =
    Hash[@@EXCLUDE_NOMARKER_TASKS.map { |i| [i, true] }]

  ##
  # Tasks to be executed only in datasets that are single-organism. These
  # tasks are ignored for multi-organism datasets or for unknown types
  @@ONLY_NONMULTI_TASKS = %i[mytaxa_scan taxonomy distances]
  @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]

  ##
  # Tasks to be executed only in datasets that are multi-organism. These
  # tasks are ignored for single-organism datasets or for unknwon types
  @@ONLY_MULTI_TASKS = %i[mytaxa]
  @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]

  ##
  # Options supported by datasets
  @@OPTIONS = {
    db_project: {
      desc: 'Project to use as database', type: String
    },
    dist_req: {
      desc: 'Run distances against these datasets', type: Array, default: []
    }
  }
end