CIMAC-CIDC/cidc-schemas

View on GitHub
cidc_schemas/pipeline_configs/rna_level1_analysis_config.yaml.j2

Summary

Maintainability
Test Coverage
{#
  participant_id_from_cimac: Callable[[str], str]
    returns participant id from cimac id
  data_bucket: str = GOOGLE_ACL_DATA_BUCKET
    a name of a data_bucket where data files are expected to be available for the pipeline runner
  samples: List[dict]
    RNAseq assay records to be entered for this config
    see schemas/assays/rna_assay-v0.json#properties/records/items
  google_bucket_path: str
    the path to a google bucket folder to store the results in
  instance_name: str
    the unique part of the instance name to use
    NOTE: "rima-auto" will automatically be prepended to this string
#}
###############################################################################
# Automator configuration file
###############################################################################

# Give the new instance a unique name, e.g. the rima run name
# NOTE: "rima-auto" will automatically be prepended to this string
instance_name: {{ instance_name }}

# Define the number of cores for the rima instance
# Options- 32 (default), 64, 96
cores: 64

# Define the disk size to use in GB, default 250
# The name of the persistent disk will be: "rima_auto_{instance_name}_disk"
disk_size: 250

# Define the path to the google bucket path for the run
google_bucket_path: {{ google_bucket_path }}

# Define the specific rima commit string to use
rima_commit: "f5533ec"
serviceAcct: "biofxvm@cidc-biofx.iam.gserviceaccount.com"

# Define the specific rima GCP image to use
# NOTE: If a specific GCP image is not set via config['image'], then
# the default behavior is to get the latest rima image
image: 'rima-ver3-1'

# Define the rima reference snapshot to use
rima_ref_snapshot: 'rima-v27-reference-v1-0'

# Define the samples- each sample should have a name, e.g. SAMPLE1
# and a Google bucket path to the input file,
# e.g. gs://mybucket/data/sample1.fastq.gz
# Valid inputs: fastq, fastq.gz, bam
# NOTE: for PAIRED-END fastq/fastq.gz, give both pairs to the sample:
# SAMPLE_1_PE:
#   - gs://mybucket/data/sample1_pair1.fastq
#   - gs://mybucket/data/sample1_pair2.fastq
samples:
{% for sample in samples %}
  {{ sample.cimac_id }}:
  # either bam or both r1/r2
  {% if sample.files.r1 %}
      # r1
    {% for fastq_gz_artifact in sample.files.r1 %}
      - gs://{{data_bucket}}/{{fastq_gz_artifact.object_url}}
    {% endfor %}
      # r2
    {% for fastq_gz_artifact in sample.files.r2 %}
      - gs://{{data_bucket}}/{{fastq_gz_artifact.object_url}}
    {% endfor %}
  {% else %}
      # bam
    {% for bam_artifact in sample.files.bam %}
      - gs://{{data_bucket}}/{{bam_artifact.object_url}}
    {% endfor %}
  {% endif %}
{% endfor %}

metasheet:
{% for sample in samples %}
  {{ sample.cimac_id }}:
    SampleName: {{ sample.cimac_id }}
    PatName: {{ participant_id_from_cimac(sample.cimac_id) }}
{% endfor %}