cidc_api/models/files/details.py from CIMAC-CIDC/cidc-api-gae

cidc_api/models/files/details.py
Summary

Maintainability

0 mins
Test Coverage

100%
Issues
Coverage
from typing import NamedTuple
from typing_extensions import Literal


FilePurpose = Literal["source", "analysis", "clinical", "miscellaneous"]


class FileDetails(NamedTuple):
    file_purpose: FilePurpose
    short_description: str
    long_description: str


details_dict = {
    # ATACseq Assay
    "/atacseq/r1_L.fastq.gz": FileDetails(
        "source",
        "fastq file of raw Read 1, compressed",
        "The gzipped, FASTQ file that represents the 5' read of paired sequencing. Generated by the sequencing machine defined by the RNA assay this file is associated with.",
    ),
    "/atacseq/r2_L.fastq.gz": FileDetails(
        "source",
        "fastq file of raw Read 2, compressed",
        "The gzipped, FASTQ file that represents the 3' read of paired sequencing. Generated by the sequencing machine defined by the RNA assay this file is associated with.",
    ),
    # ATACseq Analysis
    "/atacseq/analysis/peaks/sorted_peaks.bed": FileDetails(
        "miscellaneous",
        "regular peaks called by MACS2, sorted",
        "The BED file that represents the regular peaks as called by MACS2, sorted. 5th: integer score for display. It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff 7th: fold-change at peak summit 8th: -log10pvalue at peak summit 9th: -log10qvalue at peak summit 10th: relative summit position to peak start. https://github.com/macs3-project/MACS",
    ),
    "/atacseq/analysis/peaks/sorted_summits.bed": FileDetails(
        "miscellaneous",
        "peak summits called by MACS2, sorted",
        "The BED file that represents the MACS2-called location with the highest fragment pileup aka the summit, sorted.",
    ),
    "/atacseq/analysis/peaks/sorted_peaks.narrowPeak": FileDetails(
        "miscellaneous",
        "narrowPeak called by MACS2 in BED6+4 format",
        "The narrowPeak file that represents the MACS2-called peak locations, summits, p-, and q-values in BED6+4 format",
    ),
    "/atacseq/analysis/peaks/treat_pileup.bw": FileDetails(
        "miscellaneous",
        "bigwig file, for visualization in IGV",
        "The bigwig file that represents the RPKM (reads per kilobase per million) normalized pile up, for visualization in IGV.",
    ),
    "/atacseq/analysis/aligned_sorted.bam": FileDetails(
        "source",
        "sorted bam file, aligned with bwa-mem ",
        "The BAM file that represents the aligned and sorted reads. Aligned with BWA-MEM.",
    ),
    # WES
    "/wes/r1_L.fastq.gz": FileDetails(
        "source",
        "fastq file of raw Read 1, compressed",
        "The gzipped, FASTQ file that represents the 5' read of paired sequencing. Generated by the sequencing machine defined by the RNA assay this file is associated with.",
    ),
    "/wes/r2_L.fastq.gz": FileDetails(
        "source",
        "fastq file of raw Read 2, compressed",
        "The gzipped, FASTQ file that represents the 3' read of paired sequencing. Generated by the sequencing machine defined by the RNA assay this file is associated with.",
    ),
    "/wes/reads_.bam": FileDetails(
        "source",
        "bam file containing both pairs of sequencing libraries",
        "The paired raw reads in the standard BAM binary format, generated from the raw R1 and R2 FASTQ files.",
    ),
    ## see: https://github.com/CIMAC-CIDC/cidc-ngs-pipeline-api/blob/master/cidc_ngs_pipeline_api/wes/wes_output_API.json
    "/wes/analysis/error.yaml": FileDetails(
        "analysis",
        "yaml file that specifies error codes for files",
        "Explanation of all files which are expected to be empty due to a failed/missing module.",
    ),
    "/wes/analysis/copynumber_segments.txt": FileDetails(
        "analysis",
        "copynumber: Sequenza CNV segments file",
        "Copy number variation segments file called by the Sequenza software package.  The column descriptions for the segment file could be found here (https://cran.r-project.org/web/packages/sequenza/vignettes/sequenza.html#plots-and-results)",
    ),
    "/wes/analysis/copynumber_genome_view.pdf": FileDetails(
        "analysis",
        "copynumber: Sequenza genome-wide plot of depth.ratio and B-allele frequency.",
        "Genome-wide plot (generated by Sequenza) showing depth.ratio and B-allele frequency.",
    ),
    "/wes/analysis/copynumber_chromosome_view.pdf": FileDetails(
        "analysis",
        "copynumber: Sequenza plot of depth.ratio and B-allele frequency chromosome by chromosome.",
        "Chromosome by chromosome plot (generated by Sequenza) showing depth.ratio and B-allele frequency.",
    ),
    "/wes/analysis/copynumber_sequenza_gainloss.bed": FileDetails(
        "analysis",
        "copynumber: Sequenza CNV segments file filtered with hard cut-offs to call regions of GAIN/LOSS",
        "Filtered Sequenza segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
    ),
    "/wes/analysis/copynumber_sequenza_final.txt.gz": FileDetails(
        "analysis",
        "copynumber: Sequenza post-processed seqz file used for input to Sequenza CNV caller",
        "Sequenza seqz file generated by the bam2seqz software using a GC wiggle track with a window size of 50 (-w 50).",
    ),
    "/wes/analysis/alternative_solutions.txt": FileDetails(
        "analysis",
        "purity: Sequenza Cellularity and Ploidy estimate file",
        "Cellularity and ploidy estimates of the tumor sample using the Sequenza software package.  The columns of the file are follows: Cellularity, Ploidy, and SLPP (Scaled Log Posterior Probability).",
    ),
    "/wes/analysis/cp_contours.pdf": FileDetails(
        "analysis",
        "purity: Sequenza plot of likelihood densities for all cellularity/ploidy solutions.",
        "Sequenza generated plot showing the likelihood densities for each cellularity/ploidy solution (https://cran.r-project.org/web/packages/sequenza/vignettes/sequenza.html#plots-and-results).",
    ),
    "/wes/analysis/clonality_input.tsv": FileDetails(
        "analysis",
        "tumor clonality: PyClone-VI input file generated by sequenza library (https://cran.r-project.org/web/packages/sequenza/index.html)",
        "Input file generated for PyClone-VI analysis.  Sequenza was used to generate the expected file format (https://github.com/Roth-Lab/pyclone-vi#input-format).",
    ),
    "/wes/analysis/clonality_results.tsv": FileDetails(
        "analysis",
        "tumor clonality: PyClone-VI tumor clonality results file",
        "Tumor clone/cluster prevalence estimations generated by the PyClone-VI software package.  The format of the results file is described here (https://github.com/Roth-Lab/pyclone-vi#output-format).",
    ),
    "/wes/analysis/clonality_summary.tsv": FileDetails(
        "analysis",
        "tumor clonality: PyClone-VI tumor clonality results summary file",
        "Summary of Pyclone-VI results file condensed to only show the cluster_id, cellular_prevalence, and cellular_prevalence_std columns.",
    ),
    "/wes/analysis/copynumber_cnv_segments.cns": FileDetails(
        "analysis",
        "copynumber: CNVkit segments file",
        "CNVkit's Segmented log2 ratios file. The 'cn' column representes the total copy number of the segment.  The other columns of the results file are described here (https://cnvkit.readthedocs.io/en/stable/fileformats.html#segmented-log2-ratios-cns)",
    ),
    "/wes/analysis/copynumber_cnv_segments_enhanced.cns": FileDetails(
        "analysis",
        "copynumber: Enhanced CNVkit segments file with BAF and Major/minor allele information",
        "The enhanced CNVkit segments file incoporates somatic sNP and tumor purity information (called by the pipeline) to incorporate B-allele frequencies, major and minor allele (cn1 and cn2 respectively), and correct for tumor sample purity level.",
    ),
    "/wes/analysis/copynumber_cnv_scatterplot.png": FileDetails(
        "analysis",
        "copynumber: scatter plot of log2 coverage and segmentation call information",
        "Genome-wide scatter plot of log2 coverage ratios and called CNV segments",
    ),
    "/wes/analysis/copynumber_cnvkit_gainloss.bed": FileDetails(
        "analysis",
        "copynumber: CNVkit segments file filtered with hard cut-offs to call regions of GAIN/LOSS",
        "Filtered CNVkit segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
    ),
    "/wes/analysis/copynumber_consensus.bed": FileDetails(
        "analysis",
        "copynumber: Consensus CNV segments file",
        "Consensus CNV regions that are called by at least 2 of the 3 callers (CNVkit, Sequenza, or FACETS).  CNV Callers must agree on both the region (intersection of overlapped regions) and the call (GAIN or LOSS).",
    ),
    "/wes/analysis/copynumber_consensus_gain.bed": FileDetails(
        "analysis",
        "copynumber: Consensus CNV segments file of only GAIN regions",
        "GAIN only CNV regions derived from the consensus CNV file.  Regions are also merged if they have an overlap of at least 1bp.",
    ),
    "/wes/analysis/copynumber_consensus_loss.bed": FileDetails(
        "analysis",
        "copynumber: Consensus CNV segments file of only LOSS regions",
        "LOSS only CNV regions derived from the consensus CNV file.  Regions are also merged if they have an overlap of at least 1bp.",
    ),
    "/wes/analysis/copynumber_facets.cncf": FileDetails(
        "analysis",
        "copynumber: FACETS CNV segments file",
        "Copy number variation segments file called by the FACETS software (https://github.com/mskcc/facets).",
    ),
    "/wes/analysis/copynumber_facets_gainloss.bed": FileDetails(
        "analysis",
        "copynumber: FACETS CNV segments file filtered with hard-cutoff to call regions of GAIN/LOSS",
        "Filtered FACETS segments file after applying a hard cut-off to call regions of GAIN (total copy number >= 3) and regions of LOSS (total copy number <= 1.5).",
    ),
    "/wes/analysis/tnscope_output_twist.vcf": FileDetails(
        "analysis",
        "somatic variants: vcf file of somatic variants in TWIST targed capture region",
        "VCF file of variants that fall within the TWIST excome capture regions.  bcftools is used to filter reads in output.vcf.gz that intersect with the TWIST capture regions.",
    ),
    "/wes/analysis/tnscope_output_twist.maf": FileDetails(
        "analysis",
        "somatic variants: maf file of somatic variants in TWIST targed capture region",
        "MAF file of variants that fall within the TWIST excome capture regions generated using vcf2maf tool (https://github.com/mskcc/vcf2maf). VEP was used to annotate twist.vcf file, which was then used as input to vcf2maf. NOTE: Some columns in this maf file may be affected by the ExACdb assembly compatibility issue discussed in the WES pipeline overview page (https://portal.cimac-network.org/pipelines/wes).",
    ),
    "/wes/analysis/tnscope_output_twist_filtered.vcf": FileDetails(
        "analysis",
        "somatic variants: vcf file of somatic variants in TWIST targed capture region filtered by PASS column",
        "VCF file of variants that fall within the TWIST excome capture regions filtered to remove vairants where the PASS column contained one of the following- germline-risk, low_t_alt_frac, t_lod_fstar, or triallelic_site",
    ),
    "/wes/analysis/tnscope_output_twist_filtered.maf": FileDetails(
        "analysis",
        "somatic variants: maf file of somatic variants in TWIST targed capture region filtered by PASS column",
        "MAF file generated by converting twist.filtered.vcf to maf using VEP to annotate variants and vcf2maf to do the conversion. NOTE: Some columns in this maf file may be affected by the ExACdb assembly compatibility issue discussed in the WES pipeline overview page (https://portal.cimac-network.org/pipelines/wes).",
    ),
    "/wes/analysis/tcellextrect.txt": FileDetails(
        "analysis",
        "tcell: TCell fraction estimates generated by TcellExTRECT",
        "TCell fraction estimates generated by the TcellExTRECT software (https://github.com/McGranahanLab/TcellExTRECT)",
    ),
    "/wes/analysis/tumor/hla_final_result.txt": FileDetails(
        "analysis",
        "hla: MHC Class I and II results (using HLA-HD)",
        "Predicted MHC Class II and II results using the HLA-HD software (https://www.genome.med.kyoto-u.ac.jp/HLA-HD/).  Chromosome 6 reads from the deduplicated bam file were extracted and fed into the HLA-HD prediction algorithm.",
    ),
    "/wes/analysis/normal/hla_final_result.txt": FileDetails(
        "analysis",
        "hla: MHC Class I and II results (using HLA-HD)",
        "Predicted MHC Class II and II results using the HLA-HD software (https://www.genome.med.kyoto-u.ac.jp/HLA-HD/).  Chromosome 6 reads from the deduplicated bam file were extracted and fed into the HLA-HD prediction algorithm.",
    ),
    "/wes/analysis/vcf_compare.txt": FileDetails(
        "analysis",
        "plain-text file of overlaps of somatic and germline variants, from VCFtool's vcf-compare",
        "VCFtool's vcf-compare (http://vcftools.sourceforge.net/perl_module.html#vcf-compare) is used to compare somatic and germline variants.  The file shows the number of common variants, somatic only, and germline only variants.",
    ),
    "/wes/analysis/optimal_purity_value.txt": FileDetails(
        "analysis",
        "tumor purity: tumor purity estimates using the FACETS software package",
        "Tumor purity estimates using the FACETS software (https://github.com/mskcc/facets).",
    ),
    "/wes/analysis/clonality_pyclone.tsv": FileDetails(
        "analysis", "tab-separated input file for PyClone generated by Sequenza", ""
    ),
    "/wes/analysis/clonality_table.tsv": FileDetails("analysis", "", ""),
    "/wes/analysis/copynumber_cnvcalls.txt": FileDetails(
        "analysis", "plain-text copynumber analysis results, from Sentieon CNV", ""
    ),
    "/wes/analysis/copynumber_cnvcalls.txt.tn.tsv": FileDetails(
        "analysis",
        "tab-separated segmented copynumber variations, from Sentieon CNV",
        "",
    ),
    "/wes/analysis/vcf_gz_tnscope_output.vcf.gz": FileDetails(
        "analysis",
        "somatic variants: vcf file of somatic variants",
        "VCF file of somatic variants using one of the following the Sentieon somatic callers {tnscope (default), tnhaplotyper2, tnsnv}.\n\nTNscope algorithm- https://support.sentieon.com/manual/usages/general/#tnscope-algorithm\nTNhaplotyper2- https://support.sentieon.com/manual/usages/general/#tnhaplotyper2-algorithm\nTNsnv - https://support.sentieon.com/manual/usages/general/#tnsnv-algorithm",
    ),
    "/wes/analysis/msisensor.txt": FileDetails("analysis", "", ""),
    "/wes/analysis/vcf_gz_tnscope_filter.vcf.gz": FileDetails(
        "analysis",
        "",
        "",
    ),
    "/wes/analysis/maf_tnscope_output.maf": FileDetails(
        "analysis",
        "somatic variants: maf file of somatic variants",
        "MAF file of VEP annotated variants using vcf2maf tool (https://github.com/mskcc/vcf2maf).  The vep annotated vcf of the somatic variants (output.vcf.gz) file was converted to maf using vcf2maf.",
    ),
    "/wes/analysis/vcf_tnscope_filter_neoantigen.vcf": FileDetails(
        "analysis",
        "rna: Shared RNA and WES variants that is used for neoantigen prediction when RNA-seq data is provided with the WES run",
        "Variants file representing the common variants between RNA (haplotyper.rna.vcf.gz) and WES data (output.twist.neoantigen.vep.vcf).",
    ),
    "/wes/analysis/combined_filtered.tsv": FileDetails(
        "analysis",
        "neoantigen: list of predicted neoantigens",
        "The combined MHC class I and II predicted neoantigens using the pVACseq software.  The column definitions are given here (ref: https://pvactools.readthedocs.io/en/latest/pvacseq/output_files.html)",
    ),
    "/wes/analysis/maf_tnscope_filter.maf": FileDetails("analysis", "", ""),
    "/wes/analysis/tnscope_exons.vcf.gz": FileDetails("analysis", "", ""),
    "/wes/analysis/HLA_results.tsv": FileDetails("analysis", "", ""),
    "/wes/analysis/tumor_germline_overlap.tsv": FileDetails("analysis", "", ""),
    "/wes/analysis/tumor/recalibrated.bam": FileDetails(
        "analysis",
        "alignment: Base Qualtiy Score Recalibration (BQSR) bam file",
        "The Sentieon QualCal (https://support.sentieon.com/manual/usages/general/#qualcal-algorithm) is used to perform BSQR and remove any technical artifacts in the base quality scores.",
    ),
    "/wes/analysis/tumor/recalibrated.bam.bai": FileDetails(
        "analysis",
        "alignment: index file for Base Qualtiy Score Recalibration (BQSR) bam file",
        "Index file for the BQSR bam file",
    ),
    "/wes/analysis/tumor/sorted.dedup.bam": FileDetails(
        "analysis",
        "alignment: bam file with deduplicated reads",
        "Aligned reads were sorted and marked duplicates were removed using the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
    ),
    "/wes/analysis/tumor/sorted.dedup.bam.bai": FileDetails(
        "analysis",
        "alignment: index file for deduplicated bam",
        "Bam index file for deduplicated bam file generated by the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
    ),
    "/wes/analysis/tumor/haplotyper_targets.vcf.gz": FileDetails(
        "analysis",
        "germline: vcf of haplotype variants in targeted regions",
        "Haplotype variants within targeted capture regions using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
    ),
    "/wes/analysis/tumor/haplotyper_output.vcf": FileDetails(
        "analysis",
        "germline: germline variants",
        "Haplotype variants using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
    ),
    "/wes/analysis/normal/haplotyper_targets.vcf.gz": FileDetails(
        "analysis",
        "germline: vcf of haplotype variants in targeted regions",
        "Haplotype variants within targeted capture regions using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
    ),
    "/wes/analysis/normal/haplotyper_output.vcf": FileDetails(
        "analysis",
        "germline: germline variants",
        "Haplotype variants using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
    ),
    "/wes/analysis/normal/recalibrated.bam": FileDetails(
        "analysis",
        "alignment: Base Qualtiy Score Recalibration (BQSR) bam file",
        "The Sentieon QualCal (https://support.sentieon.com/manual/usages/general/#qualcal-algorithm) is used to perform BSQR and remove any technical artifacts in the base quality scores.",
    ),
    "/wes/analysis/normal/recalibrated.bam.bai": FileDetails(
        "analysis",
        "alignment: index file for Base Qualtiy Score Recalibration (BQSR) bam file",
        "Index file for the BQSR bam file",
    ),
    "/wes/analysis/normal/sorted.dedup.bam": FileDetails(
        "analysis",
        "alignment: bam file with deduplicated reads",
        "Aligned reads were sorted and marked duplicates were removed using the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
    ),
    "/wes/analysis/normal/sorted.dedup.bam.bai": FileDetails(
        "analysis",
        "alignment: index file for deduplicated bam",
        "Bam index file for deduplicated bam file generated by the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
    ),
    "/wes/analysis/tumor/coverage_metrics.txt": FileDetails(
        "analysis",
        "plain-text genome-wide coverage file from tumor sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes/analysis/tumor/target_metrics.txt": FileDetails(
        "analysis",
        "plain-text targeted exome regions coverage file from tumor sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes/analysis/tumor/coverage_metrics_summary.txt": FileDetails(
        "analysis",
        "plain-text genome-wide coverage summary file from tumor sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes/analysis/normal/coverage_metrics.txt": FileDetails(
        "analysis",
        "plain-text genome-wide coverage file from normal sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes/analysis/normal/target_metrics.txt": FileDetails(
        "analysis",
        "plain-text targeted exome regions coverage file from normal sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes/analysis/normal/coverage_metrics_summary.txt": FileDetails(
        "analysis",
        "plain-text genome-wide coverage summary file from normal sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes/analysis/tumor/optitype_result.tsv": FileDetails(
        "analysis",
        "hla: MHC Class I results (using OptiType)",
        "Predicted MHC Class I alleles using the Optitype software (https://github.com/FRED-2/OptiType).  Chromosome 6 reads from the deduplicated bam file were extracted and fed into the Optitype prediction algorithm.",
    ),
    "/wes/analysis/normal/optitype_result.tsv": FileDetails(
        "analysis",
        "hla: MHC Class I results (using OptiType)",
        "Predicted MHC Class I alleles using the Optitype software (https://github.com/FRED-2/OptiType).  Chromosome 6 reads from the deduplicated bam file were extracted and fed into the Optitype prediction algorithm.",
    ),
    # Tumor Only
    "/wes_tumor_only/analysis/haplotyper.vcf.gz": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/error.yaml": FileDetails(
        "analysis",
        "yaml file that specifies error codes for files",
        "Explanation of all files which are expected to be empty due to a failed/missing module.",
    ),
    "/wes_tumor_only/analysis/tnscope_output_twist.vcf": FileDetails(
        "analysis",
        "somatic variants: vcf file of somatic variants in TWIST targed capture region",
        "VCF file of variants that fall within the TWIST excome capture regions.  bcftools is used to filter reads in output.vcf.gz that intersect with the TWIST capture regions.",
    ),
    "/wes_tumor_only/analysis/tnscope_output_twist.maf": FileDetails(
        "analysis",
        "somatic variants: maf file of somatic variants in TWIST targed capture region",
        "MAF file of variants that fall within the TWIST excome capture regions generated using vcf2maf tool (https://github.com/mskcc/vcf2maf). VEP was used to annotate twist.vcf file, which was then used as input to vcf2maf.NOTE: Some columns in this maf file may be affected by the ExACdb assembly compatibility issue discussed in the WES pipeline overview page (https://portal.cimac-network.org/pipelines/wes).",
    ),
    "/wes_tumor_only/analysis/tnscope_output_twist_filtered.vcf": FileDetails(
        "analysis",
        "somatic variants: vcf file of somatic variants in TWIST targed capture region filtered by PASS column",
        "VCF file of variants that fall within the TWIST excome capture regions filtered to remove vairants where the PASS column contained one of the following- germline-risk, low_t_alt_frac, t_lod_fstar, or triallelic_site",
    ),
    "/wes_tumor_only/analysis/tnscope_output_twist_filtered.maf": FileDetails(
        "analysis",
        "somatic variants: maf file of somatic variants in TWIST targed capture region filtered by PASS column",
        "MAF file generated by converting twist.filtered.vcf to maf using VEP to annotate variants and vcf2maf to do the conversion. NOTE: Some columns in this maf file may be affected by the ExACdb assembly compatibility issue discussed in the WES pipeline overview page (https://portal.cimac-network.org/pipelines/wes).",
    ),
    "/wes_tumor_only/analysis/tcellextrect.txt": FileDetails(
        "analysis",
        "tcell: TCell fraction estimates generated by TcellExTRECT",
        "TCell fraction estimates generated by the TcellExTRECT software (https://github.com/McGranahanLab/TcellExTRECT)",
    ),
    "/wes_tumor_only/analysis/tumor/hla_final_result.txt": FileDetails(
        "analysis",
        "hla: MHC Class I and II results (using HLA-HD)",
        "Predicted MHC Class II and II results using the HLA-HD software (https://www.genome.med.kyoto-u.ac.jp/HLA-HD/).  Chromosome 6 reads from the deduplicated bam file were extracted and fed into the HLA-HD prediction algorithm.",
    ),
    "/wes_tumor_only/analysis/copynumber_cnvcalls.txt": FileDetails(
        "analysis", "plain-text copynumber analysis results, from Sentieon CNV", ""
    ),
    "/wes_tumor_only/analysis/copynumber_cnvcalls.txt.tn.tsv": FileDetails(
        "analysis",
        "tab-separated segmented copynumber variations, from Sentieon CNV",
        "",
    ),
    "/wes_tumor_only/analysis/msisensor.txt": FileDetails(
        "analysis",
        "",
        "",
    ),
    "/wes_tumor_only/analysis/maf_tnscope_output.maf": FileDetails(
        "analysis",
        "somatic variants: maf file of somatic variants",
        "MAF file of VEP annotated variants using vcf2maf tool (https://github.com/mskcc/vcf2maf).  The vep annotated vcf of the somatic variants (output.vcf.gz) file was converted to maf using vcf2maf.",
    ),
    "/wes_tumor_only/analysis/vcf_tnscope_filter_neoantigen.vcf": FileDetails(
        "analysis",
        "rna: Shared RNA and WES variants that is used for neoantigen prediction when RNA-seq data is provided with the WES run",
        "Variants file representing the common variants between RNA (haplotyper.rna.vcf.gz) and WES data (output.twist.neoantigen.vep.vcf).",
    ),
    "/wes_tumor_only/analysis/combined_filtered.tsv": FileDetails(
        "analysis",
        "neoantigen: list of predicted neoantigens",
        "The combined MHC class I and II predicted neoantigens using the pVACseq software.  The column definitions are given here (ref: https://pvactools.readthedocs.io/en/latest/pvacseq/output_files.html)",
    ),
    "/wes_tumor_only/analysis/vcf_gz_tnscope_filter.vcf.gz": FileDetails(
        "analysis", "", ""
    ),
    "/wes_tumor_only/analysis/vcf_gz_tnscope_output.vcf.gz": FileDetails(
        "analysis", "", ""
    ),
    "/wes_tumor_only/analysis/maf_tnscope_filter.maf": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/tnscope_exons.vcf.gz": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/HLA_results.tsv": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/tumor/sorted.dedup.bam": FileDetails(
        "analysis",
        "alignment: bam file with deduplicated reads",
        "Aligned reads were sorted and marked duplicates were removed using the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
    ),
    "/wes_tumor_only/analysis/tumor/sorted.dedup.bam.bai": FileDetails(
        "analysis",
        "alignment: index file for deduplicated bam",
        "Bam index file for deduplicated bam file generated by the Sentieon Dedup tool (https://support.sentieon.com/manual/usages/general/#dedup-algorithm)",
    ),
    "/wes/analysis/tumor/haplotyper_targets.vcf.gz": FileDetails(
        "analysis",
        "germline: vcf of haplotype variants in targeted regions",
        "Haplotype variants within targeted capture regions using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
    ),
    "/wes/analysis/tumor/haplotyper_output.vcf": FileDetails(
        "analysis",
        "germline: germline variants",
        "Haplotype variants using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
    ),
    "/wes/analysis/tumor/haplotyper_targets.vcf.gz": FileDetails(
        "analysis",
        "germline: vcf of haplotype variants in targeted regions",
        "Haplotype variants within targeted capture regions using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
    ),
    "/wes/analysis/tumor/haplotyper_output.vcf": FileDetails(
        "analysis",
        "germline: germline variants",
        "Haplotype variants using Sentieon Haplotyper algorithm (https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm)",
    ),
    "/wes_tumor_only/analysis/tumor/coverage_metrics.txt": FileDetails(
        "analysis",
        "plain-text genome-wide coverage file from tumor sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes_tumor_only/analysis/tumor/target_metrics.txt": FileDetails(
        "analysis",
        "plain-text targeted exome regions coverage file from tumor sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes_tumor_only/analysis/tumor/coverage_metrics_summary.txt": FileDetails(
        "analysis",
        "plain-text genome-wide coverage summary file from tumor sample, from Sentieon's CoverageMetrics",
        "",
    ),
    "/wes_tumor_only/analysis/tumor/optitype_result.tsv": FileDetails(
        "analysis",
        "hla: MHC Class I results (using OptiType)",
        "Predicted MHC Class I alleles using the Optitype software (https://github.com/FRED-2/OptiType).  Chromosome 6 reads from the deduplicated bam file were extracted and fed into the Optitype prediction algorithm.",
    ),
    # WES Report
    "/wes/analysis/haplotyper.vcf.gz": FileDetails("analysis", "", ""),
    "/wes/analysis/tumor_mutational_burden.tsv": FileDetails("analysis", "", ""),
    "/wes/analysis/report.tar.gz": FileDetails("analysis", "", ""),
    "/wes/analysis/wes_run_version.tsv": FileDetails("analysis", "", ""),
    "/wes/analysis/config.yaml": FileDetails("analysis", "", ""),
    "/wes/analysis/metasheet.csv": FileDetails("analysis", "", ""),
    "/wes/analysis/wes_sample.json": FileDetails("analysis", "", ""),
    "/wes/analysis/tumor/xhla_report_hla.json": FileDetails(
        "analysis",
        "hla: MHC Class I and II results (using xhla)",
        "Predicted MHC Class I and II results using the xHLA software(https://github.com/humanlongevity/HLA).  Chromosome 6 reads from the deduplicated bam file were extracted and fed into the xHLA prediction algorithm.",
    ),
    "/wes/analysis/normal/xhla_report_hla.json": FileDetails(
        "analysis",
        "hla: MHC Class I and II results (using xhla)",
        "Predicted MHC Class I and II results using the xHLA software(https://github.com/humanlongevity/HLA).  Chromosome 6 reads from the deduplicated bam file were extracted and fed into the xHLA prediction algorithm.",
    ),
    # WES Report Tumor Only
    "/wes_tumor_only/analysis/report.tar.gz": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/wes_run_version.tsv": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/config.yaml": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/metasheet.csv": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/wes_sample.json": FileDetails("analysis", "", ""),
    "/wes_tumor_only/analysis/tumor/xhla_report_hla.json": FileDetails(
        "analysis", "", ""
    ),
    # RNA
    "/rna/r1_.fastq.gz": FileDetails(
        "source",
        "fastq file of raw Read 1, compressed",
        "The gzipped, FASTQ file that represents the 5' read of paired sequencing. Generated by the sequencing machine defined by the RNA assay this file is associated with.",
    ),
    "/rna/r2_.fastq.gz": FileDetails(
        "source",
        "fastq file of raw Read 2, compressed",
        "The gzipped, FASTQ file that represents the 3' read of paired sequencing. Generated by the sequencing machine defined by the RNA assay this file is associated with.",
    ),
    "/rna/reads_.bam": FileDetails(
        "source",
        "bam file containing the paired pairs",
        "The paired raw reads in the standard BAM binary format, generated from the raw R1 and R2 FASTQ files.",
    ),
    ## see: https://github.com/CIMAC-CIDC/cidc-ngs-pipeline-api/blob/master/rna/rna_level1_output_API.json
    "/rna/analysis/star/sorted.bam": FileDetails(
        "analysis",
        "bam file of the aligned reads with index sorted.bam.bai, sorted by their location in the genome from STAR",
        "Aligned reads in the standard BAM binary format, sorted by their coordinate in the genome (similar to `samtools sort`) which is required by many downstream applications.",
    ),
    "/rna/analysis/star/sorted.bam.bai": FileDetails(
        "analysis",
        "bam index file of the aligned reads that accompanies sorted.bam, sorted by their location in the genome from STAR",
        "The index for the aligned reads in the standard BAI binary format, sorted by their coordinate in the genome (similar to `samtools sort`) which is required by many downstream applications.",
    ),
    "/rna/analysis/star/sorted.bam.stat.txt": FileDetails(
        "analysis",
        "plain text statistics of sorted.bam file alignment",
        "A plain-text file of summary statistics of the alignment. Provides informaion useful for determining sample quality and discovering alignment errors.",
    ),
    "/rna/analysis/star/transcriptome.bam": FileDetails(
        "analysis",
        "transcriptome bam file",
        "A bam file containing the transcriptomefrom RNAseq expression analyses",
    ),
    "/rna/analysis/star/chimeric_out_junction.junction": FileDetails(
        "analysis",
        "Chimeric junction output",
        "Chimeric junction output for fusion calling",
    ),
    "/rna/analysis/star/chimeric_out.junction": FileDetails(
        "analysis",
        "Chimeric junction output",
        "Chimeric junction output for fusion calling",
    ),
    "/rna/analysis/rseqc/read_distrib.txt": FileDetails(
        "analysis",
        "plain-text statistics of the distribution of the aligned reads from RSeQC",
        "A plain-text file containing summary statistics about the overall mapping and rate of alignment for different types of sequence elements, using the downsampled BAM from STAR. Produced by RSeQC.",
    ),
    "/rna/analysis/rseqc/tin_score.summary.txt": FileDetails(
        "analysis",
        "tab-separated statistics of the Transcript Integrity Number (TIN) score calculated for each gene by RSeQC",
        "A two-line tab-separated file containing the mean, median, and stdev of the TIN scores, using the downsampled BAM from STAR. Produced by RSeQC.",
    ),
    "/rna/analysis/rseqc/tin_score.txt": FileDetails(
        "analysis",
        "tab-separated table of Transcript Integrity Number (TIN) scores calculated for each gene by RSeQC",
        "A five-column tab-separated table containing the gene, chromosome, start and end locations, and the TIN score, using the downsampled BAM from STAR. Produced by RSeQC.",
    ),
    "/rna/analysis/salmon/quant.sf": FileDetails(
        "analysis",
        "tab-separated quantification output (columns: Name, Length, EffectiveLength, TPM, NumReads) from Salmon",
        "A plain-text, tab-separated file with a single header line (which names all of the columns). Each subsequent row describes a single quantification record. The columns have the following interpretations:\nName — This is the name of the target transcript provided in the input transcript database (FASTA file).\nLength — This is the length of the target transcript in nucleotides.\nEffectiveLength — This is the computed effective length of the target transcript. It takes into account all factors being modeled that will effect the probability of sampling fragments from this transcript, including the fragment length distribution and sequence-specific and gc-fragment bias (if they are being modeled).\nTPM — This is salmon’s estimate of the relative abundance of this transcript in units of Transcripts Per Million (TPM). TPM is the recommended relative abundance measure to use for downstream analysis.\nNumReads — This is salmon’s estimate of the number of reads mapping to each transcript that was quantified. It is an “estimate” insofar as it is the expected number of reads that have originated from each transcript given the structure of the uniquely mapping and multi-mapping reads and the relative abundance estimates for each transcript. [Salmon documentation].",
    ),
    "/rna/analysis/salmon/transcriptome.bam.log": FileDetails(
        "analysis",
        "the log file produced during processing of the transcriptome by Salmon",
        "A plain-text file containing the time and output of all logging by Salmon during sample preparation of the target transcriptome against which samples can be analysed.",
    ),
    "/rna/analysis/salmon/aux_info_ambig_info.tsv": FileDetails(
        "analysis",
        "tab-separated statistics of ambiguously mapping reads for each transcript from Salmon",
        "A 2-column tab-separated file that contains information about the number of uniquely-mapping reads as well as the total number of ambiguously-mapping reads for each transcript. This file is provided mostly for exploratory analysis of the results; it gives some idea of the fraction of each transcript’s estimated abundance that derives from ambiguously-mappable reads. [Salmon documentation].",
    ),
    "/rna/analysis/salmon/aux_info_expected_bias.gz": FileDetails(
        "analysis",
        "compressed binary file with information about the expected (5') sequence-specific biases from Salmon",
        "A gzipped, binary file that encodes the expected parameters of the VLMM that were learned for the 5' end.\nIt starts with 3x 32-bit signed int that represent length of context window, length left of the read, and length right of the read.\nThen there are 3 arrays of 32-bit signed ints of the length as the context window which represent the order of the VLMM for that position, and the shift and width to extract each subcontext window.\nThen 2x 64-bit signed ints specify the dimension of the table that immediately follow, where each row represents the nonzero probabilities of the VLMM for one subcontext window.\nFinally, the file contains the distribution of nucleotides in each position in the context as a 4-column table preceded by its dimensions as 2x 64-bit signed int.\n[Adapted from Salmon documentation].",
    ),
    "/rna/analysis/salmon/aux_info_fld.gz": FileDetails("analysis", "", ""),
    "/rna/analysis/salmon/aux_info_meta_info.json": FileDetails(
        "analysis",
        "json file of statistics and meta information about the Salmon run",
        "A JSON format file which contains meta information about the run, including stats such as the number of observed and mapped fragments, details of the bias modeling etc.\nOne particularly important piece of information contained in this file is the inferred library type. Most of the information recorded in this file should be self-descriptive. [Salmon documentation].",
    ),
    "/rna/analysis/salmon/aux_info_observed_bias.gz": FileDetails(
        "analysis",
        "compressed binary file with information about the observed 5' sequence-specific biases from Salmon",
        "A gzipped, binary file that encodes the observed parameters of the VLMM that were learned for the 5' end.\nIt starts with 3x 32-bit signed int that represent length of context window, length left of the read, and length right of the read.\nThen there are 3 arrays of 32-bit signed ints of the length as the context window which represent the order of the VLMM for that position, and the shift and width to extract each subcontext window.\nThen 2x 64-bit signed ints specify the dimension of the table that immediately follow, where each row represents the nonzero probabilities of the VLMM for one subcontext window.\nFinally, the file contains the distribution of nucleotides in each position in the context as a 4-column table preceded by its dimensions as 2x 64-bit signed int.\n[Adapted from Salmon documentation].",
    ),
    "/rna/analysis/salmon/aux_info_observed_bias_3p.gz": FileDetails(
        "analysis",
        "compressed binary file with information about the observed 3' sequence-specific biases from Salmon",
        "A gzipped, binary file that encodes the observed parameters of the VLMM that were learned for the 3' end.\nIt starts with 3x 32-bit signed int that represent length of context window, length left of the read, and length right of the read.\nThen there are 3 arrays of 32-bit signed ints of the length as the context window which represent the order of the VLMM for that position, and the shift and width to extract each subcontext window.\nThen 2x 64-bit signed ints specify the dimension of the table that immediately follow, where each row represents the nonzero probabilities of the VLMM for one subcontext window.\nFinally, the file contains the distribution of nucleotides in each position in the context as a 4-column table preceded by its dimensions as 2x 64-bit signed int.\n[Adapted from Salmon documentation].",
    ),
    "/rna/analysis/salmon/cmd_info.json": FileDetails(
        "analysis",
        "json file that records the command parameters used to call Salmon",
        "A JSON format file that records the main command line parameters with which Salmon was invoked for the run that produced the output in this directory. [Salmon documentation].",
    ),
    "/rna/analysis/salmon/salmon_quant.log": FileDetails(
        "analysis",
        "the log file produced by the Salmon analysis",
        "A plain-text file containing the time and output of all logging by Salmon during sample analysis.",
    ),
    "/rna/analysis/microbiome/addSample_report.txt": FileDetails(
        "analysis",
        "centrifuge summary output file",
        "Centrifuge output file contains name of a genome, taxonomic ID and rank, and also the proportion of this genome normalized by its genomic length",
    ),
    "/rna/analysis/microbiome/sample_report.txt": FileDetails(
        "analysis",
        "centrifuge summary output file",
        "Centrifuge output file contains name of a genome, taxonomic ID and rank, and also the proportion of this genome normalized by its genomic length",
    ),
    "/rna/analysis/trust4/trust4_report.tsv": FileDetails(
        "analysis",
        "TRUST4 final report file",
        "This report file focuses on CDR3 and is compatible with other repertoire analysis tools, such as VDJTools.",
    ),
    "/rna/analysis/fusion/fusion_predictions.tsv": FileDetails(
        "analysis",
        "fusion analysis report file",
        "This report file contains valiated fusion gene pairs found in all samples including their gene expression.",
    ),
    "/rna/analysis/msisensor/msisensor_report.txt": FileDetails(
        "analysis",
        "msisensor report file",
        "This report file contains MSI score this report file contains MSI score.",
    ),
    "/rna/analysis/msisensor/msisensor.txt": FileDetails(
        "analysis",
        "msisensor report file",
        "This report file contains MSI score this report file contains MSI score.",
    ),
    "/rna/analysis/neoantigen/genotype.json": FileDetails(
        "analysis",
        "arcasHLA report file",
        "This report file contains MHC class I & II HLA alleles.",
    ),
    # Nanostring
    "/nanostring/.rcc": FileDetails(
        "source",
        "xml-encoded comma-separated direct outputs from a NanoString sample",
        "A plain-text XML file with comma-separated table elements for the raw NanoString output from a sample.",
    ),
    "/nanostring/control.rcc": FileDetails(
        "source",
        "xml-encoded csv direct outputs from a NanoString control",
        "A plain-text XML file with comma-separated table elements for the raw NanoString output from a control.",
    ),
    "/nanostring/raw_data.csv": FileDetails(
        "miscellaneous",
        "comma-separated values aggregated from the raw RCC files for all samples/controls",
        "A comma-separated file where each column represents the values pulled from a sample/control's RCC file.",
    ),
    "/nanostring/normalized_data.csv": FileDetails(
        "analysis",
        "comma-separated values aggregated for all samples/controls and normalized",
        "A comma-separated file where each column is the normalized values for a sample/control.",
    ),
    # mIF
    "/mif/roi_/composite_image.tif": FileDetails(
        "source",
        "tiff image of region-of-interest merging all of the signals into a single image",
        "The TIFF image of the region-of-interest that contains the composition of all components into a combined image. Exported from inForm (PerkinElmer).",
    ),
    "/mif/roi_/component_data.tif": FileDetails(
        "source",
        "multi-image tiff of region-of-interest holding all of the individual component signals",
        "The multi-image TIFF of the region-of-interest that contains all of the individual components, one for each marker. Exported from inForm (PerkinElmer).",
    ),
    "/mif/roi_/multispectral.im3": FileDetails(
        "source",
        "multispectral image in the PerkinElmer IM3 format",
        "The multispectral image of the region-of-interest taken by Mantra, in the PerkinElmer IM3 file format. Used as input for inForm.",
    ),
    "/mif/roi_/binary_seg_maps.tif": FileDetails(
        "miscellaneous",
        "multi-image tiff of region-of-interest holding all maps as binary in/out, from inForm",
        "A multi-image TIFF of the region-of-interest, all Tissue, Cell, and Object maps created in analysis are stored as binary in/out 'images', as well as any processing maps. Exported from inForm (PerkinElmer).",
    ),
    "/mif/roi_/phenotype_map.tif": FileDetails(
        "miscellaneous",
        "tiff image where each dot is a phenotyped cell, from inForm",
        "A TIFF image using dots to represent the location of each cell in the region-of-interest with each called phenotype in a different color. Exported from inForm (PerkinElmer).",
    ),
    "/mif/roi_/image_with_all_seg.tif": FileDetails("miscellaneous", "", ""),
    "/mif/roi_/image_with_cell_seg_map.tif": FileDetails("miscellaneous", "", ""),
    "/mif/roi_/image_with_phenotype_map.tif": FileDetails("miscellaneous", "", ""),
    "/mif/roi_/image_with_tissue_seg.tif": FileDetails("miscellaneous", "", ""),
    "/mif/roi_/tissue_seg_data_summary.txt": FileDetails("miscellaneous", "", ""),
    "/mif/roi_/tissue_seg_data.txt": FileDetails("miscellaneous", "", ""),
    "/mif/roi_/score_data_.txt": FileDetails(
        "miscellaneous",
        "plain-text statistics on the score step, from inForm",
        "A plain-text file containing at mimimum, 'Tissue Category' and 'Area (Percent)', 'Number of cells', 'Cell Compartment', 'Stain Component'. Exported from inForm (PerkinElmer).",
    ),
    "/mif/roi_/cell_seg_data.txt": FileDetails(
        "analysis",
        "tab-separated statistics on each cell in the region-of-interest, from inForm",
        "A wide tab-separated file containing information and statistics on each cell in the range of interest. For each cell, measures of its position, size, shape, and marker expression in several compartments are recorded. Exported from inForm (PerkinElmer).",
    ),
    "/mif/roi_/cell_seg_data_summary.txt": FileDetails(
        "miscellaneous",
        "tab-separated statistics of cells, summarized across each phenotype in the region-of-interest, from inForm",
        "A wide tab-separated file containing information and statistics across all cells of a given phenotype in the region-of-interest. Contains the same measures of position, size, shape and marker expression in several compartments as does the cell_seg_data file. Exported from inForm (PerkinElmer).",
    ),
    "/mif/qc_report.zip": FileDetails(
        "miscellaneous",
        "QC Report",
        "Spreadsheets containing info regarding Quality Control from pathology and reasoning for expected failures.",
    ),
    # Olink
    "/olink/study_npx.xlsx": FileDetails(
        "source",
        "excel file that has the Normalized Protein eXpression results for the full study",
        "An XML-based Excel file that contains the combined results across the full study in Log2-scaled NPX, or Normalized Protein eXpression (Olink’s arbitrary unit). NPX is calculated from Ct values and data pre-processing (normalization) is performed to minimize both intra- and inter-assay variation.\nNPX data allows users to identify changes for individual protein levels across their sample set, and then use this data to establish protein signatures.",
    ),
    "/olink/batch_/combined_npx.xlsx": FileDetails(
        "source",
        "excel file that has the Normalized Protein eXpression results for a batch of samples",
        "An XML-based Excel file that contains the combined results across a batch of samples in Log2-scaled NPX, or Normalized Protein eXpression (Olink’s arbitrary unit). NPX is calculated from Ct values and data pre-processing (normalization) is performed to minimize both intra- and inter-assay variation.\nNPX data allows users to identify changes for individual protein levels across their sample set, and then use this data to establish protein signatures.",
    ),
    "/olink/batch_/chip_/assay_npx.xlsx": FileDetails(
        "source",
        "excel file that has the Normalized Protein eXpression results for a single chip",
        "An XML-based Excel file that contains the results from a single chip in Log2-scaled NPX, or Normalized Protein eXpression (Olink’s arbitrary unit). Combined with other chips in study_npx.xlsx.\nNPX is calculated from Ct values and data pre-processing (normalization) is performed to minimize both intra- and inter-assay variation.\nNPX data allows users to identify changes for individual protein levels across their sample set, and then use this data to establish protein signatures.",
    ),
    "/olink/batch_/chip_/assay_raw_ct.csv": FileDetails(
        "source",
        "comma-separated table of Ct values results for a single chip",
        "The comma-separated, plain-text table that contains the raw Ct value results from a single chip. These values are not normalized for intra- or inter-assay variablity, and a high Ct value is related to a low protein concentration.",
    ),
    "npx|analysis_ready|csv": FileDetails(
        "analysis",
        "comma-separated table of Normalized Protein eXpressions for for all analytes/samples across the entire study",
        "The comma-separated, plain-text table of Normalized Protein eXpressions for all samples (valid CIMAC ID) and analytes (have Olink ID) across the entire study.\nEach row is a sample by CIMAC ID and each column is a sample by (Name, Uniprot ID, *Olink ID*).\nNote that quality control columns and non-sample rows have been removed.",
    ),
    # IHC
    "/ihc/ihc_image.": FileDetails(
        "source",
        "image file that is the result of an IHC experiment",
        "The image file generated from an IHC experiment. Generally, higher signal relates to higher local concentrations of protein, allowing for inferences of both tissue- and cell-level localization.",
    ),
    "csv|ihc marker combined": FileDetails(
        "analysis",
        "comma-separated quantification file of IHC image for all markers",
        "A comma-separated file containing quantification of each marker from the corresponding IHC image.",
    ),
    # Clinical
    "csv|participants info": FileDetails("clinical", "", ""),
    "csv|samples info": FileDetails("clinical", "", ""),
    "/clinical/.": FileDetails(
        "clinical",
        "A file containing unharmonized clinical data from the trial",
        "A file containing unharmonized clinical data from the trial. Files are generated by the trial \
         teams, and the CIDC applies minimal manipulation to ensure no PHI is present and each record is identified \
             by the appropriate CIMAC participant or sample id.",
    ),
    # Miscellaneous Data
    "/misc_data/": FileDetails(
        "source", "Unharmonized, one-off, or not-yet-supported data.", ""
    ),
    # TCR
    "/tcr/reads.tsv": FileDetails("source", "", ""),
    "/tcr/controls/reads.tsv": FileDetails("source", "", ""),
    "/tcr/replicate_/r1.fastq.gz": FileDetails("source", "", ""),
    "/tcr/replicate_/r2.fastq.gz": FileDetails("source", "", ""),
    "/tcr/replicate_/i1.fastq.gz": FileDetails("source", "", ""),
    "/tcr/replicate_/i2.fastq.gz": FileDetails("source", "", ""),
    "/tcr/SampleSheet.csv": FileDetails("miscellaneous", "", ""),
    "/tcr_analysis/summary_info.csv": FileDetails("miscellaneous", "", ""),
    "/tcr_analysis/tra_clone.csv": FileDetails("analysis", "", ""),
    "/tcr_analysis/trb_clone.csv": FileDetails("analysis", "", ""),
    "/tcr_analysis/report_trial.tar.gz": FileDetails("analysis", "", ""),
    # H&E
    "/hande/image_file.svs": FileDetails(
        "source",
        "stained image file that is the result of an H&E experiment",
        "An SVS image file stained with hematoxylin and eosin, generated from an H&E experiment.",
    ),
    "/hande/image_file.": FileDetails(
        "source",
        "stained image file that is the result of an H&E experiment",
        "An image file stained with hematoxylin and eosin, generated from an H&E experiment.",
    ),
    # ELISA
    "/elisa/assay.xlsx": FileDetails(
        "source",
        "xlsx file of measured values where rows are samples and columns are antigens",
        "An XML-based Excel file that contains the results of a single run in arbitrary units. Each row is a sample, though not all have CIMAC IDs, and each column is an antigen.",
    ),
    # CyTOF analysis
    f"/cytof_analysis/cell_counts_assignment.csv": FileDetails(
        "miscellaneous",
        "comma-separated two-column table with cell counts for each assigned cell type",
        "A plain-text, comma-separated table with a numbered index column, the 'CellSubset' as the called cell type, and 'N', the number of cells of that type seen in the sample.",
    ),
    f"/cytof_analysis/cell_counts_compartment.csv": FileDetails(
        "miscellaneous",
        "comma-separated two-column table with cell counts for each broad compartment assigned",
        "A plain-text, comma-separated table with a numbered index column, the 'CellSubset' as the broad compartment of the called cell types, and 'N', the number of cells within that compartment seen in the sample.",
    ),
    f"/cytof_analysis/cell_counts_profiling.csv": FileDetails(
        "miscellaneous",
        "comma-separated two-column table with cell counts for each profiled subset of assigned cell types",
        "A plain-text, comma-separated table with a numbered index column, the 'CellSubset' as the broad compartment of the called cell types, and 'N', the number of cells within that profiled subset seen in the sample.",
    ),
    "csv|cell counts assignment": FileDetails(
        "analysis",
        "comma-separated two-column table with cell counts for each assigned cell type",
        "A plain-text, comma-separated table with a numbered index column, the 'CellSubset' as the called cell type, and 'N', the number of cells of that type seen in the sample.",
    ),
    f"csv|cell counts compartment": FileDetails(
        "analysis",
        "comma-separated two-column table with cell counts for each broad compartment assigned",
        "A plain-text, comma-separated table with a numbered index column, the 'CellSubset' as the broad compartment of the called cell types, and 'N', the number of cells within that compartment seen in the sample.",
    ),
    f"csv|cell counts profiling": FileDetails(
        "analysis",
        "comma-separated two-column table with cell counts for each profiled subset of all assigned cell types",
        "A plain-text, comma-separated table with a numbered index column, the 'CellSubset' as the profiled subset of the assigned cell types, and 'N', the number of cells within that profiled subset seen in the sample.",
    ),
    f"/cytof_analysis/analysis.zip": FileDetails("analysis", "", ""),
    f"/cytof_analysis/assignment.csv": FileDetails(
        "miscellaneous",
        "comma-separated table of marker expression for each assigned cell type",
        "A plain-text, comma-separated table with a column for each assigned cell type, where rows are the signal on each channel for every cell type assigned.",
    ),
    f"/cytof_analysis/compartment.csv": FileDetails(
        "miscellaneous",
        "comma-separated table of marker expression for each broad compartment assigned",
        "A plain-text, comma-separated table with a column for each broad compartment of the called cell types, where rows are the signal on each channel for every compartment.",
    ),
    f"/cytof_analysis/control_files_analysis.zip": FileDetails("analysis", "", ""),
    f"/cytof_analysis/profiling.csv": FileDetails(
        "miscellaneous",
        "comma-separated two-column table with cell counts for each profiled subset of all assigned cell types",
        "A plain-text, comma-separated table with a numbered index column, the 'CellSubset' as the profiled subset of the assigned cell types, and 'N', the number of cells within that profiled subset seen in the sample.",
    ),
    f"/cytof_analysis/reports.zip": FileDetails("analysis", "", ""),
    f"/cytof_analysis/source.fcs": FileDetails(
        "source",
        "fcs data used as the input for this analysis",
        "The analysis-ready FCS file used as the input for this analysis. After normalization, debarcoding, and removal of Veri-Cells and other non-specimen cells.",
    ),
    # CyTOF assay
    f"/cytof/spike_in.fcs": FileDetails(
        "source",
        "normalized and debarcoded fcs data for a blank spike-in sample",
        "The FCS file that captures pure spike-in for use as a control, after normalization and debarcoding.",
    ),
    f"/cytof/controls/spike_in.fcs": FileDetails(
        "source",
        "normalized and debarcoded fcs data for a blank spike-in sample",
        "The FCS file that captures pure spike-in for use as a control, after normalization and debarcoding.",
    ),
    f"/cytof/controls/processed.fcs": FileDetails("source", "", ""),
    f"/cytof/source_.fcs": FileDetails(
        "source",
        "raw fcs data as generated by the machine, without normalization, debarcoding, or cleaning",
        "The raw FCS file as generated by the machine, without any normalization, debarcoding, cleaning, etc.",
    ),
    f"/cytof/debarcoding_key.csv": FileDetails(
        "source",
        "",
        "",
    ),
    f"/cytof/processed.fcs": FileDetails(
        "source",
        "fully processed fcs data: normalized, debarcoded, no Veri-Cells, cleaned",
        "The analysis-ready FCS file after normalization, debarcoding, and removal of Veri-Cells and other non-specimen cells.",
    ),
    # ctDNA assay
    "/ctdna/demultiplexed.bam": FileDetails(
        "source",
        "demultiplexed BAM files generated from circulating tumor DNA",
        "",
    ),
    "/ctdna/demultiplexed.bam.bai": FileDetails(
        "source",
        "index for demultiplexed BAM files generated from circulating tumor DNA",
        "",
    ),
    "/ctdna/summary_plots.pdf": FileDetails(
        "source",
        "CNA plots for top 2 models per sample generated by ichorCNA",
        "Plots showing Copy Number Alteration (CNA) across the whole genome for each sample for the top 2 solutions generated by ichorCNA.",
    ),
    "/ctdna/genome-wide_plots.pdf": FileDetails(
        "source",
        "CNA plots for all ichorCNA solutions",
        "Plots showing Copy Number Alteration (CNA) across the whole genome for all solutions generated by ichorCNA.",
    ),
    "/ctdna/bias_qc_plots.pdf": FileDetails(
        "source",
        "readcount plots showing GC and mappability bias corrections",
        "Plots showing readcount corrections applied by ichorCNA for GC bias and Mappability bias.",
    ),
    "/ctdna/optimal_solution.zip": FileDetails(
        "source",
        "the optimal solution as calculated by ichorCNA",
        "",
    ),
    "/ctdna/other_solutions.zip": FileDetails(
        "source",
        "other possible solutions also calculated by ichorCNA",
        "",
    ),
    # Microbiome Assay
    "/microbiome/forward.fastq.gz": FileDetails(
        "source",
        "fastq file of raw forward read, compressed",
        "The gzipped, FASTQ file that represents the 5' read of paired sequencing. Generated by the sequencing machine defined by the Microbiome assay this file is associated with.",
    ),
    "/microbiome/forward_index.fastq.gz": FileDetails(
        "source",
        "fastq file of raw forward read index, compressed",
        "The gzipped, FASTQ file that contains the read index sequences for the forward read.",
    ),
    "/microbiome/reverse.fastq.gz": FileDetails(
        "source",
        "fastq file of raw reverse read, compressed",
        "The gzipped, FASTQ file that represents the 5' read of paired sequencing. Generated by the sequencing machine defined by the Microbiome assay this file is associated with.",
    ),
    "/microbiome/reverse_index.fastq.gz": FileDetails(
        "source",
        "fastq file of raw reverse read index, compressed",
        "The gzipped, FASTQ file that contains the read index sequences for the reverse read.",
    ),
    "/microbiome/otu_table.tsv": FileDetails(
        "miscellaneous",
        "Operational taxonomic unit matrix.",
        "A matrix that gives the number of reads per sample per OTU. ",
    ),
    "/microbiome/summary.pdf": FileDetails(
        "miscellaneous",
        "Plots showing analysis summary for microbiome.",
        "",
    ),
    # mibi
    "/mibi/metadata.tsv": FileDetails(
        "miscellaneous",
        "TSV-formatted table containing metadata about the run",
        "",
    ),
    "/mibi/multichannel_image.ome.tiff": FileDetails(
        "miscellaneous",
        "Analysis-ready multilayer OME-TIFF image file",
        "",
    ),
    "/mibi/cluster_labels.tif": FileDetails(
        "miscellaneous",
        "TIF-formatted whole cell segmentation masks for each multiplexed image",
        "",
    ),
    "/mibi/channel_names.csv": FileDetails(
        "miscellaneous",
        "CSV-formatted table of each channel and the corresponding mass",
        "",
    ),
    "/mibi/single_cell_table.csv": FileDetails(
        "miscellaneous",
        "Single cell data table containing eg integrated expression values, XY location",
        "",
    ),
}