CIMAC-CIDC/cidc-schemas

View on GitHub
benchmark.py

Summary

Maintainability
A
0 mins
Test Coverage
import os
import pstats
import cProfile
import argparse
from contextlib import contextmanager

from tqdm import tqdm

from cidc_schemas.template import Template
from cidc_schemas.template_reader import XlTemplateReader
from cidc_schemas.prism import (
    merge_clinical_trial_metadata,
    merge_artifacts,
    ArtifactInfo,
    prismify,
    set_prism_encrypt_key,
)


@contextmanager
def profiling(run_name: str, outdir: str = "benchmark"):
    """A context manager that profiles enclosed code using cProfile.Profile,
    outputting results to the specified output director (defaults to "benchmark/").
    """
    if not os.path.isdir(outdir):
        os.mkdir(outdir)

    profiler = cProfile.Profile()
    profiler.enable()
    exception = None
    try:
        print(f"Running step '{run_name}'")
        yield
    except Exception as e:
        exception = e
    finally:
        profiler.disable()
        filename = os.path.join(outdir, f"{run_name}.profile.txt")
        with open(filename, "w") as outfile:
            outfile.write(f"[profiler output for '{run_name}']\n\n")
            ps = pstats.Stats(profiler, stream=outfile).sort_stats("time")
            ps.print_stats()
        print(f"Wrote profiler results to {filename}")
        if exception:
            raise exception


def run(ts_path: str, mif_path: str, he_path: str, outdir: str):
    """Run and profile a typical metadata validation and merging workload."""
    set_prism_encrypt_key("foobar")

    with profiling("1_prismify_tissue_slide_shipping_manifest", outdir):
        ts_template = Template.from_type("tissue_slide")
        ts_spreadsheet, _ = XlTemplateReader.from_excel(ts_path)
        ts_metadata, _, _ = prismify(ts_spreadsheet, ts_template)
        ts_metadata["allowed_cohort_names"] = ["Not_reported"]
        ts_metadata["allowed_collection_event_names"] = ["Baseline"]

    with profiling("2_prismify_mif_assay_metadata_spreadsheet", outdir):
        mif_template = Template.from_type("mif")
        mif_spreadsheet, _ = XlTemplateReader.from_excel(mif_path)
        mif_metadata, files, _ = prismify(mif_spreadsheet, mif_template)

    with profiling("3_merge_mif_assay_artifacts_into_mif_metadata_patch", outdir):
        # tqdm gives us a stdout progress indicator as prism iterates through the array
        artifact_info = tqdm(
            [
                ArtifactInfo(
                    f.upload_placeholder,
                    f"object/url/{f.upload_placeholder}",
                    "",
                    0,
                    "",
                    "abcd",
                )
                for i, f in enumerate(files)
            ]
        )
        mif_metadata, _ = merge_artifacts(mif_metadata, artifact_info)

    with profiling("4_merge_mif_metadata_with_tissue_slide_metadata", outdir):
        combined_metadata, _ = merge_clinical_trial_metadata(mif_metadata, ts_metadata)

    # Don't profile this a second time, since we're only interested
    # in how long it takes to merge the shipping manifest data into
    # existing trial metadata
    he_template = Template.from_type("h_and_e")
    he_spreadsheet, _ = XlTemplateReader.from_excel(he_path)
    he_metadata, _, _ = prismify(he_spreadsheet, he_template)

    with profiling("5_merge_h_and_e_metadata_into_trial", outdir):
        merge_clinical_trial_metadata(he_metadata, combined_metadata)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Run and profile a typical metadata validation and merging workload."
    )
    parser.add_argument(
        "--ts-path", required=True, help="path to a tissue slide metadata spreadsheet"
    )
    parser.add_argument(
        "--mif-path",
        required=True,
        help="path to an mif metadata spreadsheet with samples from the tissue slide manifest",
    )
    parser.add_argument(
        "--he-path", required=True, help="path to an h&e metadata spreadsheet"
    )
    parser.add_argument(
        "--out-dir",
        required=False,
        help="root directory to write profile info to",
        default="benchmark",
    )
    args = parser.parse_args()

    run(args.ts_path, args.mif_path, args.he_path, args.out_dir)