crimson/star.py

Summary

Maintainability
A
25 mins
Test Coverage
A
90%
"""Parser for STAR Log.final.out file"""

# Copyright (c) 2015-2022 Wibowo Arindrarto <contact@arindrarto.dev>
# SPDX-License-Identifier: BSD-3-Clause

import os
from typing import Any, Callable, Dict, Optional, TextIO, Tuple, Union

import click

from .utils import convert, get_handle, get_linesep

__all__ = ["parse"]


def _pct_convert(raw_str: str) -> Union[str, int, float]:
    """Converts the given number ending with '%' to a number."""
    if raw_str.endswith("%"):
        return convert(raw_str[:-1])
    return convert(raw_str)


_MAX_SIZE = 1024 * 10
# Mapping of STAR attribute names to python dictionary attribute names
# and the function used to parse it.
_PARSE_MAP: Dict[str, Tuple[str, Callable[[Any], Union[str, int, float]]]] = {
    "Started job on": ("timeJobStart", str),
    "Started mapping on": ("timeMappingStart", str),
    "Finished on": ("timeEnd", str),
    "Mapping speed, Million of reads per hour": ("mappingSpeed", convert),
    "Number of input reads": ("nInput", convert),
    "Average input read length": ("avgInputLength", convert),
    "Uniquely mapped reads number": ("nUniquelyMapped", convert),
    "Uniquely mapped reads %": ("pctUniquelyMapped", _pct_convert),
    "Average mapped length": ("avgMappedLength", convert),
    "Number of splices: Total": ("nSplicesTotal", convert),
    "Number of splices: Annotated (sjdb)": ("nSplicesAnnotated", convert),
    "Number of splices: GT/AG": ("nSplicesGTAG", convert),
    "Number of splices: GC/AG": ("nSplicesGCAG", convert),
    "Number of splices: AT/AC": ("nSplicesATAC", convert),
    "Number of splices: Non-canonical": ("nSplicesNonCanonical", convert),
    "Mismatch rate per base, %": ("rateMismatchPerBase", _pct_convert),
    "Deletion rate per base": ("rateDeletionPerBase", _pct_convert),
    "Deletion average length": ("avgDeletionLength", convert),
    "Insertion rate per base": ("rateInsertionPerBase", _pct_convert),
    "Insertion average length": ("avgInsertionLength", convert),
    "Number of reads mapped to multiple loci": ("nMappedMultipleLoci", convert),
    "% of reads mapped to multiple loci": ("pctMappedMultipleLoci", _pct_convert),
    "Number of reads mapped to too many loci": ("nMappedTooManyLoci", convert),
    "% of reads mapped to too many loci": ("pctMappedTooManyLoci", _pct_convert),
    "% of reads unmapped: too many mismatches": (
        "pctUnmappedForTooManyMismatches",
        _pct_convert,
    ),
    "% of reads unmapped: too short": ("pctUnmappedForTooShort", _pct_convert),
    "% of reads unmapped: other": ("pctUnmappedForOther", _pct_convert),
}


def parse(
    in_data: Union[str, os.PathLike, TextIO],
    input_linesep: Optional[str] = None,
) -> dict:
    """Parse the log of a STAR run.

    :param in_data: Input STAR-Fusion contents.
    :param input_linesep: Name of the operating system used for determining
        input line separator. Valid values are 'nt', 'posix', or None.
    :returns: Parsed values.

    """
    payload: dict = {}
    with get_handle(in_data) as src:
        contents = src.read(_MAX_SIZE)

    linesep = get_linesep(input_linesep)
    val: Union[str, int, float]
    for line in contents.split(linesep):
        # pass empty lines
        if line and not line.strip():
            continue
        line = line.strip()
        if "|" in line:
            ori_key, val = [x.strip() for x in line.split("|", 1)]
            if ori_key in _PARSE_MAP:
                key, func = _PARSE_MAP[ori_key]
                val = func(val)
                if key in payload:
                    raise click.BadParameter(
                        f"Unexpected duplicate key entry: {key} ({ori_key})."
                    )
                payload[key] = val

    if not payload:
        msg = "Unexpected file structure. No contents parsed."
        raise click.BadParameter(msg)

    return payload