ananse/view.py from vanheeringen-lab/ANANSE

ananse/view.py
Summary

Maintainability

1 hr
Test Coverage

85%
Issues
Coverage
import pandas as pd


def get_binding_tfs(binding, all_tfs=False):
    """
    Returns a list of unique TFs found in the ananse binding output file.

    The binding file contains multiple lists of TFs:
    - set(pd.read_hdf(binding, key="_factor_activity")["factor"])
    - set(x for x in dir(hdf.root) if not x.startswith("_"))
    The former contains all TFs from the motif2factors.txt, while the latter
    contains only those for which a binding probability was calculated.

    The outersection of these sets had an error in "predict_proba()".
    """
    if all_tfs:
        # all TFs (from the motifs2factors.txt used in ananse binding)
        tfs = set(pd.read_hdf(binding, key="_factor_activity")["factor"])
    else:
        # TFs with binding probabilities
        hdf = pd.HDFStore(binding, "r")
        # TODO: This is hacky (depending on "_"), however the hdf.keys() method is
        #       much slower. Currently all TF names do *not* start with "_"
        keys = hdf.root.__members__
        tfs = set(k for k in keys if not k.startswith("_"))
        hdf.close()
    return sorted(tfs)


def view_h5(
    fname,
    tfs=None,
    regions=None,
    fmt="wide",
    n=None,
    list_regions=False,
    list_tfs=False,
    activity=False,
):
    """Extract information from an ANANSE binding.h5 file.

    Parameters
    ----------
    fname : str
        File name (binding.h5).
    tfs : list, optional
        List of transcription factor names to extract. All TFs are shown
        by default.
    regions : list, optional
        List of regions to extract. All regions are shown by default.
    fmt : str, optional
        Return output in 'wide' or in 'long' format. Default is 'wide'.
    n : int, optional
        Return the first n regions and tfs. All are shown by default.
    list_regions : bool, optional
        Return a list of regions
    list_tfs : bool, optional
        Return a list of transcription factors
    activity : bool, optional
        Return activity scores of transcription factors

    Returns
    -------
    pandas.DataFrame
    """
    if n:
        n = int(n)

    if list_regions:
        reg = pd.read_hdf(fname, key="_index")
        df = pd.DataFrame({"region": sorted(reg.index.unique())})
        if n:
            return df.head(n)
        return df

    if tfs is None:
        tfs = get_binding_tfs(fname)
    tfs = list(tfs)
    if n:
        tfs = tfs[: min(len(tfs), n)]

    if list_tfs:
        return pd.DataFrame({"factor": tfs})

    if activity:
        df = pd.read_hdf(fname, key="_factor_activity").set_index("factor")
        if n:
            return df.head(n)
        return df

    if fmt not in ["wide", "long"]:
        raise ValueError("fmt should be either 'wide' or 'long'")

    with pd.HDFStore(fname, "r") as hdf:
        idx = hdf.get("_index").index
        if n:
            rows = [True for _ in range(n)] + [False for _ in range(n, len(idx))]
            idx = idx[: min(len(idx), n)]
        elif regions:
            rows = idx.isin(regions)
            idx = list(set(regions) & set(idx))
        else:
            rows = [True for _ in range(len(idx))]
        df = pd.DataFrame(index=idx)
        df.index.name = "loc"
        for tf in tfs:
            df[tf] = hdf.get(tf)[rows].values

    if fmt == "long":
        df = df.reset_index().melt(
            id_vars=["loc"], value_name="prob", var_name="factor"
        )

    return df