build/lib/epigenomic_dataset/load_tasks.py
"""Module providing straightforward methods to load the tasks."""
from typing import Tuple, Dict
import pandas as pd
from tqdm.auto import tqdm
from .load_epigenomes import load_epigenomes
def load_task(
cell_line: str = "K562",
assembly: str = "hg38",
dataset: str = "fantom",
metric: str = "mean",
window_size: int = 256,
root: str = "datasets",
verbose: int = 2,
only_active: bool = False,
only_inactive: bool = False,
load_promoters: bool = False,
load_enhancers: bool = False,
binarize: bool = False,
min_active_tpm_value: float = 1,
max_inactive_tpm_value: float = 1,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Return epigenomic data and labels for given parameters.
Parameters
----------------------------------------
cell_line: str = "K562",
Cell line to consider. By default K562.
Currently available cell lines are
listed in the repository README file.
assembly: str,
The genomic assembly of the data to be retrieved.
dataset: str = "fantom",
Dataset to consider. By default fantom.
Currently available datasets are
listed in the repository README file.
metric: str = "mean",
The metric to load.
window_size: int = 256,
Window size to consider. By default 256.
Currently available window sizes are
listed in the repository README file.
root: str = "datasets"
Where to store the downloaded data.
verbose: int = 2,
Verbosity level.
only_active: bool = False,
Wether to filter for only active.
only_inactive: bool = False,
Wether to filter for only inactive.
load_promoters: bool = False,
Wether to load promoters.
load_enhancers: bool = False
Wether to load enhancers.
binarize: bool = False,
Whether to binarize the TPM values.
min_active_tpm_value: float = 1,
Minimum TPM value.
max_inactive_tpm_value: float = 1,
Maximum TPM value.
Values between the minimum and maximum will be dropped.
Returns
----------------------------------------
Return tuple with input and output DataFrames.
"""
if only_active and only_inactive:
raise ValueError(
"It is not possible to require both only active and inactive."
)
if not (load_promoters or load_enhancers):
raise ValueError(
"You need to load either the promoters or the enhancers."
)
if (only_active or only_inactive) and not (load_promoters and load_enhancers):
raise ValueError(
"When requiring to filter only active or inactive regions, "
"you must load both enhancers and promoters."
)
if (only_active or only_active) and min_active_tpm_value != max_inactive_tpm_value:
raise ValueError(
"It does not make sense to threshold different cis-regulatory "
"regions TPMs."
)
(promoters_epi, promoters_labels), (enhancers_epi, enhancers_labels) = [
load_epigenomes(
cell_line=cell_line,
assembly=assembly,
dataset=dataset,
region=region,
metric=metric,
window_size=window_size,
root=root,
binarize=binarize,
min_active_tpm_value=min_active_tpm_value,
max_inactive_tpm_value=max_inactive_tpm_value,
verbose=verbose
) if enabled else (None, None)
for region, enabled in (
("promoters", load_promoters),
("enhancers", load_enhancers),
)
]
if only_active:
promoters_epi = promoters_epi[promoters_labels.to_numpy() == 1]
enhancers_epi = enhancers_epi[enhancers_labels.to_numpy() == 1]
promoters_labels = promoters_labels[promoters_labels.to_numpy() == 1]
enhancers_labels = enhancers_labels[enhancers_labels.to_numpy() == 1]
enhancers_labels[enhancers_labels.columns[0]] = 0
elif only_inactive:
promoters_epi = promoters_epi[promoters_labels.to_numpy() == 0]
enhancers_epi = enhancers_epi[enhancers_labels.to_numpy() == 0]
promoters_labels = promoters_labels[promoters_labels.to_numpy() == 0]
promoters_labels[promoters_labels.columns[0]] = 1
enhancers_labels = enhancers_labels[enhancers_labels.to_numpy() == 0]
if only_active or only_inactive:
epigenomic_data = pd.concat([
promoters_epi,
enhancers_epi
])
labels = pd.concat([
promoters_labels,
enhancers_labels
])
else:
epigenomic_data = pd.concat([
region
for region in (promoters_epi, enhancers_epi)
if region is not None
])
labels = pd.concat([
region
for region in (promoters_labels, enhancers_labels)
if region is not None
])
return epigenomic_data, labels
def active_promoters_vs_inactive_promoters(
cell_line: str = "K562",
assembly: str = "hg38",
dataset: str = "fantom",
metric: str = "mean",
window_size: int = 256,
root: str = "datasets",
binarize: bool = False,
min_active_tpm_value: float = 1,
max_inactive_tpm_value: float = 1,
verbose: int = 2,
):
"""Return epigenomic data and labels for given parameters.
Parameters
----------------------------------------
cell_line: str = "K562",
Cell line to consider. By default K562.
Currently available cell lines are
listed in the repository README file.
assembly: str,
The genomic assembly of the data to be retrieved.
dataset: str = "fantom",
Dataset to consider. By default fantom.
Currently available datasets are
listed in the repository README file.
metric: str = "mean",
The metric to load.
window_size: int = 256,
Window size to consider. By default 256.
Currently available window sizes are
listed in the repository README file.
root: str = "datasets"
Where to store the downloaded data.
binarize: bool = False,
Whether to binarize the TPM values.
min_active_tpm_value: float = 1,
Minimum TPM value.
max_inactive_tpm_value: float = 1,
Maximum TPM value.
Values between the minimum and maximum will be dropped.
verbose: int = 2,
Verbosity level.
Returns
----------------------------------------
Return tuple with input and output DataFrames.
"""
return load_task(
cell_line=cell_line,
assembly=assembly,
dataset=dataset,
metric=metric,
window_size=window_size,
root=root,
verbose=verbose,
only_active=False,
only_inactive=False,
load_enhancers=False,
load_promoters=True,
binarize=binarize,
min_active_tpm_value=min_active_tpm_value,
max_inactive_tpm_value=max_inactive_tpm_value,
)
def active_enhancers_vs_inactive_enhancers(
cell_line: str = "K562",
assembly: str = "hg38",
dataset: str = "fantom",
metric: str = "mean",
window_size: int = 256,
root: str = "datasets",
binarize: bool = False,
min_active_tpm_value: float = 1,
max_inactive_tpm_value: float = 1,
verbose: int = 2,
):
"""Return epigenomic data and labels for given parameters.
Parameters
----------------------------------------
cell_line: str = "K562",
Cell line to consider. By default K562.
Currently available cell lines are
listed in the repository README file.
assembly: str,
The genomic assembly of the data to be retrieved.
dataset: str = "fantom",
Dataset to consider. By default fantom.
Currently available datasets are
listed in the repository README file.
metric: str = "mean",
The metric to load.
window_size: int = 256,
Window size to consider. By default 256.
Currently available window sizes are
listed in the repository README file.
root: str = "datasets"
Where to store the downloaded data.
binarize: bool = False,
Whether to binarize the TPM values.
min_active_tpm_value: float = 1,
Minimum TPM value.
max_inactive_tpm_value: float = 1,
Maximum TPM value.
Values between the minimum and maximum will be dropped.
verbose: int = 2,
Verbosity level.
Returns
----------------------------------------
Return tuple with input and output DataFrames.
"""
return load_task(
cell_line=cell_line,
assembly=assembly,
dataset=dataset,
metric=metric,
window_size=window_size,
root=root,
verbose=verbose,
only_active=False,
only_inactive=False,
load_enhancers=True,
load_promoters=False,
binarize=binarize,
min_active_tpm_value=min_active_tpm_value,
max_inactive_tpm_value=max_inactive_tpm_value,
)
def active_enhancers_vs_active_promoters(
cell_line: str = "K562",
assembly: str = "hg38",
dataset: str = "fantom",
metric: str = "mean",
window_size: int = 256,
root: str = "datasets",
verbose: int = 2,
**kwargs: Dict
):
"""Return epigenomic data and labels for given parameters.
Parameters
----------------------------------------
cell_line: str = "K562",
Cell line to consider. By default K562.
Currently available cell lines are
listed in the repository README file.
assembly: str,
The genomic assembly of the data to be retrieved.
dataset: str = "fantom",
Dataset to consider. By default fantom.
Currently available datasets are
listed in the repository README file.
metric: str = "mean",
The metric to load.
window_size: int = 256,
Window size to consider. By default 256.
Currently available window sizes are
listed in the repository README file.
root: str = "datasets"
Where to store the downloaded data.
verbose: int = 2,
Verbosity level.
Returns
----------------------------------------
Return tuple with input and output DataFrames.
"""
return load_task(
cell_line=cell_line,
assembly=assembly,
dataset=dataset,
metric=metric,
window_size=window_size,
root=root,
verbose=verbose,
only_active=True,
only_inactive=False,
load_enhancers=True,
load_promoters=True,
)
def inactive_enhancers_vs_inactive_promoters(
cell_line: str = "K562",
assembly: str = "hg38",
dataset: str = "fantom",
metric: str = "mean",
window_size: int = 256,
root: str = "datasets",
verbose: int = 2,
**kwargs: Dict
):
"""Return epigenomic data and labels for given parameters.
Parameters
----------------------------------------
cell_line: str = "K562",
Cell line to consider. By default K562.
Currently available cell lines are
listed in the repository README file.
assembly: str,
The genomic assembly of the data to be retrieved.
dataset: str = "fantom",
Dataset to consider. By default fantom.
Currently available datasets are
listed in the repository README file.
metric: str = "mean",
The metric to load.
window_size: int = 256,
Window size to consider. By default 256.
Currently available window sizes are
listed in the repository README file.
root: str = "datasets"
Where to store the downloaded data.
verbose: int = 2,
Verbosity level.
Returns
----------------------------------------
Return tuple with input and output DataFrames.
"""
return load_task(
cell_line=cell_line,
assembly=assembly,
dataset=dataset,
metric=metric,
window_size=window_size,
root=root,
verbose=verbose,
only_active=False,
only_inactive=True,
load_enhancers=True,
load_promoters=True,
)
def load_all_tasks(
cell_line: str = "K562",
assembly: str = "hg38",
dataset: str = "fantom",
metric: str = "mean",
window_size: int = 256,
root: str = "datasets",
verbose: int = 2,
leave: bool = False,
binarize: bool = False,
min_active_tpm_value: float = 1,
max_inactive_tpm_value: float = 1,
):
"""Return generator with all the tasks.
Parameters
----------------------------------------
cell_line: str = "K562",
Cell line to consider. By default K562.
Currently available cell lines are
listed in the repository README file.
assembly: str,
The genomic assembly of the data to be retrieved.
dataset: str = "fantom",
Dataset to consider. By default fantom.
Currently available datasets are
listed in the repository README file.
metric: str = "mean",
The metric to load.
window_size: int = 256,
Window size to consider. By default 256.
Currently available window sizes are
listed in the repository README file.
root: str = "datasets"
Where to store the downloaded data.
verbose: int = 2,
Verbosity level.
leave: bool = False,
Wether to leave the loading bar.
binarize: bool = False,
Whether to binarize the TPM values.
min_active_tpm_value: float = 1,
Minimum TPM value.
max_inactive_tpm_value: float = 1,
Maximum TPM value.
Values between the minimum and maximum will be dropped.
Returns
----------------------------------------
Return tuple with input and output DataFrames.
"""
return (
(
task(
cell_line=cell_line,
assembly=assembly,
dataset=dataset,
metric=metric,
window_size=window_size,
root=root,
verbose=verbose,
binarize=binarize,
min_active_tpm_value=min_active_tpm_value,
max_inactive_tpm_value=max_inactive_tpm_value,
),
task.__name__
)
for task in tqdm(
(
active_enhancers_vs_inactive_enhancers,
active_promoters_vs_inactive_promoters,
active_enhancers_vs_active_promoters,
inactive_enhancers_vs_inactive_promoters,
),
desc="Executing CRR prediction tasks",
disable=verbose == 0,
leave=leave
)
)