gimmemotifs/report.py
# Copyright (c) 2009-2019 Simon van Heeringen <simon.vanheeringen@gmail.com>
#
# This module is free software. You can redistribute it and/or modify it under
# the terms of the MIT License, see the file COPYING included with this
# distribution.
"""Reports (graphical and text) for motifs statistics."""
import logging
import os
import re
import shutil
import sys
from datetime import datetime
from multiprocessing import Pool
import jinja2
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.io.formats.style import Styler
from pandas.io.formats.style_render import non_reducing_slice
from statsmodels.stats.multitest import multipletests
from gimmemotifs import __version__
from gimmemotifs.comparison import MotifComparer
from gimmemotifs.config import MotifConfig
from gimmemotifs.fasta import Fasta
from gimmemotifs.motif import read_motifs
from gimmemotifs.plot import roc_plot
from gimmemotifs.stats import add_star, calc_stats, write_stats
from gimmemotifs.utils import motif_localization
try:
import emoji # noqa: currently optional
except ImportError:
pass
logger = logging.getLogger("gimme.report")
FACTOR_TOOLTIP = """
<div title='
\"Direct\" means that there is direct evidence of binding or that this assignment
is based on curated information.
\n\n
\"Predicted\" means that the motif comes from a non-curated ChIP-seq experiment
or that the factor was computationally predicted to bind this motif based on its
DNA binding domain.
'>factors<br/>(<span style='color:black'>direct</span> or
<span style='color:#666666'>predicted</span>)</div>
""".replace(
"\n ", ""
)
def _wrap_html_str(x):
if " " not in x:
return x
min_pos, max_pos = 0, len(x)
if ">" in x and "</" in x:
m = re.compile(r">[^<>]*<").search(x)
min_pos, max_pos = m.start(), m.end()
positions = [m.start() for m in re.compile(" ").finditer(x)]
positions = [p for p in positions if min_pos < p < max_pos]
if len(positions) == 0:
return x
pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0]
x = x[:pos] + "<br/>" + x[pos + 1 :]
return x
class ExtraStyler(Styler):
"""
Extra styles for a DataFrame or Series based on pandas.styler using HTML and CSS.
"""
# add our own templates to those in Styler
loader = jinja2.ChoiceLoader(
[jinja2.FileSystemLoader(MotifConfig().get_template_dir()), Styler.loader]
)
env = jinja2.Environment(loader=loader)
template_html = env.get_template("table.tpl") # sortable reports with slick theme
def __init__(self, *args, **kwargs):
self._data_todo = []
self.circle_styles = None
self.palette_styles = None
self.col_heading_style = {
"name": "col_heading",
"props": [("border-bottom", "1px solid #e0e0e0")],
}
super(ExtraStyler, self).__init__(*args, **kwargs)
self.display_data = self.data.copy()
self.font = "Nunito Sans"
def set_font(self, font_name):
"""
Set the font that will be used.
Parameters
----------
font_name : str
Should be a font name available though the Google Font API.
Returns
-------
self : ExtraStyler
Notes
-----
``font_name`` can contain spaces, eg. "Nunito Sans".
Examples
--------
>>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
>>> ExtraStyler(df).set_font("Roboto")
"""
self.font = font_name
return self
def _current_index(self, subset):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = non_reducing_slice(subset)
selected = self.data.loc[subset]
idx_slice = pd.IndexSlice[
self.data.index.get_indexer(selected.index),
self.data.columns.get_indexer(selected.columns),
]
return idx_slice
def _translate(self, *args, **kwargs):
self._compute_data()
d = super()._translate(*args, **kwargs)
circle_styles = self.circle_styles or []
palette_styles = self.palette_styles or []
col_heading_style = self.col_heading_style or []
d.update(
{
"font": self.font,
"circle_styles": circle_styles,
"palette_styles": palette_styles,
"col_heading_style": col_heading_style,
}
)
return d
def _compute_data(self):
r = self
for func, args, kwargs in self._data_todo:
r = func(self)(*args, **kwargs)
r.data = r.display_data
return r
def _tooltip(self, tip, subset=None, part=None):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = non_reducing_slice(subset)
if part is None:
part = "data"
if part == "data":
self.display_data.loc[subset] = (
"<div title='"
+ tip
+ "'>"
+ self.display_data.loc[subset].astype(str)
+ "</div>"
)
elif part == "columns":
idx = self._current_index(subset)[1]
rename = dict(
zip(
self.display_data.columns[idx],
"<div title='"
+ tip
+ "'>"
+ self.display_data.columns[idx].astype(str)
+ "</div>",
)
)
self.display_data.rename(columns=rename, inplace=True)
elif part == "index":
idx = self._current_index(subset)[0]
rename = dict(
zip(
self.display_data.index[idx],
"<div title='"
+ tip
+ "'>"
+ self.display_data.index[idx].astype(str)
+ "</div>",
)
)
self.display_data.rename(index=rename, inplace=True)
else:
raise ValueError(f"unknown value for part: {part}")
return self
@staticmethod
def _wrap_iterable(it):
return [_wrap_html_str(val) for val in it]
def _wrap(self, subset=None, axis=0):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = non_reducing_slice(subset)
if axis in [0, "columns"]:
idx = self._current_index(subset)[1]
rename = dict(
zip(
self.display_data.columns[idx],
self._wrap_iterable(self.display_data.columns[idx]),
)
)
self.display_data.rename(columns=rename, inplace=True)
elif axis in [1, "index"]:
idx = self._current_index(subset)[0]
rename = dict(
zip(
self.display_data.index[idx],
self._wrap_iterable(self.display_data.index[idx]),
)
)
self.display_data.rename(index=rename, inplace=True)
else:
raise ValueError(f"unknown value for axis: {axis}")
return self
def _convert_to_image(self, subset=None, height=30):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = non_reducing_slice(subset)
self.display_data.loc[subset] = (
f'<div style="height:{height}px;object-fit:contain;"><img src="'
+ self.data.loc[subset].astype(str)
+ '" style="height:100%;width:100%;object-fit:contain;"/></div>'
)
return self
@staticmethod
def _border(idx, location="left"):
return [f"border-{location}: 2px solid #444;" for _ in idx]
def border(
self,
subset=None,
location="bottom",
part="data",
width="2px",
style="solid",
color="#444",
):
"""
Add a border to data cells, columns or index.
Parameters
----------
subset : IndexSlice, optional
An argument to ``DataFrame.loc`` that restricts which elements
``border`` is applied to. If ``part`` is "columns" or "index"
subset should be present in either the columns or the index.
location : str, optional
Location of the border, default is "bottom". Can be "top", "bottom",
"right" or "left".
part : str, optional
If ``part`` is "data", the border will be applied to the data cells.
Set part to "index" or to "column" to add a border to the index or
header, respectively.
width : str, int or float, optional
Valid CSS value for border width.
style : str, optional
Valid CSS value for border style.
color : str, optional
Valid CSS value for border color.
Returns
-------
self : ExtraStyler
Examples
--------
>>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
>>> ExtraStyler(df).border(part="columns")
"""
if part == "data":
self.apply(self._border, subset=subset, location=location)
else:
self.col_heading_style["props"].append(
(f"border-{location}", f"{width} {style} {color}")
)
return self
@staticmethod
def _align(idx, location="center"):
return [f"text-align:{location};" for _ in idx]
def align(self, subset=None, location="center", axis=0):
"""
Align text.
Parameters
----------
subset : IndexSlice, optional
An argument to ``DataFrame.loc`` that restricts which elements
``center_align`` is applied to.
location : str, optional
"center", "left" or "right"
axis : {0 or 'index', 1 or 'columns', None}, default 0
Apply to each column (``axis=0`` or ``'index'``), to each row
(``axis=1`` or ``'columns'``), or to the entire DataFrame at once
with ``axis=None``.
Returns
-------
self : ExtraStyler
"""
self.apply(self._align, subset=subset, location=location, axis=axis)
return self
def to_precision_str(self, subset=None, precision=0, include_zero=True):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = non_reducing_slice(subset)
def precision_str(x, p=precision):
if (include_zero or x > 0) and x <= 10**-p:
return f"<{10**-p}"
else:
return f"{x:.{p}f}"
self.display_data.loc[subset] = self.data.loc[subset].applymap(precision_str)
return self
def _circle(
self,
subset=None,
show_text=True,
color=None,
cmap=None,
vmax=None,
scale=False,
size=25,
min_size=5,
morph=False,
):
subset = pd.IndexSlice[:, :] if subset is None else subset
subslice = non_reducing_slice(subset)
if color:
palette = sns.color_palette([color])
elif cmap is None:
palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10)
else:
palette = sns.color_palette(cmap)
# Make sure we don't select text columns
if len(palette) > 1:
subslice = pd.IndexSlice[
self.data.loc[subslice].index,
self.data.loc[subslice].select_dtypes(exclude=["object"]).columns,
]
idx = self._current_index(subslice)
self.circle_styles = self.circle_styles or []
circle_id = len(self.circle_styles) + 1
props = [
("height", f"{size}px"),
("width", f"{size}px"),
("border-radius", "50%"),
("color", "#000"),
("line-height", f"{size}px"),
("display", "inline-block"),
("text-align", "center"),
("vertical-align", "middle"),
]
self.circle_styles.append({"name": f"circle{circle_id}", "props": props})
self.palette_styles = self.palette_styles or []
for i, color in enumerate(palette.as_hex()):
props = [("background-color", color)]
if scale:
circle_size = min_size + ((size - min_size) / len(palette) * (i + 1))
props += [
("height", f"{circle_size}px"),
("width", f"{circle_size}px"),
("line-height", f"{circle_size}px"),
("text-align", "center"),
]
if morph:
props += [("border-radius", f"{50 - int(50 / len(palette)) * i}%")]
self.palette_styles.append(
{"name": f"color{circle_id}_{i}", "props": props}
)
if len(palette) > 1:
vmax = (
self.data.loc[subslice].max().max() * 1.01
if vmax is None
else vmax * 1.01
)
text = self.display_data.iloc[idx].astype(str) if show_text else ""
self.display_data.iloc[idx] = (
f"<div class='circle{circle_id} color{circle_id}_"
+ (self.data.loc[subslice] / (vmax / len(palette)))
.astype(int)
.astype(str)
+ "'>"
+ text
+ "</div>"
)
else:
text = self.display_data.iloc[idx].astype(str) if show_text else ""
self.display_data.iloc[idx] = (
f"<div class='circle{circle_id} color{circle_id}_0'>" + text + "</div>"
)
return self
def add_circle(self, **kwargs):
self._data_todo.append((lambda instance: instance._circle, (), kwargs))
return self
def wrap(self, **kwargs):
self._data_todo.append((lambda instance: instance._wrap, (), kwargs))
return self
def add_tooltip(self, tip, **kwargs):
self._data_todo.append((lambda instance: instance._tooltip, (tip,), kwargs))
return self
def convert_to_image(self, **kwargs):
self._data_todo.append(
(lambda instance: instance._convert_to_image, (), kwargs)
)
return self
def rename(self, columns=None, index=None):
self.display_data = self.display_data.rename(columns=columns, index=index)
return self
@staticmethod
def _emoji_score(series, emoji_str=None, bins=None):
if emoji_str is None:
emoji_str = ":star:"
if bins is None:
bins = 3
if isinstance(bins, int):
labels = range(1, bins + 1)
else:
labels = range(1, len(bins))
return [
emoji.emojize(emoji_str * val, use_aliases=True)
for val in pd.cut(series, bins=bins, labels=labels)
]
@staticmethod
def _emoji_scale(series, emojis=None, bins=None):
emoji_dict = {
"thumbs": [":thumbsdown:", ":thumbsup:"],
"check": [":cross_mark:", ":white_check_mark:"],
"smiley": [
":crying_face:",
":slightly_frowning_face:",
":neutral_face:",
":slightly_smiling_face:",
":grin:",
],
"black_square": [
":black_small_square:",
":black_medium_small_square:",
":black_medium_square:",
":black_large_square:",
],
"white_square": [
":white_small_square:",
":white_medium_small_square:",
":white_medium_square:",
":white_large_square:",
],
}
if emojis is None:
emojis = "smiley"
labels = []
if emojis in emoji_dict:
labels = emoji_dict[emojis]
if bins is None:
bins = len(labels)
return [
emoji.emojize(val, use_aliases=True)
for val in pd.cut(series, bins=bins, labels=labels)
]
def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = non_reducing_slice(subset)
idx = self._current_index(subset=subset)
result = self.display_data.iloc[idx].apply(
self._emoji_scale, axis=axis, result_type="expand", args=(emojis, bins)
)
self.display_data.iloc[idx] = result.values
return self.align(subset=subset, location="center", axis=axis)
def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = non_reducing_slice(subset)
idx = self._current_index(subset=subset)
result = self.display_data.iloc[idx].apply(
self._emoji_score, axis=axis, result_type="expand", args=(emoji_str, bins)
)
self.display_data.iloc[idx] = result.values
return self.align(subset=subset, location="left", axis=axis)
def emojify(self, subset=None):
subset = pd.IndexSlice[:, :] if subset is None else subset
subset = non_reducing_slice(subset)
idx = self._current_index(subset=subset)
result = self.display_data.iloc[idx].applymap(emoji.emojize)
self.display_data.iloc[idx] = result.values
return self
def scaled_background_gradient(
self,
subset=None,
cmap="RdBu_r",
low=0.0,
high=0.0,
center_zero=False,
vmin=None,
vmax=None,
):
if center_zero:
sub = pd.IndexSlice[:, :] if subset is None else subset
sub = non_reducing_slice(sub)
vmax = (
self.data.loc[sub]
.replace({np.inf: np.nan, -np.inf: np.nan})
.max(skipna=True)
.max()
if vmax is None
else vmax
)
vmin = (
self.data.loc[sub]
.replace({np.inf: np.nan, -np.inf: np.nan})
.min(skipna=True)
.min()
if vmin is None
else vmin
)
vmax = max(abs(vmax), abs(vmin))
vmin = -vmax
r = self.background_gradient(
subset=subset,
cmap=cmap,
vmin=vmin,
vmax=vmax,
low=low,
high=high,
)
return r
def get_roc_values(motif, fg_file, bg_file, genome):
"""Calculate ROC AUC values for ROC plots."""
try:
stats = calc_stats(
fg_file=fg_file,
bg_file=bg_file,
motifs=motif,
genome=genome,
stats=["roc_values"],
ncpus=1,
)
(x, y) = list(stats.values())[0]["roc_values"]
return None, x, y
except Exception:
logger.error(motif)
logger.error(motif.id)
raise
def create_roc_plots(pfmfile, fgfa, background, outdir, genome):
"""Make ROC plots for all motifs."""
motifs = read_motifs(pfmfile, as_dict=True)
ncpus = int(MotifConfig().get_default_params()["ncpus"])
pool = Pool(processes=ncpus)
jobs = {}
for bg, fname in background.items():
for m_id, motif in motifs.items():
jobs[(m_id, bg)] = pool.apply_async(
get_roc_values, (motif, fgfa, fname, genome)
)
pool.close()
imgdir = os.path.join(outdir, "images")
if not os.path.exists(imgdir):
os.mkdir(imgdir)
for (m_id, bg), job in jobs.items():
error, x, y = job.get()
if error:
logger.error("Error in thread: %s", error)
logger.error("Motif: %s", m_id)
sys.exit(1)
roc_img_file = os.path.join(imgdir, f"{m_id}_roc.{bg}.png")
roc_plot(roc_img_file, x, y)
pool.join()
def _create_text_report(inputfile, motifs, closest_match, stats, outdir):
"""Create text report of motifs with statistics and database match."""
my_stats = {}
for motif in motifs:
match = closest_match[motif.id]
my_stats[str(motif)] = {}
for bg in list(stats.values())[0].keys():
if str(motif) not in stats:
logger.error("####")
logger.error(f"{str(motif)} not found")
for s in sorted(stats.keys()):
logger.error(s)
logger.error("####")
else:
my_stats[str(motif)][bg] = stats[str(motif)][bg].copy()
my_stats[str(motif)][bg]["best_match"] = "_".join(
match[0].split("_")[:-1]
)
my_stats[str(motif)][bg]["best_match_pvalue"] = match[1][-1]
header = f"# GimmeMotifs version {__version__}\n# Inputfile: {inputfile}\n"
write_stats(my_stats, os.path.join(outdir, "stats.{}.txt"), header=header)
def _create_graphical_report(inputfile, pwm, background, closest_match, outdir, stats):
"""Create main gimme_motifs output html report."""
logger.debug("Creating graphical report")
class ReportMotif(object):
"""Placeholder for motif stats."""
pass
config = MotifConfig()
imgdir = os.path.join(outdir, "images")
if not os.path.exists(imgdir):
os.mkdir(imgdir)
motifs = read_motifs(pwm, fmt="pfm")
dbpwm = config.get_default_params()["motif_db"]
pwmdir = config.get_motif_dir()
dbmotifs = read_motifs(os.path.join(pwmdir, dbpwm), as_dict=True)
report_motifs = []
for motif in motifs:
rm = ReportMotif()
rm.id = motif.id
rm.id_href = {"href": f"#{motif.id}"}
rm.id_name = {"name": motif.id}
rm.img = {"src": os.path.join("images", f"{motif.id}.png")}
motif.plot_logo(fname=os.path.join(outdir, "images", f"{motif.id}.png"))
# TODO: fix best ID
rm.best = "Gimme" # best_id[motif.id]
rm.consensus = motif.to_consensus()
rm.stars = int(
np.mean([stats[str(motif)][bg].get("stars", 0) for bg in background]) + 0.5
)
rm.bg = {}
for bg in background:
rm.bg[bg] = {}
this_stats = stats.get(str(motif), {}).get(bg)
# TODO: fix these stats
rm.bg[bg]["e"] = f"{this_stats.get('enr_at_fpr', 1.0):.2f}"
rm.bg[bg]["p"] = f"{this_stats.get('phyper_at_fpr', 1.0):.2f}"
rm.bg[bg]["auc"] = f"{this_stats.get('roc_auc', 0.5):.3f}"
rm.bg[bg]["mncp"] = f"{this_stats.get('mncp', 1.0):.3f}"
rm.bg[bg]["roc_img"] = {
"src": os.path.join(
"images", os.path.basename(f"{motif.id}_roc.{bg}") + ".png"
)
}
rm.bg[bg]["roc_img_link"] = {
"href": os.path.join(
"images", os.path.basename(f"{motif.id}_roc.{bg}") + ".png"
)
}
rm.histogram_img = {"data": os.path.join("images", f"{motif.id}_histogram.svg")}
rm.histogram_link = {
"href": os.path.join("images", f"{motif.id}_histogram.svg")
}
match_id = closest_match[motif.id][0]
dbmotifs[match_id].plot_logo(
fname=os.path.join(outdir, "images", f"{match_id}.png")
)
rm.match_img = {"src": os.path.join("images", f"{match_id}.png")}
rm.match_id = closest_match[motif.id][0]
rm.match_pval = f"{closest_match[motif.id][1][-1]:.2e}"
report_motifs.append(rm)
total_report = os.path.join(outdir, "gimme.denovo.html")
star_img = os.path.join(config.get_template_dir(), "star.png")
shutil.copyfile(star_img, os.path.join(outdir, "images", "star.png"))
env = jinja2.Environment(
loader=jinja2.FileSystemLoader([config.get_template_dir()])
)
template = env.get_template("report_template.jinja.html")
# TODO: title
result = template.render(
motifs=report_motifs,
inputfile=inputfile,
date=datetime.today().strftime("%d/%m/%Y"),
version=__version__,
bg_types=list(background.keys()),
)
with open(total_report, "wb") as f:
f.write(result.encode("utf-8"))
def create_denovo_motif_report(
inputfile, pfmfile, fgfa, background, locfa, outdir, params, stats=None
):
"""Create text and graphical (.html) motif reports."""
logger.info("creating de novo reports")
motifs = read_motifs(pfmfile, fmt="pwm")
# ROC plots
create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"])
# Closest match in database
mc = MotifComparer()
closest_match = mc.get_closest_match(motifs)
if stats is None:
stats = {}
for bg, bgfa in background.items():
for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa, motifs=motifs).items():
if m not in stats:
stats[m] = {}
stats[m][bg] = s
stats = add_star(stats)
if not params:
params = {}
cutoff_fpr = params.get("cutoff_fpr", 0.9)
lsize = np.median([len(seq) for seq in Fasta(locfa).seqs])
# Location plots
logger.debug("Creating localization plots")
for motif in motifs:
logger.debug(f" {motif.id} {motif}")
outfile = os.path.join(outdir, "images", f"{motif.id}_histogram.svg")
motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr)
# Create reports
_create_text_report(inputfile, motifs, closest_match, stats, outdir)
_create_graphical_report(
inputfile, pfmfile, background, closest_match, outdir, stats
)
def motif_to_factor_series(series, pfmfile=None, motifs=None):
if motifs is None:
motifs = read_motifs(pfmfile, as_dict=True)
if isinstance(series, pd.Index):
index = series
else:
index = series.index
factors = [motifs[motif].format_factors(html=True) for motif in series]
return pd.Series(data=factors, index=index, dtype=str)
def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="logos"):
if motifs is None:
motifs = read_motifs(pfmfile, as_dict=True)
if not os.path.exists(outdir):
os.makedirs(outdir)
if not os.path.exists(os.path.join(outdir, subdir)):
os.makedirs(os.path.join(outdir, subdir))
img_series = []
for motif in series:
if motif not in motifs:
raise ValueError(f"Motif {motif} does not occur in motif database")
fname = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)
fname = os.path.join(subdir, f"{fname}.png")
if not os.path.exists(fname):
motifs[motif].plot_logo(fname=os.path.join(outdir, fname))
img_series.append(fname)
if isinstance(series, pd.Index):
index = series
else:
index = series.index
return pd.Series(data=img_series, index=index, dtype=str)
def maelstrom_html_report(
outdir,
infile,
pfmfile=None,
threshold=3,
plot_all_motifs=False,
plot_no_motifs=False,
):
# Read the maelstrom text report
df = pd.read_table(infile, index_col=0)
# Columns with maelstrom rank aggregation value
value_cols = df.columns[
~df.columns.str.contains("corr") & ~df.columns.str.contains("% with motif")
]
# Columns with correlation values
corr_cols = df.columns[df.columns.str.contains("corr")]
if plot_all_motifs:
_ = motif_to_img_series(
df.index, pfmfile=pfmfile, outdir=outdir, subdir="logos"
)
df = df[np.any(abs(df[value_cols]) >= threshold, 1)]
if not plot_no_motifs:
# Add motif logo's
df.insert(
0,
"logo",
motif_to_img_series(
df.index, pfmfile=pfmfile, outdir=outdir, subdir="logos"
),
)
# Add factors that can bind to the motif
df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
rename_columns = {"factors": FACTOR_TOOLTIP}
df_styled = (
ExtraStyler(df)
.format(precision=2)
.pipe(
lambda d: d
if plot_no_motifs
else d.convert_to_image(subset=["logo"], height=30)
)
.scaled_background_gradient(
subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75
)
.border(subset=list(value_cols[:1]), location="left")
.border(part="columns", location="bottom")
.set_table_attributes('class="sortable-theme-slick" data-sortable')
.align(subset=list(value_cols), location="center")
.set_font("Nunito Sans")
.rename(columns=rename_columns)
)
if len(corr_cols) > 0:
df_styled = (
df_styled.wrap(subset=list(corr_cols))
.align(subset=list(corr_cols), location="center")
.scaled_background_gradient(
subset=corr_cols,
cmap="PuOr_r",
center_zero=True,
low=1 / 1.75,
high=1 / 1.75,
)
)
for col in df.columns:
if "% with motif" in col:
df_styled = (
df_styled.add_circle(subset=[col], cmap="Purples", vmax=100, size=30)
.wrap(subset=[col])
.align(subset=[col], location="center")
.border(subset=[col], location="left")
.to_precision_str(subset=[col])
)
df_styled = df_styled.wrap().render()
with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f:
f.write(df_styled)
def roc_html_report(
outdir,
infile,
pfmfile,
outname="gimme.motifs.html",
threshold=0.01,
use_motifs=None,
link_matches=False,
):
df = pd.read_table(infile, index_col=0)
df.rename_axis(None, inplace=True)
motifs = read_motifs(pfmfile, as_dict=True)
if use_motifs is not None and len(use_motifs) == 0:
with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f:
f.write("<body>No enriched motifs found.</body>")
return
if use_motifs is not None:
motifs = {k: v for k, v in motifs.items() if k in use_motifs}
idx = list(motifs.keys())
df = df.loc[idx]
try:
df.insert(
2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1]
)
except ZeroDivisionError:
logger.error(f"ZeroDivisionError when correcting {df['P-value']}")
df.insert(2, "corrected P-value", df["P-value"])
df.insert(3, "-log10 P-value", -np.log10(df["corrected P-value"]))
df = df[df["corrected P-value"] <= threshold]
cols = [
"factors",
"logo",
"% matches input",
"% matches background",
"-log10 P-value",
"ROC AUC",
"PR AUC",
"Enr. at 1% FPR",
"Recall at 10% FDR",
]
if link_matches:
df["# matches"] = (
"<a href=motif_scan_results/"
+ df.index.to_series().str.replace(r"[^a-zA-Z0-9\-]+", "_", regex=True)
+ ".matches.bed>"
+ df["# matches"].astype(str)
+ "</a>"
)
# Add motif logo's
df.insert(
0,
"logo",
motif_to_img_series(
df.index, pfmfile=pfmfile, motifs=motifs, outdir=outdir, subdir="logos"
),
)
# Add factors that can bind to the motif
df.insert(
0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile, motifs=motifs)
)
df = df[cols]
bar_cols = [
"% matches input",
"% matches background",
"-log10 P-value",
"ROC AUC",
"PR AUC",
"Enr. at 1% FPR",
"Recall at 10% FDR",
]
df["% matches input"] = df["% matches input"].astype(int)
df["% matches background"] = df["% matches background"].astype(int)
rename_columns = {"factors": FACTOR_TOOLTIP}
df = df.sort_values("ROC AUC", ascending=False)
with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f:
if df.shape[0] > 0:
f.write(
ExtraStyler(df)
.convert_to_image(
subset=["logo"],
height=30,
)
.add_circle(
subset=["% matches input", "% matches background"],
vmax=100,
cmap="Purples",
)
.scaled_background_gradient(
"-log10 P-value", vmin=0, high=0.3, cmap="Reds"
)
.scaled_background_gradient(
"ROC AUC", vmin=0.5, vmax=1, high=0.3, cmap="Reds"
)
.scaled_background_gradient(
"PR AUC", vmin=0, vmax=1, high=0.3, cmap="Reds"
)
.scaled_background_gradient(
"Enr. at 1% FPR", vmin=1, high=0.3, cmap="Reds"
)
.scaled_background_gradient(
"Recall at 10% FDR", vmin=0, vmax=1, high=0.7, cmap="Reds"
)
.format(precision=2)
.set_table_attributes('class="sortable-theme-slick" data-sortable')
.wrap(subset=cols)
.align(subset=bar_cols, location="center")
.rename(columns=rename_columns)
.to_precision_str(subset=["% matches input", "% matches background"])
.render()
)
else:
f.write("<body>No enriched motifs found.</body>")