ananse/plot.py from vanheeringen-lab/ANANSE

ananse/plot.py
Summary

Maintainability

1 hr
Test Coverage

16%
Issues
Coverage
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from adjustText import adjust_text
from loguru import logger
from matplotlib.lines import Line2D
from sklearn.preprocessing import minmax_scale


def plot_influence(infile, outfile):
    """Plot TF influence score to expression."""

    df = pd.read_table(infile, sep="\t", index_col="factor")
    df = df.dropna()
    factors = list(df.sort_values("influence_score").tail(20).index)
    xcol = "factor_fc"
    plt.figure(figsize=(8, 6))
    sns.regplot(
        data=df,
        x=xcol,
        y="influence_score",
        fit_reg=False,
        scatter_kws={"s": df["direct_targets"] / 10, "alpha": 0.5},
    )
    x = df.loc[factors, xcol]
    y = df.loc[factors, "influence_score"]
    texts = []
    for s, xt, yt in zip(factors, x, y):
        texts.append(plt.text(xt, yt, s))
    adjust_text(texts, arrowprops=dict(arrowstyle="-", color="black"))
    plt.xlabel("Log2 fold change of TF")
    plt.ylabel("Influence score")
    plt.savefig(outfile, dpi=300)
    plt.clf()


def read_diff_network(diff_network_file, full_output):
    """read the differential network file outputed by the influence command."""
    data_columns = ["source", "target", "weight"]
    if full_output:
        data_columns = None  # all
        # data_columns = [
        #     "source",
        #     "target",
        #     "weight",
        #     "weight_target",
        #     "weight_source",
        #     "tf_expression_target",
        #     "tf_expression_source",
        #     "target_expression_target",
        #     "target_expression_source",
        #     "weighted_binding_target",
        #     "weighted_binding_source",
        #     "tf_activity_target",
        #     "tf_activity_source",
        # ]
    rnet = pd.read_csv(
        diff_network_file,
        sep="\t",
        usecols=data_columns,
        dtype="float64",
        converters={"source": str, "target": str},
    )
    grn = nx.from_pandas_edgelist(rnet, edge_attr=True, create_using=nx.DiGraph)
    return grn


def plot_grn(
    infile,
    fname,
    outfile,
    edge_info="weight",
    network_algorithm="neato",
    n_tfs=20,
    cmap="viridis",
    edge_min=0.1,
    full_output=False,
):
    """
    Plot the top differential TFs and their interactions with the highest score into a GRN network image.

    Parameters
    ----------
    infile: str
        influence output file
    fname: str
        diffnetwork text file
    outfile: str
        output file location
    edge_info: str, optional
        column to use for interaction weight, default is 'weight'. Full output diff options:
        weight, weight_target, weight_source, tf_expression_target, tf_expression_source,
        target_expression_target, target_expression_source, weighted_binding_target, weighted_binding_source,
        tf_activity_target, tf_activity_source.
    network_algorithm: str , optional
        pyviz cluster algorithm used for node placement, options include: neato, dot, fdp, twopi, sfdp, circo
    n_tfs: int, optional
        number of (significantly differential expressed) Tfs to visualize
    cmap: str, optional
        matplotlib colour pallet used
    edge_min: float, optional
        minimum value the selected edge value needs to have to be included in the GRN,
        0.1 seems like a good number for weight.
    full_output: bool, optional
        required to use edge_info other than the default 'weight'.
    """
    if full_output is False and edge_info != "weight":
        logger.info(
            "selected custom edge weight, but missing full output option, selecting 'weight' as edge weight instead."
        )
        edge_info = "weight"

    # select the top TFs:
    df = pd.read_table(
        infile,
        sep="\t",
        index_col="factor",
        usecols=["factor", "influence_score", "G_score_scaled"],
    )
    df = df[df.G_score_scaled > 0]  # plot only TFs that are differential
    top_factors = list(df.sort_values("influence_score").tail(n_tfs).index)

    if len(df) == 0:
        logger.warning(f"No differential TFs in {infile}!")
        return

    # read the diff_network file into a network
    grn = read_diff_network(fname, full_output)
    # filter the diffnetwork to only contain topTF-topTF
    tf_grn = nx.subgraph_view(grn, filter_node=lambda tf: tf in top_factors)
    # filter the diffnetwork to only contain edges above the cuttoff value
    tf_grn2 = nx.DiGraph(
        ((u, v, e) for u, v, e in tf_grn.edges(data=True) if e[edge_info] > edge_min)
    )
    # make the network directed again after filtering
    tf_grn2 = tf_grn2.to_directed()
    # remove TFs with no interactions
    tf_grn2.remove_nodes_from(list(nx.isolates(tf_grn2)))
    # load all edge info for scaling edge width
    edge_atribute = list(nx.get_edge_attributes(tf_grn2, edge_info).values())
    if len(edge_atribute) == 0:
        raise ValueError("Top factors do not share any connections.")
    edge_atribute_scaled = minmax_scale(
        edge_atribute, feature_range=(0, 1), axis=0, copy=True
    )
    # calculate edge weight quantile regions to dipslay 4 relevant numbers within the edge legend
    # normzalize the edge atributes (interaction scores) to be between 0 and 1 (where 0-1 corresponds to edge width)
    edges_norm_weight = [
        0,
        round((np.quantile(sorted(edge_atribute_scaled), 1) / 4), 3),
        round((np.quantile(sorted(edge_atribute_scaled), 1) / 2), 3),
        round(np.quantile(sorted(edge_atribute_scaled), 1), 3),
    ]
    # Also get the numbers of the unnormalized edge numbers to display within the legend next to the lines
    edges_weight = [
        0,
        round((np.quantile(sorted(edge_atribute), 1) / 4), 3),
        round((np.quantile(sorted(edge_atribute), 1) / 2), 3),
        round(np.quantile(sorted(edge_atribute), 1), 3),
    ]
    # lets calculate the nodes their outdegree (edges regulating other TFs):
    outdegree = pd.DataFrame(tf_grn2.out_degree(weight=edge_info))
    outdegree = outdegree[1]
    node_outdegree_size = 600 + outdegree * 100

    # Lets set some plotstuff
    colors = outdegree
    vmin = min(colors)
    vmax = max(colors)
    cmap = plt.get_cmap(cmap)
    plt.figure(figsize=(10, 10))
    # calculate node position of the graph
    pos = nx.drawing.nx_agraph.graphviz_layout(tf_grn2, prog=network_algorithm)
    # plot the TF nodes
    nx.draw_networkx_nodes(
        tf_grn2, pos, node_size=node_outdegree_size, node_color=outdegree, cmap=cmap
    )
    # plot the TF name labels:
    nx.draw_networkx_labels(
        tf_grn2,
        pos,
        font_color="black",
        bbox=dict(facecolor="white", alpha=0.5, pad=0),
    )
    # plot the node colour legend:
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
    cbar = plt.colorbar(sm, ax=plt.gca())
    cbar.ax.set_ylabel("outdegree (regulation other TFs)", rotation=270, labelpad=25)
    # plot the TF-TF edges:
    nx.draw_networkx_edges(
        tf_grn2,
        pos,
        arrows=True,
        arrowstyle="->",
        arrowsize=20,
        width=edge_atribute_scaled,
        node_size=node_outdegree_size,
        connectionstyle="arc3, rad = 0.1",
    )
    # add edge width legend:
    lines = []
    for _i, width in enumerate(edges_norm_weight):
        lines.append(Line2D([], [], linewidth=width, color="black"))
    plt.legend(
        lines,
        edges_weight,
        bbox_to_anchor=(0, 0.5),
        frameon=False,
        title=f"{edge_info}",
    )
    # save the plot
    plt.savefig(outfile, dpi=300)
    plt.clf()