scripts/ananse
#!/usr/bin/env python
import os
import sys
import argparse
from genomepy import Genome
from loguru import logger
from ananse import commands, __version__, SEPARATOR
logger.remove()
logger.add(
sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | {level} | {message}"
)
class NegateAction(argparse.Action):
def __call__(self, parser, ns, values, option): # noqa
setattr(ns, self.dest, "include" in option)
if __name__ == "__main__":
usage = "%(prog)s [-h] <command> [options]"
description = "ANANSE: ANalysis Algorithm for Networks Specified by Enhancers"
epilog = """
commands:
binding predict TF binding sites in cis-regulatory region
network infer gene regulatory network
influence prioritize transcription factors
plot plot influence result in dotplot and GRN network
view view binding file
"""
parser = argparse.ArgumentParser(
usage=usage,
description=description,
epilog=epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-v", "--version", action="version", version=f"%(prog)s v{__version__}"
)
subparser_dict = {}
subparsers = parser.add_subparsers()
# ananse binding
p = subparsers.add_parser(
"binding", add_help=False
) # help added manually (more control)
subparser_dict["binding"] = p
group = p.add_argument_group("Required arguments")
group.add_argument(
"-A",
"--atac-bams",
dest="atac_bams",
metavar="BAM",
help="ATAC-seq input BAM file(s) (or one counts table with reads per peak), "
"can be used alone or in combination with the -H option",
default=None,
nargs="+",
)
group.add_argument(
"-H",
"--histone-bams",
dest="histone_bams",
metavar="BAM",
help="H3K27ac ChIP-seq input BAM file(s) (or one counts table with reads per peak), "
"can be used alone or in combination with the -A option",
default=None,
nargs="+",
)
group.add_argument(
"-C",
"--cage-tpms",
dest="cage_tpms",
help="CAGE-seq bidirectional regions (TPM) generated with CAGEfightR, "
"cannot be used in combination with the -A and/or -H options",
metavar="TPM",
default=None,
)
group.add_argument(
"-g",
"--genome",
dest="genome",
help="Genome (genomepy name or FASTA file) used to align the BAMs and regions to (default: hg38)",
metavar="NAME",
default="hg38",
)
group = p.add_argument_group("Required arguments (optional for hg38)")
group.add_argument(
"-r",
"--regions",
dest="regions",
help="Regions to analyse. Can be "
"one or more BED format files (e.g. BED, narrowPeak, broadPeak) or "
"one file with one region per line (e.g. 'chr1:100-200') or "
"a space-separated list. Optional if a pfmscorefile is provided (used to filter those regions instead)",
metavar="FILE",
default=None,
nargs="*",
)
group.add_argument(
"-p",
"--pfmfile",
dest="pfmfile",
help="PFM file of the transcription factors to search for (default: gimme.vertebrate.v5.0)",
metavar="FILE",
default=None,
)
group = p.add_argument_group("Optional arguments")
group.add_argument(
"-o",
"--outdir",
dest="outdir",
help="Directory where you wish to store the output (default: ./ANANSE_binding)",
metavar="DIR",
default="./ANANSE_binding",
)
group.add_argument(
"-c",
"--columns",
dest="columns",
help="One or more (case insensitive) column names to extract "
"from the counts table(s) (default: all)",
metavar="COL",
default=None,
nargs="*",
)
group.add_argument(
"-R",
"--reference",
dest="reference",
help="Path to reference data directory",
metavar="DIR",
default=None,
)
group.add_argument(
"--pfmscorefile",
dest="pfmscorefile",
help="Use precomputed gimmemotifs scores (gimme scan -Tz --gc -g GENOME REGIONS > SCAN.tsv)",
metavar="FILE",
default=None,
)
group.add_argument(
"-t",
"--tfs",
dest="tfs",
help="Filter Transcription Factors to use (default: all in motif2factors.txt). "
"Either a space-separated list or one or more files with one TF per line",
metavar="TF",
default=None,
nargs="*",
)
group.add_argument(
"--jaccard-cutoff",
dest="jaccard_cutoff",
help="TFs with a jaccard motif similarity >= the cutoff can be used as backup model. "
"0: any similarity, 1: perfect similarity (default is 0.1)",
metavar="FLOAT",
type=float,
default=0.1,
)
group.add_argument(
"-n",
"--ncore",
dest="ncore",
help="Number of cores to use.",
metavar="INT",
type=int,
default=min(os.cpu_count(), 4),
)
group.add_argument(
"-h", "--help", action="help", help="show this help message and exit"
)
p.set_defaults(func=commands.binding)
# network.py
p = subparsers.add_parser("network", add_help=False)
subparser_dict["network"] = p
group = p.add_argument_group("required arguments")
group.add_argument(
# "-b",
# "--binding",
dest="binding",
help="TF binding prediction file (ANANSE binding output)",
metavar="FILE",
default=None,
# required=True,
)
group.add_argument(
"-e",
"--expression",
dest="fin_expression",
help="Gene expression file(s) with genes as first column and "
"expression column(s) in TPM values (column name(s) specified below). "
"Genes must be in HGNC symbols, unless a genomepy gene annotation is provided. "
"Files can include transcript-level TPMs "
"(the quant.sf from salmon or the abundances.tsv from kallisto), "
"or gene-level TPMs tables (summarized with e.g. tximeta). ",
metavar="FILE",
default=None,
# required=True,
nargs="+",
)
group = p.add_argument_group("Required arguments (optional for hg38)")
group.add_argument(
"-g",
"--genome",
dest="genome",
help="Genome (genomepy name or FASTA file) used to align the BAMs and regions to (default: hg38)",
metavar="NAME",
default="hg38",
)
group.add_argument(
"-a",
"--annotation",
dest="annotation",
help="Gene annotation (genomepy name or BED12 file) used to quantify expression levels. "
"Optional when a genomepy genome (with annotation) is providing.",
metavar="BED",
default=None,
)
group = p.add_argument_group("optional arguments")
group.add_argument(
"-o",
"--outfile",
dest="outfile",
help="Name of the output network file (default: ./ANANSE_network.tsv)",
metavar="FILE",
default="./ANANSE_network.tsv",
)
group.add_argument(
"-t",
"--tfs",
dest="tfs",
help="Filter Transcription Factors to use (default: all in motif2factors.txt). "
"Either a space-separated list or one or more files with one TF per line",
metavar="TF",
default=None,
nargs="*",
)
group.add_argument(
"-r",
"--regions",
dest="regions",
help="Filter regions to use (default: all in binding.h5). "
"Either one region/BED format file or a space-separated list.",
metavar="FILE",
default=None,
nargs="*",
)
group.add_argument(
"-c",
"--columns",
dest="column",
help="One or more (case insensitive) column names to extract "
"from the expression file(s) (default: tpm)",
metavar="COL",
default="tpm",
nargs="*",
)
group.add_argument(
"-f",
"--full-output",
dest="full_output",
help="Export the full GRN output to the output file",
action="store_true",
default=False,
)
group.add_argument(
"--include-promoter",
"--exclude-promoter",
default=True,
help="Include or exclude promoter peaks (<= TSS +/- 2kb) in network inference. "
"By default promoter peaks are included.",
dest="include_promoter",
action=NegateAction,
nargs=0,
)
group.add_argument(
"--include-enhancer",
"--exclude-enhancer",
default=True,
help="Include or exclude enhancer peaks (> TSS +/- 2kb) in network inference. "
"By default enhancer peaks are included.",
dest="include_enhancer",
action=NegateAction,
nargs=0,
)
group.add_argument(
"-n",
"--ncore",
dest="ncore",
help="Number of cores to use.",
type=int,
metavar="INT",
default=min(os.cpu_count(), 4),
)
group.add_argument(
"-h", "--help", action="help", help="show this help message and exit"
)
p.set_defaults(func=commands.network)
# ananse influence
p = subparsers.add_parser("influence", add_help=False)
subparser_dict["influence"] = p
group = p.add_argument_group("required arguments")
group.add_argument(
"-t",
"--target",
dest="target_file",
help="Network of target cell type.",
metavar="FILE",
default=None,
required=True,
)
group.add_argument(
"-d",
"--degenes",
dest="expression",
help="File with differential gene expression (DEseq2 output file). "
"Genes must be in HGNC symbols, unless a genomepy gene annotation is provided. ",
metavar="FILE",
required=True,
)
group = p.add_argument_group("optional arguments")
group.add_argument(
"-s",
"--source",
dest="source_file",
help="Network of source cell type.",
metavar="FILE",
default=None,
)
group.add_argument(
"-o",
"--outfile",
dest="outfile",
help="Name of the output influence file (default: ./ANANSE_influence.tsv)",
metavar="FILE",
default="./ANANSE_influence.tsv",
)
group.add_argument(
"-f",
"--full-output",
dest="full_output",
help="Export the full GRN output to the output file",
action="store_true",
default=False,
)
group.add_argument(
"-a",
"--annotation",
dest="gene_gtf",
help="Gene annotation (genomepy name or GTF file) used to quantify expression levels.",
metavar="GTF",
default=None,
)
group.add_argument(
"-i",
"--interactions",
dest="edges",
help="Number of top TF-gene interactions used (default: 500.000).",
type=int,
metavar="INT",
default=500_000,
)
group.add_argument(
"--select-after-join",
dest="select_after_join",
help="Select top interactions on differential network, instead of input networks.",
default=False,
action="store_true",
)
group.add_argument(
"-w",
"--whitelist",
dest="whitelist",
help="Include these genes/interactions after filtering top interactions. "
f"Either a space-separated list or a file with one TF/gene/TF{SEPARATOR}gene interaction per line.",
metavar="",
default=None,
nargs="*",
)
group.add_argument(
"-j",
"--padj",
dest="padj_cutoff",
help="Adjusted p-value below which genes classify as differential (default: 0.05).",
type=float,
metavar="FLOAT",
default=0.05,
)
group.add_argument(
"-c",
"--column",
dest="sort_column",
help="Column of the network file(s) to select top interactions (default: prob).",
default="prob",
metavar="STR",
)
group.add_argument(
"-n",
"--ncore",
dest="ncore",
help="Number of cores to use.",
type=int,
metavar="INT",
default=min(os.cpu_count(), 4),
)
group.add_argument(
"-h", "--help", action="help", help="show this help message and exit"
)
p.set_defaults(func=commands.influence)
# ananse View
p = subparsers.add_parser(
"view",
add_help=False,
description="Explore the contents of an ANANSE binding file.",
)
subparser_dict["view"] = p
group = p.add_argument_group("required arguments")
group.add_argument(
"infile",
help="TF binding prediction file (ANANSE binding output)",
metavar="FILE",
)
group = p.add_argument_group("optional arguments")
group.add_argument(
"-o",
"--outfile",
dest="outfile",
help="Output file (tab-separated text, default: stdout)",
metavar="FILE",
default=None,
)
group.add_argument(
"-t",
"--tfs",
dest="tfs",
help="Transcription factor(s) to display (default: all)",
metavar="TF",
default=None,
nargs="*",
)
group.add_argument(
"-r",
"--regions",
dest="regions",
help="Region(s) to display (default: all)",
metavar="REGION",
default=None,
nargs="*",
)
group.add_argument(
"-F",
"--format",
dest="format",
default="wide",
help="Display format: wide (n columns) or long (3 columns) (default: wide)",
)
group.add_argument(
"-n",
dest="n",
metavar="INT",
help="Number of regions and tfs to display (default: all)",
default=None,
)
group.add_argument(
"-lr",
"--list-regions",
dest="list_regions",
help="Return a list of regions",
action="store_true",
default=False,
)
group.add_argument(
"-lt",
"--list-tfs",
dest="list_tfs",
help="Return a list of transcription factors",
action="store_true",
default=False,
)
group.add_argument(
"-a",
"--activity",
dest="activity",
help="Return activity scores of transcription factors",
action="store_true",
default=False,
)
group.add_argument(
"-h", "--help", action="help", help="show this help message and exit"
)
p.set_defaults(func=commands.view)
# ananse plot
p = subparsers.add_parser("plot", add_help=False)
group = p.add_argument_group("required arguments")
group.add_argument(
# "-i",
# "--influence-file",
dest="infile",
help="TF influence file (ANANSE influence output)",
metavar="FILE",
# default=None,
# required=True,
)
group.add_argument(
"-d",
"--diff-network",
help="TF influence diffnetwork file (also ANANSE influence output)",
dest="GRN_file",
metavar="FILE",
# default=None,
)
group = p.add_argument_group("optional arguments")
group.add_argument(
"-o",
"--outdir",
dest="outdir",
help="Directory where you wish to store the output (default: ./ANANSE_plot)",
metavar="DIR",
default="./ANANSE_plot",
)
group.add_argument(
"--edge-info",
help="Column to use for edges of GRN, default: 'weight'. "
"When full_output is specified, options are 'wb_diff' ,'tf_act_diff', 'tf_expr_diff', 'tg_expr_diff'",
dest="edge_info",
type=str,
default="weight",
)
group.add_argument(
"--edge-min",
help="Minimum value for an edge to be included in the GRN image",
dest="edge_min",
type=float,
default=0.0,
)
group.add_argument(
"--node-placement",
help="pyviz cluster algorithm used for node placement, options include: neato, dot, fdp, twopi, sfdp, circo",
dest="network_algorithm",
type=str,
default="neato",
)
group.add_argument(
"--n-tfs",
help="Amount of TFs to plot in the GRN, default is top 20 differential TFs",
dest="n_TFs",
type=int,
default=20,
)
group.add_argument(
"-c",
"--cmap",
dest="cmap",
help="matlotlib colour library",
type=str,
default="viridis",
)
group.add_argument(
"-f",
"--full-output",
dest="full_output",
help="Select if the diffnetwork is a full output file",
action="store_true",
default=False,
)
group.add_argument(
"-t",
"--type",
dest="ftype",
help="Specify the output filetype (default: pdf)",
default="pdf",
)
group.add_argument(
"-h", "--help", action="help", help="show this help message and exit"
)
p.set_defaults(func=commands.plot)
if len(sys.argv) == 1:
parser.print_help()
elif len(sys.argv) == 2 and sys.argv[-1] in subparser_dict:
subparser_dict[sys.argv[-1]].print_help()
else:
args = parser.parse_args()
if hasattr(args, "genome"):
if args.genome is not None:
try:
Genome(args.genome)
except Exception as e: # noqa
logger.error(str(e))
logger.error(
" Have you installed your genome with genomepy? "
"See https://github.com/vanheeringen-lab/genomepy for details. "
)
logger.error(" Alternatively, you can specify a FASTA file.")
sys.exit(0)
if args.func.__name__.startswith("run_"):
args.func(**vars(args))
else:
args.func(args)