vanheeringen-lab/ANANSE

View on GitHub
scripts/ananse

Summary

Maintainability
Test Coverage
#!/usr/bin/env python

import os
import sys
import argparse
from genomepy import Genome
from loguru import logger

from ananse import commands, __version__, SEPARATOR

logger.remove()
logger.add(
    sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | {level} | {message}"
)


class NegateAction(argparse.Action):
    def __call__(self, parser, ns, values, option):  # noqa
        setattr(ns, self.dest, "include" in option)


if __name__ == "__main__":
    usage = "%(prog)s [-h] <command> [options]"
    description = "ANANSE: ANalysis Algorithm for Networks Specified by Enhancers"
    epilog = """
    commands:
        binding     predict TF binding sites in cis-regulatory region
        network     infer gene regulatory network
        influence   prioritize transcription factors
        plot        plot influence result in dotplot and GRN network
        view        view binding file
        
    """
    parser = argparse.ArgumentParser(
        usage=usage,
        description=description,
        epilog=epilog,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument(
        "-v", "--version", action="version", version=f"%(prog)s v{__version__}"
    )

    subparser_dict = {}

    subparsers = parser.add_subparsers()

    # ananse binding
    p = subparsers.add_parser(
        "binding", add_help=False
    )  # help added manually (more control)
    subparser_dict["binding"] = p
    group = p.add_argument_group("Required arguments")
    group.add_argument(
        "-A",
        "--atac-bams",
        dest="atac_bams",
        metavar="BAM",
        help="ATAC-seq input BAM file(s) (or one counts table with reads per peak), "
        "can be used alone or in combination with the -H option",
        default=None,
        nargs="+",
    )
    group.add_argument(
        "-H",
        "--histone-bams",
        dest="histone_bams",
        metavar="BAM",
        help="H3K27ac ChIP-seq input BAM file(s) (or one counts table with reads per peak), "
        "can be used alone or in combination with the -A option",
        default=None,
        nargs="+",
    )
    group.add_argument(
        "-C",
        "--cage-tpms",
        dest="cage_tpms",
        help="CAGE-seq bidirectional regions (TPM) generated with CAGEfightR, "
             "cannot be used in combination with the -A and/or -H options",
        metavar="TPM",
        default=None,
    )
    group.add_argument(
        "-g",
        "--genome",
        dest="genome",
        help="Genome (genomepy name or FASTA file) used to align the BAMs and regions to (default: hg38)",
        metavar="NAME",
        default="hg38",
    )
    group = p.add_argument_group("Required arguments (optional for hg38)")
    group.add_argument(
        "-r",
        "--regions",
        dest="regions",
        help="Regions to analyse. Can be "
        "one or more BED format files (e.g. BED, narrowPeak, broadPeak) or "
        "one file with one region per line (e.g. 'chr1:100-200') or "
        "a space-separated list. Optional if a pfmscorefile is provided (used to filter those regions instead)",
        metavar="FILE",
        default=None,
        nargs="*",
    )
    group.add_argument(
        "-p",
        "--pfmfile",
        dest="pfmfile",
        help="PFM file of the transcription factors to search for (default: gimme.vertebrate.v5.0)",
        metavar="FILE",
        default=None,
    )
    group = p.add_argument_group("Optional arguments")
    group.add_argument(
        "-o",
        "--outdir",
        dest="outdir",
        help="Directory where you wish to store the output (default: ./ANANSE_binding)",
        metavar="DIR",
        default="./ANANSE_binding",
    )
    group.add_argument(
        "-c",
        "--columns",
        dest="columns",
        help="One or more (case insensitive) column names to extract "
        "from the counts table(s) (default: all)",
        metavar="COL",
        default=None,
        nargs="*",
    )
    group.add_argument(
        "-R",
        "--reference",
        dest="reference",
        help="Path to reference data directory",
        metavar="DIR",
        default=None,
    )
    group.add_argument(
        "--pfmscorefile",
        dest="pfmscorefile",
        help="Use precomputed gimmemotifs scores (gimme scan -Tz --gc -g GENOME REGIONS > SCAN.tsv)",
        metavar="FILE",
        default=None,
    )
    group.add_argument(
        "-t",
        "--tfs",
        dest="tfs",
        help="Filter Transcription Factors to use (default: all in motif2factors.txt). "
        "Either a space-separated list or one or more files with one TF per line",
        metavar="TF",
        default=None,
        nargs="*",
    )
    group.add_argument(
        "--jaccard-cutoff",
        dest="jaccard_cutoff",
        help="TFs with a jaccard motif similarity >= the cutoff can be used as backup model. "
        "0: any similarity, 1: perfect similarity (default is 0.1)",
        metavar="FLOAT",
        type=float,
        default=0.1,
    )
    group.add_argument(
        "-n",
        "--ncore",
        dest="ncore",
        help="Number of cores to use.",
        metavar="INT",
        type=int,
        default=min(os.cpu_count(), 4),
    )
    group.add_argument(
        "-h", "--help", action="help", help="show this help message and exit"
    )
    p.set_defaults(func=commands.binding)

    # network.py
    p = subparsers.add_parser("network", add_help=False)
    subparser_dict["network"] = p
    group = p.add_argument_group("required arguments")
    group.add_argument(
        # "-b",
        # "--binding",
        dest="binding",
        help="TF binding prediction file (ANANSE binding output)",
        metavar="FILE",
        default=None,
        # required=True,
    )
    group.add_argument(
        "-e",
        "--expression",
        dest="fin_expression",
        help="Gene expression file(s) with genes as first column and "
        "expression column(s) in TPM values (column name(s) specified below). "
        "Genes must be in HGNC symbols, unless a genomepy gene annotation is provided. "
        "Files can include transcript-level TPMs "
        "(the quant.sf from salmon or the abundances.tsv from kallisto), "
        "or gene-level TPMs tables (summarized with e.g. tximeta). ",
        metavar="FILE",
        default=None,
        # required=True,
        nargs="+",
    )
    group = p.add_argument_group("Required arguments (optional for hg38)")
    group.add_argument(
        "-g",
        "--genome",
        dest="genome",
        help="Genome (genomepy name or FASTA file) used to align the BAMs and regions to (default: hg38)",
        metavar="NAME",
        default="hg38",
    )
    group.add_argument(
        "-a",
        "--annotation",
        dest="annotation",
        help="Gene annotation (genomepy name or BED12 file) used to quantify expression levels. "
        "Optional when a genomepy genome (with annotation) is providing.",
        metavar="BED",
        default=None,
    )
    group = p.add_argument_group("optional arguments")
    group.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        help="Name of the output network file (default: ./ANANSE_network.tsv)",
        metavar="FILE",
        default="./ANANSE_network.tsv",
    )
    group.add_argument(
        "-t",
        "--tfs",
        dest="tfs",
        help="Filter Transcription Factors to use (default: all in motif2factors.txt). "
        "Either a space-separated list or one or more files with one TF per line",
        metavar="TF",
        default=None,
        nargs="*",
    )
    group.add_argument(
        "-r",
        "--regions",
        dest="regions",
        help="Filter regions to use (default: all in binding.h5). "
        "Either one region/BED format file or a space-separated list.",
        metavar="FILE",
        default=None,
        nargs="*",
    )
    group.add_argument(
        "-c",
        "--columns",
        dest="column",
        help="One or more (case insensitive) column names to extract "
        "from the expression file(s) (default: tpm)",
        metavar="COL",
        default="tpm",
        nargs="*",
    )
    group.add_argument(
        "-f",
        "--full-output",
        dest="full_output",
        help="Export the full GRN output to the output file",
        action="store_true",
        default=False,
    )
    group.add_argument(
        "--include-promoter",
        "--exclude-promoter",
        default=True,
        help="Include or exclude promoter peaks (<= TSS +/- 2kb) in network inference. "
        "By default promoter peaks are included.",
        dest="include_promoter",
        action=NegateAction,
        nargs=0,
    )
    group.add_argument(
        "--include-enhancer",
        "--exclude-enhancer",
        default=True,
        help="Include or exclude enhancer peaks (> TSS +/- 2kb) in network inference. "
        "By default enhancer peaks are included.",
        dest="include_enhancer",
        action=NegateAction,
        nargs=0,
    )
    group.add_argument(
        "-n",
        "--ncore",
        dest="ncore",
        help="Number of cores to use.",
        type=int,
        metavar="INT",
        default=min(os.cpu_count(), 4),
    )
    group.add_argument(
        "-h", "--help", action="help", help="show this help message and exit"
    )
    p.set_defaults(func=commands.network)

    # ananse influence
    p = subparsers.add_parser("influence", add_help=False)
    subparser_dict["influence"] = p
    group = p.add_argument_group("required arguments")
    group.add_argument(
        "-t",
        "--target",
        dest="target_file",
        help="Network of target cell type.",
        metavar="FILE",
        default=None,
        required=True,
    )
    group.add_argument(
        "-d",
        "--degenes",
        dest="expression",
        help="File with differential gene expression (DEseq2 output file). "
        "Genes must be in HGNC symbols, unless a genomepy gene annotation is provided. ",
        metavar="FILE",
        required=True,
    )
    group = p.add_argument_group("optional arguments")
    group.add_argument(
        "-s",
        "--source",
        dest="source_file",
        help="Network of source cell type.",
        metavar="FILE",
        default=None,
    )
    group.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        help="Name of the output influence file (default: ./ANANSE_influence.tsv)",
        metavar="FILE",
        default="./ANANSE_influence.tsv",
    )
    group.add_argument(
        "-f",
        "--full-output",
        dest="full_output",
        help="Export the full GRN output to the output file",
        action="store_true",
        default=False,
    )
    group.add_argument(
        "-a",
        "--annotation",
        dest="gene_gtf",
        help="Gene annotation (genomepy name or GTF file) used to quantify expression levels.",
        metavar="GTF",
        default=None,
    )
    group.add_argument(
        "-i",
        "--interactions",
        dest="edges",
        help="Number of top TF-gene interactions used (default: 500.000).",
        type=int,
        metavar="INT",
        default=500_000,
    )
    group.add_argument(
        "--select-after-join",
        dest="select_after_join",
        help="Select top interactions on differential network, instead of input networks.",
        default=False,
        action="store_true",
    )
    group.add_argument(
        "-w",
        "--whitelist",
        dest="whitelist",
        help="Include these genes/interactions after filtering top interactions. "
             f"Either a space-separated list or a file with one TF/gene/TF{SEPARATOR}gene interaction per line.",
        metavar="",
        default=None,
        nargs="*",
    )
    group.add_argument(
        "-j",
        "--padj",
        dest="padj_cutoff",
        help="Adjusted p-value below which genes classify as differential (default: 0.05).",
        type=float,
        metavar="FLOAT",
        default=0.05,
    )
    group.add_argument(
        "-c",
        "--column",
        dest="sort_column",
        help="Column of the network file(s) to select top interactions (default: prob).",
        default="prob",
        metavar="STR",
    )
    group.add_argument(
        "-n",
        "--ncore",
        dest="ncore",
        help="Number of cores to use.",
        type=int,
        metavar="INT",
        default=min(os.cpu_count(), 4),
    )
    group.add_argument(
        "-h", "--help", action="help", help="show this help message and exit"
    )
    p.set_defaults(func=commands.influence)

    # ananse View
    p = subparsers.add_parser(
        "view",
        add_help=False,
        description="Explore the contents of an ANANSE binding file.",
    )
    subparser_dict["view"] = p
    group = p.add_argument_group("required arguments")
    group.add_argument(
        "infile",
        help="TF binding prediction file (ANANSE binding output)",
        metavar="FILE",
    )
    group = p.add_argument_group("optional arguments")
    group.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        help="Output file (tab-separated text, default: stdout)",
        metavar="FILE",
        default=None,
    )
    group.add_argument(
        "-t",
        "--tfs",
        dest="tfs",
        help="Transcription factor(s) to display (default: all)",
        metavar="TF",
        default=None,
        nargs="*",
    )
    group.add_argument(
        "-r",
        "--regions",
        dest="regions",
        help="Region(s) to display (default: all)",
        metavar="REGION",
        default=None,
        nargs="*",
    )
    group.add_argument(
        "-F",
        "--format",
        dest="format",
        default="wide",
        help="Display format: wide (n columns) or long (3 columns) (default: wide)",
    )
    group.add_argument(
        "-n",
        dest="n",
        metavar="INT",
        help="Number of regions and tfs to display (default: all)",
        default=None,
    )
    group.add_argument(
        "-lr",
        "--list-regions",
        dest="list_regions",
        help="Return a list of regions",
        action="store_true",
        default=False,
    )
    group.add_argument(
        "-lt",
        "--list-tfs",
        dest="list_tfs",
        help="Return a list of transcription factors",
        action="store_true",
        default=False,
    )
    group.add_argument(
        "-a",
        "--activity",
        dest="activity",
        help="Return activity scores of transcription factors",
        action="store_true",
        default=False,
    )
    group.add_argument(
        "-h", "--help", action="help", help="show this help message and exit"
    )
    p.set_defaults(func=commands.view)

    # ananse plot
    p = subparsers.add_parser("plot", add_help=False)
    group = p.add_argument_group("required arguments")
    group.add_argument(
        # "-i",
        # "--influence-file",
        dest="infile",
        help="TF influence file (ANANSE influence output)",
        metavar="FILE",
        # default=None,
        # required=True,
    )
    group.add_argument(
        "-d",
        "--diff-network",
        help="TF influence diffnetwork file (also ANANSE influence output)",
        dest="GRN_file",
        metavar="FILE",
        # default=None,
    )
    group = p.add_argument_group("optional arguments")
    group.add_argument(
        "-o",
        "--outdir",
        dest="outdir",
        help="Directory where you wish to store the output (default: ./ANANSE_plot)",
        metavar="DIR",
        default="./ANANSE_plot",
    )
    group.add_argument(
        "--edge-info",
        help="Column to use for edges of GRN, default: 'weight'. "
        "When full_output is specified, options are 'wb_diff' ,'tf_act_diff', 'tf_expr_diff', 'tg_expr_diff'",
        dest="edge_info",
        type=str,
        default="weight",
    )
    group.add_argument(
        "--edge-min",
        help="Minimum value for an edge to be included in the GRN image",
        dest="edge_min",
        type=float,
        default=0.0,
    )
    group.add_argument(
        "--node-placement",
        help="pyviz cluster algorithm used for node placement, options include: neato, dot, fdp, twopi, sfdp, circo",
        dest="network_algorithm",
        type=str,
        default="neato",
    )
    group.add_argument(
        "--n-tfs",
        help="Amount of TFs to plot in the GRN, default is top 20 differential TFs",
        dest="n_TFs",
        type=int,
        default=20,
    )
    group.add_argument(
        "-c",
        "--cmap",
        dest="cmap",
        help="matlotlib colour library",
        type=str,
        default="viridis",
    )
    group.add_argument(
        "-f",
        "--full-output",
        dest="full_output",
        help="Select if the diffnetwork is a full output file",
        action="store_true",
        default=False,
    )
    group.add_argument(
        "-t",
        "--type",
        dest="ftype",
        help="Specify the output filetype (default: pdf)",
        default="pdf",
    )
    group.add_argument(
        "-h", "--help", action="help", help="show this help message and exit"
    )
    p.set_defaults(func=commands.plot)

    if len(sys.argv) == 1:
        parser.print_help()
    elif len(sys.argv) == 2 and sys.argv[-1] in subparser_dict:
        subparser_dict[sys.argv[-1]].print_help()
    else:
        args = parser.parse_args()
        if hasattr(args, "genome"):
            if args.genome is not None:
                try:
                    Genome(args.genome)
                except Exception as e:  # noqa
                    logger.error(str(e))
                    logger.error(
                        "  Have you installed your genome with genomepy? "
                        "See https://github.com/vanheeringen-lab/genomepy for details. "
                    )
                    logger.error("  Alternatively, you can specify a FASTA file.")
                    sys.exit(0)

        if args.func.__name__.startswith("run_"):
            args.func(**vars(args))
        else:
            args.func(args)