lib/gamtools/main.py
"""
===============
The main module
===============
The `main` module creates the relatively complex argument parser used by the
GAMtools command line application, and the :func:`main` function which handles
dispatching command line arguments to the correct functions.
"""
# more or less everything in this module is in the global scope, so pylint
# thinks objects are constants that should be in capital letters
# pylint: disable=invalid-name
import sys
import os
import argparse
from wrapit.parser import add_doit_options
import pytest
from . import bias, cosegregation, matrix, call_windows, enrichment, radial_position, \
permutation, pipeline, select_samples, compaction, resolution, utils
def add_window_calling_options(base_parser):
"""
Add options related to window calling to an argparse parser.
"""
base_parser.add_argument(
'-f', '--fitting-folder', metavar='FITTING_FOLDER',
help='If specified, save the individual curve fittings to this folder')
base_parser.add_argument(
'-d', '--details-file',
help='If specified, write a table of fitting parameters to this path')
base_parser.add_argument(
'--min-read-threshold',
type=int,
default=None,
help='If specified, require at least this many reads to call a window as positive')
seg_method = base_parser.add_mutually_exclusive_group()
seg_method.add_argument(
'--orphan-window-bounds', metavar='LOW_PERCENTILE,HIGH_PERCENTILE',
dest='fitting_function',
type=call_windows.orphan_windows_fitting_func,
default='0,98',
help='Only accept thresholds within the given range of percentiles.')
seg_method.add_argument(
'-x', '--fixed-threshold', metavar='MIN_COVERAGE',
dest='fitting_function',
type=call_windows.fixed_threshold_fitting_func,
help='Instead of fitting each sample individually, use a fixed threshold '
'to determine which windows are positive.')
base_parser.set_defaults(
fitting_function=call_windows.orphan_windows_fitting_func('0,98'))
parser = argparse.ArgumentParser(
prog='gamtools',
description='GAMtools is a suite of tools for working with Genome'
' Architecture Mapping (GAM) data.')
subparsers = parser.add_subparsers(help='Please select a command:')
subparsers.required = True
subparsers.dest = 'command'
# Options for the 'bias' command
bias_parser = subparsers.add_parser(
'bias',
help='Calculate the bias in a set of matrices based on a genomic feature.')
bias_parser.add_argument(
'-m', '--matrix-paths', metavar='MATRIX_PATH', nargs='+', required=True,
help='Paths to matrix files for each chromosome')
bias_parser.add_argument(
'-f', '--feature-path', metavar='FEATURE_PATH', required=True,
help='Paths to bed file with one additional column giving the feature to \
use for calculating biases')
bias_parser.add_argument(
'-o', '--output-path', required=True,
help='Path to save output matrix')
bias_parser.set_defaults(
func=bias.calculate_bias_from_args)
# Options for the 'call_windows' command
call_windows_parser = subparsers.add_parser(
'call_windows',
help='Call positive windows for individual NPs')
call_windows_parser.add_argument(
'coverage_file', type=argparse.FileType('r'),
help='Input coverage file generated by bedtools coverage '
'(or "-" to read from stdin)')
call_windows_parser.add_argument(
'-o', '--output-file', type=argparse.FileType('w'),
default=sys.stdout,
help='Output segregation file to create '
'(or "-" to write to stdout)')
add_window_calling_options(call_windows_parser)
call_windows_parser.set_defaults(
func=call_windows.threshold_from_args)
# Options for the 'compaction' command
compaction_parser = subparsers.add_parser(
'compaction',
help='Get the compaction of each genomic window from a segmentation file')
compaction_parser.add_argument(
'-s', '--segregation-file', required=True,
type=argparse.FileType('r'),
help='A segregation file to use for calculating compaction'
'(or - to read from stdin)')
compaction_parser.add_argument(
'-o', '--output-file', required=True,
type=argparse.FileType('w'),
help='Output bedgraph file path (or - to write to stdout)')
compaction_parser.add_argument(
'-n', '--no-blanks', default=False,
action='store_true',
help='Do not include lines with no value in the output.')
compaction_parser.set_defaults(func=compaction.compaction_from_args)
# Options for the 'convert' command
convert_parser = subparsers.add_parser(
'convert',
help='Convert between different GAM matrix formats')
convert_parser.add_argument(
'input_file', metavar='INPUT_MATRIX',
help='Input matrix file to convert')
convert_parser.add_argument(
'output_file', metavar='OUTPUT_MATRIX',
help='Output matrix file to write to '
'(or "-" to write to stdout).')
convert_parser.add_argument(
'-i', '--input-format',
choices=matrix.INPUT_FORMATS,
help='Input matrix file format (choose from: {})'.format(
', '.join(matrix.INPUT_FORMATS.keys())))
convert_parser.add_argument(
'-o', '--output-format',
choices=matrix.OUTPUT_FORMATS,
help='Output matrix file format (choose from: {})'.format(
', '.join(matrix.OUTPUT_FORMATS.keys())))
convert_parser.add_argument(
'-t', '--thresholds-file', metavar='THRESHOLDS_FILE',
help='Thresholds file. If specified, any values lower than the specified'
' thresholds will be masked/excluded from the output file')
convert_parser.add_argument(
'-w', '--windows-file', metavar='WINDOWS_FILE',
help='File containing the genomic locations of matrix bins '
'(only required if not specified in input matrix file).')
convert_parser.add_argument(
'-r', '--region',
help='Region covered by the input matrix '
'required if -w/--windows-file is specified')
convert_parser.set_defaults(func=matrix.convert_from_args)
# Options for the 'enrichment' command
enrichment_parser = subparsers.add_parser(
'enrichment',
help='Calculate enrichments of SLICE interactions')
enrichment_parser.add_argument(
'-i', '--interactions-file', required=True,
help='A file containing all pairwise interactions between genomic windows')
enrichment_parser.add_argument(
'-c', '--classes-file', required=True,
help='A file containing classes assigned to each genomic window')
enrichment_parser.add_argument(
'-o', '--output-prefix', default='enrichment_results',
help='First part of the output file name')
# Either permuting the data or not permuting must be specified
perm_type = enrichment_parser.add_mutually_exclusive_group(required=True)
perm_type.add_argument(
'-p', '--permutations', default=10, type=int, dest='num_permutations',
help='Number of times to randomly permute the input file')
perm_type.add_argument(
'-n',
'--no-permute',
action='store_const',
const=0,
dest='num_permutations',
help='Do not permute the input file, instead calculate observed counts')
enrichment_parser.set_defaults(func=enrichment.enrichment_from_args)
# Options for the 'matrix' command
matrix_parser = subparsers.add_parser(
'matrix',
help='Generate a GAM matrix from a segregation file')
matrix_parser.add_argument(
'-r', '--regions', metavar='REGION', required=True, nargs='+',
help='Specific genomic regions to calculate matrices for. '
'If one region is specified, a matrix is calculated for that region '
'against itself. If more than one region is specified, a matrix is '
'calculated for each region against the other. Regions are specified '
'using UCSC browser syntax, i.e. "chr4" for the whole of chromosome '
'4 or "chr4:100000-200000" for a sub-region of the chromosome.')
matrix_parser.add_argument(
'-s', '--segregation_file', required=True,
help='A segregation file to use as input')
matrix_parser.add_argument(
'-f', '--output-format',
choices=matrix.OUTPUT_FORMATS,
help='Output matrix file format (choose from: {}, default is txt.gz)'.format(
', '.join(matrix.OUTPUT_FORMATS.keys())))
matrix_parser.add_argument(
'-t',
'--matrix-type',
default='npmi',
choices=cosegregation.MATRIX_TYPES,
help='Method used to calculate the interaction matrix (choose from: '
'{}, default is npmi)'.format(
', '.join(
cosegregation.MATRIX_TYPES.keys())))
matrix_parser.add_argument(
'-o', '--output-file', metavar='OUTPUT_FILE',
help='Output matrix file. If not specified, new file will have the '
'same name as the segregation file and an extension indicating the '
'genomic region(s) and the matrix method')
matrix_parser.set_defaults(func=cosegregation.matrix_from_args)
# Options for 'permute_segregation' command
permute_seg_parser = subparsers.add_parser(
'permute_segregation',
help='Circularly permute the columns of a GAM segregation file')
permute_seg_parser.add_argument(
'-s', '--segregation_file', required=True,
type=argparse.FileType('r'),
help='A segregation file to circularly permute '
'(or - to read from stdin)')
permute_seg_parser.add_argument(
'-o', '--output-file', required=True,
type=argparse.FileType('w'),
help='Output file path (or - to write to stdout)')
permute_seg_parser.set_defaults(func=permutation.permute_segregation_from_args)
# Options for 'process' command
process_parser = subparsers.add_parser(
'process_nps',
help='Run a pipeline for mapping raw GAM sequencing data and '
'calling positive windows')
process_parser.add_argument(
'input_fastqs', metavar='INPUT_FASTQ', nargs='+',
help='One or more input fastq files.')
process_parser.add_argument(
'-g', '--genome-file', metavar='GENOME_FILE', required=True,
help='File containing chromosome names and lengths')
process_parser.add_argument(
'-o', '--output_dir', metavar='OUPUT_DIRECTORY',
default=os.getcwd(),
help='Write segregation, matrix etc. to this directory')
process_parser.add_argument(
'-i', '--bigwigs', action='append_const',
dest='to_run', const='Converting bedgraph to bigwig',
help='Make bigWig files.')
process_parser.add_argument(
'-b', '--bigbeds', action='append_const',
dest='to_run', const='Getting segregation bigBed',
help='Make bed files of positive windows')
process_parser.add_argument(
'-c', '--do-qc', action='append_const',
dest='to_run', const='do_qc',
help='Perform sample quality control.')
process_parser.add_argument(
'-w', '--window-sizes', metavar='WINDOW_SIZE',
default=[50000], type=int, nargs='+',
help='One or more window sizes for calling positive windows')
process_parser.add_argument(
'-m', '--matrix-sizes', metavar='MATRIX_SIZE',
default=[], type=int, nargs='+',
help='Resolutions for which linkage matrices should be produced.')
process_parser.add_argument(
'--qc-window-size', type=int,
help='Use this window size for qc (default is median window size).')
process_parser.add_argument(
'--additional-qc-files', nargs='*', default=[],
help='Any additional qc files to filter on')
process_parser.add_argument(
'-q', '--minimum-mapq', metavar='MINIMUM_MAPQ', default=20, type=int,
help='Filter out any mapped read with a mapping quality less than x '
'(default is 20, use -q 0 for no filtering)')
# Add options for doit, the task runner engine (www.pydoit.org)
add_doit_options(process_parser,
['dep_file', 'backend', 'verbosity',
'reporter', 'num_process', 'par_type',
'reset_dep'])
add_window_calling_options(process_parser)
def get_script(script_name):
"""Get the absolute path to a script in the gamtools 'scripts' folder
:param str script_name: Name of the script file.
:returns: Absolute path to the script file
"""
return os.path.join(os.path.dirname(__file__),
'data',
'scripts',
script_name)
process_parser.set_defaults(func=pipeline.process_nps_from_args,
to_run=[
'Calling positive windows',
#'Filtering samples based on QC values',
],
mapping_stats_script=get_script(
'mapping_stats.sh'),
example_parameters_file=utils.get_example(
'qc_parameters.example.cfg'),
default_stats=['contamination_stats.txt', 'mapping_stats.txt',
'quality_stats.txt', 'segregation_stats.txt'],
)
# Options for the 'radial_pos' command
radial_pos_parser = subparsers.add_parser(
'radial_pos',
help='Get the radial position of each genomic window from a segmentation file')
radial_pos_parser.add_argument(
'-s', '--segregation-file', required=True,
type=argparse.FileType('r'),
help='A segregation file to use for calculating radial position'
'(or - to read from stdin)')
radial_pos_parser.add_argument(
'-o', '--output-file', required=True,
type=argparse.FileType('w'),
help='Output bedgraph file path (or - to write to stdout)')
radial_pos_parser.add_argument(
'-n', '--no-blanks', default=False,
action='store_true',
help='Do not include lines with no value in the output.')
radial_pos_parser.set_defaults(func=radial_position.radial_position_from_args)
# Options for 'resolution_qc' command
resolution_parser = subparsers.add_parser(
'resolution_qc',
help='Check a segregation table to ensure it is of sufficient quality.')
resolution_parser.add_argument(
'segregation_table', metavar='SEGREGATION_TABLE',
help='Segregation table to check.')
resolution_parser.add_argument(
'slice_thickness', metavar='SLICE_THICKNESS', type=float,
help='Thickness of NPs in this dataset (units must match those for nuclear radius).')
resolution_parser.add_argument(
'nuclear_radius', metavar='NUCLEAR_RADIUS', type=float,
help='Average radius of nuclei in this dataset (units must match those for slice thickness).')
resolution_parser.add_argument(
'-x', '--skip-chromosomes', metavar='CHROM', nargs='+',
default=[],
help='Exclude one or more chromosomes when calculating QC metrics.')
resolution_parser.add_argument(
'-g', '--genome-size', metavar='GENOME_SIZE', type=float,
help='Total size of the genome in bp. Use approximately 6.4e9 for human '
'or 5e9 for mouse. If not specified, the genome size is calculated from '
'the segregation table itself.')
resolution_parser.add_argument(
'-p', '--plexity', metavar='NPS_PER_SAMPLE',
type=int, default=1,
help='Number of NPs multiplexed together in each column of the segregation table.')
resolution_parser.add_argument(
'-i', '--include-invisible', action='store_false', dest='only_visible',
help='By default, we assume windows that are never detected are "invisible" '
'e.g. they are too repetitive to sequence, and we therefore ignore them. '
'This option turns off the default behaviour and includes all windows.')
resolution_parser.set_defaults(func=resolution.resolution_check_from_args)
# Options for 'select' command
select_parser = subparsers.add_parser(
'select',
help='Select only certain samples from a segregation file')
select_parser.add_argument(
'-s', '--segregation-file', required=True,
help='A file containing the segregation of all samples')
select_parser.add_argument(
'-d', '--drop-samples', action='store_true',
help='Discard the listed samples (default: discard samples not in the list)')
select_parser.add_argument(
'-n', '--sample-names', metavar='SAMPLE_NAME',
required=True, nargs='*', help='Names of the samples to remove')
select_parser.add_argument(
'-o', '--output-file', required=True,
type=argparse.FileType('w'),
help='Output file path (or - to write to stdout)')
select_parser.set_defaults(func=select_samples.select_samples_from_args)
# Options for 'test' command
test_parser = subparsers.add_parser(
'test',
help='Test that GAMtools is installed and configured correctly.')
test_directory = os.path.join(os.path.dirname(__file__), 'tests')
def test_function(args): #pylint: disable=unused-argument
"""Wrapper function to call py.test from arparse"""
# TODO: Fix passing additional arguments to py.test
test_args = sys.argv[2:]
sys.exit(pytest.main([test_directory] + test_args))
test_parser.set_defaults(func=test_function)
def main():
"""Main gamtools function that is called by the gamtools command line script."""
args = parser.parse_args()
args.func(args)
if __name__ == '__main__':
main()