lib/gamtools/resolution.py from pombo-lab/gamtools

lib/gamtools/resolution.py
Summary

Maintainability

4 hrs
Test Coverage

Issues
"""

The resolution module
======================

This module contains functions for estimating the best resolution to use
for analysing a GAM dataset.

Two metrics are used:
 * Detection efficiency
 * Window coverage

Detection efficiency
--------------------

Detection efficiency is an estimate of the percentage of DNA actually present
in each library that was detected after WGA and sequencing. For example if your
sample contains three NPs with a total of 10 million base pairs of DNA, but the
detected positive windows only cover 6 million base pairs of DNA, the detection
efficiency would be 60%. This value is estimated for a whole dataset and we
would recommend it should be 70% or higher. If detection efficiency is too low,
this can be a sign that your window size is too small (i.e. you are trying to
work at too high a resolution). Datasets with lower detection efficiency can
still be usable, but a greater number of libraries will need to be sequenced.

Window coverage
---------------

In order to have good statistical power, each genomic window should be observed
in the dataset with a sufficient frequency. We recommend that 80% of the windows
in the genome should be detected at least 20 times. If this number is too low,
increasing the window size may help a little but you may need to collect more
libraries.

"""

import numpy as np

from . import segregation


class SliceException(Exception):
    """Exception class for errors related to SLICE"""

def segregation_info(segregation_table, skip_chroms):
    """
    Given a segregation table, retrieve the number of tubes,
    the diploid genome size and the bin size.

    :param segregation_table: Input :ref:`segregation table <segregation_table>`
    :param list skip_chroms: List of chromosome names to exclude.
    :returns: Number of libraries, total size of the genome and size of each window
    """

    segregation_windows = segregation_table.reset_index()[['chrom', 'start', 'stop']]
    segregation_windows['size'] = segregation_windows.stop - segregation_windows.start
    genome_size = segregation_windows.loc[
        np.logical_not(segregation_windows.chrom.isin(skip_chroms)),
        'size'].sum() * 2

    bin_sizes = segregation_windows['size'].value_counts()
    frequent_sizes = [size for (size, freq)
                      in (bin_sizes / bin_sizes.sum()).items()
                      if freq > 0.95]
    if not frequent_sizes:
        raise SliceException('SLICE requires windows of even sizes')
    window_size = frequent_sizes[0]

    return genome_size, window_size


def compute_eff_slice_fraction(slice_thickness, nuclear_radius, window_fraction):
    """
    Compute the ratio between slice thickness and nuclear radius, taking into account
    the window size. This is important because a genomic window does not have to fall
    entirely within the NP to be detected. The larger the window size, the further DNA
    can fall outside of the physical section and still be 'effectively' within the NP.

    :param float slice_thickness: Thickness of the NP.
    :param float nuclear_radius: Average nuclear radius.
    :param float window_fraction: Genomic window size divided by total genome size.
    :returns: Effective slice thickness
    """

    return (slice_thickness/nuclear_radius) + (2 * (window_fraction ** (1/3)))


def get_detection_efficiency(segregation_table, slice_thickness, #pylint: disable=too-many-arguments
                             nuclear_radius, skip_chroms=None, genome_size=None,
                             plexity=1, only_visible=True):
    """
    Given a GAM segregation table, calculate the efficiency of detection
    of individual loci. For example, 0.8 detection efficiency indicates that
    80% of the loci actually intersected by a slice are detected in our
    sequencing data.

    :param segregation_table: Input :ref:`segregation table <segregation_table>`
    :param float slice_thickness: Thickness of the NP.
    :param float nuclear_radius: Average nuclear radius.
    :param list skip_chroms: List of chromosome names to exclude.
    :param float genome_size: Total size of the genome (if None, calculate from \
          the segregation table.
    :param int plexity: Number of nuclear profiles in each sample.
    :param bool only_visible: If true, don't include genomic windows that are \
          never detected across the dataset in calculation. This is the default \
          behaviour.
    :returns: Number of libraries, total size of the genome and size of each window
    """

    if skip_chroms is None:
        skip_chroms = []

    table_genome_size, window_size = segregation_info(segregation_table, skip_chroms)

    if genome_size is None:
        genome_size = table_genome_size

    if only_visible:
        visible = segregation_table.loc[segregation_table.sum(axis=1) > 0]
    else:
        visible = segregation_table

    avg_m1s_m =  visible.sum(axis=1).mean() / visible.shape[1]

    coef = 1 / plexity
    window_fraction = window_size / genome_size

    effective_slice_fraction = compute_eff_slice_fraction(
        slice_thickness, nuclear_radius, window_fraction)

    return ((2 + effective_slice_fraction) / effective_slice_fraction) * (1 -
            (((1 - avg_m1s_m) ** coef) ** (1/2)))


def get_segregation_coverage_qc(segregation_table, skip_chroms=None,
                                only_visible=True, quantile=0.2):
    """
    Given a GAM segregation table, calculate x, such that 80% of the
    windows in the segregation table are identified in an NP at least
    x times. For example, we recommend that in a "good quality" GAM
    dataset, 80% of the genomic windows will be detected at least 20
    times.

    :param segregation_table: Input :ref:`segregation table <segregation_table>`
    :param list skip_chroms: List of chromosome names to exclude.
    :param bool only_visible: If true, don't include genomic windows that are \
          never detected across the dataset in calculation. This is the default \
          behaviour.
    :param float quantile: The quantile at which to calculate window coverage. \
          The default value of 0.2 returns the coverage exceeded by 80% of \
          genomic windows.
    :returns: Number of times 80% of genomic windows are detected.
    """

    if skip_chroms is None:
        skip_chroms = []

    if only_visible:
        visible = segregation_table.loc[segregation_table.sum(axis=1) > 0]
    else:
        visible = segregation_table

    return visible.sum(axis=1).quantile(quantile)


def do_segregation_qc(segregation_table, slice_thickness, nuclear_radius, coverage_q=0.2, #pylint: disable=too-many-arguments
                      skip_chroms=None, genome_size=None, plexity=1, only_visible=True):
    """
    Check that a GAM dataset has sufficient quality and depth.  Specifically,
    check that 80% of genomic windows have been detected at least 20 times
    and that the detection efficiency is at least 70%.

    :param segregation_table: Input :ref:`segregation table <segregation_table>`
    :param float slice_thickness: Thickness of the NP.
    :param float nuclear_radius: Average nuclear radius.
    :param float coverage_q: The quantile at which to calculate window coverage. \
          The default value of 0.2 returns the coverage exceeded by 80% of \
          genomic windows.
    :param list skip_chroms: List of chromosome names to exclude.
    :param float genome_size: Total size of the genome (if None, calculate from \
          the segregation table.
    :param int plexity: Number of nuclear profiles in each sample.
    :param bool only_visible: If true, don't include genomic windows that are \
          never detected across the dataset in calculation. This is the default \
          behaviour.
    :returns: False if either of the checks fail, otherwise True.
    """

    if skip_chroms is None:
        skip_chroms = []

    pass_qc = True

    window_coverage = get_segregation_coverage_qc(segregation_table, skip_chroms,
                                only_visible, coverage_q)

    if window_coverage >= 20:
        print('Coverage check passed. 80% of genomic windows are detected at '
              'least {:.0f} times'.format(window_coverage))
    else:
        print('Coverage check failed. 80% of genomic windows are detected at '
              'least {:.0f} times. This value should be at least 20, try increasing '
              'the genomic window size for this dataset.'.format(window_coverage))
        pass_qc = False

    detection_efficiency = get_detection_efficiency(
        segregation_table, slice_thickness, nuclear_radius,
        skip_chroms, genome_size, plexity, only_visible)

    if detection_efficiency > 0.7:
        if detection_efficiency <= 1.02:
            print('Detection check passed. Genome-wide detection-efficiency '
                  '{:.1%}'.format(detection_efficiency))
        else:
            print('Detection check failed. Genome-wide detection-efficiency '
                  '{:.1%} - this cannot be higher than 100%, please check '
                  'your input parameters.'.format(
                   detection_efficiency))
            pass_qc = False
    else:
        print('Detection check failed. Genome-wide detection-efficiency '
              '{:.1%}. This value should be at least 70%.'.format(
               detection_efficiency))
        pass_qc = False

    return pass_qc

def resolution_check_from_args(args):
    """Extract parameters from an argparse namespace object and pass them
    to do_segregation_qc."""

    segregation_table = segregation.open_segregation(args.segregation_table)

    do_segregation_qc(segregation_table, args.slice_thickness, args.nuclear_radius,
                      skip_chroms=args.skip_chromosomes, genome_size=args.genome_size,
                      plexity=args.plexity, only_visible=args.only_visible)