KarrLab/datanator

View on GitHub
datanator/util/taxonomy_util.py

Summary

Maintainability
A
1 hr
Test Coverage
A
92%
""" Utilities for dealing with taxa

:Author: Yosef Roth <yosefdroth@gmail.com>
:Author: Jonathan <jonrkarr@gmail.com>
:Date: 2017-04-11
:Copyright: 2017, Karr Lab
:License: MIT
"""

from ete3 import NCBITaxa


def setup_database(force_update=False):
    """ Setup a local sqllite copy of the NCBI Taxonomy database. If :obj:`force_update` is `False`, then
    only download the content from NCBI and build the sqllite database, if a local database doesn't already
    exist. If :obj:`force_update` is `True`, then always download the content from NCBI and rebuild the
    sqllite copy of the database.

    Args:
        force_update (:obj:`bool`, optional):

            * :obj:`False`: only download the content for the database and build a local sqllite database
                if a local sqllite copy of the database doesn't already exist
            * :obj:`True`: always download the content for the database from NCBI and rebuild a local sqllite
                database
    """
    ncbi_taxa = NCBITaxa()
    if force_update:
        # force downloading of latest content from NCBI and (re)building of local sqllite database
        ncbi_taxa.update_taxonomy_database()
    else:
        # run an operation on the local sqllite database to trigger NCBITaxa to setup a local sqllite
        # database if one doesn't already exist
        ncbi_taxa.get_descendant_taxa('Homo')


class Taxon(object):
    """ Represents a taxon such as a genus, species, or strain

    Attributes:
        id (:obj:`str`): identifier
        name (:obj:`str`): name of the taxon
        id_of_nearest_ncbi_taxon (:obj:`int`): ID of the nearest parent taxon which is in the NCBI database
        distance_from_nearest_ncbi_taxon (:obj:`int`): distance from the taxon to its nearest parent which
            is in the NCBI database
        additional_name_beyond_nearest_ncbi_taxon (:obj:`str`): additional part of the taxon's beyond that of
            its nearest parent in the NCBI database
        cross_references (:obj:`list` of :obj:`CrossReference`): list of cross references
    """

    def __init__(self, id='', name='', ncbi_id=None, cross_references=None):
        """
        Args:
            id (:obj:`str`, optional): identifier
            name (:obj:`str`, optional): name
            ncbi_id (:obj:`int`, optional): NCBI identifier
            cross_references (:obj:`list` of :obj:`CrossReference`, optional): list of cross references
        """

        self.id = id
        self.name = name
        self.id_of_nearest_ncbi_taxon = None
        self.distance_from_nearest_ncbi_taxon = None
        self.additional_name_beyond_nearest_ncbi_taxon = None
        self.cross_references = cross_references or []

        ncbi_taxa = NCBITaxa()

        if ncbi_id:
            self.id_of_nearest_ncbi_taxon = ncbi_id
            self.distance_from_nearest_ncbi_taxon = 0
            self.additional_name_beyond_nearest_ncbi_taxon = ''
            self.name = ncbi_taxa.translate_to_names([ncbi_id])[0]
            if self.name == ncbi_id:
                raise ValueError('The NCBI taxonomy database does not contain a taxon with id {}'.format(ncbi_id))
        else:
            rank_names = name.split(' ')
            for i_rank in range(len(rank_names)):
                partial_name = ' '.join(rank_names[0:len(rank_names) - i_rank])
                result = ncbi_taxa.get_name_translator([partial_name])
                if result:
                    self.id_of_nearest_ncbi_taxon = result[partial_name][0]
                    self.distance_from_nearest_ncbi_taxon = i_rank
                    self.additional_name_beyond_nearest_ncbi_taxon = ''.join(' ' + n for n in rank_names[len(rank_names) - i_rank:])
                    self.name = ncbi_taxa.translate_to_names([self.id_of_nearest_ncbi_taxon])[0] \
                        + self.additional_name_beyond_nearest_ncbi_taxon
                    return

            self.name = name

    def get_ncbi_id(self):
        """ Get the ID of the taxon within the NCBI database

        Returns:
            :obj:`int` or :obj:`None`: ID of the taxon within the NCBI database or
                :obj:`None` if the taxon isn't in the NCBI database
        """
        if self.distance_from_nearest_ncbi_taxon == 0:
            return self.id_of_nearest_ncbi_taxon
        return None

    def get_parent_taxa(self):
        """ Get parent taxa

        Returns:
            :obj:`list` of :obj:`Taxon`: list of parent taxa
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return None

        cls = self.__class__
        ncbi_taxa = NCBITaxa()
        lineage = [cls(ncbi_id=id) for id in ncbi_taxa.get_lineage(self.id_of_nearest_ncbi_taxon)]

        if self.additional_name_beyond_nearest_ncbi_taxon:
            base_name = ncbi_taxa.translate_to_names([self.id_of_nearest_ncbi_taxon])[0]
            names = self.additional_name_beyond_nearest_ncbi_taxon[1:].split(' ')
            for i_rank, name, in enumerate(names):
                lineage.append(cls(name=base_name + ''.join(' ' + n for n in name[0:i_rank+1])))

        return lineage[0:-1]

    def get_rank(self):
        """ Get the rank of the taxon

        Returns:
            :obj:`str`: rank of the taxon
        """
        if self.distance_from_nearest_ncbi_taxon == 0:
            ncbi_taxa = NCBITaxa()
            rank = ncbi_taxa.get_rank([self.id_of_nearest_ncbi_taxon])[self.id_of_nearest_ncbi_taxon]
            if rank != 'no rank':
                return rank

        return None

    def get_common_ancestor(self, other):
        """ Get the lastest common ancestor of two taxa

        Args:
            other (:obj:`Taxon`): a second taxon

        Returns:
            :obj:`Taxon`: latest common ancestor
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return id_of_nearest_ncbi_taxon

        ncbi_taxa = NCBITaxa()
        tree = ncbi_taxa.get_topology([self.id_of_nearest_ncbi_taxon, other.id_of_nearest_ncbi_taxon], intermediate_nodes=True)
        self_node = tree.search_nodes(name=str(self.id_of_nearest_ncbi_taxon))[0]
        other_node = tree.search_nodes(name=str(other.id_of_nearest_ncbi_taxon))[0]
        ancestor = tree.get_common_ancestor(self_node, other_node)
        cls = self.__class__
        return cls(ncbi_id=float(ancestor.name))

    def get_distance_to_common_ancestor(self, other):
        """ Calculate the number of links in the NCBI taxonomic tree between two taxa and their latest common ancestor

        Note: This distances depends on the granularity of the lineage of the taxon. For example, there are only 7 links
        between most bacteria species and the Bacteria superkingdom. However, there are 28 links between the Homo sapiens
        species and the Eukaryota superkingdom.

        Args:
            other (:obj:`Taxon`): a second taxon

        Returns:
             :obj:`int`: number of links between :obj:`self` and its latest common ancestor with :obj:`other` in the NCBI
                taxonomic tree
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return id_of_nearest_ncbi_taxon

        ncbi_taxa = NCBITaxa()
        tree = ncbi_taxa.get_topology([self.id_of_nearest_ncbi_taxon, other.id_of_nearest_ncbi_taxon], intermediate_nodes=True)
        self_node = tree.search_nodes(name=str(self.id_of_nearest_ncbi_taxon))[0]
        other_node = tree.search_nodes(name=str(other.id_of_nearest_ncbi_taxon))[0]
        ancestor = tree.get_common_ancestor(self_node, other_node)
        return tree.get_distance(self_node, ancestor) + self.distance_from_nearest_ncbi_taxon

    def get_distance_to_root(self):
        """ Get the distance from the taxon to the root of the NCBI taxonomy tree

        Returns:
            :obj:`int`: distance from the taxon to the root
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return id_of_nearest_ncbi_taxon

        ncbi_taxa = NCBITaxa()
        return len(ncbi_taxa.get_lineage(self.id_of_nearest_ncbi_taxon)) - 1 + self.distance_from_nearest_ncbi_taxon

    def get_max_distance_to_common_ancestor(self):
        """ Get the maximum distance from the taxon to a common ancestor with another taxon

        Returns:
            :obj:`int`: maximum distance from the taxon to a common ancestor with another taxon
        """
        return self.get_distance_to_root()