KarrLab/datanator

View on GitHub
datanator/core/data_model.py

Summary

Maintainability
F
4 days
Test Coverage
A
92%
"""
:Author: Jonathan Karr <jonrkarr@gmail.com>
:Author: Yosef Roth <yosefdroth@gmail.com>
:Date: 2017-04-10
:Copyright: 2017, Karr Lab
:License: MIT
"""

from datanator.util import molecule_util
import copy
import enum
import itertools
import numpy
import obj_tables.core
import obj_tables.bio.seq


class ConsensusMethod(enum.Enum):
    """ Represents the method by which a consensus was chosen """
    manual = 0
    mean = 1
    median = 2
    mode = 3
    weighted_mean = 4
    weighted_median = 5
    weighted_mode = 6


class Consensus(obj_tables.core.Model):
    """ Represents a consensus of one or more observed values of an attribute of a component of a model

    Attributes:
        observable (:obj:`Observable`): biological component that was estimated
        value (:obj:`float`): consensus value of the attribute of the model component
        error (:obj:`float`): uncertainty of the value of the attribute of the model component
        units (:obj:`str`): units of the value of the attribute of the model component
        evidence (:obj:`list` of :obj:`Evidence`): list of evidence which the consensus
            value is based on
        method (:obj:`ConsensusMethod`): method used to calculate the consensus value and error
        user (:obj:`str`): user who generated the consensus
        date (:obj:`datetime.datetime`): date and time when the consensus was generated
    """
    observable = obj_tables.core.ManyToOneAttribute('Observable', related_name='consensus')
    value = obj_tables.core.FloatAttribute()
    error = obj_tables.core.FloatAttribute()
    units = obj_tables.core.StringAttribute()
    evidence = obj_tables.core.ManyToManyAttribute('Evidence', related_name='consensus')
    method = obj_tables.core.EnumAttribute(ConsensusMethod)
    user = obj_tables.core.StringAttribute()
    date = obj_tables.core.DateTimeAttribute()


class Evidence(obj_tables.core.Model):
    """ Represents the observed values and their relevance which support a consensus

    Attributes:
        value (:obj:`ObservedValue`): observed value
        relevance (:obj:`float`): numeric score which indicates the relevance of the observed value to the
            consensus
    """
    value = obj_tables.core.ManyToOneAttribute('ObservedValue', related_name='evidence')
    relevance = obj_tables.core.FloatAttribute()


class ObservedResultMetadata(obj_tables.core.Model):
    """ Represents an observation (one or more observed values) about a biological system

    Attributes:
        genetics (:obj:`Genetics`): the taxon, and any genetic variation from the wildtype taxon, that the component was observed in
        environment (:obj:`Environment`): environment that the component was observed in
        values (:obj:`list` of :obj:`ObservedValue`): observed values
        reference (:obj:`Reference`): reference to the reference
    """
    genetics = obj_tables.core.ManyToOneAttribute('Genetics', related_name='observations')
    environment = obj_tables.core.ManyToOneAttribute('Environment', related_name='observations')
    cross_references = obj_tables.core.ManyToManyAttribute('Resource', related_name='observations')
    method = obj_tables.core.ManyToOneAttribute('Method', related_name='observations')
    synonym = obj_tables.core.ManyToManyAttribute('Synonym', related_name='observations')

class ObservedResult(obj_tables.core.Model):
    """ Represents a base dataset for a queried response

    Attributes:
        observation (:obj:`Observaton`): the collection of covariate observed values
        method (:obj:`str`): method that was used to make the observation
    """

    metadata = obj_tables.core.ManyToOneAttribute('ObservedResultMetadata', related_name='observed_result')


class ObservedInteraction(ObservedResult):
    """ Represents an observed interaction of a biological system

    Attributes:
        interaction (:obj:`Interaction`): observed interaction
    """

    interaction = obj_tables.core.ManyToOneAttribute('Interaction', related_name='observed_interaction')

class ObservedSpecie(ObservedResult):
    """ Represents an observed interaction of a biological system

    Attributes:
        specie (:obj:`Specie`): observed specie
    """

    specie = obj_tables.core.ManyToOneAttribute('Specie', related_name='observed_specie')

class ObservedValue(ObservedResult):
    """ Represents an observed value of a biological system

    Attributes:
        observable (:obj:`Observaton`): the observed interaction or specie for which the value corresponds
        value (:obj:`float`): observed value
        error (:obj:`float`): uncertainty of the observed value
        units (:obj:`units`): SI units of the observed value
    """
    observable = obj_tables.core.ManyToOneAttribute('Observable', related_name='observed_values')
    value = obj_tables.core.FloatAttribute()
    error = obj_tables.core.FloatAttribute()
    units = obj_tables.core.StringAttribute()


class Observable(obj_tables.core.Model):
    """ Represents an observable of a biological system

    Attributes:
        interaction (:obj:`Interaction`): observed interaction
        specie (:obj:`Specie`): observed species
        compartment (:obj:`Compartment`): compartment that the spcies/interaction was observed in
        property (:obj:`str`): property that was observed
    """
    interaction = obj_tables.core.ManyToOneAttribute('Interaction', related_name='observed_values')
    specie = obj_tables.core.ManyToOneAttribute('Specie', related_name='observed_values')
    compartment = obj_tables.core.ManyToOneAttribute('Compartment', related_name='observed_values')
    property = obj_tables.core.StringAttribute()


class EntityInteractionOrProperty(obj_tables.core.Model):
    """ Represents an observable of a biological system

    Attributes:
        id (:obj:`str`): identifier
        name (:obj:`str`): name
        cross_references (:obj:`list` of :obj:`Resource`): list of cross references to external resources
    """
    id = obj_tables.core.StringAttribute()
    name = obj_tables.core.StringAttribute()
    cross_references = obj_tables.core.ManyToManyAttribute('Resource', related_name='observables')


class Compartment(EntityInteractionOrProperty):
    """ Representes a compartment in a biological system """
    pass


class Specie(EntityInteractionOrProperty):
    """ Represents a molecular species in a biological system

    Attributes:
        structure (:obj:`str`): structure
    """
    structure = obj_tables.core.LongStringAttribute()

    def to_inchi(self, only_formula_and_connectivity=False):
        """ Get the structure in InChi format

        Args:
            only_formula_and_connectivity (:obj:`bool`): if :obj:`True`, return only the
                formula and connectivity layers

        Returns:
            :obj:`str`: structure in InChi format or just the formula and connectivity layers
                if :obj:`only_formula_and_connectivity` is :obj:`True`
        """
        inchi = molecule_util.Molecule(structure=self.structure).to_inchi()
        if only_formula_and_connectivity:
            return molecule_util.InchiMolecule(inchi).get_formula_and_connectivity()
        else:
            return inchi

    def to_mol(self):
        """ Get the structure in .mol format

        Returns:
            :obj:`str`: structure in .mol format
        """
        return molecule_util.Molecule(structure=self.structure).to_mol()

    def to_openbabel(self):
        """ Get the structure as a Open Babel molecule

        Returns:
            :obj:`openbabel.OBMol`: structure as a Open Babel molecule
        """
        return molecule_util.Molecule(structure=self.structure).to_openbabel()

    def to_pybel(self):
        """ Get the structure as a Pybel molecule

        Returns:
            :obj:`pybel.Molecule`: structure as a Pybel molecule
        """
        return molecule_util.Molecule(structure=self.structure).to_pybel()

    def to_smiles(self):
        """ Get the structure in canonical SMILES format

        Returns:
            :obj:`str`: structure in canonical SMILES format
        """
        return molecule_util.Molecule(structure=self.structure).to_smiles()

    def get_similarity(self, other, fingerprint_type='fp2'):
        """ Calculate the similarity with another species

        Args:
            other (:obj:`Specie`): a second species
            fingerprint_type (:obj:`str`, optional): fingerprint type to use to calculate similarity

        Returns:
            :obj:`float`: the similarity with the other molecule
        """
        self_mol = molecule_util.Molecule(structure=self.structure)
        other_mol = molecule_util.Molecule(structure=other.structure)
        return self_mol.get_similarity(other_mol, fingerprint_type=fingerprint_type)


class PolymerSpecie(Specie):
    """ Represents a polymer

    Attributes:
        sequence (:obj:`str`): sequence
    """
    sequence = obj_tables.core.LongStringAttribute()


class DnaSpecie(PolymerSpecie):
    """ Represents a DNA polymer

    Attributes:
        binding_matrix (:obj:`Bio.motifs.matrix.FrequencyPositionMatrix`): Binding motif
    """
    binding_matrix = obj_tables.bio.seq.FreqPosMatrixAttribute()


class RnaSpecie(PolymerSpecie):
    """ Represents a RNA polymer """
    pass


class ProteinSpecie(PolymerSpecie):
    """ Represents a protein polymer

    Attributes:
        uniprot_id (:obj:`str`): Uniprot Identifier
        entrez_id (:obj:`int`): Entrez Identifier
        gene_name (:obj:`str`): gene name from which protein stems
        length (:obj:`int`): Length of the amino acid sequence
        mass (:obj:`int`): Mass of the protein in KDa

    """
    uniprot_id  = obj_tables.core.StringAttribute()
    entrez_id = obj_tables.core.IntegerAttribute()
    gene_name = obj_tables.core.StringAttribute()
    length = obj_tables.core.IntegerAttribute()
    mass = obj_tables.core.IntegerAttribute()

class ProteinComplexSpecie(ProteinSpecie):
    """ Represents a protein interaction

        Attributes:

    """

    go_id = obj_tables.core.StringAttribute()
    go_dsc = obj_tables.core.StringAttribute()
    funcat_id = obj_tables.core.StringAttribute()
    funcat_dsc = obj_tables.core.StringAttribute()
    su_cmt = obj_tables.core.StringAttribute()
    complex_cmt = obj_tables.core.StringAttribute()
    disease_cmt = obj_tables.core.StringAttribute()
    class_name = obj_tables.core.StringAttribute()
    family_name = obj_tables.core.StringAttribute()
    molecular_weight = obj_tables.core.FloatAttribute()



class Interaction(EntityInteractionOrProperty):
    """ Represents an interaction

        Attributes:
            position (:obj:`int`): position at which interaction occurs
            score (:obj:`float`): ranking of the response of the interaction
    """
    #TODO: Assess difference between score and confidence
    position = obj_tables.core.IntegerAttribute()
    score = obj_tables.core.FloatAttribute(default=0)
    confidence = obj_tables.core.StringAttribute()

class SpecieInteraction(Interaction):
    """ Represents a protein interaction

    Attributes:
        specie_a (:obj:`str`):
        specie_b (:obj:`str`):
        stoichiometry_a (:obj:`int`):
        stoichiometry_b (:obj:`int`):
        loc_a (:obj:`str`):
        loc_b (:obj:`str`):
    """

    specie_a = obj_tables.core.OneToOneAttribute('Specie', related_name='specie_interaction')
    specie_b = obj_tables.core.OneToOneAttribute('Specie', related_name='specie_interaction')
    stoichiometry_a = obj_tables.core.IntegerAttribute()
    stoichiometry_b = obj_tables.core.IntegerAttribute()
    loc_a = obj_tables.core.StringAttribute()
    loc_b = obj_tables.core.StringAttribute()
    type_a = obj_tables.core.StringAttribute()
    type_b = obj_tables.core.StringAttribute()
    interaction_type = obj_tables.core.StringAttribute()



class Reaction(Interaction):
    """ Represents a reaction

    Attributes:
        participants (:obj:`list` of :obj:`ReactionParticipant`): list of participants
        reversible (:obj:`bool`): :obj:`True` if the reaction is reversible
    """
    kinetic_law_id = obj_tables.core.IntegerAttribute()
    participants = obj_tables.core.ManyToManyAttribute('ReactionParticipant', related_name='reactions')
    reversible = obj_tables.core.BooleanAttribute()

    def get_reactants(self):
        """ Get the reactants

        Returns:
            :obj:`list` of :obj:`ReactionParticipant`: list of reactants
        """
        return list(filter(lambda p: p.coefficient < 0, self.participants))

    def get_products(self):
        """ Get the products

        Returns:
            :obj:`list` of :obj:`ReactionParticipant`: list of products
        """
        return list(filter(lambda p: p.coefficient > 0, self.participants))

    def get_modifiers(self):
        """ Get the modifiers

        Returns:
            :obj:`list` of :obj:`ReactionParticipant`: list of modifiers
        """
        return list(filter(lambda p: p.coefficient == 0, self.participants))

    def get_ordered_participants(self, collapse_repeated=True):
        """ Get an ordered list of the participants

        Args:
            collapse_repeated (:obj:`bool`): if :obj:`True`, collapse any repeated participants

        Returns:
            :obj:`list` of :obj:`ReactionParticipant`: ordered list of reaction participants
        """

        # create copy of participants
        participants = copy.copy(list(self.participants))

        # collapse repeated participants
        if collapse_repeated:
            i_part = 0
            while i_part < len(participants):
                part = participants[i_part]
                if part.coefficient == 0:
                    participants.pop(i_part)
                    continue

                i_other_part = i_part + 1
                while i_other_part < len(participants):
                    other_part = participants[i_other_part]
                    if other_part.specie.structure == part.specie.structure and \
                            other_part.specie.id == part.specie.id and \
                            ((other_part.compartment is None and part.compartment is None) or
                                other_part.compartment.id == part.compartment.id) and \
                            numpy.sign(other_part.coefficient) == numpy.sign(part.coefficient):
                        part.coefficient += other_part.coefficient
                        participants.pop(i_other_part)
                    else:
                        i_other_part += 1

                i_part += 1

        # order participants
        participants.sort(key=lambda part: part.order if part.order is not None else 1e10)

        # return
        return participants

    def stringify(self):
        #TODO: Add the modifier

        participants = self.get_ordered_participants()
        result = ''
        for item in participants:
            if item.coefficient == -1 and len(result) < 1:
                result += str(item.specie.name)
            elif item.coefficient == -1 and len(result) > 1:
                result += ' + ' + str(item.specie.name)
            elif item.coefficient == 1 and '-->' not in result:
                result += ' --> ' + str(item.specie.name)
            elif item.coefficient == 1:
                result += ' + ' + str(item.specie.name)
            else:
                continue

        return result

    def get_reactant_product_pairs(self):
        """ Get list of pairs of similar reactants and products

        Note: This requires the modeler to have ordered the reactans and products by their similarity. The modeler is required to
        specify this pairing because it cannot easily be computed. In particular, we have tried to use Tanimoto similarity to
        predict reactant-product pairings, but this doesn't adequately capture reaction centers.

        Returns:
            :obj:`list` of :obj:`tuple` of obj:`ReactionParticipant`, :obj:`ReactionParticipant`: list of pairs of similar reactants and products
        """
        participants = self.get_ordered_participants()
        reactants = list(filter(lambda p: p.coefficient < 0, participants))
        products = list(filter(lambda p: p.coefficient > 0, participants))
        return list(itertools.zip_longest(reactants, products))

    def get_ec_numbers(self):
        """ Get the EC numbers from the list of cross references

        Returns:
            :obj:`list` of :obj:`str`: list of EC numbers
        """
        return list(filter(lambda xr: xr.namespace == 'ec-code', self.cross_references))

    def get_manual_ec_numbers(self):
        """ Get the manually assigned EC numbers from the list of cross references

        Returns:
            :obj:`list` of :obj:`str`: list of EC manually assigned numbers
        """
        return list(filter(lambda xr: xr.namespace == 'ec-code' and xr.assignment_method ==
                           ResourceAssignmentMethod.manual, self.cross_references))

    def get_predicted_ec_numbers(self):
        """ Get the predicted EC numbers from the list of cross references

        Returns:
            :obj:`list` of :obj:`str`: list of predicted EC numbers
        """
        return list(filter(lambda xr: xr.namespace == 'ec-code' and xr.assignment_method ==
                           ResourceAssignmentMethod.predicted, self.cross_references))

    def get_ec_number(self):
        """ Get the most relevant EC number from the list of cross references

        * If the reaction has a single manually-assigned EC number, return that
        * If the reaction has multiple manually-assigned EC numbers, return an error
        * Otherwise, return the most relevant predicted EC number

        Returns:
            :obj:`str`: most relevant EC number
        """

        # return
        manual_ec_numbers = list(filter(lambda xr: xr.namespace == 'ec-code' and xr.assignment_method ==
                                        ResourceAssignmentMethod.manual, self.cross_references))
        if len(manual_ec_numbers) == 1:
            return manual_ec_numbers[0].id
        elif len(manual_ec_numbers) > 1:
            raise ValueError('Reaction {} has multiple EC numbers: {}'.format(self.id, ', '.join(xr.id for xr in manual_ec_numbers)))

        # find most relevant predicted EC number
        max_relevance = float('-inf')
        max_id = ''
        for xr in self.cross_references:
            if xr.assignment_method == ResourceAssignmentMethod.predicted and xr.relevance > max_relevance:
                max_relevance = xr.relevance
                max_id = xr.id

        return max_id


class ReactionParticipant(obj_tables.core.Model):
    """ Represents a participant in a reaction

    Attributes:
        specie (:obj:`Specie`): molecular species
        compartment (:obj:`Compartment`): compartment
        coefficient (:obj:`float`): coefficient
        order (:obj:`int`): order in the list of participants
    """
    specie = obj_tables.core.ManyToOneAttribute(Specie, related_name='reaction_participants')
    compartment = obj_tables.core.ManyToOneAttribute(Compartment, related_name='reaction_participants')
    coefficient = obj_tables.core.FloatAttribute()
    order = obj_tables.core.IntegerAttribute()


class Genetics(obj_tables.core.Model):
    """ Represents a taxon

    Attributes:
        taxon (:obj:`str`): taxon name
        variation (:obj:`str`): the genetic variation from the wildtype taxon

        observations (:obj:`list` of :obj:`Observation`): list of observations
    """
    taxon = obj_tables.core.StringAttribute()
    variation = obj_tables.core.StringAttribute()

    def is_wildtype(self):
        """ Determine if the taxon is the wildtype taxon

        Returns:
            obj:`bool`: `True` if the taxon doesn't have any genetic perturbation(s)
        """
        return not self.variation

    def is_variant(self):
        """ Determine if the taxon is the wildtype stain

        Returns:
            :obj:`bool`: `True` if the taxon has at least one genetic perturbation
        """
        return not self.is_wildtype()


class Environment(obj_tables.core.Model):
    """ Represents the environment (temperature, pH, media chemical composition) of an observation

    Attributes:
        temperature (:obj:`float`): temperature in Celcius
        ph (:obj:`float`): pH
        media (:obj:`str`): the chemical composition of the environment

        observations (:obj:`list` of :obj:`Observation`): list of observations
    """
    temperature = obj_tables.core.FloatAttribute()
    ph = obj_tables.core.FloatAttribute()
    media = obj_tables.core.LongStringAttribute()
    growth_status = obj_tables.core.LongStringAttribute()
    growth_system = obj_tables.core.LongStringAttribute()


class Method(obj_tables.core.Model):
    """ Represents a method used to generate an observation

    Attributes:
        name (:obj:`str`): name
        description (:obj:`str`): description

        observations (:obj:`list` of :obj:`Observation`): list of observations
    """
    name = obj_tables.core.StringAttribute()
    description = obj_tables.core.LongStringAttribute()
    performer = obj_tables.core.StringAttribute()
    hardware = obj_tables.core.StringAttribute()
    software = obj_tables.core.StringAttribute()

class Synonym(obj_tables.core.Model):
    """
    Represents a synonym of a given physical entity or property

    Attributes:
        name (:obj:`str`): Name of the Synonym

    """
    name = obj_tables.core.StringAttribute()


class ExperimentalMethod(Method):
    """ Represents a experimental method used to generate an observation """
    pass


class ComputationalMethod(Method):
    """ Represents a computational method used to generate an observation

    Attributes:
        version (:obj:`str`): version
        arguments (:obj:`str`): string representation of the arguments to the method
    """
    version = obj_tables.core.StringAttribute()
    arguments = obj_tables.core.LongStringAttribute()


class ResourceAssignmentMethod(enum.Enum):
    """ Represents the method used to assign a cross reference to an observable """
    manual = 0
    predicted = 1


class Resource(obj_tables.core.Model):
    """ Represents an object in an external resource

    Attributes:
        namespace (:obj:`str`): namespace of the identifier (e.g. pubmed)
        id (:obj:`str`): identifier within :obj:`namespace` (e.g. PMID)
        relevance (:obj:`float`): numerical indicator relevance of the external resource to the observable
        assignment_method (:obj:`ResourceAssignmentMethod`): method used to assign the cross reference to the observable
    """
    namespace = obj_tables.core.StringAttribute()
    id = obj_tables.core.StringAttribute()
    relevance = obj_tables.core.FloatAttribute()
    assignment_method = obj_tables.core.EnumAttribute(ResourceAssignmentMethod)


class Reference(obj_tables.core.Model):
    """ Represent a reference for an observation

    Attributes:
        title (:obj:`str`): title
        editor (:obj:`str`): editor
        author (:obj:`str`): author
        year (:obj:`int`): year
        publication (:obj:`str`): publication
        volume (:obj:`str`): volume
        number (:obj:`str`): number
        chapter (:obj:`str`): chapter
        pages (:obj:`str`): pages
        url (:obj:`str`): url

        observations (:obj:`list` of :obj:`Observation`): list of observations
    """
    title = obj_tables.core.StringAttribute()
    editor = obj_tables.core.StringAttribute()
    author = obj_tables.core.StringAttribute()
    year = obj_tables.core.IntegerAttribute()
    publication = obj_tables.core.StringAttribute()
    volume = obj_tables.core.StringAttribute()
    number = obj_tables.core.StringAttribute()
    chapter = obj_tables.core.StringAttribute()
    pages = obj_tables.core.StringAttribute()
    url = obj_tables.core.StringAttribute()