KarrLab/datanator_query_python

View on GitHub
datanator_query_python/query/query_metabolites.py

Summary

Maintainability
A
3 hrs
Test Coverage
A
98%
""" Metabolite Query
:Author: Bilal Shaikh <bilalshaikh42@gmail.com>
        Zhouyang Lian <zhouyang.lian@familian.life>
:Date: 2019-08-01
:Copyright: 2019, Karr Lab
:License: MIT
"""

from datanator_query_python.util import mongo_util, chem_util, file_util
from . import query_metabolites_meta
from bson.objectid import ObjectId

class QueryMetabolites(mongo_util.MongoUtil):
    '''Queries specific to metabolites (ECMDB, YMDB) collection
    '''

    def __init__(self, cache_dirname=None, MongoDB=None, replicaSet=None, db=None,
                 verbose=True, max_entries=float('inf'), username=None,
                 password=None, authSource='admin', readPreference='nearest'):
        self.verbose = verbose
        super().__init__(cache_dirname=cache_dirname,
                         MongoDB=MongoDB,
                         replicaSet=replicaSet,
                         db=db,
                         verbose=verbose,
                         max_entries=max_entries,
                         username=username,
                         password=password,
                         authSource=authSource,
                         readPreference=readPreference)
        self.client_ecmdb, self.db_ecmdb, self.collection_ecmdb = self.con_db('ecmdb')
        self.client_ymdb, self.db_ymdb, self.collection_ymdb = self.con_db(
            'ymdb')
        self.metabolites_meta_manager = query_metabolites_meta.QueryMetabolitesMeta(
            cache_dirname=cache_dirname,
            MongoDB=MongoDB,
            replicaSet=replicaSet,
            db=db,
            collection_str='metabolites_meta',
            verbose=verbose,
            max_entries=max_entries,
            username=username,
            password=password,
            authSource=authSource,
            readPreference=readPreference)
        self.chem_manager = chem_util.ChemUtil()

    def get_conc_from_inchi(self, inchi, inchi_key=False, consensus=False, projection={'_id': 0}):
        ''' Given inchi, find the metabolite's concentration
            values.

            Args: 
                inchi (:obj:`str`): inchi or inchi key of metabolite.
                inchi_key (:obj:`bool`): input is InChI Key or not.
                consensus (`obj`: bool): whether to return consensus values or list of
                                        individual values.
            
            Return:
                (`obj`: list of `obj`: dict): concentration values separated by collections
                e.g. [{'ymdb': }, {'ecmdb': }]
        '''
        if not inchi_key:
            hashed_inchi = self.chem_manager.inchi_to_inchikey(inchi)
        else:
            hashed_inchi = inchi
        query = {'InChI_Key': inchi}
        result = []

        ids = self.metabolites_meta_manager.get_ids_from_hash(hashed_inchi)
        ecmdb_id = ids['m2m_id']
        ymdb_id = ids['ymdb_id']

        docs_ecmdb = self.collection_ecmdb.find_one(filter={'m2m_id': ecmdb_id}, projection=projection)
        docs_ymdb = self.collection_ymdb.find_one(filter={'ymdb_id': ymdb_id}, projection=projection)

        def calc_consensus(_list):
            conc_list = [float(x) for x in _list]
            cons_val = sum(conc_list) / len(conc_list)
            return cons_val

        def append_result(_dict):
            if _dict is not None:
                conc_ecmdb = _dict.get('concentrations', None)
                if consensus is True and conc_ecmdb is not None:
                    conc_list = _dict['concentrations']['concentration']
                    cons_val = calc_consensus(conc_list)
                    _dict['consensus_value'] = cons_val
                result.append(_dict)

        append_result(docs_ecmdb)
        append_result(docs_ymdb)

        if len(result) == 0:
            return [{
                'name': 'no available information',
                'species': 'no available information',
                'description': 'no available information',
                'inchikey': 'no available information'
            }]
        else:
            return result

    def get_meta_from_inchis(self, inchis, species, last_id='000000000000000000000000', page_size=20):
        ''' Get all information about metabolites given
            a list of inchi strings
            Args:
                inchis (`obj`: list of `obj`: str): list of inchi strings
                species (`obj`: str): name of species in which the metabolite resides
                last_id (`obj`: str): hex encoded version of ObjectId o, which is the last item of the previous page
                page_size (`obj`: int): number of items per page
            Return:
                result (`obj`: list of `obj`: dict): list of information
        '''
        if species == 'Escherichia coli':
            collection = self.collection_ecmdb
        elif species == 'Saccharomyces cerevisiae':
            collection = self.collection_ymdb
        else:
            return [{'name': 'Species name not supported yet.',
                    'inchikey': 'Species name not supported yet.',
                    'description': 'Species name not supported yet.'}] 

        result = []
        inchikeys = [self.chem_manager.inchi_to_inchikey(x) for x in inchis]
        last_id = ObjectId(last_id)
        query = {'$and': [{'inchikey': {'$in': inchikeys} },
                          {'_id': {'$gt': last_id} }]}
        projection = {}
        docs = collection.find(filter=query, limit=page_size)
        for doc in docs:
            doc['_id'] = str(doc['_id'])
            result.append(doc)
        return result

    def get_concentration_count(self):
        """Get number of metabolites with concentration values.

        Return:
            (:obj:`int`): Number of metabolites with concentrations.
        """
        query = {'concentrations.concentration': {'$exists': True}}
        projection = {'inchikey': 1}
        x = set()
        ecmdb_concentration = self.collection_ecmdb.find(filter=query, projection=projection)
        ymdb_concentration = self.collection_ymdb.find(filter=query, projection=projection)
        for m in ecmdb_concentration:
            x.add(m['inchikey'])
        for m in ymdb_concentration:
            x.add(m['inchikey'])
        return len(x)