KarrLab/datanator_query_python

View on GitHub
datanator_query_python/query/query_protein.py

Summary

Maintainability
F
2 wks
Test Coverage
D
69%
from datanator_query_python.util import mongo_util, file_util
from datanator_query_python.query import query_taxon_tree, query_kegg_orthology
from pymongo.collation import Collation, CollationStrength
from pymongo import ASCENDING
from collections import deque
import simplejson as json


class QueryProtein(mongo_util.MongoUtil):

    def __init__(self, username=None, password=None, server=None, authSource='admin',
                 database='datanator', max_entries=float('inf'), verbose=True, collection_str='uniprot',
                 readPreference='nearest', replicaSet=None):

        super().__init__(MongoDB=server, username=username,
                        password=password, authSource=authSource, db=database,
                        readPreference=readPreference, replicaSet=replicaSet)
        self.taxon_manager = query_taxon_tree.QueryTaxonTree(MongoDB=server, username=username, password=password,
            authSource=authSource, db=database, replicaSet=replicaSet)
        self.taxon_col = self.db_obj['taxon_tree']
        self.kegg_manager = query_kegg_orthology.QueryKO(username=username, password=password, server=server, authSource=authSource, replicaSet=replicaSet)
        self.file_manager = file_util.FileUtil()
        self.max_entries = max_entries
        self.verbose = verbose
        self.collection = self.db_obj[collection_str]
        self.paxdb_collection = self.db_obj['pax']
        self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
        self.collection_str = collection_str

    def get_meta_by_id(self, _id):
        '''
            Get protein's metadata given uniprot id

            Args:
                _id (:obj:`list` of :obj:`str`): list of uniprot id.

            Returns:
                (:obj:`list` of :obj:`dict`): list of information.
        '''
        result = []
        query = {'uniprot_id': {'$in': _id}}
        projection = {'_id': 0, 'ancestor_name': 0, 'ancestor_taxon_id': 0,
                    'kinetics': 0}
        docs = self.collection.find(filter=query, projection=projection, collation=self.collation)
        count = self.collection.count_documents(query, collation=self.collation)
        null = 'None'
        if count == 0:
            return {'uniprot_id': 'None',
            'entry_name': 'None',
            'gene_name': 'None',
            'protein_name': 'None',
            'canonical_sequence': 'None',
            'length': 99999999,
            'mass': '99999999',
            'abundances': [],
            'ncbi_taxonomy_id': 99999999,
            'species_name': '99999999'}

        for doc in docs:
            doc = json.loads(json.dumps(doc, ignore_nan=True))
            ko_number = doc.get('ko_number')
            if ko_number is not None:
                D, c = self.kegg_manager.get_meta_by_kegg_ids([ko_number])
                if c != 0:
                    doc['kegg_meta'] = [d for d in D]    
            result.append(doc)
        return result

    def get_ortho_by_id(self, _id):
        '''
            Get protein's metadata given uniprot id

            Args:
                _id (:obj:`str`): uniprot id.

            Returns:
                (:obj:`list` of :obj:`dict`): list of information.
        '''
        result = []
        query = {'uniprot_id': _id}
        doc = self.collection.find_one(filter=query, projection={"_id": 0})
        if doc is None:
            return {'uniprot_id': 'None',
            'entry_name': 'None',
            'gene_name': 'None',
            'protein_name': 'None',
            'canonical_sequence': 'None',
            'length': 99999999,
            'mass': '99999999',
            'abundances': [],
            'ncbi_taxonomy_id': 99999999,
            'species_name': '99999999'}
        else:
            doc = json.loads(json.dumps(doc, ignore_nan=True))
            result.append(doc)
            return result            

    def get_meta_by_name_taxon(self, name, taxon_id):
        '''
            Get protein's metadata given protein name
            and its ncbi taxonomy ID

            Args:
                name (:obj:`str`): protein's complete/partial name.
                taxon_id (:obj:`int`): protein's ncbi taxonomy id.

            Returns:
                (:obj:`list` of :obj:`dict`): protein's metadata.
        '''
        result = []
        expression = "\"" + name + "\""
        query = {'$and': [{'$text': { '$search': expression } },
                         {'ncbi_taxonomy_id': taxon_id},
                         {'abundances': {'$exists': True} }]}
        projection = {'_id': 0, 'ancestor_name': 0, 'ancestor_taxon_id': 0, 'kinetics': 0}
        docs = self.collection.find(filter=query, projection=projection)
        for doc in docs:
            doc = json.loads(json.dumps(doc, ignore_nan=True))
            result.append(doc)
        return result

    def get_meta_by_name_name(self, protein_name, species_name):
        '''
            Get protein metadata by protein name and the 
            name of the species the protein resides

            Args:
                protein_name (:obj:`str`): name of the protein
                species_name (:obj:`str`): complete/partial name of the organism

            Returns:
                (:obj:`list` of :obj:`dict`): protein's metadata
        '''
        result = []
        taxon_ids = self.taxon_manager.get_ids_by_name(species_name)
        expression = "\"" + protein_name + "\""
        query = {'$and': [{'$text': { '$search': expression } },
                         {'ncbi_taxonomy_id': {'$in' :taxon_ids}},
                         {'abundances': {'$exists': True} }]}
        projection = {'_id': 0, 'ancestor_name': 0, 'ancestor_taxon_id': 0, 'kinetics': 0}
        docs = self.collection.find(filter=query, projection=projection)
        for doc in docs:
            doc = json.loads(json.dumps(doc, ignore_nan=True))
            result.append(doc)
        return result

    def get_id_by_name(self, name):
        '''
            Get proteins whose name contains string 'name'.

            Args:
                name (:obj:`str`): complete/incomplete protein name.

            Returns:
                (:obj:`list` of :obj:`dict`): list of dictionary containing 
                protein's uniprot_id and name.
        '''
        result = []
        expression = "\"" + name + "\""
        query = { '$text': { '$search': expression } }
        projection = {'_id': 0, 'uniprot_id': 1, 'protein_name': 1}
        docs = self.collection.find(filter=query, projection=projection)
        # count = self.collection.count_documents(query)
        # if count == 0:
        #     return 'No protein found'
        for doc in docs:
            dic = {'uniprot_id': doc['uniprot_id'], 'protein_name': doc['protein_name']}
            result.append(dic)
        return result

    def get_info_by_text(self, name):
        '''
            Get proteins whose name or kegg name contains string 'name'.

            Args:
                name (:obj:`str`): complete/incomplete protein name.

            Returns:
                (:obj:`list` of :obj:`dict`): list of dictionary containing 
                protein's uniprot_id and kegg information
                [{'ko_number': ... 'ko_name': ... 'uniprot_ids': []},
                 {'ko_number': ... 'ko_name': ... 'uniprot_ids': []}].
        '''
        result = []
        expression = "\"" + name + "\""
        query = { '$text': { '$search': expression } }
        projection = {'_id': 0, 'uniprot_id': 1, 'ko_number': 1, 'ko_name': 1}
        docs = self.collection.find(filter=query, projection=projection)

        for doc in docs:
            ko_number = doc.get('ko_number', 'no number')
            ko_name = doc.get('ko_name', ['no name'])
            uniprot_id = doc['uniprot_id']
            index = self.file_manager.search_dict_index(result, 'ko_number', ko_number)
            if len(index) == 1:
                result[index[0]]['uniprot_ids'].append(uniprot_id)
            else:
                dic = {'ko_number': ko_number, 'ko_name': ko_name, 'uniprot_ids': [uniprot_id]}
                result.append(dic)
        return result

    def get_info_by_text_abundances(self, name):
        '''
            Get proteins whose name or kegg name contains string 'name'.

            Args:
                name (:obj:`str`): complete/incomplete protein name.

            Returns:
                (:obj:`list` of :obj:`dict`): list of dictionary containing 
                protein's uniprot_id and kegg information
                [{'ko_number': ... 'ko_name': ... 'uniprot_ids': {'id0': 0, 'id1': 1, 'id2': 0}}, # 0: has abundances info, 1: no abundances infor
                 {'ko_number': ... 'ko_name': ... 'uniprot_ids': {'id0': 0, 'id1': 1, 'id2': 0}}].
        '''
        result = []
        expression = "\"" + name + "\""
        query = { '$text': { '$search': expression } }
        projection = {'_id': 0, 'uniprot_id': 1, 'ko_number': 1, 'ko_name': 1, 'abundances': 1}
        docs = self.collection.find(filter=query, projection=projection)

        for doc in docs:
            ko_number = doc.get('ko_number')
            ko_name = doc.get('ko_name')
            if ko_number is None or ko_number == 'nan':
                ko_number = 'no number'
                ko_name = ['no name']           
            uniprot_id = doc['uniprot_id']
            abundance_status = 'abundances' in doc
            index = self.file_manager.search_dict_index(result, 'ko_number', ko_number)
            if len(index) == 1:
                result[index[0]]['uniprot_ids'][uniprot_id] = abundance_status
            else:
                dic = {'ko_number': ko_number, 'ko_name': ko_name, 'uniprot_ids': {uniprot_id: abundance_status}}
                result.append(dic)
        return result

    def get_info_by_taxonid(self, _id):
        '''
            Get proteins whose name or kegg name contains string 'name'.

            Args:
                _id (:obj:`int`): ncbi taxonomy id.

            Returns:
                (:obj:`list` of :obj:`dict`): list of dictionary containing 
                protein's uniprot_id and kegg information
                [{'ko_number': ... 'ko_name': ... 'uniprot_ids': []},
                 {'ko_number': ... 'ko_name': ... 'uniprot_ids': []}].
        '''
        result = []
        query = {'ncbi_taxonomy_id': _id}
        projection = {'_id': 0, 'uniprot_id': 1, 'ko_number': 1, 'ko_name': 1}
        docs = self.collection.find(filter=query, projection=projection)

        for doc in docs:
            ko_number = doc.get('ko_number', 'no number')
            ko_name = doc.get('ko_name', ['no name'])
            uniprot_id = doc['uniprot_id']
            index = self.file_manager.search_dict_index(result, 'ko_number', ko_number)
            if len(index) == 1:
                result[index[0]]['uniprot_ids'].append(uniprot_id)
            else:
                dic = {'ko_number': ko_number, 'ko_name': ko_name, 'uniprot_ids': [uniprot_id]}
                result.append(dic)
        return result

    def get_info_by_taxonid_abundance(self, _id):
        '''
            Get proteins associated with ncbi id.

            Args:
                _id (:obj:`int`): ncbi taxonomy id.

            Returns:
                result (:obj:`list` of :obj:`dict`): list of dictionary containing 
                protein's uniprot_id and kegg information
                [{'ko_number': ... 'ko_name': ... 'uniprot_ids': {'id0': 0, 'id1': 1, 'id2': 0}},
                 {'ko_number': ... 'ko_name': ... 'uniprot_ids': {'id0': 0, 'id1': 1, 'id2': 0}}].
        '''
        result = []
        query = {'ncbi_taxonomy_id': _id}
        projection = {'_id': 0, 'uniprot_id': 1, 'ko_number': 1, 'ko_name': 1, 'abundances': 1}
        docs = self.collection.find(filter=query, projection=projection)

        for doc in docs:
            ko_number = doc.get('ko_number', 'no number')
            ko_name = doc.get('ko_name', ['no name'])
            uniprot_id = doc['uniprot_id']
            abundance_status = 'abundances' in doc
            index = self.file_manager.search_dict_index(result, 'ko_number', ko_number)
            if len(index) == 1:
                result[index[0]]['uniprot_ids'][uniprot_id] = abundance_status
            else:
                dic = {'ko_number': ko_number, 'ko_name': ko_name, 'uniprot_ids': {uniprot_id: abundance_status}}
                result.append(dic)
        return result

    def get_info_by_ko(self, ko):
        '''
            Find all proteins with the same kegg orthology id.

            Args:
                ko (:obj:`str`): kegg orthology ID.

            Returns:
                (:obj:`list` of :obj:`dict`): list of dictionary containing 
                protein's uniprot_id and kegg information
                [{'ko_number': ... 'ko_name': ... 'uniprot_ids': []},
                 {'ko_number': ... 'ko_name': ... 'uniprot_ids': []}].
        '''
        ko = ko.upper()
        result = [{'ko_number': ko, 'uniprot_ids': []}]
        query = {'ko_number': ko}
        projection = {'uniprot_id': 1, '_id': 0, 'ko_name': 1, 'ko_number': 1}
        docs = self.collection.find(filter=query, projection=projection)

        for doc in docs:
            result[0]['ko_name'] = doc.get('ko_name', ['no name'])
            result[0]['uniprot_ids'].append(doc.get('uniprot_id'))
        return result

    def get_info_by_ko_abundance(self, ko):
        '''
            Find all proteins with the same kegg orthology id.

            Args:
                ko (:obj:`str`): kegg orthology ID.

            Returns:
                (:obj:`list` of :obj:`dict`): list of dictionary containing 
                protein's uniprot_id and kegg information
                [{'ko_number': ... 'ko_name': ... 'uniprot_ids': {}},
                 {'ko_number': ... 'ko_name': ... 'uniprot_ids': {}}].
        '''
        ko = ko.upper()
        result = [{'ko_number': ko, 'uniprot_ids': {}}]
        query = {'ko_number': ko}
        projection = {'uniprot_id': 1, '_id': 0, 'ko_name': 1, 'ko_number': 1, 'abundances': 1}
        docs = self.collection.find(filter=query, projection=projection)

        for doc in docs:
            result[0]['ko_name'] = doc.get('ko_name', ['no name'])
            abundance_status = 'abundances' in doc
            result[0]['uniprot_ids'][doc.get('uniprot_id')] = abundance_status
        return result

    def get_kinlaw_by_id(self, _id):
        '''
            Get protein kinetic law information by uniprot_id.

            Args:
                _id (:obj:`list` of :obj:`str`): list of uniprot IDs.

            Returns:
                (:obj:`list` of `dict`): list of kinlaw information.
        '''
        result = []
        query = {'uniprot_id': {'$in': _id}}
        projection = {'_id': 0, 'kinetics': 1, 'taxon': 1, 'uniprot_id': 1}
        docs = self.collection.find(filter=query, projection=projection, collation=self.collation)
        for doc in docs:
            result.append({'uniprot_id': doc.get('uniprot_id'), 'ncbi_taxonomy_id': doc.get('taxon'),
                'similar_functions': doc.get('kinetics')})
        return result

    def get_kinlaw_by_name(self, name):
        '''
        Get protein kinetic law information by protein name.

        Args:
            _id: (:obj:`str`): protein's name.

        Returns:
            (:obj:`list` of :obj:`dict`): information.
        '''
        entries = self.get_id_by_name(name)
        _ids = []
        for entry in entries:
            _ids.append(entry['uniprot_id'])
        return self.get_kinlaw_by_id(_ids)

    def get_abundance_by_id(self, _id):
        '''
            Get protein abundance information by uniprot_id.

            Args:
                id (:obj:`list` of :obj:`str`): list of uniprot_id.

            Returns:
                (:obj:`list` of `dict`): list of abundance information.
        '''
        result = []
        query = {'$and': [{'uniprot_id': {'$in': _id}}, {'abundances': {'$exists': True}}]}

        projection = {'abundances': 1, 'uniprot_id': 1, '_id': 0,
                      'protein_name': 1, 'gene_name': 1, 'species_name': 1,
                      "modifications": 1}
        docs = self.collection.find(filter=query, projection=projection, collation=self.collation)
        count = self.collection.count_documents(query, collation=self.collation)
        if count == 0:
            return [{'abundances': [], 'uniprot_id': 'No proteins that match input',
                     "species_name": "No proteins that match input"}]
        for doc in docs:
            result.append(doc)
        return result

    def get_abundance_by_taxon(self, _id):
        '''
            Get protein abundance information in one species.

            Args:
                id (:obj:`str`): taxonomy id.

            Returns:
                (:obj:`list` of `dict`): list of abundance information
        '''
        result = []
        query = {'ncbi_taxonomy_id': _id}
        projection = {'ancestor_name': 0, 'ancestor_taxon_id': 0, '_id': 0, 'ncbi_taxonomy_id': 0}
        docs = self.collection.find(filter=query, projection=projection)
        for doc in docs:
            result.append(doc)
        return result

    def get_proximity_abundance_taxon(self, _id, max_distance=3):
        '''
            Get replacement abundance value by taxonomic distance
            with the same kegg_orthology number.

            Args:
                _id (:obj:`str`): uniprot_id to query for
                max_distance (:obj:`int`): max taxonomic distance from origin protein allowed for
                                            proteins in results.

            Returns:
                (:obj:`list` of :obj:`dict`): list of result proteins and their info 
                    [{'distance': 1, 'documents': [{}, {}, {} ...]}, 
                     {'distance': 2, 'documents': [{}, {}, {} ...]}, ...]
        '''
        if max_distance == 0:
            return 'Please use get_abundance_by_id to check self abundance values'

        result = []
        for i in range(max_distance):
            result.append({'distance': i + 1, 'documents': []})

        query = {'$and': [{'uniprot_id': _id},
                        {'ancestor_taxon_id': {'$exists': True}}] } # needs indexing
        projection = {'ko_number': 1, 'ancestor_taxon_id': 1, 'ancestor_name': 1}
        protein = self.collection.find_one(filter=query, projection=projection,
                                            collation=self.collation)
        if protein == None:
            return 'This protein has no ancestor information to base upon'

        ancestors = protein.get('ancestor_taxon_id')
        names = protein.get('ancestor_name')
        if len(ancestors) < max_distance:
            close_relatives = ancestors
            close_names = names
        else:
            close_relatives = ancestors[-max_distance:]
            close_names = names[-max_distance:]

        ko_number = protein['ko_number']
        query = {'$and':[{'ko_number': ko_number},
                         {'ncbi_taxonomy_id': {'$in': close_relatives}}]} # needs indexing
        projection = {'abundances': 1, 'ncbi_taxonomy_id': 1, 'species_name': 1,
                    'uniprot_id': 1, '_id': 0}
        docs = self.collection.find(filter=query, projection=projection,
                                        collation=self.collation)
        if docs == None:
            return 'No proteins found within constraints'

        for i, doc in enumerate(docs):
            tax_id = doc['ncbi_taxonomy_id']
            distance = max_distance - close_relatives.index(tax_id)
            result[distance-1]['documents'].append(doc)
            result[distance-1]['ancestor_names'] = close_names

        return result

    def get_equivalent_protein(self, _id, max_distance, max_depth=float('inf')):
        '''
            Get replacement abundance value by taxonomic distance
            with the same kegg_orthology number.

            Args:
                _id (:obj:`str`): uniprot_id to query for.
                max_distance (:obj:`int`): max taxonomic distance from origin protein allowed for
                                            proteins in results.
                max_depth (:obj:`int`) max depth allowed from the common node.

            Returns:
                (:obj:`list` of :obj:`dict`): list of result proteins and their info 
                    [{'distance': 1, 'documents': [{}, {}, {} ...]}, 
                     {'distance': 2, 'documents': [{}, {}, {} ...]}, ...].
        '''

        if max_distance <= 0:
            return 'Please use get_abundance_by_id to check self abundance values'
        if max_depth == None:
            max_depth = 1000
        if max_depth <= 0:
            return 'Max_depth has to be greater than 0'

        result = []
        for i in range(max_distance):
            result.append({'distance': i + 1, 'documents': []})

        query = {'uniprot_id': {'$in': _id}}  # needs indexing
        projection = {'ko_number': 1, 'ancestor_taxon_id': 1, 'ancestor_name': 1, 'ncbi_taxonomy_id': 1}
        protein = self.collection.find_one(query, projection=projection, collation=self.collation)
        if protein is None:
            return 'This protein has no ancestor information to base upon'

        ko_number = protein['ko_number']
        ancestor_ids = protein.get('ancestor_taxon_id')
        levels = min(len(ancestor_ids), max_distance)
        checked_ids = [protein['ncbi_taxonomy_id']]

        projection = {'abundances': 1, 'ncbi_taxonomy_id': 1, 'species_name': 1,
                    'uniprot_id': 1, '_id': 0, 'ancestor_taxon_id': 1}
        for level in range(levels):
            cur_id = ancestor_ids[-(level+1)]

            if level == 0:
                common_ancestors = ancestor_ids
            else:
                common_ancestors = ancestor_ids[:-(level)]
            length = len(common_ancestors)

            query = {'$and': [{'ancestor_taxon_id': {'$all': common_ancestors} },{'ncbi_taxonomy_id': {'$nin': checked_ids} },
                              {'ancestor_taxon_id': {'$nin': checked_ids} }, {'ko_number': ko_number},
                              {'abundances': {'$exists': True} }]}

            equivalents = self.collection.find(filter=query, projection=projection)
            for equivalent in equivalents:
                depth = len(equivalent['ancestor_taxon_id']) - length
                if 0 <= depth < max_depth:
                    equivalent['depth'] = depth + 1
                    tmp = equivalent.pop('ancestor_taxon_id')
                    result[level]['documents'].append(equivalent)
            checked_ids.append(cur_id)

        return result

    def get_equivalent_protein_with_anchor(self, _id, max_distance, max_depth=float('inf')):
        '''
            Get replacement abundance value by taxonomic distance
            with the same kegg_orthology number.

            Args:
                _id (:obj:`str`): uniprot_id to query for.
                max_distance (:obj:`int`): max taxonomic distance from origin protein allowed for
                                            proteins in results.
                max_depth (:obj:`int`) max depth allowed from the common node.

            Returns:
                (:obj:`list` of :obj:`dict`): list of result proteins and their info 
                    [{'distance': 0, 'documents': [{}]}
                     {'distance': 1, 'documents': [{}, {}, {} ...]}, 
                     {'distance': 2, 'documents': [{}, {}, {} ...]}, ...].
        '''

        if max_distance <= 0:
            return 'Please use get_abundance_by_id to check self abundance values'
        if max_depth == None:
            max_depth = 1000
        if max_depth <= 0:
            return 'Max_depth has to be greater than 0'

        result = []
        for i in range(max_distance):
            result.append({'distance': i, 'documents': []})

        query = {'uniprot_id': _id}  # needs indexing
        projection = {
            'ko_number': 1,
            'ko_name': 1,
            'ancestor_taxon_id': 1,
            'ancestor_name': 1,
            'ncbi_taxonomy_id': 1,
            'abundances': 1,
            'ncbi_taxonomy_id': 1,
            'species_name': 1,
            'uniprot_id': 1,
            '_id': 0,
            'ancestor_taxon_id': 1
        }
        protein = self.collection.find_one(query, projection=projection)
        if protein is None:
            return [{'distance': -1, 'documents': []}]
        elif protein.get('ko_number') is None:
            return [{'distance': -2, 'documents': []}]
        elif protein.get('abundances') is None:
            result[0] = result[0]
        else:
            dic = {}
            dic['abundances'] = protein['abundances']
            dic['ncbi_taxonomy_id'] = protein['ncbi_taxonomy_id']
            dic['species_name'] = protein['species_name']
            dic['uniprot_id'] = _id
            dic['depth'] = 0
            dic['ko_number'] = protein['ko_number']
            dic['ko_name'] = protein['ko_name']
            result[0]['documents'].append(dic)

        ko_number = protein['ko_number']
        ancestor_ids = protein.get('ancestor_taxon_id')
        levels = min(len(ancestor_ids), max_distance)
        checked_ids = [protein['ncbi_taxonomy_id']]

        projection = {'abundances': 1, 'ncbi_taxonomy_id': 1, 'species_name': 1,
                    'uniprot_id': 1, '_id': 0, 'ancestor_taxon_id': 1, 'ko_number': 1,
                    'ko_name': 1}
        for level in range(levels):
            cur_id = ancestor_ids[-(level+1)]

            if level == 0:
                common_ancestors = ancestor_ids
            else:
                common_ancestors = ancestor_ids[:-(level)]
            length = len(common_ancestors)

            query = {'$and': [{'ancestor_taxon_id': {'$all': common_ancestors} },{'ncbi_taxonomy_id': {'$nin': checked_ids} },
                              {'ancestor_taxon_id': {'$nin': checked_ids} }, {'ko_number': ko_number.upper()},
                              {'abundances': {'$exists': True} }]}

            equivalents = self.collection.find(filter=query, projection=projection)
            for equivalent in equivalents:
                depth = len(equivalent['ancestor_taxon_id']) - length
                if 0 <= depth < max_depth:
                    equivalent['depth'] = depth + 1
                    tmp = equivalent.pop('ancestor_taxon_id')
                    result[level]['documents'].append(equivalent)
            checked_ids.append(cur_id)

        return result

    def get_uniprot_by_ko(self, ko):
        '''
            Find all proteins with the same kegg orthology id.

            Args:
                ko (:obj:`str`): kegg orthology ID.

            Return:
                (:obj:`list` of :obj:`str`): list of uniprot_id.
        '''
        ko = ko.upper()
        result = []
        query = {'ko_number': ko}
        projection = {'uniprot_id': 1, '_id': 0}
        docs = self.collection.find(filter=query, projection=projection)
        count = self.collection.count_documents(query)

        if count == 0:
            return 'No information available for this KO.'

        for doc in docs:
            result.append(doc['uniprot_id'])
        return result


    '''
        The methods below are "super" methods that try to predict some 
        commonly used functionalities for modelers
    '''


    def get_abundance_with_same_ko(self, _id):
        '''Find abundance information for protein with the same
            KO number.

            Args:
                _id (:obj:`str`): uniprot ID.

            Returns:
                (:obj:`list` of :obj:`dict`): information
                [{'uniprot_id': , 'abundances': }, {},...,{}].
        '''

        query = {'uniprot_id': _id}
        projection = {'_id': 0, 'ko_number': 1, 'ncbi_taxonomy_id': 1, 'uniprot_id': 1}
        doc = self.collection.find_one(filter=query, projection=projection, collation=self.collation)
        if doc is None:
            return 'No such protein in the database.'
        else:
            ko_number = doc.get('ko_number')

        if ko_number is None:
            return 'No kegg information available for this protein.'

        query = {'$and': [{'ko_number': ko_number}, {'abundances': {'$exists': True} }]}
        projection = {'ancestor_name': 0, 'ancestor_taxon_id': 0, '_id': 0}
        result = []
        docs = self.collection.find(filter=query, projection=projection)
        for doc in docs:
            result.append(doc)
        return result

    def get_abundance_by_ko(self, ko):
        ''' Get abundance information of proteins with
            the same KO.

            Args:
                ko (:obj:`str`): KO number.

            Returns:
                (:obj:`list` of :obj:`dict`): information
                [{'uniprot_id': , 'abundances': }, {},...,{}].             
        '''
        query = {'$and': [{'ko_number': ko.upper()}, {'abundances': {'$exists': True} }]}
        projection = {'abundances': 1, 'uniprot_id': 1, '_id': 0}
        result = []
        docs = self.collection.find(filter=query, projection=projection)
        if docs == None:
            return {'abundances': [], 'uniprot_id': 'No proteins match input information.'}
        for doc in docs:
            result.append(doc)
        return result
    
    def get_kegg_orthology(self, uniprot_id):
        """Get protein's kegg orthology number given uniprot id.
        
        Args:
            uniprot_id (:obj:`str`): protein's uniprot id.

        Returns:
            (:obj:`tuple`): tuple containing:
                (:obj:`str`): kegg orthology id;
                (:obj:`list` of :obj:`str`): list of kegg orthology descriptions.
        """
        query = {'uniprot_id': uniprot_id}
        projection = {'_id': 0, 'ko_number': 1, 'ko_name': 1}
        doc = self.collection.find_one(filter=query, projection=projection)
        if doc is not None:
            return doc.get('ko_number'), doc.get('ko_name', [])
        else:
            return None, []

    def get_equivalent_kegg_with_anchor_obsolete(self, ko, anchor, max_distance, max_depth=float('inf')):
        '''
            Get replacement abundance value by taxonomic distance
            with the same kegg_orthology number.

            Args:
                ko (:obj:`str`): kegg orthology id to query for.
                anchor (:obj:`str`): anchor species' name.
                max_distance (:obj:`int`): max taxonomic distance from origin protein allowed for
                                            proteins in results.
                max_depth (:obj:`int`) max depth allowed from the common node.

            Returns:
                (:obj:`list` of :obj:`dict`): list of result proteins and their info 
                    [{'distance': 0, 'documents': [{}]}
                     {'distance': 1, 'documents': [{}, {}, {} ...]}, 
                     {'distance': 2, 'documents': [{}, {}, {} ...]}, ...].
        '''

        if max_distance <= 0:
            return 'Please use get_abundance_by_id to check self abundance values'
        if max_depth == None:
            max_depth = 1000
        if max_depth <= 0:
            return 'Max_depth has to be greater than 0'

        result = []
        for i in range(max_distance):
            result.append({'distance': i, 'documents': []})

        ko_number = ko
        ancestor_ids, _ = self.taxon_manager.get_anc_by_name([anchor])
        ancestor_ids = ancestor_ids[0]
        ncbi_id = self.taxon_manager.get_ids_by_name(anchor)
        constraint_0 = {'ko_number': ko_number}
        constraint_1 = {'ncbi_taxonomy_id': {'$in': ncbi_id}}
        query = {'$and': [constraint_0, constraint_1]}
        projection = {
            'ko_number': 1,
            'ko_name': 1,
            'ancestor_name': 1,
            'ncbi_taxonomy_id': 1,
            'abundances': 1,
            'species_name': 1,
            'uniprot_id': 1,
            '_id': 0,
            'ancestor_taxon_id': 1,
            'protein_name': 1,
            'gene_name': 1
        }
        docs = self.collection.find(filter=query, projection=projection, collation=self.collation)
        if docs is not None:
            for doc in docs:
                result[0]['documents'].append(doc)

        levels = min(len(ancestor_ids), max_distance)
        checked_ids = ncbi_id

        projection = {'abundances': 1, 'ncbi_taxonomy_id': 1, 'species_name': 1,
                    'uniprot_id': 1, '_id': 0, 'ancestor_taxon_id': 1, 'ko_number': 1,
                    'ko_name': 1, 'protein_name': 1, 'gene_name': 1}
        for level in range(levels):
            cur_id = ancestor_ids[-(level+1)]

            if level == 0:
                common_ancestors = ancestor_ids
            else:
                common_ancestors = ancestor_ids[:-(level)]
            length = len(common_ancestors)

            query = {'$and': [{'ancestor_taxon_id': {'$all': common_ancestors} },{'ncbi_taxonomy_id': {'$nin': checked_ids} },
                              {'ancestor_taxon_id': {'$nin': checked_ids} }, {'ko_number': ko_number},
                              {'abundances': {'$exists': True} }]}

            equivalents = self.collection.find(filter=query, projection=projection)
            for equivalent in equivalents:
                depth = len(equivalent['ancestor_taxon_id']) - length
                if 0 <= depth < max_depth:
                    equivalent['depth'] = depth + 1
                    tmp = equivalent.pop('ancestor_taxon_id')
                    result[level]['documents'].append(equivalent)
            checked_ids.append(cur_id)

        return result

    def get_unique_protein(self):
        """Get number of unique proteins in collection

        Return:
            (:obj:`int`): number of unique proteins.
        """
        return len(self.collection.distinct('uniprot_id', collation=self.collation))
    
    def get_unique_organism(self):
        """Get number of unique organisms in collection.

        Return:
            (:obj:`int`): number of unique organisms.
        """
        return len(self.collection.distinct('ncbi_taxonomy_id'))

    def get_all_kegg(self, ko, anchor, max_distance):
        '''Get replacement abundance value by taxonomic distance
            with the same kegg_orthology number.

        Args:
            ko (:obj:`str`): kegg orthology id to query for.
            anchor (:obj:`str`): anchor species' name.
            max_distance (:obj:`int`): max taxonomic distance from origin protein allowed for
                                        proteins in results.
            max_depth (:obj:`int`) max depth allowed from the common node.

        Returns:
            (:obj:`list` of :obj:`dict`): list of result proteins and their info 
            [
            {'distance': 1, 'documents': [{}, {}, {} ...]}, 
            {'distance': 2, 'documents': [{}, {}, {} ...]}, ...].
        '''
        if max_distance <= 0:
            return 'Please use get_abundance_by_id to check self abundance values'

        result = []
        for i in range(max_distance):
            result.append({'distance': i + 1, 'documents': []})

        projection = {
            'ko_number': 1,
            'ko_name': 1,
            'ancestor_name': 1,
            'ncbi_taxonomy_id': 1,
            'abundances': 1,
            'species_name': 1,
            'uniprot_id': 1,
            '_id': 0,
            'ancestor_taxon_id': 1,
            'protein_name': 1,
            'gene_name': 1,
            'modifications': 1
        }
        con_0 = {'ko_number': ko}
        con_1 = {'abundances': {'$exists': True}}
        query = {'$and': [con_0, con_1]}
        docs = self.collection.find(filter=query, projection=projection)
        queried = deque()
        names = {}
        for doc in docs:
            doc = json.loads(json.dumps(doc, ignore_nan=True))
            species = doc.get('species_name')
            if species is None and species not in queried:
                taxon_id = doc['ncbi_taxonomy_id']
                species = self.db_obj['taxon_tree'].find_one({"tax_id": taxon_id})['tax_name']
                queried.append(taxon_id)
                names[taxon_id] = species
            elif species is None and species in queried:
                species = names[doc['ncbi_taxonomy_id']]
            obj = self.taxon_manager.get_canon_common_ancestor(anchor, species, org_format='tax_name')
            distance = obj[anchor]            
            if distance != -1 and distance <= max_distance:
                species_canon_ancestor = obj[species+'_canon_ancestors']
                doc['canon_ancestors'] = species_canon_ancestor
                result[distance-1]['documents'].append(doc)
        return result

    def get_all_ortho(self, ko, anchor, max_distance):
        '''Get replacement abundance value by taxonomic distance
            with the same OrthoDB group number.

        Args:
            ko (:obj:`str`): OrthoDB group id to query for.
            anchor (:obj:`str`): anchor species' name.
            max_distance (:obj:`int`): max taxonomic distance from origin protein allowed for
                                        proteins in results.
            max_depth (:obj:`int`) max depth allowed from the common node.

        Returns:
            (:obj:`list` of :obj:`dict`): list of result proteins and their info 
            [
            {'distance': 1, 'documents': [{}, {}, {} ...]}, 
            {'distance': 2, 'documents': [{}, {}, {} ...]}, ...].
        '''
        if max_distance <= 0:
            return 'Please use get_abundance_by_id to check self abundance values'

        result = []
        for i in range(max_distance):
            result.append({'distance': i + 1, 'documents': []})

        projection = {
            'orthodb_id': 1,
            'orthodb_name': 1,
            'ncbi_taxonomy_id': 1,
            'abundances': 1,
            'species_name': 1,
            'uniprot_id': 1,
            '_id': 0,
            'protein_name': 1,
            'gene_name': 1,
            'modifications': 1,
            "canon_anc_ids": 1,
            "canon_anc_names": 1
        }
        con_0 = {'orthodb_id': ko}
        con_1 = {'abu_exist': True}
        query = {'$and': [con_0, con_1]}
        docs = self.collection.find(filter=query, projection=projection,
                                    hint=[("abu_exist", ASCENDING), ("orthodb_id", ASCENDING)],
                                    batch_size=100)
        queried = deque()
        distances = {}
        names = {}
        species_anc = {}
        canon_anc_anchor = self.taxon_col.find_one({"tax_name": anchor})['canon_anc_names']
        for doc in docs:
            doc = json.loads(json.dumps(doc, ignore_nan=True))
            species = doc.get('species_name')
            taxon_id = doc['ncbi_taxonomy_id']
            if species is None and taxon_id not in queried: # few documents don't have species_name field
                tmp = self.taxon_col.find_one({"tax_id": taxon_id})
                species = tmp['tax_name']
                canon_anc_species = tmp["canon_anc_names"]
                queried.append(taxon_id)
                names[taxon_id] = species
                species_anc[taxon_id] = canon_anc_species
                obj = self._get_common_canon_anc(anchor, species, canon_anc_anchor, doc["canon_anc_names"])
                distances[taxon_id] = obj
            elif species is not None and taxon_id not in queried:
                canon_anc_species = doc["canon_anc_names"]
                queried.append(taxon_id)
                names[taxon_id] = species
                species_anc[taxon_id] = canon_anc_species
                obj = self._get_common_canon_anc(anchor, species, canon_anc_anchor, doc["canon_anc_names"])
                distances[taxon_id] = obj                
            else:
                species = names[taxon_id]
                canon_anc_species = species_anc[taxon_id]
                obj = distances[taxon_id]            
            distance = obj[anchor]            
            if distance != -1 and distance <= max_distance:
                species_canon_ancestor = obj[species+'_canon_ancestors']
                doc['canon_ancestors'] = species_canon_ancestor
                result[distance-1]['documents'].append(doc)
        return result

    def _get_common_canon_anc(self, org1, org2, canon_anc_1, canon_anc_2):
        """Get canon common ancestors between species.

        Args:
            org1(:obj:`str`): Name of species 1.
            org2(:obj:`str`): Name of species 2.
            canon_anc_1(:obj:`list` of `str`): List of canon ancestor names of species 1.
            canon_anc_2(:obj:`list` of `str`): List of canon ancestor names of species 2.

        Return:
            (:obj:`Obj`): {str(org1): distance1, str(org2): distance2, str(org1)+'_canon_ancestors':canon_anc_1,
                           str(org2)+'_canon_ancestors':canon_anc_2}
        """
        if canon_anc_1 == canon_anc_2:
            return {str(org1): 0, str(org1)+'_canon_ancestors':canon_anc_1,
                    str(org2): 0, str(org2)+'_canon_ancestors':canon_anc_1}

        if canon_anc_1[-1] == org2:
            distance1 = 1
            distance2 = 0
        elif canon_anc_2[-1] == org1:
            distance1 = 0
            distance2 = 1
        else:
            distance1 = -1
            distance2 = -1            
        ancestor = self.file_manager.get_common(canon_anc_1, canon_anc_2)
        if ancestor == '':                
            return {str(org1): -1, str(org2): -1, 'reason': 'No common ancestor.'}

        idx_org1 = canon_anc_1.index(ancestor)
        idx_org2 = canon_anc_2.index(ancestor)

        if distance1 == -1:
            distance1 = len(canon_anc_1) - (idx_org1) 
        if distance2 == -1:
            distance2 = len(canon_anc_2) - (idx_org2)

        return {str(org1): distance1, str(org2): distance2, str(org1)+'_canon_ancestors':canon_anc_1,
        str(org2)+'_canon_ancestors':canon_anc_2}

    def get_info_by_orthodb(self, orthodb):
        '''
            Find all proteins with the same kegg orthology id.

            Args:
                orthodb(:obj:`str`): kegg orthology ID.

            Returns:
                (:obj:`list` of :obj:`dict`): list of dictionary containing 
                protein's uniprot_id and kegg information
                [{'orthodb_id': ... 'orthodb_name': ... 'uniprot_ids': []},
                 {'orthodb_id': ... 'orthodb_name': ... 'uniprot_ids': []}].
        '''
        ko = orthodb.lower()
        result = [{'orthodb_id': ko, 'uniprot_ids': []}]
        query = {'orthodb_id': ko}
        projection = {'uniprot_id': 1, '_id': 0, 'orthodb_name': 1, 'orthodb_id': 1}
        docs = self.collection.find(filter=query, projection=projection)

        for doc in docs:
            result[0]['orthodb_name'] = doc.get('orthodb_name', ['no name'])
            result[0]['uniprot_ids'].append(doc.get('uniprot_id'))
        return result