datanator/data_source/rna_halflife/doi_10_1186_s12864_016_3219_8.py from KarrLab/datanator

datanator/data_source/rna_halflife/doi_10_1186_s12864_016_3219_8.py
Summary

Maintainability

3 days
Test Coverage

74%
Issues
Coverage
import pandas as pd
from datanator.util import mongo_util
import json
import datetime
from pymongo import ASCENDING
from pymongo.collation import Collation, CollationStrength
import datanator.config.core
from datanator_query_python.query import query_uniprot
from datanator.data_source import uniprot_nosql


class Halflife(mongo_util.MongoUtil):

    def __init__(self, cache_dir=None, server=None, db=None, collection_str=None,
                authDB=None, readPreference=None, username=None, password=None,
                verbose=None, max_entries=None, uniprot_col_db=None):
        """Init
        
        Args:
            cache_dir (:obj:`str`, optional): Cache directory for logs. Defaults to None.
            server (:obj:`str`, optional): MongoDB server address. Defaults to None.
            db (:obj:`str`, optional): Database where initial uniprot collection resides. Defaults to None.
            collection_str (:obj:`str`, optional): name of collection. Defaults to None.
            authDB (:obj:`str`, optional): MongoDB authentication database. Defaults to None.
            readPreference (:obj:`str`, optional): MongoDB read preference. Defaults to None.
            username (:obj:`str`, optional): MongoDB username. Defaults to None.
            password (:obj:`str`, optional): MongoDB password. Defaults to None.
            verbose (:obj:`bool`, optional): Wheter to display verbose messages. Defaults to None.
            max_entries (:obj:`int`, optional): Number of records to be processed. Defaults to None.
            uniprot_col_db (:obj:`int`, optional): Database to which new uniprot records will be inserted. Defaults to None.
        """
        super(Halflife, self).__init__(MongoDB=server, db=uniprot_col_db, username=username,
                                        password=password, authSource=authDB,
                                        verbose=verbose)
        self.cache_dir = cache_dir
        self.client, self.db, self.collection = self.con_db(collection_str)
        self.url = "https://static-content.springer.com/esm/art%3A10.1186%2Fs12864-016-3219-8/MediaObjects/12864_2016_3219_MOESM5_ESM.xlsx"
        self.max_entries = max_entries
        self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
        self.uniprot_manager = query_uniprot.QueryUniprot(username=username, password=password,
                                            server=server, database=db, collection_str='uniprot')
        self.uniprot_col_manager = uniprot_nosql.UniprotNoSQL(MongoDB=server, db=uniprot_col_db, max_entries=max_entries,
                                                              username=username, password=password, collection_str='uniprot')
        self.username = username
        self.password = password
        self.server = server
        self.max_entries = max_entries
        self.db_str = uniprot_col_db
        self.verbose = verbose

    def download_xlsx(self, sheet_name):
        """Download supplementary xlsx file
        
        Args:
            sheet_name (:obj:`str`): name of sheet in xlsx
        
        Returns:
            (:obj:`pandas.DataFrame`): xlsx transformed to pandas.DataFrame
        """
        if self.max_entries == float('inf'):
            nrows = None
        else:
            nrows = self.max_entries
        data = pd.read_excel(self.url, sheet_name=sheet_name, nrows=nrows)
        columns = ['gene_fragment', 'cog_class', 'ar_cog', 'cog', 'function', 'gene_name', 'half_life', 'half_life_std', 'std_over_avg']
        data.columns = columns
        data['half_life'] = data['half_life'].apply(lambda x: x*60)
        data['half_life_std'] = data['half_life_std'].apply(lambda x: x*60)
        return data

    def load_halflife(self, df, growth_medium='MeOH', start=0):
        df_json = json.loads(df.to_json(orient='records'))
        row_count = len(df.index)
        for i, doc in enumerate(df_json[start:]):
            if i == self.max_entries:
                break
            if self.verbose and i % 100 == 0:
                print('Processing {} row {} out of {}'.format(growth_medium, i + start, row_count))
            doc['halflives'] = [{'halflife': doc['half_life'], 'std': doc['half_life_std'], 'std_over_avg': doc['std_over_avg'],
                                'unit': 's', 'reference': [{'doi': '10.1186/s12864-016-3219-8'}], 'growth_medium': growth_medium,
                                'ordered_locus_name': doc['gene_fragment'], 'ar_cog': doc['ar_cog'], 'cog_class': doc['cog_class'],
                                'cog': doc['cog'], 'species': 'Methanosarcina acetivorans', 'ncbi_taxonomy_id': 188937}]
            doc['modified'] = datetime.datetime.utcnow()
            del doc['half_life']
            del doc['half_life_std']
            del doc['std_over_avg']
            del doc['ar_cog']
            del doc['cog']
            del doc['cog_class']

            if doc['gene_name'] != '-':
                self.collection.update_one({'gene_name': doc['gene_name']},
                                    {'$set': doc}, upsert=True, collation=self.collation)
            elif doc['function'] != '-':
                self.fill_uniprot_by_oln(doc['gene_fragment'])
                self.collection.update_one({'function': doc['function']},
                                           {'$set': doc}, upsert=True, collation=self.collation)
            else:
                self.fill_uniprot_by_oln(doc['gene_fragment'])
                self.collection.update_one({'halflives.ordered_locus_name': doc['gene_fragment']},
                                        {'$set': doc}, upsert=True, collation=self.collation)
            if i == 0:
                self.collection.create_index([("gene_name", ASCENDING)], background=True,
                                            collation=self.collation)
                self.collection.create_index([("halflives.ordered_locus_name", ASCENDING)], background=True,
                                            collation=self.collation)
                self.collection.create_index([("function", ASCENDING)], background=True,
                                            collation=self.collation)

        self.collection.update_many({'gene_fragment':{'$exists': True}}, {'$unset': {'gene_fragment': ""}})

    def add_to_halflife(self, df, growth_medium='TMA'):
        """Add df to existing rna_halflife collection
        
        Args:
            df (:obj:`pandas.DataFrame`): dataframe to be added.
            growth_medium (:obj:`str`): medium in which the cells were grown. Defaults to TMA.
        """
        df_json = json.loads(df.to_json(orient='records'))
        row_count = len(df.index)
        for i, doc in enumerate(df_json):
            if i == self.max_entries:
                break
            if self.verbose and i % 100 == 0:
                print('Processing {} row {} out of {}'.format(growth_medium, i, row_count))
            to_add = {'halflife': doc['half_life'], 'std': doc['half_life_std'], 'std_over_avg': doc['std_over_avg'],
                    'unit': 's', 'reference': [{'doi': '10.1186/s12864-016-3219-8'}], 'growth_medium': growth_medium,
                    'ordered_locus_name': doc['gene_fragment'], 'ar_cog': doc['ar_cog'], 'cog_class': doc['cog_class'],
                    'cog': doc['cog'], 'species': 'Methanosarcina acetivorans', 'ncbi_taxonomy_id': 188937}
            if doc['gene_name'] != '-':
                self.collection.update_one({'gene_name': doc['gene_name']},
                                        {'$addToSet': {'halflives': to_add},
                                        '$set': {'modified': datetime.datetime.utcnow()}}, 
                                        upsert=True, collation=self.collation)
            elif doc['function'] != '-':
                self.collection.update_one({'function': doc['function']},
                                        {'$addToSet': {'halflives': to_add},
                                        '$set': {'modified': datetime.datetime.utcnow()}}, 
                                        upsert=True, collation=self.collation)
            else:
                query = {'halflives.ordered_locus_name': doc['gene_fragment']}
                result = self.collection.find_one(filter=query, collation=self.collation)
                if result is not None:
                    self.collection.update_one(query,
                                            {'$addToSet': {'halflives': to_add},
                                            '$set': {'modified': datetime.datetime.utcnow()}}, 
                                            upsert=True, collation=self.collation)
                else:
                    doc['halflives'] = [to_add]
                    doc['modified'] = datetime.datetime.utcnow()
                    del doc['half_life']
                    del doc['half_life_std']
                    del doc['std_over_avg']
                    del doc['ar_cog']
                    del doc['cog']
                    del doc['cog_class']
                    self.collection.update_one(query, {'$set': doc}, upsert=True, collation=self.collation)
        self.collection.update_many({'gene_fragment':{'$exists': True}}, {'$unset': {'gene_fragment': ""}})

    def fill_protein_name(self):
        """Create and fill 'protein_name' field for documents
        'gene_name' field with values other than '-'
        """
        con_0 = {'gene_name': {'$ne': '-'}}
        con_1 = {'gene_name': {'$exists': True}}
        query = {'$and': [con_0, con_1]}
        projection = {'gene_name': 1}
        docs = self.collection.find(filter=query, projection=projection, collation=self.collation)
        for doc in docs:
            gene_name = doc['gene_name']
            q = {'gene_name': gene_name}
            result = self.uniprot_manager.collection.find_one(filter=q, collation=self.collation, projection={'protein_name': 1})
            if result is not None:
                protein_name = result['protein_name']
            else:
                protein_name = ""
            self.collection.update_one({'_id': doc['_id']},
                                       {'$set': {'protein_name': protein_name}})

    def fill_gene_protein_name(self):
        """Fill 'gene_name' field where 'gene_name' has value of '-' and create
        'protein_name' field
        """
        query = {'gene_name': '-'}
        projection = {'gene_name': 1, 'halflives': 1}
        gene_name = ''
        protein_name = ''
        docs = self.collection.find(filter=query, projection=projection, collation=self.collation)
        uniprot_manager = query_uniprot.QueryUniprot(username=self.username, password=self.password,
                                            server=self.server, database=self.db_str, collection_str='uniprot')
        for doc in docs:
            oln = doc['halflives'][0]['ordered_locus_name']
            oln = 'MA_' + oln.split('MA')[1]  # "MA0002" to "MA_0002"
            gene_name, protein_name = uniprot_manager.get_gene_protein_name_by_oln(oln, species=[188937])
            self.collection.update_one({'_id': doc['_id']},
                                       {'$set': {'gene_name': gene_name,
                                                 'protein_name': protein_name}})

    def fill_uniprot_by_oln(self, oln):
        """Fill uniprot collection using ordered locus name
        
        Args:
            oln (:obj:`str`): Ordered locus name
        """
        gene_name, protein_name = self.uniprot_manager.get_gene_protein_name_by_oln(oln, species=[188937])
        if gene_name is None and protein_name is None: # no such entry in uniprot collection
            self.uniprot_col_manager.load_uniprot(query=True, msg=oln)
        else:
            return

    def uniprot_names(self, results, count):
        """Extract protein_name and gene_name from returned
        tuple of uniprot query function
        
        Args:
            results (:obj:`Iter`): pymongo cursor object.
            count (:obj:`int`): Number of documents found.

        Return:
            (:obj:`tuple` of :obj:`str`): gene_name and protein_name
        """
        if count == 0:
            return '', ''
        else:
            for result in results:
                gene_name = result['gene_name']
                protein_name = result['protein_name']
                return gene_name, protein_name

def main():
    src_db = 'datanator'
    des_db = 'datanator'
    collection_str = 'rna_halflife'
    username = datanator.config.core.get_config()[
        'datanator']['mongodb']['user']
    password = datanator.config.core.get_config(
    )['datanator']['mongodb']['password']
    server = datanator.config.core.get_config(
    )['datanator']['mongodb']['server']       
    src = Halflife(username=username, password=password, server=server, 
                    authDB='admin', db=src_db, uniprot_col_db=des_db,
                    verbose=True, collection_str=collection_str, max_entries=float('inf'))
    df = src.download_xlsx('MeOH')
    src.load_halflife(df, start=800)
    df = src.download_xlsx('TMA')
    src.add_to_halflife(df, growth_medium='TMA')
    df = src.download_xlsx('Acetate')
    src.add_to_halflife(df, growth_medium='Acetate')
    src.fill_protein_name()
    src.fill_gene_protein_name()

if __name__ == '__main__':
    main()