KarrLab/datanator

View on GitHub
datanator/data_source/rna_modification/rna_modification.py

Summary

Maintainability
B
6 hrs
Test Coverage
import pandas as pd
import numpy as np
import datetime
from datanator_query_python.util import mongo_util
import datanator.config.core
from pymongo.collation import Collation, CollationStrength


class RNAMod(mongo_util.MongoUtil):

    def __init__(self, MongoDB=None, db=None, collection_str=None, username=None,
                 password=None, authSource='admin', readPreference='nearest',
                 verbose=True, max_entries=float('inf')):
        """Init        
        """
        super().__init__(MongoDB=MongoDB, db=db, username=username, password=password,
                         authSource=authSource, readPreference=readPreference)
        self.collation = Collation('en', strength=CollationStrength.SECONDARY)
        self.collection = self.db_obj[collection_str]
        self.taxon_collection = self.client['datanator']['taxon_tree']
        self.max_entries = max_entries
        self.verbose = verbose

    def fill_trna_primary(self, file_location, sheet_name='tRNA', start_row=[0],
                        use_columns='A:H', column_names=['Amino_acid', 'aa_Code', 'aa_Name',
                        'kegg_orthology_id', 'kegg_gene_Name', 'Definition', 'kegg_Pathway_id',
                        'kegg_Pathway_name']):
        """
        """
        print('here')
        df = pd.read_excel(file_location, sheet_name=sheet_name,
                         header=0, usecols=use_columns,
                         skiprows=start_row) 
        df.columns = [x.lower() for x in column_names]
        row_count = len(df.index)
        result = []
        for i, row in df.iterrows():
            if i == self.max_entries:
                break
            if i % 50 == 0 and self.verbose:
                print("Processing locus {} out {}".format(i, row_count))
            obj = row.to_dict()
            obj.pop('pathways')
            result.append(obj)
        self.collection.insert_many(result)      
    
    def fill_trna_collection(self, file_location, start_row=0,
                            column_names=[],
                            reference={'doi': '10.1093/nar/gkx1030'},
                            query='amino_acid'):
        """
        Fill collection collection_str.

        Args:
            sheet_name(:obj:`str`, optional): Name of sheet in excel.
            start_row (:obj:`int`, optional): Read from csv row. Defaults to 0.
            use_columns(:obj:`str`): Indicates comma separated list of Excel column letters and column ranges (e.g. “A:E” or “A,C,E:F”). Ranges are inclusive of both sides.
            column_names(:obj:`list` of :obj:`str`): Names of columns used.
            reference(:obj:`Obj`): reference information.
            unit(:obj:`str`): Unit used for Km.
        """
        df = pd.read_csv(file_location, skiprows=start_row, header=0,
                         engine='c', sep='\t').replace({np.nan:None})
        df.columns = [x.lower() for x in column_names]
        row_count = len(df.index)
        for i, row in df.iterrows():
            if i == self.max_entries:
                break
            if i % 10 == 0 and self.verbose:
                print("Processing locus {} out {}".format(i, row_count))
            row['reference'] = reference
            row['last_modified'] = datetime.datetime.utcnow()
            try:
                row['ncbi_taxonomy_id'] = self.taxon_collection.find_one({'tax_name': row['organism']},
                                                                        projection={'tax_id': 1}, collation=self.collation).get('tax_id')
            except AttributeError:
                row['ncbi_taxonomy_id'] = None
            obj = row.to_dict()
            obj.pop('amino_acid', None)
            obj.pop('kegg_orthology_id')
            self.collection.update_one({query: row[query]},
                                        {'$addToSet': {'modifications': obj}},
                                        collation=self.collation, upsert=True)


import datanator.config.core
from pathlib import Path

def main():
    db = 'datanator'
    collection_str = 'rna_modification'
    username = datanator.config.core.get_config()[
        'datanator']['mongodb']['user']
    password = datanator.config.core.get_config(
    )['datanator']['mongodb']['password']
    MongoDB = datanator.config.core.get_config(
    )['datanator']['mongodb']['server']
    manager = RNAMod(MongoDB=MongoDB, db=db, collection_str=collection_str,
                     username=username, password=password)

    # file_location = str(Path('~/karr_lab/datanator/docs/rna_modification/rna-ortholog-groups.xlsx').expanduser())
    # manager.fill_trna_primary(file_location)
    # column_names = ['kegg_orthology_id', 'kegg_orthology_name', 'definition',
    #                 'Mamallian_subunit', 'pathways', 'kingdom']
    # manager.fill_trna_primary(file_location, sheet_name='rRNA',
    #                           column_names=column_names, use_columns='A:F')

    # column_names = ['amino_acid', 'Anticodon', 'Organism', 'Organellum', 
    #                 'kegg_orthology_id', 'Sequence_MODOMICS', 'Sequence_BpForms',
    #                 'Sequence_IUPAC', 'Length',    'Number_of_modifications', 'Number_of_modified_A',
    #                 'Number_of_modified_C',    'Number_of_modified_G',    'Number_of_modified_U',    
    #                 'Formula', 'Molecular_weight', 'Charge', 'Canonical_formula',    
    #                 'Canonical_molecular_weight', 'Canonical_charge', 'Extra_formula',
    #                 'Extra_molecular_weight', 'Extra_charge', 'BpForms_errors']
    # file_location = Path('~/karr_lab/datanator/docs/rna_modification/modomics.trna.tsv').expanduser()
    # manager.fill_trna_collection(file_location, column_names=column_names,
    #                              start_row=None)

    # column_names = ['gen_bank', 'Organism', 'Organellum', 
    #                 'kegg_orthology_name', 'subunit', 'kegg_orthology_id', 
    #                 'sequence_modomics', 'Sequence_BpForms',
    #                 'Sequence_IUPAC', 'Length',    'Number_of_modifications', 'Number_of_modified_A',
    #                 'Number_of_modified_C',    'Number_of_modified_G',    'Number_of_modified_U',    
    #                 'Formula', 'Molecular_weight', 'Charge', 'Canonical_formula',    
    #                 'Canonical_molecular_weight', 'Canonical_charge', 'Extra_formula',
    #                 'Extra_molecular_weight', 'Extra_charge', 'BpForms_errors']
    # file_location = Path('~/karr_lab/datanator/docs/rna_modification/modomics.rrna.tsv').expanduser()
    # manager.fill_trna_collection(file_location, column_names=column_names,
    #                              start_row=None, query='kegg_orthology_name')

if __name__ == '__main__':
    main()