datanator/data_source/protein_modification/10_1093_nar_gkw1075.py
"""Parse tsv file generated by datanator.data_source.protein_modification.pro
"""
import pandas as pd
from datanator_query_python.util import mongo_util
from pymongo.collation import Collation, CollationStrength
import numpy as np
class ProteinMod(mongo_util.MongoUtil):
def __init__(self, file_location, MongoDB=None, db=None, collection_str=None, username=None,
password=None, authSource='admin', readPreference='nearest', verbose=True,
max_entries=float('inf')):
"""
Args:
file_location(:obj:`str`): location of csv file to be parsed.
collection_str(:obj:`str`): name of collection in MongoDB to be filled.
"""
super().__init__(MongoDB=MongoDB, db=db, username=username, password=password,
authSource=authSource, readPreference=readPreference)
self.collection = self.db_obj[collection_str]
self.verbose = verbose
self.file_location = file_location
self.max_entries = max_entries
self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
def fill_collection(self, start_row=0):
"""
Fill collection collection_str.
Args:
start_row (:obj:`int`, optional): Read from csv row. Defaults to 0.
"""
df = pd.read_csv(self.file_location, header=0, error_bad_lines=False,
engine='c', sep='\t',
low_memory=False, skiprows=start_row)
df.columns = [x.lower() for x in ['PRO_id', 'UniProt_id', 'Organism', 'Unmodified_sequence_IUBMB',
'Processing', 'Deletions', 'Processsed_sequence_IUBMB', 'Processsed_formula',
'Processsed_molecular_weight', 'Processsed_charge', 'Modifications',
'Crosslinks', 'Modified_sequence_abbreviated_BpForms', 'Modified_sequence_BpForms',
'concrete', 'Modified_formula', 'Modified_molecular_weight', 'Modified_charge',
'Modifications_formula', 'Modifications_molecular_weight', 'Modifications_charge',
'PRO_issues', 'Monomeric_form_issues']]
df = df.drop(columns=['organism', 'unmodified_sequence_iubmb'])
for i, row in df.iterrows():
if i == self.max_entries:
break
if row['concrete'] == False or row['pro_issues'] == np.NAN or row['monomeric_form_issues'] == np.NAN:
continue
if i % 50 == 0 and self.verbose:
print('Processing row {}'.format(i))
uniprot_id = row['uniprot_id'][:6]
row['reference'] = {'doi': '10.1093/nar/gkw1075'}
self.collection.update_many({'uniprot_id': uniprot_id},
{'$addToSet': {'modifications': row.to_dict()}},
collation=self.collation, upsert=False)
import datanator.config.core
from pathlib import Path
def main():
db = 'datanator'
collection_str = 'uniprot'
username = datanator.config.core.get_config()[
'datanator']['mongodb']['user']
password = datanator.config.core.get_config(
)['datanator']['mongodb']['password']
MongoDB = datanator.config.core.get_config(
)['datanator']['mongodb']['server']
file_location = str(Path('~/karr_lab/datanator/docs/modified_protein_sequences/pro.out.tsv').expanduser())
manager = ProteinMod(file_location, MongoDB=MongoDB, db=db, collection_str=collection_str,
username=username, password=password)
manager.fill_collection(start_row=None)
if __name__ == '__main__':
main()