datanator/data_source/protein_localization/victoria_insert_experimental_protein.py from KarrLab/datanator

datanator/data_source/protein_localization/victoria_insert_experimental_protein.py
Summary

Maintainability

3 days
Test Coverage

Issues
import pandas as pd
from pymongo import MongoClient
from datanator_query_python.util import mongo_util
from datanator_query_python.config import config

class insert_experimental_protein(mongo_util.MongoUtil):
    def __init__(self,
                 MongoDB=None,
                 db=None,
                 collection=None,
                 max_entries=float('inf'),
                 username=None,
                 password=None,
                 authSource = 'admin',
                 readPreference = 'nearest'):
        super().__init__(MongoDB=MongoDB, db=db,
                         username = username,
                         password = password,
                         authSource = authSource,
                         readPreference=readPreference)
        self.max_entries = max_entries
        self.collection = collection
        self.entity_col = self.db_obj["entity"]
        self.identifier_col = self.db_obj["identifier"]
        self.obs_col = self.db_obj["observation"]
        
    def build_entity(self, data, i):
        """Build entity object from obj.
        Go into entity collection

        Args:
            data (:obj:`Obj`): source object.
            i (:obj: `int`): index (row labels) of object in dataframe
        Return:
            (:obj:`Obj`), e.g.
            {
                "entity": {
                    "type": "protein",
                    "name": "Cytoplasmic protein",
                    "identifiers": [{}... {}]
                }
            }        
        """
        entity = {}
        entity["type"] = "protein"
        entity["name"]=str(data.iloc[i,6])
        entity["identifiers"]=[]
        entity["identifiers"].append({"namespace": "uniprot_id",
                "value": str(data.iloc[i,0])})
        entity["identifiers"].append({"namespace": "Refseq_Accession",
                "value":str(data.iloc[i,1])})
        entity["identifiers"].append({"namespace": "Other_Accession",
                "value":str(data.iloc[i,2])})
        return entity
        

    def build_obs(self, data, i):
        """Build observation objects from obj.
        Go into observations collection.
        Args:
            data (:obj:`Obj`): source object.
            i (:obj: `int`): index (row labels) of object in dataframe
        Return:
            obj(:obj:`Obj`)
            {
                "entity": {
                    "type": "protein",
                    "name": "Cytoplasmic protein",
                    "identifiers": [{}... {}]
                },
                "value": [],
                "source": {}, ...
            }
        """
        
        entity={}
        entity["type"]="protein"
        entity["name"]=str(data.iloc[i,6])
        print(entity["name"])
        entity["identifiers"]=[]
        entity["identifiers"].append({"namespace": "uniprot_id",
                "value": str(data.iloc[i,0])})
        entity["identifiers"].append({"namespace":"Refseq_Accession",
                                          "value":str(data.iloc[i,1])})
        entity["identifiers"].append({"namespace": "Other_Accession",
                "value":str(data.iloc[i,2])})
        
        values_p=[]
        exp=str(data.iloc[i,3]).split(",")
        for x in exp:
            values_p.append({"type":"localization","value":x,"description":"experimental"})
        values_p.append({"type":"localization","value":str(data.iloc[i,4]),"description":"secondary"})
        values_p.append({"type":"MultipleSCL","value":str(data.iloc[i,5])})
                         
        genotype = {"taxon": {"ncbi_taxonomy_id":int(data.iloc[i,9]),
                              "name":str(data.iloc[i,10])}}
        genotype["taxon"]["canon_ancestors"]=[]
        query={"tax_id":int(data.iloc[i,9])}
        projection={"_id":0,"canon_anc_ids":1,"canon_anc_names":1}
        doc = self.client["datanator-test"]["taxon_tree"].find_one(filter=query, projection=projection)
        if doc!=None:
            for j in range(len(doc["canon_anc_names"])):
                d={}
                d["ncbi_taxonomy_id"]=doc["canon_anc_ids"][j]
                d["name"]=doc["canon_anc_names"][j]
                genotype["taxon"]["canon_ancestors"].append(d)
        genotype["cellType"]=str(data.iloc[i,13])
        source=[{"namespace":"ePSORTdb","value": "Version "+str(data.iloc[i,17])}]
        ob_p ={"entity":entity,
               "genotype":genotype,
               "values":values_p,
               "source":source,
               "schema_version":"2.0"}
        
        return ob_p
        
                         
    def process_docs(self, skip=0):
        #read .tsv file
        data=pd.read_csv('Experimental-PSORTdb-v4.00.tsv',delimiter="\t")
        data = data.where(pd.notnull(data), None)
        
        #update entity collection
        
        for i in range(len(data)):
            
            if data.iloc[i,0]!="None":
                con_0={"identifiers":{"$elemMatch":{"namespace":"uniprot_id","value":str(data.iloc[i,0])}}}
            elif data.iloc[i,1]!="None":
                con_0 = {"identifiers":{"$elemMatch":{"namespace": "Refseq_Accession",
                "value":str(data.iloc[i,1])}}}
            elif data.iloc[i,2]!="None":
                con_0 = {"identifiers":{"$elemMatch":{"namespace":"Other_Accession","value":str(data.iloc[i,2])}}}
              
            entity = self.build_entity(data,i)
             
            self.entity_col.update_one(con_0,
                                           {"$set": {"type": entity["type"],
                                                     "name": entity["name"],                      
                                                     "schema_version": "2.0"},
                                            "$addToSet": {"identifiers": {"$each": entity["identifiers"]}}},
                                           upsert=True)
            
            
            obs = self.build_obs(data,i)
            
            if obs["entity"]["identifiers"][0]["value"]!="None":
                id_name="uniprot_id"
                value=obs["entity"]["identifiers"][0]["value"]
            else:
                if obs["entity"]["identifiers"][1]["value"]!="None":
                    id_name="Refseq_Accession"
                    value=obs["entity"]["identifiers"][1]["value"]
                elif obs["entity"]["identifiers"][2]["value"]!="None":
                    id_name="Other_Accession"
                    value=obs["entity"]["identifiers"][2]["value"]
                else:
                    id_name="uniprot_id"
                    value="None"
    
                
            query={"$and":[{"namespace":id_name},{"value":value}]}
            con_1={"identifier":{"value":value}}
            #update identifier collection
            self.identifier_col.update_one(query,
                                                {"$set": {"namespace": id_name,
                                                  "value": value,
                                                  "description":str(entity["name"])}},upsert=True)
            
            #update observation collection
            con_2 = {"source":{"$elemMatch":{"namespace":"ePSORTdb","value":"Version "+str(data.iloc[i,17])}}}
            query={"$and":[con_1,con_2]}
            self.obs_col.update_one(query,
                                        {"$set": {"entity": obs["entity"],
                                                  "genotype": obs["genotype"],
                                                  "schema_version": "2.0",
                                                  "identifier":{"namespace":id_name,"value":value}},
                                         "$addToSet": {"values": {"$each": obs["values"]},
                                                       "source": {"$each": obs["source"]}}},
                                         upsert=True)
                       
def main():
    conf=config.Victoria()
    conf_main = config.Config()
    username = conf.USERNAME
    password = conf.PASSWORD
    MongoDB = conf_main.SERVER
    src = insert_experimental_protein(MongoDB = MongoDB,
                                       username=username,
                                       password=password,
                                       collection = "observation",
                                       db = "datanator-demo")
    src.process_docs()
    
if __name__== '__main__':
    main()