KarrLab/datanator

View on GitHub
datanator/data_source/protein_localization/parse_psortdb_negative_wo_outer_membrane.py

Summary

Maintainability
C
1 day
Test Coverage
import pandas as pd
import json

class ParsePsort:
    def __init__(self, max_entries):
        self.max_entries = max_entries
        
    def parse_psortdb(self):
        """
        To parse database psortdb gram negative without outer membrane file
        and create JSON files conforming to datanator_pattern/observation_compiled.json
        
        Args:
            max_entries(:obj:'int'): number of rows to parse.
            A JSON file will be created for each of the first <max_entries> rows

        Return:
            ()
        """
        data=pd.read_csv('Computed-Gram_negative_without_outer_membrane-PSORTdb-3.00.tab',delimiter="\t",low_memory=False)
        data = data.where(pd.notnull(data), None)
        for i in range(self.max_entries):
            d={}
            #entity
            d["entity"]={}
            d["entity"]["type"]="protein"
            d["entity"]["name"]=str(data.iloc[i,0])[str(data.iloc[i,0]).rfind("|")+2:]
            d["entity"]["synonyms"]=[]
            #identifiers
            d["entity"]["identifiers"]=[]
            seq_id = {}
            seq_id["namespace"]="Seq_ID"
            seq_id["value"]=str(data.iloc[i,0])[str(data.iloc[i,0]).find("ref")+4:str(data.iloc[i,0]).rfind("|")]
            d["entity"]["identifiers"].append(seq_id)
            #localizations
            d["value"]={}
            d["value"]["PPSVM_Localization"]=data.iloc[i,1]
            d["value"]["Profile_Localization"]=data.iloc[i,3]
            d["value"]["Signal_Localization"]=data.iloc[i,5]
            d["value"]["SCL-BLASTe_Localization"]=data.iloc[i,7]
            d["value"]["CMSVM_Localization"]=data.iloc[i,9]
            d["value"]["SCL-BLAST_Localization"]=data.iloc[i,11]
            d["value"]["OMPMotif_Localization"]=data.iloc[i,13]
            d["value"]["OMSVM_Localization"]=data.iloc[i,15]
            d["value"]["Motif_Localization"]=data.iloc[i,17]
            d["value"]["CytoSVM_Localization"]=data.iloc[i,19]
            d["value"]["CWSVM_Localization"]=data.iloc[i,21]
            d["value"]["ModHMM_Localization"]=data.iloc[i,23]
            d["value"]["ECSVM_Localization"]=data.iloc[i,25]
            d["value"]["Cytoplasmic Membrane_Score"]=data.iloc[i,27]
            d["value"]["Cellwall_Score"]=data.iloc[i,28]
            d["value"]["Extracellular_Score"]=data.iloc[i,29]
            d["value"]["Cytoplasmic_Score"]=data.iloc[i,30]                    
            d["value"]["Final_Localization"]=data.iloc[i,31]
            d["value"]["Final_Localization_2"]=data.iloc[i,32]
            d["value"]["Secondary_Localization"]=data.iloc[i,34]
            d["value"]["Final_Score"]=data.iloc[i,35]

            #source
            d["source"]={}
            d["source"]["namespace"]="PSORT"
            d["source"]["value"]="Version "+str(data.iloc[i,36])
            with open("Gram_Negative_WO_Outer_Membrane/"+str(data.iloc[i,0])[str(data.iloc[i,0]).find("ref")+4:str(data.iloc[i,0]).rfind("|")]+".json","w+") as f:
                json.dump(d,f,indent=4)

p1=ParsePsort(10)
p1.parse_psortdb()