wc_model_gen/eukaryote/initialize_model.py from KarrLab/wc_model_gen

wc_model_gen/eukaryote/initialize_model.py
Summary

Maintainability

1 wk
Test Coverage

97%
Issues
Coverage
""" Initialize the construction of wc_lang-encoded models from wc_kb-encoded knowledge base.

:Author: Yin Hoon Chew <yinhoon.chew@mssm.edu>
:Date: 2019-01-09
:Copyright: 2019, Karr Lab
:License: MIT
"""

from wc_utils.util.chem import EmpiricalFormula, OpenBabelUtils
from wc_utils.util.chem.marvin import get_major_micro_species
from wc_onto import onto as wc_ontology
from wc_utils.util.units import unit_registry
from wc_utils.util import chem
import wc_model_gen.global_vars as gvar
import ete3
import math
import mendeleev
import numpy
import openbabel
import scipy.constants
import wc_kb
import wc_lang
import wc_model_gen


class InitializeModel(wc_model_gen.ModelComponentGenerator):
    """ Initialize model from knowledge base 
    
    Options:

    * culture_volume (:obj:`float`): volume of cell culture; default is 1.0 liter
    * cell_density(:obj:`float`): cell density; default is 1040 g/liter
    * membrane_density (:obj:`float`): membrane density; default is 1160 g/liter
    * cds (:obj:`bool`): True indicates mRNA sequence is a complete CDS; default is True
    * amino_acid_id_conversion (:obj:`dict`): a dictionary with amino acid standard ids
        as keys and amino acid metabolite ids as values
    * selenoproteome (:obj:`list`): list of IDs of genes that translate into 
        selenoproteins, default is an empty list  
    * environment (:obj:`dict`): dictionary with details for generating cell environment in the model 
    * ph (:obj:`float`): pH at which species will be protonated and reactions will be balanced; default is 7.4
    * media (:obj:`dict`): a dictionary with species type ids as keys and tuples of concentration (M) in the 
        media (extracellular space), `list` of `wc_lang.Reference`, and comments as values
    * rna_input_seq (:obj:`dict`, optional): a dictionary with RNA ids as keys and sequence strings as values
    * smiles_input (:obj:`dict`, optional): a dictionary with metabolite ids as keys and smiles strings as values     
    * check_reaction (:obj:`bool`): if True, reactions will be checked and corrected for proton and charge balance;
        default is True
    * gen_dna (:obj:`bool`): if True, DNA species types and species will be generated; 
        default is True
    * gen_transcripts (:obj:`bool`): if True, transcript species types and species will be generated; 
        default is True
    * gen_protein (:obj:`bool`): if True, protein species types and species will be generated; 
        default is True
    * gen_metabolites (:obj:`bool`): if True, metabolite species types and species will be generated; 
        default is True
    * gen_complexes (:obj:`bool`): if True, macromolecular complex species types and species will be generated; 
        default is True
    * gen_distribution_init_concentration (:obj:`bool`): if True, initial concentration of species will be generated; 
        default is True                    
    * gen_observables (:obj:`bool`): if True, observables will be generated; default is True    
    * gen_kb_reactions (:obj:`bool`): if True, reactions will be generated; default is True
    * gen_dfba_objective (:obj:`bool`): if True, a dfba objective function will be created; default is False 
    * gen_kb_rate_laws (:obj:`bool`): if True, rate laws will be generated; default is True
    * gen_environment (:obj:`bool`): if True, cell environment will be generated; default is True    
    """

    def run(self):
        """ Run all the components for initializing model from knowledge base """
        self.clean_and_validate_options()
        options = self.options

        print('Initialization is starting...')

        self.gen_taxon()
        self.gen_compartments()
        self.gen_parameters()
        self.global_vars_from_input()

        print('Taxon, compartments, and parameters have been initialized')

        if options['gen_metabolites']:
            self.gen_metabolites()
            print('All metabolite species types and species have been initialized')

        if options['gen_dna']:
            self.gen_dna()
            print('All DNA species types and species have been initialized')    

        if options['gen_transcripts']:
            self.gen_transcripts()
            print('All transcript species types and species have been initialized')    

        if options['gen_protein']:
            self.gen_protein()
            print('All protein species types and species have been initialized')                

        if options['gen_complexes']:
            self.gen_complexes()
            print('All complex species types and species have been initialized')    

        if options['gen_distribution_init_concentrations']:
            self.gen_distribution_init_concentrations()
            print('Concentrations of species have been initialized')    

        if options['gen_observables']:
            self.gen_observables()
            print('Model observables have been initialized')    

        if options['gen_kb_reactions']:
            self.gen_kb_reactions()
            print('Reactions in knowledge base have been initialized')    

        if options['gen_kb_rate_laws']:
            self.gen_kb_rate_laws()
            print('Rate laws in knowledge base have been initialized')    

        if options['gen_environment']:
            self.gen_environment()
            
        print('Model generator has been initialized')        

    def clean_and_validate_options(self):
        """ Apply default options and validate options """
        options = self.options

        culture_volume = options.get('culture_volume', 1.)
        assert(isinstance(culture_volume, float))
        options['culture_volume'] = culture_volume

        cell_density = options.get('cell_density', 1040.)
        assert(isinstance(cell_density, float))
        options['cell_density'] = cell_density

        membrane_density = options.get('membrane_density', 1160.)
        assert(isinstance(membrane_density, float))
        options['membrane_density'] = membrane_density

        cds = options.get('cds', True)
        assert(isinstance(cds, bool))
        options['cds'] = cds

        amino_acid_id_conversion = options.get('amino_acid_id_conversion', {})
        assert(isinstance(amino_acid_id_conversion, dict))
        options['amino_acid_id_conversion'] = amino_acid_id_conversion

        selenoproteome = options.get('selenoproteome', [])
        assert(isinstance(selenoproteome, list))
        options['selenoproteome'] = selenoproteome

        environment = options.get('environment', {})
        assert(isinstance(environment, dict))
        options['environment'] = environment

        ph = options.get('ph', 7.4)
        assert(isinstance(ph, float))
        options['ph'] = ph

        media = options.get('media', {})
        assert(isinstance(media, dict))
        options['media'] = media

        rna_input_seq = options.get('rna_input_seq', {})
        assert(isinstance(rna_input_seq, dict))
        options['rna_input_seq'] = rna_input_seq

        smiles_input = options.get('smiles_input', {})
        assert(isinstance(smiles_input, dict))
        options['smiles_input'] = smiles_input

        check_reaction = options.get('check_reaction', True)
        assert(isinstance(check_reaction, bool))
        options['check_reaction'] = check_reaction

        gen_metabolites = options.get('gen_metabolites', True)
        assert(isinstance(gen_metabolites, bool))
        options['gen_metabolites'] = gen_metabolites       

        gen_dna = options.get('gen_dna', True)
        assert(isinstance(gen_dna, bool))
        options['gen_dna'] = gen_dna

        gen_transcripts = options.get('gen_transcripts', True)
        assert(isinstance(gen_transcripts, bool))
        options['gen_transcripts'] = gen_transcripts

        gen_protein = options.get('gen_protein', True)
        assert(isinstance(gen_protein, bool))
        options['gen_protein'] = gen_protein

        gen_complexes = options.get('gen_complexes', True)
        assert(isinstance(gen_complexes, bool))
        options['gen_complexes'] = gen_complexes

        gen_distribution_init_concentrations = options.get('gen_distribution_init_concentrations', True)
        assert(isinstance(gen_distribution_init_concentrations, bool))
        options['gen_distribution_init_concentrations'] = gen_distribution_init_concentrations

        gen_observables = options.get('gen_observables', True)
        assert(isinstance(gen_observables, bool))
        options['gen_observables'] = gen_observables

        gen_kb_reactions = options.get('gen_kb_reactions', True)
        assert(isinstance(gen_kb_reactions, bool))
        options['gen_kb_reactions'] = gen_kb_reactions

        gen_dfba_objective = options.get('gen_dfba_objective', False)
        assert(isinstance(gen_dfba_objective, bool))
        options['gen_dfba_objective'] = gen_dfba_objective

        gen_kb_rate_laws = options.get('gen_kb_rate_laws', True)
        assert(isinstance(gen_kb_rate_laws, bool))
        options['gen_kb_rate_laws'] = gen_kb_rate_laws

        gen_environment = options.get('gen_environment', True)
        assert(isinstance(gen_environment, bool))
        options['gen_environment'] = gen_environment

    def gen_taxon(self):
        """ Generate taxon for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        ncbi_taxa = ete3.NCBITaxa()
        taxon_name = ncbi_taxa.get_taxid_translator([kb.cell.taxon])[kb.cell.taxon]
        taxon_rank = ncbi_taxa.get_rank([kb.cell.taxon])[kb.cell.taxon]
        model_taxon = wc_lang.core.Taxon(id='taxon', name=taxon_name, model=model, 
            rank=wc_lang.core.TaxonRank[taxon_rank]) 

    def gen_compartments(self):
        """ Generate compartments for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        if kb.cell.parameters.get_one(id='cell_volume'):
            mean_cell_volume = kb.cell.parameters.get_one(id='cell_volume').value
        else:
            raise ValueError('The cell object does not have the parameter cell_volume')        
        
        culture_volume = self.options['culture_volume']
        cell_density = self.options['cell_density']
        membrane_density = self.options['membrane_density']

        for comp in kb.cell.compartments:

            c = model.compartments.get_or_create(
                    id=comp.id, name=comp.name)

            c.init_density = model.parameters.create(
                id='density_' + c.id,
                units=unit_registry.parse_units('g l^-1'))

            if comp.id=='e':
                c.biological_type = wc_ontology['WC:extracellular_compartment']
                c.init_density.value = 1000.
                c.init_volume = wc_lang.core.InitVolume(distribution=wc_ontology['WC:normal_distribution'], 
                    mean=culture_volume, std=0)                
                
            elif '_m' in comp.id:
                c.physical_type = wc_ontology['WC:membrane_compartment']
                c.init_density.value = membrane_density
                organelle_fraction = kb.cell.compartments.get_one(id=comp.id[:comp.id.index('_')]).volumetric_fraction              
                c.init_volume = wc_lang.core.InitVolume(distribution=wc_ontology['WC:normal_distribution'], 
                    mean=4.836E-09*(mean_cell_volume*organelle_fraction)**(2/3), std=0)                

            else:
                c.init_density.value = cell_density
                organelle_fraction = kb.cell.compartments.get_one(id=comp.id).volumetric_fraction
                c.init_volume = wc_lang.core.InitVolume(distribution=wc_ontology['WC:normal_distribution'], 
                    mean=mean_cell_volume*organelle_fraction - 4.836E-09*(mean_cell_volume*organelle_fraction)**(2/3), std=0)

            volume = model.functions.create(id='volume_' + c.id, units=unit_registry.parse_units('l'))
            volume.expression, error = wc_lang.FunctionExpression.deserialize(f'{c.id} / {c.init_density.id}', {
                wc_lang.Compartment: {c.id: c},
                wc_lang.Parameter: {c.init_density.id: c.init_density},
                })
            assert error is None, str(error)           

    def gen_parameters(self):
        """ Generate parameters for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        Avogadro = model.parameters.create(id='Avogadro',
                                        type = None,
                                        value = scipy.constants.Avogadro,
                                        units = unit_registry.parse_units('molecule mol^-1'))       
      
        # Create parameters from kb
        for param in kb.cell.parameters:
            model_param = model.parameters.create(
                            id=param.id,
                            name=param.name,                            
                            value=param.value,
                            units=param.units)
            if 'K_m' in param.id:
                model_param.type = wc_ontology['WC:K_m']
            elif 'k_cat' in param.id:
                model_param.type = wc_ontology['WC:k_cat']
                model_param.units = unit_registry.parse_units('molecule^-1 s^-1')
            else:
                model_param.type = None

            if param.references:
                for ref in param.references:
                    ref_model = model.references.get_or_create(
                        __type=wc_lang.Reference,
                        author=ref.authors,
                        title=ref.title,
                        publication=ref.journal,
                        volume=ref.volume,
                        issue=ref.issue,
                        pages=ref.pages,
                        year=ref.year,
                        comments=ref.comments, 
                        type=wc_ontology['WC:article'])
                    if not ref_model.id:
                        ref_model.id = 'ref_'+str(len(model.references))
                    model_param.references.append(ref_model)

            if param.identifiers:
                for identifier in param.identifiers:
                    identifier_model = wc_lang.Identifier(
                        namespace=identifier.namespace, id=identifier.id)
                    model_param.identifiers.append(identifier_model)

        # Standardize the units of doubling time
        if model.parameters.get_one(id='mean_doubling_time'):
            model_doubling_time = model.parameters.get_one(id='mean_doubling_time')
        else:
            raise ValueError('The cell object does not have the parameter mean_doubling_time')

        expr = unit_registry.parse_expression(str(model_doubling_time.units))
        scale = expr.to(unit_registry.parse_units('second'))
        conversion_factor = scale.magnitude
        model_doubling_time.value *= conversion_factor
        model_doubling_time.units = unit_registry.parse_units('s')

    def global_vars_from_input(self):
        """ Populate global variable if input transcript sequences are provided in the options  
        """
        rna_input_seq = self.options['rna_input_seq']
        selenoproteome =self.options['selenoproteome']

        for Id, seq in rna_input_seq.items():
            gvar.transcript_ntp_usage[Id] = {
                'A': seq.upper().count('A'),
                'C': seq.upper().count('C'),
                'G': seq.upper().count('G'),
                'U': seq.upper().count('U'),
                'len': len(seq)
                }

    def gen_metabolites(self):
        """ Generate metabolites for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        kb_species_types = kb.cell.species_types.get(
            __type=wc_kb.core.MetaboliteSpeciesType)

        for kb_species_type in kb_species_types:
            self.gen_species_type(kb_species_type)                    

    def gen_dna(self):
        """ Generate DNAs for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        kb_species_types = kb.cell.species_types.get(
            __type=wc_kb.core.DnaSpeciesType)

        for kb_species_type in kb_species_types:
            if 'M' in kb_species_type.id:
                self.gen_species_type(kb_species_type, ['m'])
            else:
                self.gen_species_type(kb_species_type, ['n'])

    def gen_transcripts(self):
        """ Generate transcripts (mature RNAs) for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        kb_species_types = kb.cell.species_types.get(
            __type=wc_kb.eukaryote.TranscriptSpeciesType)

        count = 0
        for kb_species_type in kb_species_types:
            self.gen_species_type(kb_species_type)
            count += 1
            if count % 100 == 0:
                print('{}/{} of the transcripts have been initialized'.format(
                    count, len(kb_species_types)))

    def gen_protein(self):
        """ Generate proteins for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        kb_species_types = kb.cell.species_types.get(
            __type=wc_kb.eukaryote.ProteinSpeciesType)

        count = 0
        for kb_species_type in kb_species_types:
            self.gen_species_type(kb_species_type)
            count += 1
            if count % 100 == 0:
                print('{}/{} of the proteins have been initialized'.format(
                    count, len(kb_species_types)))

    def gen_complexes(self):
        """ Generate complexes for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        kb_species_types = kb.cell.species_types.get(
            __type=wc_kb.core.ComplexSpeciesType)

        for kb_species_type in kb_species_types:
            self.gen_species_type(kb_species_type)

    def gen_species_type(self, kb_species_type, extra_compartment_ids=None):
        """ Generate a model species type and species

        Args:
            kb_species_type (:obj:`wc_kb.SpeciesType`): knowledge base species type
            extra_compartment_ids (:obj:`list` of :obj:`str`, optional): compartment ids of
                additional species that should be created beyond those represented in the KB

        Returns:
            * :obj:`wc_lang.SpeciesType`: model species type
        """

        model = self.model
        ph = self.options['ph']
        selenoproteome = self.options['selenoproteome']        

        model_species_type = model.species_types.get_or_create(id=kb_species_type.id)
        model_species_type.name = kb_species_type.name
        model_species_type.structure = wc_lang.ChemicalStructure()
        model_species_type.comments = kb_species_type.comments

        if isinstance(kb_species_type, wc_kb.core.MetaboliteSpeciesType):
            model_species_type.type = wc_ontology['WC:metabolite'] # metabolite
            if kb_species_type.name == 'electron':
                model_species_type.structure.value = '[*-]'
                model_species_type.structure.format = wc_lang.ChemicalStructureFormat.SMILES
                model_species_type.structure.empirical_formula = EmpiricalFormula()
                model_species_type.structure.molecular_weight = 0.
                model_species_type.structure.charge = -1
            elif kb_species_type.name == 'proton' or kb_species_type.name == 'proton motive force':
                model_species_type.structure.value = '[1H+]'
                model_species_type.structure.format = wc_lang.ChemicalStructureFormat.SMILES
                model_species_type.structure.empirical_formula = EmpiricalFormula('H')
                model_species_type.structure.molecular_weight = 1.008
                model_species_type.structure.charge = 1
            elif kb_species_type.name == 'dihydrogen':
                model_species_type.structure.value = '[H][H]'
                model_species_type.structure.format = wc_lang.ChemicalStructureFormat.SMILES
                model_species_type.structure.empirical_formula = EmpiricalFormula('H2')
                model_species_type.structure.molecular_weight = 2.016
                model_species_type.structure.charge = 0    
            else:    
                inchi_str = kb_species_type.properties.get_one(property='structure')
                if inchi_str:
                    smiles, formula, charge, mol_wt = self.structure_to_smiles_and_props(
                        kb_species_type.id, inchi_str.get_value(), ph)
                    model_species_type.structure.value = smiles
                    model_species_type.structure.format = wc_lang.ChemicalStructureFormat.SMILES
                else:
                    formula = kb_species_type.get_empirical_formula()
                    charge = kb_species_type.get_charge()
                    mol_wt = kb_species_type.get_mol_wt()
                model_species_type.structure.empirical_formula = formula
                model_species_type.structure.molecular_weight = mol_wt
                model_species_type.structure.charge = charge 

        elif isinstance(kb_species_type, wc_kb.core.DnaSpeciesType):
            model_species_type.type = wc_ontology['WC:DNA'] # DNA
            model_species_type.structure.empirical_formula = kb_species_type.get_empirical_formula()
            model_species_type.structure.molecular_weight = kb_species_type.get_mol_wt()
            model_species_type.structure.charge = kb_species_type.get_charge()

        elif isinstance(kb_species_type, wc_kb.eukaryote.TranscriptSpeciesType):
            model_species_type.type = wc_ontology['WC:RNA'] # RNA
            if model_species_type.id in self.options['rna_input_seq']:
                seq = self.options['rna_input_seq'][model_species_type.id]
            else:
                seq = kb_species_type.get_seq()
                gvar.transcript_ntp_usage[model_species_type.id] = {
                    'A': seq.upper().count('A'),
                    'C': seq.upper().count('C'),
                    'G': seq.upper().count('G'),
                    'U': seq.upper().count('U'),
                    'len': len(seq)
                    }
            model_species_type.structure.empirical_formula = kb_species_type.get_empirical_formula(
                seq_input=seq)
            model_species_type.structure.molecular_weight = kb_species_type.get_mol_wt(
                seq_input=seq)
            model_species_type.structure.charge = kb_species_type.get_charge(
                seq_input=seq)

        elif isinstance(kb_species_type, wc_kb.eukaryote.ProteinSpeciesType):
            model_species_type.type = wc_ontology['WC:protein'] # protein
            table = 2 if 'M' in kb_species_type.transcript.gene.polymer.id else 1
            cds = self.options['cds']
            _, raw_seq, start_codon = kb_species_type.get_seq_and_start_codon(table=table, cds=cds)
            if kb_species_type.transcript.gene.id in selenoproteome:
                processed_seq = raw_seq[:-1] if raw_seq.endswith('*') else raw_seq
                protein_seq = ''.join(i if i!='*' else 'U' for i in processed_seq)
            else:                                            
                protein_seq = ''.join(i for i in raw_seq if i!='*')            
            self.populate_protein_aa_usage(model_species_type.id, protein_seq)
            gvar.protein_aa_usage[model_species_type.id]['start_aa'] = protein_seq[0]
            gvar.protein_aa_usage[model_species_type.id]['start_codon'] = str(start_codon).upper()                        
            _, _, _, determined = self.determine_protein_structure_from_aa(
                model_species_type.id, gvar.protein_aa_usage[model_species_type.id])
            if not determined:    
                model_species_type.structure.empirical_formula = kb_species_type.get_empirical_formula(
                    seq_input=protein_seq)
                model_species_type.structure.molecular_weight = kb_species_type.get_mol_wt(
                    seq_input=protein_seq)
                model_species_type.structure.charge = kb_species_type.get_charge(
                    seq_input=protein_seq)       

        elif isinstance(kb_species_type, wc_kb.core.ComplexSpeciesType):
            model_species_type.type = wc_ontology['WC:pseudo_species'] # pseudo specie
            formula = EmpiricalFormula()
            charge = 0
            weight = 0
            for subunit in kb_species_type.subunits:
                subunit_id = subunit.species_type.id
                coef = abs(max(1, subunit.coefficient))
                if isinstance(subunit.species_type, wc_kb.eukaryote.ProteinSpeciesType):
                    subunit_model_species_type = model.species_types.get_one(id=subunit_id)
                    if subunit_model_species_type:
                        formula += subunit_model_species_type.structure.empirical_formula * coef 
                        charge += subunit_model_species_type.structure.charge * coef
                        weight += subunit_model_species_type.structure.molecular_weight * coef
                    else:    
                        table = 2 if 'M' in subunit.species_type.transcript.gene.polymer.id else 1
                        cds = self.options['cds']
                        _, raw_seq, start_codon = subunit.species_type.get_seq_and_start_codon(table=table, cds=cds)
                        if subunit.species_type.transcript.gene.id in selenoproteome:
                            processed_seq = raw_seq[:-1] if raw_seq.endswith('*') else raw_seq
                            protein_seq = ''.join(i if i!='*' else 'U' for i in processed_seq)
                        else:                                            
                            protein_seq = ''.join(i for i in raw_seq if i!='*')
                        self.populate_protein_aa_usage(subunit_id, protein_seq)
                        gvar.protein_aa_usage[subunit_id]['start_aa'] = protein_seq[0]
                        gvar.protein_aa_usage[subunit_id]['start_codon'] = str(start_codon).upper()                        
                        sub_formula, sub_mol_wt, sub_charge, determined = \
                            self.determine_protein_structure_from_aa(
                            subunit_id, gvar.protein_aa_usage[subunit_id])
                        if not determined:
                            formula += subunit.species_type.get_empirical_formula(seq_input=protein_seq) * coef
                            charge += subunit.species_type.get_charge(seq_input=protein_seq) * coef
                            weight += subunit.species_type.get_mol_wt(seq_input=protein_seq) * coef
                        else:
                            formula += sub_formula * coef 
                            charge += sub_charge * coef
                            weight += sub_mol_wt * coef
                elif isinstance(subunit.species_type, wc_kb.eukaryote.TranscriptSpeciesType):
                    subunit_model_species_type = model.species_types.get_one(id=subunit_id)
                    if subunit_model_species_type:
                        formula += subunit_model_species_type.structure.empirical_formula * coef 
                        charge += subunit_model_species_type.structure.charge * coef
                        weight += subunit_model_species_type.structure.molecular_weight * coef
                    else:
                        if subunit_id not in gvar.transcript_ntp_usage:
                            seq = subunit.species_type.get_seq()
                            gvar.transcript_ntp_usage[subunit.species_type.id] = {
                                'A': seq.upper().count('A'),
                                'C': seq.upper().count('C'),
                                'G': seq.upper().count('G'),
                                'U': seq.upper().count('U'),
                                'len': len(seq)
                                }
                        else:
                            seq = self.options['rna_input_seq'][subunit_id]

                        formula += subunit.species_type.get_empirical_formula(seq_input=seq) * coef
                        weight += subunit.species_type.get_mol_wt(seq_input=seq) * coef
                        charge += subunit.species_type.get_charge(seq_input=seq) * coef

                else:
                    inchi_str = subunit.species_type.properties.get_one(property='structure')
                    if inchi_str:
                        _, sub_formula, sub_charge, sub_mol_wt = self.structure_to_smiles_and_props(
                            subunit.species_type.id, inchi_str.get_value(), ph)
                    else:
                        sub_formula = subunit.species_type.get_empirical_formula()
                        sub_charge = subunit.species_type.get_charge()
                        sub_mol_wt = subunit.species_type.get_mol_wt()    
                    
                    formula += sub_formula * coef
                    charge += sub_charge * coef
                    weight += sub_mol_wt * coef        
            
            model_species_type.structure.empirical_formula = formula
            model_species_type.structure.molecular_weight = weight
            model_species_type.structure.charge = charge

        else:
            raise ValueError('Unsupported species type: {}'.format(
                kb_species_type.__class__.__name__))

        # Create species
        if isinstance(kb_species_type, wc_kb.core.ComplexSpeciesType):
            subunit_compartments = [[s.compartment.id for s in sub.species_type.species]
                for sub in kb_species_type.subunits]
            if len(subunit_compartments) == 1:
                shared_compartments = set(subunit_compartments[0])
            else:
                shared_compartments = set([])
                for i in range(len(subunit_compartments)):
                    shared_compartments = (set(subunit_compartments[i])
                        if i==0 else shared_compartments).intersection(
                        set(subunit_compartments[i+1]) if i<(len(subunit_compartments)-1) else shared_compartments)
            # Combine compartments where all the subunits exist, where catalyzed reactions occur and the additionally defined extra
            compartment_ids = set(list(shared_compartments) + [s.compartment.id for s in kb_species_type.species] +
                              (extra_compartment_ids or []))
        else:
            compartment_ids = set([s.compartment.id for s in kb_species_type.species] +
                              (extra_compartment_ids or []))

        for compartment_id in compartment_ids:
            model_compartment = model.compartments.get_one(id=compartment_id)
            model_species = model.species.get_or_create(species_type=model_species_type, compartment=model_compartment)
            model_species.id = model_species.gen_id()

        return model_species_type

    def gen_distribution_init_concentrations(self):
        """ Generate concentrations for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model
        media = self.options['media']

        Avogadro = model.parameters.get_one(id='Avogadro')

        for conc in kb.cell.concentrations:
            species_comp_model = model.compartments.get_one(id=conc.species.compartment.id)

            species_type = model.species_types.get_or_create(id=conc.species.species_type.id)
            species = model.species.get_or_create(species_type=species_type, compartment=species_comp_model)
            species.id = species.gen_id()

            if conc.units == unit_registry.parse_units('molecule'):
                mean_concentration = conc.value
            elif conc.units == unit_registry.parse_units('M'):
                mean_concentration = conc.value * Avogadro.value * species_comp_model.init_volume.mean
            else:
                raise Exception('Unsupported units: {}'.format(conc.units.name))

            conc_model = model.distribution_init_concentrations.create(
                species=species,
                mean=mean_concentration,
                units=unit_registry.parse_units('molecule'),
                comments=conc.comments)
            conc_model.id = conc_model.gen_id()

            if conc.references:
                for ref in conc.references:
                    ref_model = model.references.get_or_create(
                        __type=wc_lang.Reference,
                        author=ref.authors,
                        title=ref.title,
                        publication=ref.journal,
                        volume=ref.volume,
                        issue=ref.issue,
                        pages=ref.pages,
                        year=ref.year,
                        comments=ref.comments, 
                        type=wc_ontology['WC:article'])
                    if not ref_model.id:
                        ref_model.id = 'ref_'+str(len(model.references))
                    conc_model.references.append(ref_model)

            if conc.identifiers:
                for identifier in conc.identifiers:
                    identifier_model = wc_lang.Identifier(
                        namespace=identifier.namespace, id=identifier.id)
                    conc_model.identifiers.append(identifier_model)

        for chromosome in kb.cell.species_types.get(__type=wc_kb.core.DnaSpeciesType):
            model_species_type = model.species_types.get_or_create(id=chromosome.id)
            model_species = model.species.get_or_create(species_type=model_species_type)
            conc_model = model.distribution_init_concentrations.create(
                species=model_species,
                mean=chromosome.ploidy, 
                units=unit_registry.parse_units('molecule'),
                )
            conc_model.id = conc_model.gen_id()

        for Id, (conc, refs, comments) in media.items():
            species_type = model.species_types.get_or_create(id=Id)
            species_comp_model = model.compartments.get_one(id='e')
            species = model.species.get_or_create(species_type=species_type, compartment=species_comp_model)
            species.id = species.gen_id()

            conc_model = model.distribution_init_concentrations.create(
                species=species,
                mean=conc * Avogadro.value * species_comp_model.init_volume.mean,
                units=unit_registry.parse_units('molecule'),
                comments=comments,
                )
            conc_model.id = conc_model.gen_id()

            if refs:
                for ref in refs:
                    ref_model = model.references.get_one(title=ref.title)
                    if ref_model:
                        conc_model.references.append(ref_model)
                    else:    
                        ref.model = model
                        ref.id = 'ref_'+str(len(model.references))
                        conc_model.references.append(ref)
            
    def gen_observables(self):
        """ Generate observables for the model from knowledge base """
        kb = self.knowledge_base
        model = self.model

        for kb_observable in kb.cell.observables:
            all_species = {}
            all_observables = {}

            for kb_species in kb_observable.expression.species:
                kb_species_type = kb_species.species_type
                kb_compartment = kb_species.compartment
                model_species_type = model.species_types.get_one(
                    id=kb_species_type.id)
                model_species = model_species_type.species.get_one(
                    compartment=model.compartments.get_one(id=kb_compartment.id))
                all_species[model_species.gen_id()] = model_species

            for kb_observable_observable in kb_observable.expression.observables:
                model_observable_observable = model.observables.get_or_create(
                    id=kb_observable_observable.id)
                all_observables[model_observable_observable.id] = model_observable_observable

            model_observable_expression, error = wc_lang.ObservableExpression.deserialize(
                kb_observable.expression.expression, {
                wc_lang.Species: all_species,
                wc_lang.Observable: all_observables,
                })
            assert error is None, str(error)

            model_observable = model.observables.get_or_create(
                id=kb_observable.id,
                name=kb_observable.name,
                expression=model_observable_expression)

    def gen_kb_reactions(self):
        """ Generate reactions encoded within KB
        """

        kb = self.knowledge_base
        model = self.model
        submodel = model.submodels.get_or_create(id='metabolism', 
            framework=wc_ontology['WC:dynamic_flux_balance_analysis'])              
 
        for kb_rxn in kb.cell.reactions:
            if len(kb_rxn.participants)==1 and kb_rxn.participants[0].species.compartment.id=='e' and \
                not model.distribution_init_concentrations.get_one(
                species=model.species.get_one(id=kb_rxn.participants[0].species.id())):
                pass
            else:    
                model_rxn = model.reactions.create(
                    submodel=submodel,
                    id=kb_rxn.id + '_kb',
                    name=kb_rxn.name,
                    reversible=kb_rxn.reversible,
                    comments=kb_rxn.comments)

                delta_formula = EmpiricalFormula()
                delta_charge = 0.
                proton_participant = 0
                for participant in kb_rxn.participants:
                    kb_species = participant.species
                    model_species_type = model.species_types.get_one(id=kb_species.species_type.id)
                    model_compartment = model.compartments.get_one(id=kb_species.compartment.id)
                    model_species = model_species_type.species.get_one(compartment=model_compartment)
                    
                    if model_species_type.name=='proton':
                        proton_participant += 1

                    if model_species is None:
                        model_species = self.gen_species_type(kb_species.species_type, model_compartment)

                    model_rxn.participants.add(
                        model_species.species_coefficients.get_or_create(coefficient=participant.coefficient))

                    # Check element and charge balance          
                    delta_formula += model_species_type.structure.empirical_formula * participant.coefficient
                    delta_charge += model_species_type.structure.charge * participant.coefficient

                # Correct proton and charge balance at the pH at which metabolite properties are determined
                if self.options['check_reaction']:
                    if delta_charge and len(delta_formula)==1 and delta_charge==delta_formula['H']:
                        proton_species_type = model.species_types.get_one(name='proton')
                        if proton_participant:
                            proton_coef = [i for i in model_rxn.participants if i.species.species_type==proton_species_type][0]
                            model_rxn.participants.discard(proton_coef)
                            model_rxn.participants.add(
                                proton_coef.species.species_coefficients.get_or_create(coefficient=proton_coef.coefficient-delta_charge))
                        else:
                            comp_id = set([i.species.compartment.id for i in model_rxn.participants]).pop()
                            proton_compartment = model.compartments.get_one(id=comp_id)
                            proton_species = model.species.get_or_create(species_type=proton_species_type, compartment=proton_compartment)
                            proton_species.id = proton_species.gen_id()
                            model_rxn.participants.add(
                                proton_species.species_coefficients.get_or_create(coefficient=-delta_charge))

        # For testing purpose
        if self.options['gen_dfba_objective']:
            submodel.dfba_obj = wc_lang.DfbaObjective(model=model)
            submodel.dfba_obj.id = submodel.dfba_obj.gen_id()
            obj_expression = model_rxn.id
            dfba_obj_expression, error = wc_lang.DfbaObjectiveExpression.deserialize(
                obj_expression, {wc_lang.Reaction: {model_rxn.id: model_rxn}})
            assert error is None, str(error)
            submodel.dfba_obj.expression = dfba_obj_expression            

    def gen_kb_rate_laws(self):
        """ Generate the rate laws for reactions encoded in the knowledge base """
        kb = self.knowledge_base
        model = self.model

        Avogadro = model.parameters.get_or_create(id='Avogadro')

        for kb_rxn in kb.cell.reactions:

            model_rxn = model.reactions.get_one(id=kb_rxn.id + '_kb')

            for kb_rate_law in kb_rxn.rate_laws:
                all_parameters = {}
                all_parameters[Avogadro.id] = Avogadro
                all_species = {}
                all_observables = {}
                all_volumes = {}
                
                model_expression = kb_rate_law.expression.expression

                for observable in kb_rate_law.expression.observables:
                    all_observables[observable.id] = model.observables.get_one(id=observable.id)

                for species in kb_rate_law.expression.species:
                    model_species_type = model.species_types.get_one(id=species.species_type.id)
                    model_compartment = model.compartments.get_one(id=species.compartment.id)
                    if model_compartment.init_density.function_expressions:
                        volume = model_compartment.init_density.function_expressions[0].function
                        all_volumes[volume.id] = volume
                    model_species = model_species_type.species.get_one(compartment=model_compartment)
                    all_species[model_species.gen_id()] = model_species                    

                for param in kb_rate_law.expression.parameters:
                    all_parameters[param.id] = model.parameters.get_one(id=param.id)
                    if 'K_m' in param.id:
                        model_compartment = model.compartments.get_one(id=param.id.split('_')[-1])
                        if model_compartment.init_density.function_expressions:
                            volume = model_compartment.init_density.function_expressions[0].function
                            unit_adjusted_term = '{} * {} * {}'.format(param.id, Avogadro.id, volume.id)
                        else:
                            volume = model_compartment.init_volume.mean    
                            unit_adjusted_term = '{} * {} * {}'.format(param.id, Avogadro.id, volume)
                        model_expression = model_expression.replace(param.id, unit_adjusted_term)            

                rate_law_expression, error = wc_lang.RateLawExpression.deserialize(
                    model_expression, {
                    wc_lang.Parameter: all_parameters,
                    wc_lang.Species: all_species,
                    wc_lang.Observable: all_observables,
                    wc_lang.Function: all_volumes,
                    })
                assert error is None, str(error)

                model_rate_law = model.rate_laws.create(
                    expression=rate_law_expression,
                    reaction=model_rxn,
                    direction=wc_lang.RateLawDirection[kb_rate_law.direction.name],
                    comments=kb_rate_law.comments)
                model_rate_law.id = model_rate_law.gen_id()

    def gen_environment(self):            
        """ Generate the environment, i.e. temperature, for the simulated cells """

        model = self.model
        environment = self.options['environment']

        if environment:
            wc_lang.Environment(
                id=environment['id'],
                name=environment['name'],
                model=model,
                temp=environment['temperature'],
                comments=environment['comments'])

    def structure_to_smiles_and_props(self, met_id, structure, ph):
        """ Convert InChI or SMILES string in the knowledge base 
            to a SMILES string at specific pH and calculate properties such
            as empirical formula, charge and molecular weight 

        Args:
            met_id (:obj:`str`): id of metabolite
            structure (:obj:`str`): InChI or SMILES string
            ph (:obj:`float`): pH at which the properties should be determined

        Returns:
            :obj:`str`: SMILES string
            :obj:`wc_utils.util.chem.core.EmpiricalFormula`: empirical formula
            :obj:`int`: charge
            :obj:`float`: molecular weight    
        """
        smiles_input = self.options['smiles_input']
        
        if met_id in smiles_input:
            smiles = smiles_input[met_id]
        else:    
            structure_type = 'inchi' if 'InChI=' in structure else 'smiles'
            smiles = get_major_micro_species(structure, structure_type, 'smiles', ph=ph)        
        
        mol = openbabel.OBMol()
        conv = openbabel.OBConversion()
        conv.SetInFormat('smi')
        conv.SetOptions('c', conv.OUTOPTIONS)
        conv.ReadString(mol, smiles)        
        empirical_formula = OpenBabelUtils.get_formula(mol)
        charge = mol.GetTotalCharge()
        mol_wt = empirical_formula.get_molecular_weight()
        
        return smiles, empirical_formula, charge, mol_wt

    def populate_protein_aa_usage(self, protein_id, seq):
        """ Populate a global variable dictionary of amino acid
            usage in a protein given its sequence

        Args:
            protein_id (:obj:`str`): protein ID
            seq (:obj:`Bio.Seq.Seq`): sequence    
        """
        gvar.protein_aa_usage[protein_id] = {
                'len': len(seq) - seq.count('*'),
                '*': seq.count('*'),  # Symbol used in Bio.Seq.Seq when cds is set to False  
                'A': seq.count('A'),  # Ala: Alanine (C3 H7 N O2)
                'R': seq.count('R'),  # Arg: Arginine (C6 H14 N4 O2)
                'N': seq.count('N'),  # Asn: Asparagine (C4 H8 N2 O3)
                'D': seq.count('D'),  # Asp: Aspartic acid (C4 H7 N O4)
                'C': seq.count('C'),  # Cys: Cysteine (C3 H7 N O2 S)
                'Q': seq.count('Q'),  # Gln: Glutamine (C5 H10 N2 O3)
                'E': seq.count('E'),  # Glu: Glutamic acid (C5 H9 N O4)
                'G': seq.count('G'),  # Gly: Glycine (C2 H5 N O2)
                'H': seq.count('H'),  # His: Histidine (C6 H9 N3 O2)
                'I': seq.count('I'),  # Ile: Isoleucine (C6 H13 N O2)
                'L': seq.count('L'),  # Leu: Leucine (C6 H13 N O2)
                'K': seq.count('K'),  # Lys: Lysine (C6 H14 N2 O2)
                'M': seq.count('M'),  # Met: Methionine (C5 H11 N O2 S)
                'F': seq.count('F'),  # Phe: Phenylalanine (C9 H11 N O2)
                'P': seq.count('P'),  # Pro: Proline (C5 H9 N O2)
                'S': seq.count('S'),  # Ser: Serine (C3 H7 N O3)
                'T': seq.count('T'),  # Thr: Threonine (C4 H9 N O3)
                'W': seq.count('W'),  # Trp: Tryptophan (C11 H12 N2 O2)
                'Y': seq.count('Y'),  # Tyr: Tyrosine (C9 H11 N O3)
                'V': seq.count('V'),  # Val: Valine (C5 H11 N O2)
                'U': seq.count('U'),  # Selcys: Selenocysteine (C3 H7 N O2 Se)
                'start_aa': seq[0] if seq[0]!='*' else seq[1] # Amino acid at start codon
            }

    def determine_protein_structure_from_aa(self, 
            polymer_id, count):
        """ Determine the empirical formula, molecular weight and charge of
            a protein based on the structural information of its metabolite
            amino acid monomers to ensure consistency with the pH

        Args:
            polymer_id (:obj:`str`): polymer ID
            count (:obj:`dict`): dictionary showing the count of each amino
                acid in the protein

        Returns:
            :obj:`wc_utils.util.chem.EmpiricalFormula`: protein empirical formula
            :obj:`float`: protein molecular weight
            :obj:`int`: protein charge
            :obj:`bool`: True if protein structure has been successfully determined
                from the metabolite monomer, else False             
        """                
        model = self.model
        amino_acid_id_conversion = self.options['amino_acid_id_conversion']
        
        total_empirical_formula = EmpiricalFormula()
        total_molecular_weight = 0
        total_charge = 0
        polymer_len = 0

        if not amino_acid_id_conversion:            
            return EmpiricalFormula(), 0, 0, False        
        
        for standard_id, met_id in amino_acid_id_conversion.items():
            if standard_id in count:
                monomer_species_type = model.species_types.get_one(id=met_id)
                if monomer_species_type:
                    total_empirical_formula += \
                        monomer_species_type.structure.empirical_formula * count[standard_id]
                    total_molecular_weight += \
                        monomer_species_type.structure.molecular_weight * count[standard_id]
                    total_charge += monomer_species_type.structure.charge * count[standard_id]
                    polymer_len += count[standard_id]    
                else:
                    return EmpiricalFormula(), 0, 0, False
            else:
                return EmpiricalFormula(), 0, 0, False        
        
        protein_empirical_formula = total_empirical_formula - \
            EmpiricalFormula('H{}O{}'.format(2 * (polymer_len - 1), polymer_len - 1))
        protein_molecular_weight = total_molecular_weight - \
            2 * (polymer_len - 1) * mendeleev.element('H').atomic_weight - \
            (polymer_len - 1) * mendeleev.element('O').atomic_weight 
        protein_charge = total_charge    
            
        model_species_type = model.species_types.get_one(id=polymer_id)
        if model_species_type:
            model_species_type.structure.empirical_formula = protein_empirical_formula
            model_species_type.structure.molecular_weight = protein_molecular_weight
            model_species_type.structure.charge = protein_charge
        
        return protein_empirical_formula, protein_molecular_weight, protein_charge, True