bpforms/alphabet/protein.py from KarrLab/bpforms

bpforms/alphabet/protein.py
Summary

Maintainability

2 wks
Test Coverage

34%
Issues
Coverage
""" Alphabet and BpForm to represent modified proteins

:Author: Jonathan Karr <karr@mssm.edu>
:Date: 2019-02-05
:Copyright: 2019, Karr Lab
:License: MIT
"""

from bpforms.core import (Alphabet, AlphabetBuilder, Monomer, MonomerSequence,
                          Bond, Atom, BpForm, Identifier, IdentifierSet, SynonymSet,
                          BpFormsWarning)
from bpforms.alphabet.core import download_pdb_ccd, parse_pdb_ccd
from bs4 import BeautifulSoup
from ftplib import FTP
from wc_utils.util.chem import EmpiricalFormula, OpenBabelUtils
from wc_utils.util.chem.marvin import get_major_micro_species
import bpforms.xlink.core
import glob
try:
    import jnius
except ModuleNotFoundError: # pragma: no cover
    pass # pragma: no cover
import openbabel
import os.path
import pandas
import pkg_resources
import re
import requests
import requests_cache
import warnings
import zipfile


filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'protein.yml'))
protein_alphabet = Alphabet().from_yaml(filename)
# :obj:`Alphabet`: Alphabet for protein amino acids

canonical_filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'protein.canonical.yml'))
canonical_protein_alphabet = Alphabet().from_yaml(canonical_filename)
# :obj:`Alphabet`: Alphabet for canonical protein amino acids


class ProteinAlphabetBuilder(AlphabetBuilder):
    """ Build protein alphabet from the `PDB Chemical Component Dictionary <http://www.wwpdb.org/data/ccd>` and
    `RESID <https://proteininformationresource.org/resid/>`_ """

    MAX_RETRIES = 5

    def run(self, ph=None, major_tautomer=False, dearomatize=False, path=filename):
        """ Build alphabet and, optionally, save to YAML file

        Args:
            ph (:obj:`float`, optional): pH at which calculate major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule
            path (:obj:`str`, optional): path to save alphabet

        Returns:
            :obj:`Alphabet`: alphabet
        """
        return super(ProteinAlphabetBuilder, self).run(ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize, path=path)

    def build(self, ph=None, major_tautomer=False, dearomatize=False):
        """ Build alphabet

        Args:
            ph (:obj:`float`, optional): pH at which calculate major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule

        Returns:
            :obj:`Alphabet`: alphabet
        """
        # initialize alphabet
        alphabet = Alphabet()

        # load canonical monomeric forms
        alphabet.from_yaml(canonical_filename)
        alphabet.id = 'protein'
        alphabet.name = 'Protein amino acids'
        alphabet.description = ('The canonical protein amino acids, plus non-canonical amino acids based on '
                                '<a href="http://www.wwpdb.org/data/ccd">PDB Chemical Component Dictionary</a> '
                                'and <a href="https://pir.georgetown.edu/resid">RESID</a>')

        # build from sources
        self.build_from_resid(alphabet, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)
        self.build_from_pdb(alphabet, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)
        self.build_from_mod(alphabet, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)

        # corrections
        alphabet.monomers.A.identifiers.add(Identifier('chebi', 'CHEBI:46217'))
        alphabet.monomers.C.identifiers.add(Identifier('chebi', 'CHEBI:29950'))
        alphabet.monomers.D.identifiers.add(Identifier('chebi', 'CHEBI:29958'))
        alphabet.monomers.E.identifiers.add(Identifier('chebi', 'CHEBI:29972'))
        alphabet.monomers.F.identifiers.add(Identifier('chebi', 'CHEBI:29997'))
        alphabet.monomers.G.identifiers.add(Identifier('chebi', 'CHEBI:29947'))
        alphabet.monomers.H.identifiers.add(Identifier('chebi', 'CHEBI:29979'))
        alphabet.monomers.I.identifiers.add(Identifier('chebi', 'CHEBI:30009'))
        alphabet.monomers.K.identifiers.add(Identifier('chebi', 'CHEBI:29967'))
        alphabet.monomers.L.identifiers.add(Identifier('chebi', 'CHEBI:30006'))
        alphabet.monomers.M.identifiers.add(Identifier('chebi', 'CHEBI:16044'))
        alphabet.monomers.N.identifiers.add(Identifier('chebi', 'CHEBI:50347'))
        alphabet.monomers.P.identifiers.add(Identifier('chebi', 'CHEBI:50342'))
        alphabet.monomers.Q.identifiers.add(Identifier('chebi', 'CHEBI:30011'))
        alphabet.monomers.R.identifiers.add(Identifier('chebi', 'CHEBI:29952'))
        alphabet.monomers.S.identifiers.add(Identifier('chebi', 'CHEBI:29999'))
        alphabet.monomers.T.identifiers.add(Identifier('chebi', 'CHEBI:30013'))
        alphabet.monomers.V.identifiers.add(Identifier('chebi', 'CHEBI:30015'))
        alphabet.monomers.W.identifiers.add(Identifier('chebi', 'CHEBI:29954'))
        alphabet.monomers.Y.identifiers.add(Identifier('chebi', 'CHEBI:46858'))
        alphabet.monomers.AA0431.identifiers.add(Identifier('mod', 'MOD:00160'))
        alphabet.monomers.AA0560.identifiers.remove(Identifier('pdb.ligand', 'NAG'))
        alphabet.monomers.AA0027.identifiers.remove(Identifier('cas', '17576'))
        alphabet.monomers.AA0230.identifiers.remove(Identifier('pdb.ligand', 'NO'))
        alphabet.monomers.AA0232.identifiers.remove(Identifier('pdb.ligand', 'OTD'))
        alphabet.monomers.D.base_monomers.clear()
        if 'AA0253' in alphabet.monomers and 'A' in alphabet.monomers and \
                alphabet.monomers.A in alphabet.monomers.AA0253.base_monomers:
            alphabet.monomers.AA0253.base_monomers.remove(alphabet.monomers.A)
        if 'SEC' in alphabet.monomers and 'C' in alphabet.monomers:
            alphabet.monomers.SEC.base_monomers.add(alphabet.monomers.C)
        for monomer in alphabet.monomers.values():
            for identifier in list(monomer.identifiers):
                if identifier.ns == 'pdb.ligand' and identifier.id == 'ACE':
                    monomer.identifiers.remove(identifier)
        for monomer in alphabet.monomers.values():
            for base_monomer in list(monomer.base_monomers):
                if base_monomer.id == 'AA0025':
                    monomer.base_monomers.remove(base_monomer)
                    monomer.base_monomers.add(alphabet.monomers.C)
        for monomer in alphabet.monomers.values():
            if monomer in monomer.base_monomers:
                monomer.base_monomers.remove(monomer)

        # remove crosslinks
        filename = pkg_resources.resource_filename('bpforms', os.path.join('xlink', 'xlink.xlsx'))
        data_frame = pandas.read_excel(filename)
        for _, row in data_frame.iterrows():
            if row['alphabet'] == 'protein' and row['alphabet_id'] in alphabet.monomers:
                monomer = alphabet.monomers.pop(row['alphabet_id'])
                for o_monomer in alphabet.monomers.values():
                    if monomer in o_monomer.base_monomers:
                        o_monomer.base_monomers.remove(monomer)

        # save report
        n_only = []
        c_only = []
        no_termini = []
        for code, monomer in alphabet.monomers.items():
            if monomer.l_bond_atoms and not monomer.r_bond_atoms:
                n_only.append((code, monomer.export('smiles', options=('c',))))
            elif not monomer.l_bond_atoms and monomer.r_bond_atoms:
                c_only.append((code, monomer.export('smiles', options=('c',))))
            elif not monomer.l_bond_atoms and not monomer.r_bond_atoms:
                no_termini.append((code, monomer.export('smiles', options=('c',))))
        n_only.sort()
        c_only.sort()
        no_termini.sort()

        filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'protein.report.txt'))
        with open(filename, 'w') as file:
            file.write(('{} monomers only have N termini:\n'
                        '  Code\tSMILES\n'
                        '  {}\n\n').format(
                len(n_only), '\n  '.join('\t'.join(txt) for txt in n_only)))

            file.write(('{} monomers only have C termini:\n'
                        '  Code\tSMILES\n'
                        '  {}\n\n').format(
                len(c_only), '\n  '.join('\t'.join(txt) for txt in c_only)))

            file.write(('{} monomers have no termini:\n'
                        '  Code\tSMILES\n'
                        '  {}\n\n').format(
                len(no_termini), '\n  '.join('\t'.join(txt) for txt in no_termini)))

        # transform from residue to amino acids

        # return alphabet
        return alphabet

    def build_from_resid(self, alphabet, ph=None, major_tautomer=False, dearomatize=False):
        """ Build alphabet from RESID

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            ph (:obj:`float`, optional): pH at which calculate major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule
        """
        # get amino acid names from canonical list
        canonical_aas = {}
        for monomer in alphabet.monomers.values():
            canonical_aas[monomer.name] = monomer

        # create directory
        pdb_dir = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'resid'))
        if not os.path.isdir(pdb_dir):
            os.mkdir(pdb_dir)

        # download and unzip PDB files
        zip_filename = os.path.join(pdb_dir, 'models.zip')
        if not os.path.isfile(zip_filename):
            # retrieve files from ftp.ebi.ac.uk and save them to tmpdir
            ftp = FTP('ftp.ebi.ac.uk')
            ftp.login()
            ftp.cwd('pub/databases/RESID/')

            with open(zip_filename, 'wb') as file:
                ftp.retrbinary('RETR %s' % 'models.zip', file.write)

            # quit ftp
            ftp.quit()

            # extract pdbs from models.zip into tmp folder
            with zipfile.ZipFile(os.path.join(pdb_dir, 'models.zip'), 'r') as z:
                z.extractall(pdb_dir)

        # create session to get metadata
        cache_name = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'protein'))
        session = requests_cache.core.CachedSession(cache_name, backend='sqlite', expire_after=None)
        session.mount('http://', requests.adapters.HTTPAdapter(max_retries=self.MAX_RETRIES))

        # extract name of the molecule from pdb file
        base_monomers = {}
        monomer_ids = {}
        for file in glob.iglob(os.path.join(pdb_dir, '*.PDB')):
            id, _ = os.path.splitext(os.path.basename(file))
            names = []
            with open(file, 'r') as f:
                for line in f:
                    if re.match(r"^COMPND    ", line):
                        part1 = str(line[10:].strip())
                        names.append(part1)

                    # check if name is on two lines (when too long)
                    if re.match(r"^COMPND   1", line):
                        part2 = str(line[10:].strip())
                        names.append(part2)
            name = ''.join(names)

            result = self.get_resid_monomer_structure(name, file, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)
            if result is None:
                warnings.warn('Ignoring monomeric form {} that has no structure'.format(id), BpFormsWarning)
                continue
            structure, index_n, index_c = result

            code, synonyms, identifiers, base_monomer_ids, comments = self.get_resid_monomer_details(id, session)
            if code is None or code in alphabet.monomers:
                code = id

            if name in canonical_aas:
                canonical_aa = canonical_aas[name]
                monomer_ids[id] = canonical_aa

                if name not in ['L-arginine', 'L-asparagine', 'L-valine']:
                    canonical_aa.structure = structure
                    canonical_aa.l_bond_atoms = []
                    canonical_aa.l_displaced_atoms = []
                    canonical_aa.r_bond_atoms = []
                    canonical_aa.r_displaced_atoms = []
                    self.set_termini(structure, canonical_aa, index_n, index_c)

                warnings.warn('Updated canonical monomeric form {}'.format(name), BpFormsWarning)
                continue

            monomer = Monomer(
                id=id,
                name=name,
                synonyms=synonyms,
                identifiers=identifiers,
                structure=structure,
                comments=comments,
            )

            self.set_termini(structure, monomer, index_n, index_c)

            alphabet.monomers[code] = monomer

            monomer_ids[id] = monomer
            base_monomers[monomer] = base_monomer_ids

        for monomer, base_monomer_ids in base_monomers.items():
            for base_monomer_id in base_monomer_ids:
                base_monomer = monomer_ids.get(base_monomer_id, None)
                if base_monomer == None:
                    warnings.warn('Base {} for {} is invalid'.format(base_monomer_id, monomer.id),
                                  BpFormsWarning)  # pragma: no cover # all parent entries are retained
                else:
                    monomer.base_monomers.add(base_monomer)

    def get_resid_monomer_structure(self, name, pdb_filename, ph=None, major_tautomer=False, dearomatize=False):
        """ Get the structure of an amino acid from a PDB file

        Args:
            name (:obj:`str`): name of monomeric form
            pdb_filename (:obj:`str`): path to PDB file with structure
            ph (:obj:`float`, optional): pH at which calculate major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule

        Returns:
            :obj:`openbabel.OBMol`: structure
            :obj:`int`: index of atom of N terminus
            :obj:`int`: index of atom of C terminus
        """
        pdb_mol = openbabel.OBMol()
        conv = openbabel.OBConversion()
        assert conv.SetInFormat('pdb'), 'Unable to set format to PDB'
        conv.ReadFile(pdb_mol, pdb_filename)

        # removing modified monomeric forms where metal present in structure because:
        # - inchi structure generated separates each non covalently bound parts of the monomeric form
        # - for many cases theses structures consist of a group of modified monomeric form coordinating
        # a metal, and not a single PTM monomeric form per se
        assert conv.SetOutFormat('inchi'), 'Unable to set format to InChI'
        inchi = conv.WriteString(pdb_mol)
        formula = inchi.split('/')[1]
        if '.' in formula:
            warnings.warn('Ignoring metal coordinated monomeric form {}'.format(name), BpFormsWarning)
            return None

        assert conv.SetOutFormat('smi')
        conv.SetOptions('c', conv.OUTOPTIONS)
        smiles = conv.WriteString(pdb_mol).partition('\t')[0]
        if ph is not None:
            if not get_major_micro_species:
                raise ImportError("Marvin must be installed to calculate the major microspecies")
            smiles = get_major_micro_species(smiles, 'smiles', 'smiles', ph, major_tautomer=major_tautomer, dearomatize=dearomatize)
        smiles_mol = openbabel.OBMol()
        assert conv.SetInFormat('smi')
        conv.ReadString(smiles_mol, smiles)

        assert conv.SetOutFormat('smi'), 'Unable to set format to SMILES'
        conv.SetOptions('c', conv.OUTOPTIONS)
        smiles = conv.WriteString(smiles_mol).partition('\t')[0]
        assert conv.SetInFormat('smi'), 'Unable to set format to SMILES'
        conv.ReadString(smiles_mol, smiles)

        _, i_n, i_c = self.get_termini(smiles_mol, residue=False)
        if i_n is None or i_c is None:
            smiles_mol_2 = openbabel.OBMol()
            conv.ReadString(smiles_mol_2, smiles)
            _, i_n_2, i_c_2 = self.get_termini(smiles_mol_2)
            if ((i_n_2 is not None) + (i_c_2 is not None)) > ((i_n is not None) + (i_c is not None)):
                smiles = conv.WriteString(smiles_mol_2)

                smiles_mol = openbabel.OBMol()
                assert conv.SetInFormat('smi')
                conv.ReadString(smiles_mol, smiles)
                smiles = conv.WriteString(smiles_mol_2)

                smiles_mol = openbabel.OBMol()
                assert conv.SetInFormat('smi')
                conv.ReadString(smiles_mol, smiles)

                _, i_n, i_c = self.get_termini(smiles_mol, residue=False)

        return (smiles_mol, i_n, i_c)

    def build_from_pdb(self, alphabet, ph=None, major_tautomer=False, dearomatize=False):
        """ Build alphabet from `PDB Chemical Component Dictionary <http://www.wwpdb.org/data/ccd>`_

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            ph (:obj:`float`, optional): pH at which calculate major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule
        """
        smiles_to_monomer = {}
        pdb_ligand_to_monomer = {}
        name_to_monomer = {}
        ambiguous_names = []
        for monomer in alphabet.monomers.values():
            smiles_to_monomer[monomer.export('smiles', options=('c',))] = monomer

            for identifier in monomer.identifiers:
                if identifier.ns == 'pdb.ligand':
                    pdb_ligand_to_monomer[identifier.id] = monomer
                    break

            if monomer.name is not None:
                if monomer.name in name_to_monomer:
                    ambiguous_names.append(monomer.name.lower())
                name_to_monomer[monomer.name.lower()] = monomer
            for synonym in monomer.synonyms:
                if synonym in name_to_monomer:
                    ambiguous_names.append(synonym.lower())
                name_to_monomer[synonym.lower()] = monomer

        for name in ambiguous_names:
            if name in name_to_monomer:
                name_to_monomer.pop(name)

        smiles_conv = openbabel.OBConversion()
        assert smiles_conv.SetInFormat('smi'), 'Unable to set format to SMILES'
        assert smiles_conv.SetOutFormat('smi'), 'Unable to set format to SMILES'
        smiles_conv.SetOptions('c', smiles_conv.OUTOPTIONS)

        n_monomers = 0
        base_monomers = {}
        same_structures = []
        same_pdb_ids = []
        same_names = []
        potential_incorrect_merges = []
        new_monomers = []
        pdb_ccd_id_to_monomers = {}

        filename = download_pdb_ccd()
        valid_types = ('L-peptide linking',
                       'L-peptide COOH carboxy terminus',
                       'L-peptide NH3 amino terminus')
        for pdb_monomer, base_monomer, smiles, _, _ in \
                parse_pdb_ccd(filename, valid_types, self._max_monomers):
            id = pdb_monomer.id
            name = pdb_monomer.name
            synonyms = pdb_monomer.synonyms
            identifiers = pdb_monomer.identifiers
            if base_monomer is not None:
                base_monomers[id] = base_monomer

            # correct to residue
            mol = openbabel.OBMol()
            smiles_conv.ReadString(mol, smiles)

            _, i_n, i_c = self.get_termini(mol, residue=False)
            if i_n is None or i_c is None:
                mol_2 = openbabel.OBMol()
                smiles_conv.ReadString(mol_2, smiles)
                _, i_n_2, i_c_2 = self.get_termini(mol_2)
                if ((i_n_2 is not None) + (i_c_2 is not None)) > ((i_n is not None) + (i_c is not None)):
                    mol = mol_2
                    i_n = i_n_2
                    i_c = i_c_2

            smiles = smiles_conv.WriteString(mol).partition('\t')[0]

            # correct structure for pH
            if ph:
                try:
                    smiles = get_major_micro_species(smiles, 'smiles', 'smiles', ph,
                                                     major_tautomer=major_tautomer, dearomatize=dearomatize)
                except jnius.JavaException:
                    continue

            mol = openbabel.OBMol()
            smiles_conv.ReadString(mol, smiles)
            smiles = smiles_conv.WriteString(mol).partition('\t')[0]

            mol = openbabel.OBMol()
            smiles_conv.ReadString(mol, smiles)
            smiles = smiles_conv.WriteString(mol).partition('\t')[0]

            # exclude from alphabet because entry minus O- is equivalent to another entry
            if id in ['ASA', 'GND']:
                continue

            # merge into alphabet
            monomer = None

            if id == 'ARG':
                monomer = alphabet.monomers.R
            elif id == 'HIS':
                monomer = alphabet.monomers.H
            elif id == 'LYS':
                monomer = alphabet.monomers.K
            elif id not in ['ABA']:
                monomer = smiles_to_monomer.get(smiles, None)

            if monomer is None:
                monomer = pdb_ligand_to_monomer.get(id, None)
                if monomer is None:
                    if id not in ['HSK', 'MTY', 'SUI']:
                        names = list(synonyms)
                        if name is not None:
                            names.append(name)

                        for n in names:
                            monomer = name_to_monomer.get(n.lower(), None)
                            if monomer is not None:
                                same_names.append((id, monomer.id, n))
                                break

                    if monomer is None:
                        new_monomers.append(id)
                else:
                    same_pdb_ids.append((id, smiles, monomer.export('smiles', options=('c',))))
            else:
                resid_id = ''
                pdb_ligand_id = ''
                for identifier in monomer.identifiers:
                    if identifier.ns == 'resid':
                        resid_id = identifier.id
                    if identifier.ns == 'pdb.ligand':
                        pdb_ligand_id = identifier.id
                same_structures.append((id, resid_id, pdb_ligand_id))

            n_monomers += 1
            if monomer is not None:
                monomer.synonyms.add(name)
                monomer.synonyms.update(synonyms)
                for identifier in monomer.identifiers:
                    if identifier.ns == 'pdb-ccd':
                        potential_incorrect_merges.append((id, identifier.id))
                monomer.identifiers.update(identifiers)
            else:
                monomer = Monomer(id=id, name=name, synonyms=synonyms,
                                  identifiers=identifiers, structure=smiles)

                _, i_n, i_c = self.get_termini(monomer.structure, residue=False)
                self.set_termini(mol, monomer, i_n, i_c)

                assert id not in alphabet.monomers
                alphabet.monomers[id] = monomer

            pdb_ccd_id_to_monomers[id] = monomer

        # set base monomers
        for monomer_id, base_id in base_monomers.items():
            monomer = pdb_ccd_id_to_monomers.get(monomer_id, None)
            base = pdb_ccd_id_to_monomers.get(base_id, None)
            if monomer is not None and base is not None:
                monomer.base_monomers.add(base)

        # save summary of merging
        filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'pdb', 'merge-report.txt'))
        with open(filename, 'w') as file:
            file.write(('{} monomers were merged into the alphabet because they have the same structures (SMILES):\n'
                        '  PDB-CCD ID\tRESID ID\tPDB Ligand RESID ID\n'
                        '  {}\n\n').format(
                len(same_structures), '\n  '.join(sorted('\t'.join(n) for n in same_structures))))

            file.write(('{} monomers were merged into the alphabet because they have the same PDB ids (ligand / CCD), '
                        'but different structures (e.g., different stereochemistry or bond order):\n'
                        '  PDB-CCD ID\tPDB-CCD SMILES\tRESID SMILES\n'
                        '  {}\n\n').format(
                len(same_pdb_ids), '\n  '.join(sorted('\t'.join(n) for n in same_pdb_ids))))

            file.write(('{} monomers were merged into the alphabet because they have the same names:\n'
                        '  PDB-CCD ID\tRESID ID\tName\n'
                        '  {}\n\n').format(
                len(same_names), '\n  '.join(sorted('\t'.join(n) for n in same_names))))

            file.write(('{} monomers were potentially merged incorrectly:\n'
                        '  PDB-CCD ID-1\tPDB-CCD ID-2\n'
                        '  {}\n\n').format(
                len(potential_incorrect_merges), '\n  '.join(sorted('\t'.join(n) for n in potential_incorrect_merges))))

            file.write('{} monomers were added to the alphabet:\n  {}\n'.format(
                len(new_monomers), '\n  '.join(sorted(new_monomers))))

    def build_from_mod(self, alphabet, ph=None, major_tautomer=False, dearomatize=False):
        """ Build alphabet from PSI-MI ontology

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            ph (:obj:`float`, optional): pH at which calculate major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule
        """
        alphabet.monomers['SGergerOMeCys'] = Monomer(
            id='SGergerOMeCys',
            name='S-geranylgeranyl-L-cysteine methyl ester',
            identifiers=IdentifierSet([Identifier('mod', 'MOD:01119')]),
            base_monomers=[alphabet.monomers.C],
            structure='COC(=O)[C@H](CSC/C=C(/CC/C=C(/CC/C=C(/CCC=C(C)C)\C)\C)\C)[NH3+]',
            l_bond_atoms=[Atom(Monomer, element='N', position=29, charge=-1)],
            l_displaced_atoms=[Atom(Monomer, element='H', position=29),
                               Atom(Monomer, element='H', position=29, charge=1)])

    def set_termini(self, mol, monomer, i_n, i_c):
        """ Set the C and N terminal bond atoms of a monomer

        Args:
            mol (:obj:`openbabel.OBMol`): molecule
            monomer (:obj:`Monomer`): monomer
            i_n (:obj:`int`): index of N terminus
            i_c (:obj:`int`): index of C terminus
        """
        if i_c:
            c_atom = mol.GetAtom(i_c)
            i_o = None
            for bond in openbabel.OBAtomBondIter(c_atom):
                if bond.GetBondOrder() != 1:
                    continue

                o_atom = bond.GetBeginAtom()
                if o_atom.GetIdx() == c_atom.GetIdx():
                    o_atom = bond.GetEndAtom()

                if o_atom.GetAtomicNum() == 8:
                    other_atoms = sorted([other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(o_atom)])
                    if len(other_atoms) == 1 or other_atoms[-2] == 1:
                        i_o = o_atom.GetIdx()
                        break

            if i_o:
                monomer.r_bond_atoms.append(Atom(Monomer, element='C', position=i_c))
                if o_atom.GetFormalCharge() == 0:
                    monomer.r_displaced_atoms.append(Atom(Monomer, element='O', position=i_o))
                    monomer.r_displaced_atoms.append(Atom(Monomer, element='H', position=i_o))
                else:
                    monomer.r_displaced_atoms.append(Atom(Monomer, element='O', position=i_o, charge=-1))

        if i_n:
            atom = mol.GetAtom(i_n)
            other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(atom)]
            tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(atom)])
            other_atoms += [1] * (3 + atom.GetFormalCharge() - tot_bond_order)
            n_charge = atom.GetFormalCharge()
            n_h = other_atoms.count(1)

            monomer.l_bond_atoms.append(Atom(Monomer, element='N', position=i_n, charge=-n_charge))
            if n_charge >= 0 and n_h >= 1:
                monomer.l_displaced_atoms.append(Atom(Monomer, element='H', position=i_n))
            if n_charge >= 1 and n_h >= 2:
                monomer.l_displaced_atoms.append(Atom(Monomer, element='H', position=i_n, charge=1))

    def get_termini(self, mol, residue=True):
        """ Get indices of atoms of N and C termini

        Args:
            mol (:obj:`openbabel.OBMol`): molecule
            residue (:obj:`bool`, optional): if :obj:`True`, search for a residue (H instead of O- at C terminus)

        Returns:
            :obj:`openbabel.OBMol`: structure
            :obj:`int`: index of atom of N terminus
            :obj:`int`: index of atom of C terminus
        """
        # find N and C termini
        atom_ns = []
        atom_cs = []
        for i_atom in range(1, mol.NumAtoms() + 1):
            atom = mol.GetAtom(i_atom)
            if self.is_n_terminus(mol, atom):
                atom_ns.append(atom)
            elif self.is_c_terminus(mol, atom, residue=residue):
                atom_cs.append(atom)

        termini = []
        for atom_n in atom_ns:
            for atom_c in atom_cs:
                if self.is_terminus(atom_n, atom_c):
                    termini.append((atom_n, atom_c))

        if not termini:
            if atom_ns and atom_cs:
                termini.append((atom_ns[0], atom_cs[0]))
            elif atom_ns:
                termini.append((atom_ns[0], None))
            elif atom_cs:
                termini.append((None, atom_cs[0]))

        if termini:
            atom_n, atom_c = termini[0]

            if atom_n:
                idx_n = atom_n.GetIdx()
            else:
                idx_n = None

            if atom_c:
                idx_c = atom_c.GetIdx()
                if residue:
                    self.is_c_terminus(mol, atom_c, residue=residue, convert_to_aa=True)
            else:
                idx_c = None
        else:
            idx_n = None
            idx_c = None

        return (mol, idx_n, idx_c)

    def is_n_terminus(self, mol, atom):
        """ Determine if an atom is an N-terminus

        Args:
            mol (:obj:`openbabel.OBMol`): molecule
            atom (:obj:`openbabel.OBAtom`): atom

        Returns:
            :obj:`bool`: :obj:`True` if the atom is an N-terminus
        """
        if atom is None:
            return False

        # check atom is nitrogen
        if atom.GetAtomicNum() != 7:
            return False

        # check atom has charge 0 or +1
        if atom.GetFormalCharge() < 0:
            return False

        # check atom is single-bonded to at least 1 carbon
        has_single_c = False
        for bond in openbabel.OBAtomBondIter(atom):
            other_atom = bond.GetBeginAtom()
            if other_atom == atom:
                other_atom = bond.GetEndAtom()
            if bond.GetBondOrder() == 1 and other_atom.GetAtomicNum() == 6:
                has_single_c = True
        if not has_single_c:
            return False

        # check atom can form another bond
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(atom)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(atom)])
        other_atoms += [1] * (3 + atom.GetFormalCharge() - tot_bond_order)
        n_charge = atom.GetFormalCharge()
        n_h = other_atoms.count(1)
        if not ((n_charge == 1 and n_h >= 2) or
                (n_charge == 0 and (n_h >= 1 or len(other_atoms) <= 2))):
            return False

        # return True
        return True

    def is_c_terminus(self, mol, atom, residue=True, convert_to_aa=False):
        """ Determine if an atom is an C-terminus

        Args:
            mol (:obj:`openbabel.OBMol`): molecule
            atom (:obj:`openbabel.OBAtom`): atom
            residue (:obj:`bool`, optional): if :obj:`True`, search for a residue (H instead of O- at C terminus)
            convert_to_aa (:obj:`bool`, optional): if :obj:`True`, convert COH to COOH

        Returns:
            :obj:`bool`: :obj:`True` if the atom is an C-terminus
        """
        if atom is None:
            return False

        # check atom is carbon
        if atom.GetAtomicNum() != 6:
            return False

        # check atom has charge 0
        if atom.GetFormalCharge() != 0:
            return False

        # check atom is bonded to 1 oxygen, 1 carbon, and 1 hydrogen atom (residue=True) or 1 oxygen (residue=False) atom
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(atom)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(atom)])
        other_atoms += [1] * (4 + atom.GetFormalCharge() - tot_bond_order)
        other_atoms.sort()
        if set(other_atoms).difference(set([1, 6, 8])):
            return False
        if 6 not in other_atoms or 8 not in other_atoms:
            return False
        if len(other_atoms) != 3:
            return False
        if residue and other_atoms[0] != 1:
            return False
        if not residue and other_atoms[1] != 8:
            return False

        # check bonds to carbon and hydrogen are single bonds and bond to oxygen is a double bond
        o_single_bonds = []
        o_double_bonds = []
        for bond in openbabel.OBAtomBondIter(atom):
            other_atom = bond.GetBeginAtom()
            if other_atom == atom:
                other_atom = bond.GetEndAtom()
            if other_atom.GetAtomicNum() == 8:
                if bond.GetBondOrder() == 1:
                    o_single_bonds.append((bond, other_atom))
                elif bond.GetBondOrder() == 2:
                    o_double_bonds.append((bond, other_atom))
                else:
                    return False
            elif bond.GetBondOrder() != 1:
                return False

        if residue and (len(o_single_bonds) != 0 or len(o_double_bonds) != 1):
            return False
        if not residue and (len(o_single_bonds) != 1 or len(o_double_bonds) != 1):
            return False

        # if not residue, check that single oxygen is bound to H
        if not residue:
            oxygen_atom = o_single_bonds[0][1]
            for bond in openbabel.OBAtomBondIter(oxygen_atom):
                other_atom = bond.GetBeginAtom()
                if other_atom == oxygen_atom:
                    other_atom = bond.GetEndAtom()
                if other_atom.GetIdx() != atom.GetIdx() and other_atom.GetAtomicNum() != 1:
                    return False

        # correct residue to amino acid
        if convert_to_aa:
            h_atom = None
            for other_atom in openbabel.OBAtomAtomIter(atom):
                if other_atom.GetAtomicNum() == 1:
                    h_atom = other_atom
                    break

            if h_atom is not None:
                h_atom.SetAtomicNum(8)
            else:
                o_atom = mol.NewAtom()
                o_atom.SetAtomicNum(8)
                o_atom.SetFormalCharge(0)

                bond = openbabel.OBBond()
                bond.SetBegin(atom)
                bond.SetEnd(o_atom)
                bond.SetBondOrder(1)
                assert mol.AddBond(bond)

        # return True
        return True

    def is_terminus(self, atom_n, atom_c):
        """ Determine if a pair of atoms are N- and C-termini

        Args:
            atom_n (:obj:`openbabel.OBAtom`): potential N-terminus
            atom_c (:obj:`openbabel.OBAtom`): potential C-terminus

        Returns:
            :obj:`bool`: :obj:`True`, if the atoms are N- and C-termini
        """
        atom_n_neighbors = set((atom.GetIdx(), atom.GetAtomicNum()) for atom in openbabel.OBAtomAtomIter(atom_n))
        atom_c_neighbors = set((atom.GetIdx(), atom.GetAtomicNum()) for atom in openbabel.OBAtomAtomIter(atom_c))
        connecting_atoms = atom_n_neighbors.intersection(atom_c_neighbors)

        for idx, atomic_num in connecting_atoms:
            if atomic_num == 6:
                return True

        return False

    def get_resid_monomer_details(self, id, session):
        """ Get the CHEBI ID and synonyms of an amino acid from its RESID webpage

        Args:
            input_pdb (:obj:`str`): id of RESID entry

        Returns:
            :obj:`str`: code
            :obj:`SynonymSet`: set of synonyms
            :obj:`IdentifierSet`: set of identifiers
            :obj:`set` of :obj:`str`: ids of base monomeric forms
            :obj:`str`: comments
        """

        page = session.get('https://proteininformationresource.org/cgi-bin/resid?id=' + id)
        soup = BeautifulSoup(page.text, features="lxml")

        paragraphs = soup.select('p.annot')
        code = None
        synonyms = SynonymSet()
        identifiers = IdentifierSet()
        base_monomer_ids = set()
        comments = []
        for paragraph in paragraphs:
            text = paragraph.get_text()

            # code
            if 'Sequence code: ' in text and code is None:
                _, _, code = text.partition('Sequence code: ')
                code, _, _ = code.partition('#')
                code, _, _ = code.partition('\n')
                code = code.strip()

            # get synonyms
            if 'Alternate names:' in text:
                l = re.split("[:;]", text.strip())[1:]
                synonyms.update(map(lambda x: x.strip(), l))

            if 'Systematic name:' in text:
                _, _, systematic_name = text.partition('Systematic name:')
                systematic_name, _, _ = systematic_name.partition('\n')
                synonyms.add(systematic_name.strip())

            # ChEBI id and HETATM name
            identifiers.add(Identifier('resid', id))

            if 'Cross-references: ' in text:
                l = re.split("[:;]", text.strip())[1:]
                l2 = list(map(lambda x: x.strip(), l))
                if 'CAS' in l2:
                    identifiers.add(Identifier('cas', l2[l2.index('CAS')+1]))

                if 'ChEBI' in l2:
                    identifiers.add(Identifier('chebi', 'CHEBI:' + l2[l2.index('ChEBI')+1]))

                if 'PDBHET' in l2:
                    identifiers.add(Identifier('pdb.ligand', l2[l2.index('PDBHET')+1]))

                if 'PSI-MOD' in l2:
                    identifiers.add(Identifier('mod', 'MOD:' + l2[l2.index('PSI-MOD')+1]))

                if 'GO' in l2:
                    identifiers.add(Identifier('go', 'GO:' + l2[l2.index('GO')+1]))

            # base amino acid
            if 'Based on ' in text:
                base_monomer_ids.update(text.partition('Based on ')[2].split('+'))

            # comments
            if 'Comment: ' in text:
                comments.append(text.partition('Comment: ')[2].strip())

            if 'Generating Enzyme:' in text:
                _, _, comment = text.partition('Generating Enzyme:')
                comment = comment.strip()
                comment = 'Generating Enzyme: ' + comment
                if comment[-1] != '.':
                    comment += '.'
                comments.append(comment)

        if comments:
            comments = ' '.join(comments)
        else:
            comments = None
        return code, synonyms, identifiers, base_monomer_ids, comments


class ProteinForm(BpForm):
    """ Protein form """

    DEFAULT_FASTA_CODE = 'X'

    def __init__(self, seq=None, circular=False):
        """
        Args:
            seq (:obj:`MonomerSequence`, optional): sequence of monomeric forms of the protein
            circular (:obj:`bool`, optional): if :obj:`True`, indicates that the biopolymer is circular
        """
        super(ProteinForm, self).__init__(
            seq=seq, alphabet=protein_alphabet,
            backbone=None,
            bond=Bond(
                r_bond_atoms=[Atom(Monomer, element='C', position=None)],
                l_bond_atoms=[Atom(Monomer, element='N', position=None)],
                r_displaced_atoms=[Atom(Monomer, element='O', position=None, charge=-1),
                                   Atom(Monomer, element='H', position=None)],
                l_displaced_atoms=[Atom(Monomer, element='H', position=None),
                                   Atom(Monomer, element='H', position=None, charge=1)]),
            circular=circular)


class CanonicalProteinForm(BpForm):
    """ Canonical protein form """

    DEFAULT_FASTA_CODE = 'X'

    def __init__(self, seq=None, circular=False):
        """
        Args:
            seq (:obj:`MonomerSequence`, optional): sequence of monomeric forms of the protein
            circular (:obj:`bool`, optional): if :obj:`True`, indicates that the biopolymer is circular
        """
        super(CanonicalProteinForm, self).__init__(
            seq=seq, alphabet=canonical_protein_alphabet,
            backbone=None,
            bond=Bond(
                r_bond_atoms=[Atom(Monomer, element='C', position=None)],
                l_bond_atoms=[Atom(Monomer, element='N', position=None)],
                r_displaced_atoms=[Atom(Monomer, element='O', position=None, charge=-1),
                                   Atom(Monomer, element='H', position=None)],
                l_displaced_atoms=[Atom(Monomer, element='H', position=None),
                                   Atom(Monomer, element='H', position=None, charge=1)]),
            circular=circular)