bpforms/alphabet/rna.py from KarrLab/bpforms

bpforms/alphabet/rna.py
Summary

Maintainability

1 mo
Test Coverage

14%
Issues
Coverage
""" Alphabet and BpForm to represent modified RNA

:Author: Jonathan Karr <karr@mssm.edu>
:Date: 2019-02-05
:Copyright: 2019, Karr Lab
:License: MIT
"""

from bpforms.core import (Alphabet, AlphabetBuilder, Monomer, MonomerSequence, Backbone,
                          Bond, Atom, BpForm, Identifier, IdentifierSet, SynonymSet,
                          BpFormsWarning)
from bpforms.alphabet.core import (download_pdb_ccd, parse_pdb_ccd, get_pdb_ccd_open_babel_mol,
                                   get_can_smiles)
from wc_utils.util.chem import EmpiricalFormula
from wc_utils.util.chem.marvin import get_major_micro_species
import bs4
import csv
import io
import openbabel
import os.path
import pkg_resources
import re
import requests
import requests_cache
import warnings

filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'rna.yml'))
rna_alphabet = Alphabet().from_yaml(filename)
# :obj:`Alphabet`: Alphabet for RNA nucleotide monophosphates

canonical_filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'rna.canonical.yml'))
canonical_rna_alphabet = Alphabet().from_yaml(canonical_filename)
# :obj:`Alphabet`: Alphabet for canonical RNA nucleotide monophosphates


class RnaAlphabetBuilder(AlphabetBuilder):
    """ Build RNA alphabet from MODOMICS and the RNA Modification Database """

    MODOMICS_INDEX_ENDPOINT = 'http://modomics.genesilico.pl/modifications/'
    MODOMICS_INDEX_ASCII_ENDPOINT = 'http://modomics.genesilico.pl/modifications/?base=all&type=all&display_ascii=Display+as+ASCII'
    MODOMICS_ENTRY_ENDPOINT = 'http://modomics.genesilico.pl/modifications/{}/'
    RNA_MOD_DB_INDEX_ENDPOINT = 'https://mods.rna.albany.edu/mods/modifications/search'
    RNA_MOD_DB_ENTRY_ENDPOINT = 'https://mods.rna.albany.edu/mods/modifications/view/{}'
    MAX_RETRIES = 5

    def run(self, ph=None, major_tautomer=False, dearomatize=False, path=filename):
        """ Build alphabet and, optionally, save to YAML file

        Args:
            ph (:obj:`float`, optional): pH at which calculate major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule
            path (:obj:`str`, optional): path to save alphabet

        Returns:
            :obj:`Alphabet`: alphabet
        """
        return super(RnaAlphabetBuilder, self).run(ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize, path=path)

    def build(self, ph=None, major_tautomer=False, dearomatize=False):
        """ Build alphabet

        Args:
            ph (:obj:`float`, optional): pH at which to calculate the major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule

        Returns:
            :obj:`Alphabet`: alphabet
        """
        # initialize alphabet
        alphabet = Alphabet()
        alphabet.id = 'rna'
        alphabet.name = 'RNA nucleotide monophosphates'
        alphabet.description = ('The canonical RNA nucleotide monophosphates, '
                                'plus non-canonical RNA nucleotide monophosphates based on '
                                '<a href="http://modomics.genesilico.pl/modifications">MODOMICS</a> and '
                                'the <a href="https://mods.rna.albany.edu/mods/">RNA Modification Database</a>')

        # create requests session
        cache_name = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'rna'))
        session = requests_cache.core.CachedSession(cache_name, backend='sqlite', expire_after=None)
        session.mount('http://', requests.adapters.HTTPAdapter(max_retries=self.MAX_RETRIES))

        # build from databases
        self.build_modomics(alphabet, session, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)
        self.build_rna_mod_db(alphabet, session, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)

        # add missing parent/child information
        if '10G' in alphabet.monomers and 'G' in alphabet.monomers:
            alphabet.monomers['10G'].base_monomers.add(alphabet.monomers['G'])
        if 'manQ' in alphabet.monomers and '10G' in alphabet.monomers:
            alphabet.monomers['manQ'].base_monomers.add(alphabet.monomers['10G'])
        if '102G' in alphabet.monomers and '10G' in alphabet.monomers:
            alphabet.monomers['102G'].base_monomers.add(alphabet.monomers['10G'])
        if '103G' in alphabet.monomers and 'G' in alphabet.monomers:
            alphabet.monomers['103G'].base_monomers.add(alphabet.monomers['G'])
        if '104G' in alphabet.monomers and '10G' in alphabet.monomers:
            alphabet.monomers['104G'].base_monomers.add(alphabet.monomers['10G'])
        if '106G' in alphabet.monomers and '10G' in alphabet.monomers:
            alphabet.monomers['106G'].base_monomers.add(alphabet.monomers['10G'])

        # convert to nucleosides to nucleotide monophosphates
        conv = openbabel.OBConversion()
        assert conv.SetInFormat('smi'), 'Unable to set format to SMILES'
        assert conv.SetOutFormat('smi'), 'Unable to set format to SMILES'
        conv.SetOptions('c', conv.OUTOPTIONS)

        pi_smiles = 'OP([O-])([O-])=O'

        for monomer in alphabet.monomers.values():
            # add PI to molecule
            monomer_structure = openbabel.OBMol(monomer.structure)
            n_monomer_atoms = monomer_structure.NumAtoms()

            pi = openbabel.OBMol()
            conv.ReadString(pi, pi_smiles)

            mol = openbabel.OBMol()
            mol += monomer_structure
            mol += pi

            # get atom references
            monomer_backbone_o = mol.GetAtom(monomer.backbone_bond_atoms[0].position)
            assert monomer_backbone_o.GetAtomicNum() == 8

            monomer_backbone_h = None
            for other_atom in openbabel.OBAtomAtomIter(monomer_backbone_o):
                if other_atom.GetAtomicNum() == 1:
                    monomer_backbone_h = other_atom
                    break

            pi_p = mol.GetAtom(n_monomer_atoms + 2)
            pi_o = mol.GetAtom(n_monomer_atoms + 4)
            pi_oh = mol.GetAtom(n_monomer_atoms + 1)

            # form bond with PI
            bond = openbabel.OBBond()
            bond.SetBegin(monomer_backbone_o)
            bond.SetEnd(pi_p)
            bond.SetBondOrder(1)
            assert mol.AddBond(bond)

            # remove displaced atoms
            mol.DeleteAtom(pi_oh, True)
            if monomer_backbone_h is not None:
                mol.DeleteAtom(monomer_backbone_h, True)

            # canonicalize atom indices
            smiles = conv.WriteString(mol).partition('\t')[0]

            mol2 = openbabel.OBMol()
            conv.ReadString(mol2, smiles)
            smiles = conv.WriteString(mol2).partition('\t')[0]

            mol2 = openbabel.OBMol()
            conv.ReadString(mol2, smiles)
            smiles = conv.WriteString(mol2).partition('\t')[0]

            # get indices of termini
            atom_ls = []
            atom_rs = []
            for i_atom in range(1, mol2.NumAtoms() + 1):
                atom = mol2.GetAtom(i_atom)
                if self.is_l_atom(atom):
                    atom_ls.append(atom)
                elif self.is_r_bond_atom(atom):
                    atom_rs.append(atom)

            i_left_p = None
            i_right_o = None
            i_left_o = None

            for atom_l in atom_ls:
                for atom_r in atom_rs:
                    if self.is_nucleotide_terminus(atom_l, atom_r):
                        i_left_p = atom_l.GetIdx()
                        i_right_o = atom_r.GetIdx()
                        break

            for other_atom in openbabel.OBAtomAtomIter(atom_l):
                if other_atom.GetFormalCharge() == -1:
                    i_left_o = other_atom.GetIdx()

            # update properties of monomer
            monomer.structure = smiles
            monomer.backbone_bond_atoms = []
            monomer.backbone_displaced_atoms = []
            monomer.l_bond_atoms = [Atom(Monomer, element='P', position=i_left_p)]
            monomer.l_displaced_atoms = [Atom(Monomer, element='O', position=i_left_o, charge=-1)]
            monomer.r_bond_atoms = [Atom(Monomer, element='O', position=i_right_o)]
            monomer.r_displaced_atoms = [Atom(Monomer, element='H', position=i_right_o)]

            assert monomer.structure.GetAtom(i_right_o).GetAtomicNum() == 8
            assert monomer.structure.GetAtom(i_left_p).GetAtomicNum() == 15
            assert monomer.structure.GetAtom(i_left_o).GetAtomicNum() == 8

        # build PDB
        self.build_pdb(alphabet, session, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)

        # return alphabet
        return alphabet

    def build_modomics(self, alphabet, session, ph=None, major_tautomer=False, dearomatize=False):
        """ Build alphabet from MODOMICS

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            session (:obj:`requests_cache.core.CachedSession`): request cache session
            ph (:obj:`float`, optional): pH at which to calculate the major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule
        """
        # get originating monomeric forms
        ascii_response = session.get(self.MODOMICS_INDEX_ASCII_ENDPOINT)
        ascii_response.raise_for_status()
        stream = io.StringIO(ascii_response.text)
        stream.readline()
        reader = csv.reader(stream, delimiter='\t', quoting=csv.QUOTE_NONE)
        base_monomer_short_names = {}
        for monomer in reader:
            for i in range(1, 4):
                short_name = monomer[i]
                if short_name:
                    break
            if short_name in ['A', 'C', 'G', 'U']:
                continue
            base_monomer_short_names[short_name] = monomer[i + 2]

        # get index of nucleosides
        response = session.get(self.MODOMICS_INDEX_ENDPOINT)
        response.raise_for_status()

        # get individual nucleosides and create monomeric forms
        doc = bs4.BeautifulSoup(response.text, 'html.parser')
        table = doc.find('table', {'class': 'datagrid'})
        tbody = table.find('tbody')
        mods = tbody.find_all('tr')
        monomer_short_names = {}
        invalid_nucleosides = []
        for i_mod, mod in enumerate(mods):
            if i_mod >= self._max_monomers:
                break

            cells = mod.find_all('td')
            new_nomenclature = cells[0].text
            name = cells[1].text
            short_name = cells[2].text
            abbrev = cells[3].text

            id = short_name

            chars = new_nomenclature
            if not chars:
                chars = id

            synonyms = SynonymSet()
            if abbrev:
                synonyms.add(abbrev)

            identifiers = IdentifierSet()
            identifiers.add(Identifier('modomics.short_name', short_name))
            if new_nomenclature:
                identifiers.add(Identifier('modomics.new_nomenclature', new_nomenclature))

            structure, more_identifiers = self.get_nucleoside_details_from_modomics(id, session)
            identifiers.update(more_identifiers)

            monomer = Monomer(
                id=chars,
                name=name,
                synonyms=synonyms,
                identifiers=identifiers,
                structure=structure,
            )

            if chars in ['N']:
                continue
            if not monomer.structure:
                continue
            if '*' in monomer.export('smiles', options=('c',)):
                continue
            if ' (base)' in new_nomenclature:
                continue
            if ' (cap)' in name or ' cap' in name:
                continue
            if '-CoA)' in name:
                continue
            if new_nomenclature and new_nomenclature[-1] == 'N':
                continue

            conv = openbabel.OBConversion()
            assert conv.SetOutFormat('smi'), 'Unable to set format to SMILES'
            assert conv.SetInFormat('smi'), 'Unable to set format to SMILES'
            conv.SetOptions('c', conv.OUTOPTIONS)

            smiles = conv.WriteString(monomer.structure).partition('\t')[0]
            if ph is not None:
                smiles = get_major_micro_species(smiles, 'smiles', 'smiles', ph, major_tautomer=major_tautomer, dearomatize=dearomatize)
            smiles_mol = openbabel.OBMol()
            conv.ReadString(smiles_mol, smiles)

            smiles = conv.WriteString(smiles_mol).partition('\t')[0]
            smiles_mol2 = openbabel.OBMol()
            conv.ReadString(smiles_mol2, smiles)

            monomer.structure = smiles_mol2

            if not self.is_valid_nucleoside(monomer):
                invalid_nucleosides.append(chars)
                continue

            alphabet.monomers[chars] = monomer
            monomer_short_names[short_name] = monomer

        if invalid_nucleosides:
            warnings.warn('The following compounds were ignored because they do not appear to be nucleosides:\n- {}'.format(
                '\n- '.join(invalid_nucleosides)), BpFormsWarning)

        for short_name, monomer in monomer_short_names.items():
            base_monomer = monomer_short_names.get(base_monomer_short_names.get(short_name, None), None)
            if base_monomer:
                monomer.base_monomers.add(base_monomer)

        # return alphabet
        return alphabet

    def get_nucleoside_details_from_modomics(self, id, session):
        """ Get the structure of a nucleoside in the MODOMICS database

        Args:
            id (:obj:`str`): id of nucleoside in MODOMICS database

        Returns:
            :obj:`openbabel.OBMol`: structure
            :obj:`IdentifierSet`: identifiers
        """
        response = session.get(self.MODOMICS_ENTRY_ENDPOINT.format(id))
        response.raise_for_status()

        doc = bs4.BeautifulSoup(response.text, 'html.parser')

        table = doc.find(id='modification_details')
        tbody = table.find('tbody')
        rows = tbody.find_all('tr')
        mol = None
        identifiers = IdentifierSet()
        for row in rows:
            cells = row.find_all('td')
            if cells[0].text.startswith('SMILES'):
                smiles = cells[1].text
                if not smiles:
                    continue

                mol = openbabel.OBMol()
                conv = openbabel.OBConversion()
                assert conv.SetInFormat('smi')
                if not conv.ReadString(mol, smiles):
                    mol = None
                    continue

            elif cells[0].text.startswith('PubChem'):
                link = cells[1].find('a')
                identifiers.add(Identifier('pubchem.compound', link.text))

        return mol, identifiers

    def build_rna_mod_db(self, alphabet, session, ph=None, major_tautomer=False, dearomatize=False):
        """ Build alphabet from the RNA Modification Database

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            session (:obj:`requests_cache.core.CachedSession`): request cache session
            ph (:obj:`float`, optional): pH at which to calculate the major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule
        """
        ########################################
        # get information from the RNA Modification Database

        # parse index
        response = session.get(self.RNA_MOD_DB_INDEX_ENDPOINT)
        response.raise_for_status()
        doc = bs4.BeautifulSoup(response.text, 'html.parser')
        table = doc.find('table', {'class': 'searchresult'})
        entry_ids = []
        for row in table.find_all('tr')[1:]:
            entry_ids.append(row.find('td').find('a').get('href').partition('/mods/modifications/view/')[2])

        # parse entries
        monomers = []
        for entry_id in entry_ids:
            response = session.get(self.RNA_MOD_DB_ENTRY_ENDPOINT.format(entry_id))
            response.raise_for_status()

            id = None
            name = None
            identifiers = IdentifierSet([Identifier('rnamods', str(entry_id))])
            comments = []

            doc = bs4.BeautifulSoup(response.text, 'html.parser')
            div = doc.find('div', {'class': 'modView'})

            dl = div.find('dl')
            key = None
            for child in dl.children:
                if isinstance(child, bs4.element.Tag):
                    if child.name == 'dt':
                        key = child.text[0:-1]
                    elif child.name == 'dd':
                        if key == 'Symbol':
                            id = child.text
                        elif key == 'Common name':
                            name = child.text
                        elif key == 'CA registry numbers' and child.text.startswith('ribonucleoside '):
                            cas_id = child.text[len('ribonucleoside '):].strip()
                            if cas_id:
                                identifiers.add(Identifier('cas', cas_id))

            # comment
            for p in div.find_all('p'):
                label = p.find('span', {'class': 'refLabel'}).text
                if label == 'Comment:':
                    comments.append('<p>{}</p>'.format(re.sub(' \[\d+\]', '', p.text.partition('Comment:')[2].strip())))

            # phylogenetic prevalance
            table = div.find('table', {'class': 'phylotable'})
            if table:
                rows = table.find_all('tr')

                domains = []
                for cell in rows[1].find_all('th'):
                    domains.append(cell.text.strip())

                domain_dist = []
                for row in rows[2:]:
                    rna_type_obj = row.find('th')
                    if rna_type_obj:
                        rna_type = rna_type_obj.text.strip()
                        assert rna_type.endswith('RNA')
                        for domain, cell in zip(domains, row.find_all('td')):
                            if cell.text:
                                domain_dist.append(domain + ' ' + rna_type)

                comments.append('<p>Phylogenetic distribution<ul><li>{}</li></ul></p>'.format(
                    '</li><li>'.join(domain_dist)))

            monomers.append(Monomer(id=id, name=name, identifiers=identifiers,
                                    comments=' '.join(comments)))

        ########################################
        # corrections to the RNA Modification Database: incorrect or missing CAS ids
        for monomer in monomers:
            # incorrect ids
            for identifier in monomer.identifiers:
                if identifier.ns == 'cas':
                    if identifier.id == '577773-09-02':
                        identifier.id = '577773-09-2'

                    elif identifier.id == '-56973-12-7':
                        identifier.id = '56973-12-7'

                    elif monomer.id == 'C+':
                        identifier.id = '1221169-70-5'

            # missing ids
            if monomer.id == 'cnm5U':
                monomer.identifiers.add(Identifier('cas', '58479-73-5'))

            elif monomer.id == 'gcmnm5s2U':
                monomer.identifiers.add(Identifier('cas', '1401688-11-6'))

            elif monomer.id == 'gmnm5s2U':
                monomer.identifiers.add(Identifier('cas', '1401688-10-5'))

        ########################################
        # get structures for entries from SciFinder

        # save list of CAS ids to search for in SciFinder
        rnamods_dirname = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'rnamods'))
        if not os.path.isdir(rnamods_dirname):
            os.makedirs(rnamods_dirname)  # pragma: no cover # already created to store .mol files

        with open(os.path.join(rnamods_dirname, 'index.tsv'), 'w') as file:
            csv_writer = csv.writer(file, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
            csv_writer.writerow(['RNA Modification Database Id', 'RNA Modification Database Symbol', 'Name', 'CAS Id', 'Comments'])
            for monomer in monomers:
                rnamods_id = ''
                cas_id = ''
                for identifier in monomer.identifiers:
                    if identifier.ns == 'rnamods':
                        rnamods_id = identifier.id
                    elif identifier.ns == 'cas':
                        cas_id = identifier.id
                csv_writer.writerow([rnamods_id, monomer.id, monomer.name, cas_id, monomer.comments])

        # Get structures from SciFinder and save to file (done manually at https://scifinder.cas.org)
        # 1. Use `Substance Identifier` to find compounds by their CAS id
        # 2. Select `Explore by Structure` > `Chemical Structure` in the context menu for the compound
        # 3. Open the `Non-Java` structure viewer
        # 4. Click the save icon
        # 5. Save the structure in .mol format with the name equal to the CAS id

        # Read structures from file
        mol_conv = openbabel.OBConversion()
        assert mol_conv.SetInFormat('mol'), 'Unable to set format to MOL'

        smiles_conv = openbabel.OBConversion()
        assert smiles_conv.SetInFormat('smi'), 'Unable to set format to SMILES'
        assert smiles_conv.SetOutFormat('smi'), 'Unable to set format to SMILES'
        smiles_conv.SetOptions('c', smiles_conv.OUTOPTIONS)

        for monomer in monomers:
            for identifier in monomer.identifiers:
                if identifier.ns == 'cas':
                    cas_id = identifier.id
                    break

            # read .mol file
            mol_filename = os.path.join(rnamods_dirname, cas_id + '.mol')
            assert os.path.isfile(mol_filename)
            mol = openbabel.OBMol()
            mol_conv.ReadFile(mol, mol_filename)
            smiles = smiles_conv.WriteString(mol)

            # get major microspecies
            if ph:
                smiles = get_major_micro_species(smiles, 'smiles', 'smiles', ph, major_tautomer=major_tautomer, dearomatize=dearomatize)

            # sanitize SMILES
            mol = openbabel.OBMol()
            smiles_conv.ReadString(mol, smiles)
            smiles = smiles_conv.WriteString(mol)

            mol = openbabel.OBMol()
            smiles_conv.ReadString(mol, smiles)
            smiles = smiles_conv.WriteString(mol)

            # set structure
            monomer.structure = smiles

        ########################################
        # Determine valid monomers and find 3' and 5' sites
        for monomer in list(monomers):
            assert self.is_valid_nucleoside(monomer), "Monomer {} does not appear to be a valid nucleoside".format(monomer.id)

        ########################################
        # merge monomers with alphabet
        additional_monomers = []
        for monomer in monomers:
            inchi = monomer.export('inchi').partition('/h')[0]
            same_monomer = None
            for test_monomer in alphabet.monomers.values():
                if monomer.id == 'manQ':
                    break

                modomics_short_name = None
                for identifier in test_monomer.identifiers:
                    if identifier.ns == 'modomics.short_name':
                        modomics_short_name = identifier.id
                        break
                if monomer.id == modomics_short_name:
                    same_monomer = test_monomer
                    break

                if test_monomer.export('inchi').partition('/h')[0] == inchi:
                    same_monomer = test_monomer
                    break

            if same_monomer:
                if monomer.id not in [same_monomer.id, same_monomer.name]:
                    same_monomer.synonyms.add(monomer.id)

                if monomer.name not in [same_monomer.id, same_monomer.name]:
                    same_monomer.synonyms.add(monomer.name)

                same_monomer.identifiers.update(monomer.identifiers)

                assert not same_monomer.comments, "Comments must be merged, which isn't implemented"
                same_monomer.comments = monomer.comments
            else:
                code = monomer.id
                assert code not in alphabet.monomers, "Code already used. Another code must be chosen."
                alphabet.monomers[code] = monomer
                additional_monomers.append(code)

            if additional_monomers:
                print('{} monomeric forms were added to the alphabet:\n  {}'.format(
                    len(additional_monomers), '\n  '.join(additional_monomers)))

    def build_pdb(self, alphabet, session, ph=None, major_tautomer=False, dearomatize=False):
        """ Build monomeric forms from PDB CCD

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            session (:obj:`requests_cache.core.CachedSession`): request cache session
            ph (:obj:`float`, optional): pH at which to calculate the major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule

        Returns:
            :obj:`Alphabet`: alphabet
        """
        smiles_to_monomer = {get_can_smiles(monomer.structure): monomer for monomer in alphabet.monomers.values()}
        monomer_codes = {code: monomer for code, monomer in alphabet.monomers.items()}
        monomer_lc_codes = {code.lower(): code for code in alphabet.monomers.keys()}
        monomer_to_codes = {monomer: code for code, monomer in alphabet.monomers.items()}

        base_monomers = {}
        pdb_id_to_monomer = {}

        same_structures = []
        same_ids = []
        same_ids_case_insensitive = []

        replaced_monomers = []

        filename = download_pdb_ccd()
        valid_types = ('RNA linking',
                       'RNA OH 5 prime terminus',
                       'RNA OH 3 prime terminus')
        for pdb_monomer, base_monomer, smiles, pdb_structure, atoms in \
                parse_pdb_ccd(filename, valid_types, self._max_monomers):
            if pdb_monomer.id == 'N':
                continue

            structure = get_pdb_ccd_open_babel_mol(pdb_structure)
            if structure is None:
                continue

            # set structure, atoms
            mol = openbabel.OBMol()
            mol += structure

            if "P" in atoms:
                atom = mol.GetAtom(atoms["P"]['position'])
                assert atom.GetAtomicNum() == 15
                atom.SetIsotope(1)

            ops = []
            for i_o in range(1, 4):
                id_o = 'OP' + str(i_o)
                id_h = 'H' + id_o
                if id_o in atoms and id_h in atoms:
                    atom_o = mol.GetAtom(atoms[id_o]['position'])
                    assert atom_o.GetAtomicNum() == 8
                    atom_o.SetIsotope(len(ops))

                    atom_h = mol.GetAtom(atoms[id_h]['position'])
                    assert atom_h.GetAtomicNum() == 1

                    ops.append((atom_o, atom_h))
                    if len(ops) == 2:
                        break

            if "O3'" in atoms:
                atom = mol.GetAtom(atoms["O3'"]['position'])
                assert atom.GetAtomicNum() == 8
                atom.SetIsotope(2)

            for op, hop in ops:
                assert mol.DeleteAtom(hop, True)
                op.SetFormalCharge(-1)

            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            isotope_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, isotope_smiles)
            isotope_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, isotope_smiles)
            i_left_p = None
            i_right_o = None
            i_left_o = None
            for atom in openbabel.OBMolAtomIter(mol):
                if atom.GetAtomicNum() == 15 and atom.GetIsotope() == 1:
                    i_left_p = atom.GetIdx()
                    atom.SetIsotope(0)
                elif atom.GetAtomicNum() == 8 and atom.GetIsotope() == 1:
                    i_left_o = atom.GetIdx()
                    atom.SetIsotope(0)
                elif atom.GetAtomicNum() == 8 and atom.GetIsotope() == 2:
                    i_right_o = atom.GetIdx()
                    atom.SetIsotope(0)
            can_smiles = conv.WriteString(mol, True)

            atom_map = {}
            for atom in openbabel.OBMolAtomIter(mol):
                if atom.GetAtomicNum() > 1:
                    atom_map[atom.GetIdx()] = len(atom_map) + 1

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)
            can_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)

            atom_map_2 = {}
            for atom in openbabel.OBMolAtomIter(mol):
                if atom.GetAtomicNum() > 1:
                    atom_map_2[len(atom_map_2) + 1] = atom.GetIdx()

            if i_left_p is not None:
                i_left_p = atom_map_2[atom_map[i_left_p]]
                assert mol.GetAtom(i_left_p).GetAtomicNum() == 15
            if i_left_o is not None:
                i_left_o = atom_map_2[atom_map[i_left_o]]
                assert mol.GetAtom(i_left_o).GetAtomicNum() == 8
            if i_right_o is not None:
                i_right_o = atom_map_2[atom_map[i_right_o]]
                assert mol.GetAtom(i_right_o).GetAtomicNum() == 8

            pdb_monomer.structure = mol
            if i_left_p is not None and i_left_o is not None and \
                    ('HOP3' in atoms or mol.GetAtom(i_left_o).GetFormalCharge() == -1):
                pdb_monomer.l_bond_atoms = [Atom(Monomer, element='P', position=i_left_p)]
                pdb_monomer.l_displaced_atoms = [
                    Atom(Monomer, element='O', position=i_left_o, charge=-1)]
            if i_right_o is not None and "HO3'" in atoms:
                pdb_monomer.r_bond_atoms = [Atom(Monomer, element='O', position=i_right_o)]
                pdb_monomer.r_displaced_atoms = [Atom(Monomer, element='H', position=i_right_o)]

            # get canonical SMILES for entry
            can_smiles = smiles

            if ph:
                can_smiles = get_major_micro_species(
                    can_smiles, 'smiles', 'smiles', ph=ph,
                    major_tautomer=major_tautomer,
                    dearomatize=dearomatize)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)
            can_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)
            can_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)
            can_smiles = get_can_smiles(mol)

            # determine if the entry is already represented in the alphabet
            merge_with_new = False

            monomer = None

            if pdb_monomer.id not in ['A5O', 'AP7', 'CAR', 'GAO', 'UAR']:
                monomer = smiles_to_monomer.get(can_smiles, None)
                if monomer is not None:
                    for identifier in monomer.identifiers:
                        if identifier.ns == 'pdb-ccd':
                            monomer = None
                            break
                if monomer is not None and (\
                    (monomer.l_bond_atoms and not pdb_monomer.l_bond_atoms) or \
                    (monomer.r_bond_atoms and not pdb_monomer.r_bond_atoms)):
                    monomer = None

            if monomer is not None:
                merge_with_new = True
                same_structures.append((pdb_monomer.id, monomer_to_codes[monomer]))
            if pdb_monomer.id in ['A', 'C', 'G', 'U']:
                merge_with_new = False
                monomer = alphabet.monomers.get(pdb_monomer.id, None)
            if monomer is None:
                monomer = monomer_codes.get(pdb_monomer.id, None)
                if monomer is not None:
                    same_ids.append(pdb_monomer.id)
            if monomer is None:
                monomer_diff_case = monomer_lc_codes.get(pdb_monomer.id.lower(), None)
                if monomer_diff_case is not None:
                    same_ids_case_insensitive.append((pdb_monomer.id, monomer_diff_case))

            # add/merge the entry with the alphabet
            if monomer is None:
                # add the entry to the alphabet
                alphabet.monomers[pdb_monomer.id] = pdb_monomer
                if base_monomer is not None:
                    base_monomers[pdb_monomer] = base_monomer
                pdb_id_to_monomer[pdb_monomer.id] = pdb_monomer

            elif merge_with_new:
                # merge an existing monomer with the PDB entry
                alphabet.monomers[monomer_to_codes[monomer]] = pdb_monomer

                pdb_monomer.synonyms.update(monomer.synonyms)
                pdb_monomer.synonyms.add(pdb_monomer.id)
                pdb_monomer.synonyms.add(pdb_monomer.name)
                pdb_monomer.id = monomer.id or pdb_monomer.id
                pdb_monomer.name = monomer.name or pdb_monomer.name
                pdb_monomer.synonyms.discard(pdb_monomer.id)
                pdb_monomer.synonyms.discard(pdb_monomer.name)

                pdb_monomer.identifiers.update(monomer.identifiers)
                pdb_monomer.base_monomers.update(monomer.base_monomers)
                pdb_monomer.comments = monomer.comments

                if base_monomer is not None:
                    base_monomers[pdb_monomer] = base_monomer
                pdb_id_to_monomer[pdb_monomer.id] = pdb_monomer

                replaced_monomers.append((monomer, pdb_monomer))

            else:
                # merge the entry with an existing monomer
                if pdb_monomer.id not in [monomer.id, monomer.name]:
                    monomer.synonyms.add(pdb_monomer.id)
                if pdb_monomer.name not in [monomer.id, monomer.name]:
                    monomer.synonyms.add(pdb_monomer.name)
                monomer.synonyms.update(pdb_monomer.synonyms)
                monomer.identifiers.update(pdb_monomer.identifiers)

                if base_monomer is not None:
                    base_monomers[monomer] = base_monomer
                pdb_id_to_monomer[pdb_monomer.id] = monomer

        # set base monomers
        for monomer, base_monomer in base_monomers.items():
            if base_monomer in pdb_id_to_monomer:
                monomer.base_monomers.add(pdb_id_to_monomer[base_monomer])

        for old_monomer, new_monomer in replaced_monomers:
            for o_monomer in alphabet.monomers.values():
                if old_monomer in o_monomer.base_monomers:
                    o_monomer.base_monomers.remove(old_monomer)
                    o_monomer.base_monomers.add(new_monomer)

        # print summary
        print('{} entries with the similar structures were joined:\n  {}\t{}\n  {}'.format(
            len(same_structures), 'PDB CCD id', 'Alphabet code',
            '\n  '.join(sorted('\t'.join(ids) for ids in same_structures))))
        print('{} entries with the same ids were joined:\n  {}'.format(
            len(same_ids), '\n  '.join(sorted(same_ids))))
        print('{} entries with similar ids potentially should be joined:\n  {}\t{}\n  {}'.format(
            len(same_ids_case_insensitive), 'PDB CCD id', 'Alphabet code',
            '\n  '.join(sorted('\t'.join(ids) for ids in same_ids_case_insensitive))))

    def is_valid_nucleoside(self, monomer):
        """ Determine if nucleoside should be included in alphabet

        Args:
            monomer (:obj:`Monomer`): monomeric form

        Returns:
            :obj:`bool`: :obj:`True` if the monomeric form is a valid nucleoside
        """
        formula = monomer.get_formula()
        if formula.C < 9 or formula.O < 4 or formula.N < 2:
            return False

        atom_bs = []
        atom_rs = []
        for i_atom in range(1, monomer.structure.NumAtoms() + 1):
            atom = monomer.structure.GetAtom(i_atom)
            if self.is_backbone_atom(atom):
                atom_bs.append(atom)
            elif self.is_r_bond_atom(atom):
                atom_rs.append(atom)

        termini = []
        for atom_b in atom_bs:
            for atom_r in atom_rs:
                if self.is_terminus(atom_b, atom_r):
                    termini.append((atom_b, atom_r))

        if termini:
            atom_b_idx = termini[0][0].GetIdx()
            atom_r_idx = termini[0][1].GetIdx()
        else:  # pragma no cover: case not used by MODOMICS or the RNA Modification Database
            if atom_bs and not atom_rs:
                atom_b_idx = atom_bs[0].GetIdx()
                atom_r_idx = None
            elif atom_rs and not atom_bs:
                atom_b_idx = None
                atom_r_idx = atom_rs[0].GetIdx()
            else:
                atom_b_idx = None
                atom_r_idx = None

        if atom_b_idx:
            monomer.backbone_bond_atoms = [Atom(Monomer, 'O', atom_b_idx)]
            monomer.backbone_displaced_atoms = [Atom(Monomer, 'H', atom_b_idx)]
        if atom_r_idx:
            monomer.r_bond_atoms = [Atom(Monomer, 'O', atom_r_idx)]
            monomer.r_displaced_atoms = [Atom(Monomer, 'H', atom_r_idx)]

        return True

    def is_terminus(self, b_atom, r_atom):
        """ Determine if a pair of atoms is a valid pair of bonding sites

        Args:
            b_atom (:obj:`openbabel.OBAtom`): potential backbone atom
            r_atom (:obj:`openbabel.OBAtom`): potential right bond atom

        Returns:
            :obj:`bool`: :obj:`True`, if the atoms are a valid pair of bonding sites
        """
        r_atom_2 = self.is_backbone_atom(b_atom)
        if not r_atom_2:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database

        if not self.is_r_bond_atom(r_atom):
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database

        return r_atom_2.GetIdx() == r_atom.GetIdx()

    def is_nucleotide_terminus(self, l_atom, r_atom):
        """ Determine if a pair of atoms is a valid pair of bonding sites

        Args:
            l_atom (:obj:`openbabel.OBAtom`): potential left atom
            r_atom (:obj:`openbabel.OBAtom`): potential right bond atom

        Returns:
            :obj:`bool`: :obj:`True`, if the atoms are a valid pair of bonding sites
        """
        r_atom_2 = self.is_l_atom(l_atom)
        if not r_atom_2:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database

        if not self.is_r_bond_atom(r_atom):
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database

        return r_atom_2.GetIdx() == r_atom.GetIdx()

    def is_backbone_atom(self, b_atom):
        """ Determine if an atom is a valid backbone bonding site

        Args:
            b_atom (:obj:`openbabel.OBAtom`): potential backbone atom

        Returns:
            :obj:`bool`: :obj:`True`, if the atom is a valid backbone bonding site
        """
        if b_atom.GetAtomicNum() != 8:
            return False
        if b_atom.GetFormalCharge() != 0:
            return False

        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(b_atom)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(b_atom)])
        other_atoms += [1] * (2 - tot_bond_order)
        other_atoms = sorted(other_atoms)
        if other_atoms != [1, 6]:
            return False

        # get first C
        for other_atom in openbabel.OBAtomAtomIter(b_atom):
            if other_atom.GetAtomicNum() == 6:
                c_1 = other_atom
        if c_1.GetFormalCharge() != 0:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(c_1)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(c_1)])
        other_atoms += [1] * (4 - tot_bond_order)
        other_atoms = sorted(other_atoms)
        if other_atoms != [1, 1, 6, 8]:
            return False

        # get second C
        for other_atom in openbabel.OBAtomAtomIter(c_1):
            if other_atom.GetAtomicNum() == 6:
                c_2 = other_atom
        if c_2.GetFormalCharge() != 0:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(c_2)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(c_2)])
        other_atoms += [1] * (4 - tot_bond_order)
        other_atoms = sorted(other_atoms)
        if other_atoms != [1, 6, 6, 8]:
            return False

        # get third C
        for other_atom in openbabel.OBAtomAtomIter(c_2):
            if other_atom.GetAtomicNum() == 6 and other_atom.GetIdx() != c_1.GetIdx():
                c_3 = other_atom
        if c_3.GetFormalCharge() != 0:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(c_3)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(c_3)])
        other_atoms += [1] * (4 - tot_bond_order)
        other_atoms = sorted(other_atoms)
        if other_atoms != [1, 6, 6, 8]:
            return False

        # get second O
        for other_atom in openbabel.OBAtomAtomIter(c_3):
            if other_atom.GetAtomicNum() == 8:
                r_atom_2 = other_atom
        if r_atom_2.GetFormalCharge() != 0:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database

        return r_atom_2

    def is_l_atom(self, l_atom):
        """ Determine if an atom is a valid left bonding site

        Args:
            l_atom (:obj:`openbabel.OBAtom`): potential left atom

        Returns:
            :obj:`bool`: :obj:`True`, if the atom is a valid left bonding site
        """
        if l_atom.GetAtomicNum() != 15:
            return False
        if l_atom.GetFormalCharge() != 0:
            return False

        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(l_atom)]
        if other_atoms != [8, 8, 8, 8]:
            return False
        bonds = sorted([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(l_atom)])
        if bonds != [1, 1, 1, 2]:
            return False
        c_1 = None
        o_minus = 0
        for bond in openbabel.OBAtomBondIter(l_atom):
            other_atom = bond.GetBeginAtom()
            if other_atom == l_atom:
                other_atom = bond.GetEndAtom()

            if other_atom.GetFormalCharge() == -1 and len(list(openbabel.OBAtomAtomIter(other_atom))) == 1:
                o_minus += 1
                continue

            for other_other_atom in openbabel.OBAtomAtomIter(other_atom):
                if other_other_atom.GetAtomicNum() == 6:
                    c_1 = other_other_atom
                    break
        if o_minus != 2:
            return False
        if c_1 is None:
            return False

        # get first C
        if c_1.GetFormalCharge() != 0:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(c_1)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(c_1)])
        other_atoms += [1] * (4 - tot_bond_order)
        other_atoms = sorted(other_atoms)
        if other_atoms != [1, 1, 6, 8]:
            return False

        # get second C
        for other_atom in openbabel.OBAtomAtomIter(c_1):
            if other_atom.GetAtomicNum() == 6:
                c_2 = other_atom
        if c_2.GetFormalCharge() != 0:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(c_2)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(c_2)])
        other_atoms += [1] * (4 - tot_bond_order)
        other_atoms = sorted(other_atoms)
        if other_atoms != [1, 6, 6, 8]:
            return False

        # get third C
        for other_atom in openbabel.OBAtomAtomIter(c_2):
            if other_atom.GetAtomicNum() == 6 and other_atom.GetIdx() != c_1.GetIdx():
                c_3 = other_atom
        if c_3.GetFormalCharge() != 0:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(c_3)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(c_3)])
        other_atoms += [1] * (4 - tot_bond_order)
        other_atoms = sorted(other_atoms)
        if other_atoms != [1, 6, 6, 8]:
            return False

        # get second O
        for other_atom in openbabel.OBAtomAtomIter(c_3):
            if other_atom.GetAtomicNum() == 8:
                r_atom_2 = other_atom
        if r_atom_2.GetFormalCharge() != 0:
            return False  # pragma no cover: case not used by MODOMICS or the RNA Modification Database

        return r_atom_2

    def is_r_bond_atom(self, r_atom):
        """ Determine if an atom is a valid right bond bonding site

        Args:
            b_atom (:obj:`openbabel.OBAtom`): potential right bond atom

        Returns:
            :obj:`bool`: :obj:`True`, if the atom is a valid right bond bonding site
        """
        other_atoms = [other_atom.GetAtomicNum() for other_atom in openbabel.OBAtomAtomIter(r_atom)]
        tot_bond_order = sum([bond.GetBondOrder() for bond in openbabel.OBAtomBondIter(r_atom)])
        other_atoms += [1] * (2 - tot_bond_order)
        other_atoms = sorted(other_atoms)
        if other_atoms != [1, 6]:
            return False

        return True


class RnaForm(BpForm):
    """ RNA form """

    DEFAULT_FASTA_CODE = 'N'

    def __init__(self, seq=None, circular=False):
        """
        Args:
            seq (:obj:`MonomerSequence`, optional): sequence of monomeric forms of the RNA
            circular (:obj:`bool`, optional): if :obj:`True`, indicates that the biopolymer is circular
        """
        super(RnaForm, self).__init__(
            seq=seq, alphabet=rna_alphabet,
            backbone=None,
            bond=Bond(
                r_bond_atoms=[Atom(Monomer, element='O', position=None)],
                l_bond_atoms=[Atom(Monomer, element='P', position=None)],
                r_displaced_atoms=[Atom(Monomer, element='H', position=None)],
                l_displaced_atoms=[Atom(Monomer, element='O', position=None, charge=-1)]),
            circular=circular)


class CanonicalRnaForm(BpForm):
    """ Canonical RNA form """

    DEFAULT_FASTA_CODE = 'N'

    def __init__(self, seq=None, circular=False):
        """
        Args:
            seq (:obj:`MonomerSequence`, optional): sequence of monomeric forms of the RNA
            circular (:obj:`bool`, optional): if :obj:`True`, indicates that the biopolymer is circular
        """
        super(CanonicalRnaForm, self).__init__(
            seq=seq, alphabet=canonical_rna_alphabet,
            backbone=None,
            bond=Bond(
                r_bond_atoms=[Atom(Monomer, element='O', position=None)],
                l_bond_atoms=[Atom(Monomer, element='P', position=None)],
                r_displaced_atoms=[Atom(Monomer, element='H', position=None)],
                l_displaced_atoms=[Atom(Monomer, element='O', position=None, charge=-1)]),
            circular=circular)