bpforms/alphabet/dna.py from KarrLab/bpforms

bpforms/alphabet/dna.py
Summary

Maintainability

2 wks
Test Coverage

95%
Issues
Coverage
""" Alphabet and BpForm to represent modified DNA

:Author: Jonathan Karr <karr@mssm.edu>
:Date: 2019-02-05
:Copyright: 2019, Karr Lab
:License: MIT
"""

from bpforms.core import (Alphabet, AlphabetBuilder, Monomer, MonomerSequence, BpForm,
                          Bond, Atom, Identifier, IdentifierSet, SynonymSet,
                          BpFormsWarning)
from bpforms.alphabet.core import (download_pdb_ccd, parse_pdb_ccd, get_pdb_ccd_open_babel_mol,
                                   get_can_smiles)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from wc_utils.util.chem import EmpiricalFormula
from wc_utils.util.chem.marvin import get_major_micro_species
import csv
import math
import openbabel
import os
import pkg_resources
import requests
import sqlalchemy
import warnings

dna_mod_filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'DNAmod.sqlite'))


def get_dnamod(filename):
    if not os.path.isfile(filename):
        response = requests.get('https://hoffmanlab.org/proj/dnamod/DNAmod.sqlite')
        with open(filename, 'wb') as file:
            file.write(response.content)


get_dnamod(dna_mod_filename)

filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'dna.yml'))
dna_alphabet = Alphabet().from_yaml(filename)
# :obj:`Alphabet`: Alphabet for DNA nucleotide monophosphates

canonical_filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'dna.canonical.yml'))
canonical_dna_alphabet = Alphabet().from_yaml(canonical_filename)
# :obj:`Alphabet`: Alphabet for canonical DNA nucleotide monophosphates

engine = sqlalchemy.create_engine('sqlite:///' + dna_mod_filename)
DeclarativeBase = declarative_base(engine)


class DnaAlphabetBuilder(AlphabetBuilder):
    """ Build DNA alphabet from MODOMICS """

    INVALID_IDS = (
        # nucleosides
        "2'-deoxyinosine",
        "2'-deoxyuridine",
        "2'-deoxy-5-(4,5-dihydroxypentyl)uridine",
        "5-(2-aminoethoxy)methyl-2'-deoxyuridine",
        "5,6-dihydroxy-2'-deoxyuridine",
        "5-(2-aminoethyl)-2'-deoxyuridine",
        "7-amido-7-deazaguanosine",
        "(beta-D-glucopyranosyloxymethyl)deoxyuridine 5'-monophosphate",
        "N(2),N(2)-dimethylguanosine",
    )

    class Names(DeclarativeBase):
        """"""
        __tablename__ = 'names'
        __table_args__ = {'autoload': True}

    class ExpandedAlphabet(DeclarativeBase):
        """"""
        __tablename__ = 'expanded_alphabet'
        __table_args__ = {'autoload': True}
        nameid = sqlalchemy.Column(primary_key=True)

    class ModBase(DeclarativeBase):
        """"""
        __tablename__ = 'modbase'
        __table_args__ = {'autoload': True, 'extend_existing': True}
        nameid = sqlalchemy.Column(primary_key=True)

    class ModBaseParents(DeclarativeBase):
        """"""
        __tablename__ = 'modbase_parents'
        __table_args__ = {'autoload': True, 'extend_existing': True}
        nameid = sqlalchemy.Column(primary_key=True)

    class CovMod(DeclarativeBase):
        """"""
        __tablename__ = 'covmod'
        __table_args__ = {'autoload': True, 'extend_existing': True}
        cmodid = sqlalchemy.Column(primary_key=True)

    def load_session(self):
        """ loads an SQLAlchemy session """
        metadata = DeclarativeBase.metadata
        Session = sessionmaker(bind=engine)
        session = Session()
        return session

    def run(self, ph=None, major_tautomer=False, dearomatize=False, path=filename):
        """ Build alphabet and, optionally, save to YAML file

        Args:
            ph (:obj:`float`, optional): pH at which calculate major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule
            path (:obj:`str`, optional): path to save alphabet

        Returns:
            :obj:`Alphabet`: alphabet
        """
        return super(DnaAlphabetBuilder, self).run(ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize, path=path)

    def build(self, ph=None, major_tautomer=False, dearomatize=False):
        """ Build alphabet

        Args:
            ph (:obj:`float`, optional): pH at which to calculate the major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule

        Returns:
            :obj:`Alphabet`: alphabet
        """
        # initialize alphabet
        alphabet = Alphabet()

        # create canonical monomeric forms
        alphabet.from_yaml(canonical_filename)
        alphabet.id = 'dna'
        alphabet.name = 'DNA nucleotide monophosphates'
        alphabet.description = ('The canonical DNA nucleotide monophosphates, '
                                'plus the non-canonical DNA nucleotide monophosphates based on '
                                '<a href="https://hoffmanlab.org/proj/dnamod">DNAmod</a> and '
                                '<a href="http://repairtoire.genesilico.pl/damage/">REPAIRtoire</a>')

        # build from sources
        self.build_dnamod(alphabet, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)
        self.build_repairtoire(alphabet, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)
        self.build_pdb_ccd(alphabet, ph=ph, major_tautomer=major_tautomer, dearomatize=dearomatize)

        # corrections
        if 'dI' in alphabet.monomers and 'DI' in alphabet.monomers:
            dI = alphabet.monomers.dI
            alphabet.monomers.dI = alphabet.monomers.pop('DI')
            alphabet.monomers.dI.synonyms.add("2'-deoxyinosine, 9-[(2R,4S,5R)-4-hydroxy-5-"
                                              "(hydroxymethyl)tetrahydrofuran-2-yl]-9H-purin-6-ol")
            alphabet.monomers.dI.synonyms.add('deoxyinosine')
            alphabet.monomers.dI.synonyms.add('9-(2-deoxy-beta-d-erythro-pentofuranosyl)-9h-purin-6-ol')
            alphabet.monomers.dI.identifiers.add(Identifier('dnamod', "2'-deoxyinosine"))
            alphabet.monomers.dI.identifiers.add(Identifier('chebi', 'CHEBI:28997'))
            alphabet.monomers.dI.comments = ("A purine 2'-deoxyribonucleoside that is inosine in which the hydroxy "
                                             "group at position 2' is replaced by a hydrogen.")
            for monomer in alphabet.monomers.values():
                if dI in monomer.base_monomers:
                    monomer.base_monomers.remove(dI)
                    monomer.base_monomers.add(alphabet.monomers.dI)

        if 'dU' in alphabet.monomers:
            alphabet.monomers.dU.synonyms.add("2'-deoxyuridine")
            alphabet.monomers.dU.synonyms.add('1-(2-deoxy-beta-D-erythro-pentofuranosyl)uracil')
            alphabet.monomers.dU.synonyms.add("2'-deoxyuridine-5'-monophosphate")
            alphabet.monomers.dU.synonyms.add("dU")
            alphabet.monomers.dU.synonyms.add("deoxyuridine")
            alphabet.monomers.dU.synonyms.add("2-deoxyuridine")
            alphabet.monomers.dU.identifiers.add(Identifier('dnamod', "2'-deoxyuridine"))
            alphabet.monomers.dU.identifiers.add(Identifier('chebi', 'CHEBI:16450'))
            alphabet.monomers.dU.identifiers.add(Identifier('repairtoire', 'dU'))
            alphabet.monomers.dU.comments = (
                "A pyrimidine 2'-deoxyribonucleoside having uracil as the nucleobase. Cysteine "
                "spontaneously loses an amine group, which is replaced by a keto group at the "
                "C4-atom. This reaction occurs spontaneously and all the time, and is the reason "
                "why T is used in DNA. This way, dU and T can be distinguished from each other.")

        # return alphabet
        return alphabet

    def build_dnamod(self, alphabet, ph=None, major_tautomer=False, dearomatize=False):
        """ Build monomeric forms from DNAmod

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            ph (:obj:`float`, optional): pH at which to calculate the major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule

        Returns:
            :obj:`Alphabet`: alphabet
        """
        filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'dnamod.csv'))
        monomers = {}
        with open(filename, 'r') as file:
            reader = csv.DictReader(file, dialect='excel')
            for row in reader:
                monomers[row['Id']] = {
                    'nucleobase': row['Nucleobase'],
                    'nucleotide': row['Nucleotide'],
                    'l_bond_atom': int(float(row['Left bond atom (P)'])),
                    'l_displaced_atom': int(float(row['Left displaced atom (O-)'])),
                    'r_bond_atom': int(float(row['Right bond atom (O)'])),
                }

        # get individual nucleobases and create monomeric forms
        session = self.load_session()
        with_smiles = session.query(self.Names).filter(self.Names.smiles != '[]')
        if not math.isinf(self._max_monomers):
            with_smiles = with_smiles.limit(self._max_monomers)

        monomer_nameids = {}
        all_base_monomers_codes = {}
        all_base_monomers_ids = {}
        invalid_nucleobases = []
        uncatalogued_monomers = []
        for item in with_smiles.all():
            row = session.query(self.ExpandedAlphabet).filter(self.ExpandedAlphabet.nameid == item.nameid).first()
            if row is None:
                continue
            if row.Symbol:
                chars = row.Symbol
            else:
                chars = row.Abbreviation

            id = item.chebiname

            name = item.iupacname[1:-1]

            synonyms = SynonymSet()
            other_names = item.othernames[1:-1].split(', ')
            if other_names != ['']:
                for other_name in other_names:
                    synonyms.add(other_name[1:-1])

            identifiers = IdentifierSet()
            identifiers.add(Identifier('chebi', item.nameid))
            identifiers.add(Identifier('dnamod', id))

            smiles = item.smiles.strip('[]')
            inchi = item.inchi.strip('[]')

            cmodid = None
            verified_status = False
            base_monomer_codes = set()
            for base_monomer_code in session.query(self.ModBase).filter(self.ModBase.nameid == item.nameid).all():
                cmodid = base_monomer_code.cmodid
                verified_status = base_monomer_code.verifiedstatus
                if self.ModBase.baseid != 'O':
                    base_monomer_codes.add(base_monomer_code.baseid)

            if not verified_status:
                continue

            base_monomer_ids = set()
            for base_monomer_id in session.query(self.ModBaseParents).filter(self.ModBaseParents.nameid == item.nameid).all():
                base_monomer_ids.add(base_monomer_id.parentid)

            comments = session.query(self.CovMod).filter(self.CovMod.cmodid == cmodid).first().definition

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            if not conv.ReadString(mol, smiles):
                assert conv.SetInFormat('inchi')
                assert conv.ReadString(mol, inchi)
            assert conv.SetOutFormat('smiles')
            conv.SetOptions('c', conv.OUTOPTIONS)
            smiles = conv.WriteString(mol, True)

            assert conv.SetInFormat('smiles')

            if ph:
                smiles = get_major_micro_species(smiles, 'smiles', 'smiles', ph=ph,
                                                 major_tautomer=major_tautomer, dearomatize=dearomatize)

            mol = openbabel.OBMol()
            conv.ReadString(mol, smiles)
            smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv.ReadString(mol, smiles)
            smiles = conv.WriteString(mol, True)

            monomer = alphabet.monomers.get(chars, Monomer())
            monomer.id = id
            monomer.name = name
            monomer.synonyms = synonyms
            monomer.identifiers = identifiers
            monomer.structure = smiles
            monomer.comments = comments

            if not self.is_nucleobase_valid(monomer):
                invalid_nucleobases.append(id)
                continue

            if monomer.id in monomers:
                if smiles != monomers[monomer.id]['nucleobase']:
                    raise Exception('Structure and atom indices may need to be updated for {}\n  {}\n  {}'.format(
                        monomer.id, smiles, monomers[monomer.id]['nucleobase']))

                smiles = monomers[monomer.id]['nucleotide']
                if ph:
                    smiles = get_major_micro_species(smiles, 'smiles', 'smiles', ph=ph,
                                                     major_tautomer=major_tautomer, dearomatize=dearomatize)
                mol = openbabel.OBMol()
                conv.ReadString(mol, smiles)
                smiles = conv.WriteString(mol, True)
                mol = openbabel.OBMol()
                conv.ReadString(mol, smiles)
                smiles = conv.WriteString(mol, True)

                if smiles != monomers[monomer.id]['nucleotide']:
                    raise Exception('Structure and atom indices may need to be updated for {}\n  {}\n  {}'.format(
                        monomer.id, smiles, monomers[monomer.id]['nucleotide']))

                monomer.structure = smiles
                monomer.l_bond_atoms = [Atom(Monomer, element='P', position=monomers[monomer.id]['l_bond_atom'])]
                monomer.l_displaced_atoms = [Atom(Monomer, element='O', position=monomers[monomer.id]
                                                  ['l_displaced_atom'], charge=-1)]
                monomer.r_bond_atoms = [Atom(Monomer, element='O', position=monomers[monomer.id]['r_bond_atom'])]
                monomer.r_displaced_atoms = [Atom(Monomer, element='H', position=monomers[monomer.id]['r_bond_atom'])]
            else:
                uncatalogued_monomers.append(monomer.id)                

            alphabet.monomers[chars] = monomer

            monomer_nameids[item.nameid] = monomer
            all_base_monomers_codes[monomer] = base_monomer_codes
            all_base_monomers_ids[monomer] = base_monomer_ids

        if uncatalogued_monomers:
            raise Exception('Structure and atom indices need to be cataloged for {}'.format(', '.join(uncatalogued_monomers)))

        for monomer, base_monomer_codes in all_base_monomers_codes.items():
            for base_monomer_code in base_monomer_codes:
                base_monomer = alphabet.monomers.get(base_monomer_code, None)
                if base_monomer:
                    monomer.base_monomers.add(base_monomer)
        for monomer, base_monomer_ids in all_base_monomers_ids.items():
            for base_monomer_id in base_monomer_ids:
                base_monomer = monomer_nameids.get(base_monomer_id, None)
                if base_monomer:
                    monomer.base_monomers.add(base_monomer)

        if invalid_nucleobases:
            warnings.warn('The following compounds were ignored because they do not appear to be nucleobases:\n- {}'.format(
                '\n- '.join(invalid_nucleobases)), BpFormsWarning)

    def build_repairtoire(self, alphabet, ph=None, major_tautomer=False, dearomatize=False):
        """ Build monomeric forms from DNAmod

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            ph (:obj:`float`, optional): pH at which to calculate the major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule

        Returns:
            :obj:`Alphabet`: alphabet
        """
        monomer_smiles = {}
        for monomer in alphabet.monomers.values():
            monomer_smiles[monomer.export('smiles', options=('c',))] = monomer

        filename = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'repairtoire.csv'))
        conv = openbabel.OBConversion()
        assert conv.SetInFormat('smiles')
        assert conv.SetOutFormat('smiles')
        conv.SetOptions('c', conv.OUTOPTIONS)
        merged_monomers = []
        new_monomers = []
        with open(filename, 'r') as file:
            reader = csv.DictReader(file, dialect='excel')
            for row in reader:
                id = row['Id']
                name = row['Name']
                smiles = row['Nucleotide monophosphate (2-) (cleaned)']
                i_l_bond_atom = int(float(row['Left bond atom (P)']))
                i_l_displaced_atom = int(float(row['Left displaced atom (O-)']))
                i_r_bond_atom = int(float(row['Right bond atom (O)']))
                comments = row['Comments']

                smiles2 = smiles
                if ph:
                    smiles2 = get_major_micro_species(smiles2, 'smiles', 'smiles', ph=ph,
                                                      major_tautomer=major_tautomer, dearomatize=dearomatize)
                mol = openbabel.OBMol()
                conv.ReadString(mol, smiles2)
                smiles2 = conv.WriteString(mol, True)
                mol = openbabel.OBMol()
                conv.ReadString(mol, smiles2)
                smiles2 = conv.WriteString(mol, True)
                if smiles2 != smiles:
                    raise Exception('Structure and atom indices may need to be updated for {}'.format(id))

                monomer = monomer_smiles.get(smiles, None)
                if monomer is None:
                    monomer = alphabet.monomers.get(id, None)

                if monomer is not None:
                    monomer.synonyms.add(id)
                    monomer.synonyms.add(name)
                    monomer.identifiers.add(Identifier('repairtoire', id))
                    if comments:
                        if monomer.comments:
                            monomer.comments += ' ' + comments
                        else:
                            monomer.comments = comments

                    merged_monomers.append((id, monomer.id))

                else:
                    monomer = Monomer(
                        id=id, name=name,
                        identifiers=[Identifier('repairtoire', id)],
                        structure=smiles,
                        l_bond_atoms=[Atom(Monomer, element='P', position=i_l_bond_atom)],
                        l_displaced_atoms=[Atom(Monomer, element='O', position=i_l_displaced_atom, charge=-1)],
                        r_bond_atoms=[Atom(Monomer, element='O', position=i_r_bond_atom)],
                        r_displaced_atoms=[Atom(Monomer, element='H', position=i_r_bond_atom)],
                        comments=comments or None)
                    alphabet.monomers[id] = monomer

                    new_monomers.append(id)

        print('Merged monomers\n  {}'.format('\n  '.join('\t'.join(ids) for ids in merged_monomers)))
        print('New monomers\n  {}'.format('\n  '.join(new_monomers)))

    def is_nucleobase_valid(self, monomer):
        """ Determine if monomeric form should be included in alphabet

        Args:
            monomer (:obj:`Monomer`): monomeric form

        Returns:
            :obj:`bool`: :obj:`True` if monomeric form should be included in alphabet
        """
        for invalid_id in self.INVALID_IDS:
            if invalid_id in monomer.id:
                return False

        return True

    def build_pdb_ccd(self, alphabet, ph=None, major_tautomer=False, dearomatize=False):
        """ Build monomeric forms from PDB CCD

        Args:
            alphabet (:obj:`Alphabet`): alphabet
            ph (:obj:`float`, optional): pH at which to calculate the major protonation state of each monomeric form
            major_tautomer (:obj:`bool`, optional): if :obj:`True`, calculate the major tautomer
            dearomatize (:obj:`bool`, optional): if :obj:`True`, dearomatize molecule

        Returns:
            :obj:`Alphabet`: alphabet
        """
        smiles_to_monomer = {get_can_smiles(monomer.structure): monomer for monomer in alphabet.monomers.values()}
        monomer_codes = {code: monomer for code, monomer in alphabet.monomers.items()}
        monomer_lc_codes = {code.lower(): code for code in alphabet.monomers.keys()}
        monomer_to_codes = {monomer: code for code, monomer in alphabet.monomers.items()}

        base_monomers = {}
        pdb_id_to_monomer = {}

        same_structures = []
        same_ids = []
        same_ids_case_insensitive = []

        replaced_monomers = []

        filename = download_pdb_ccd()
        valid_types = ('DNA linking',
                       'DNA OH 5 prime terminus',
                       'DNA OH 3 prime terminus')
        for pdb_monomer, base_monomer, smiles, pdb_structure, atoms in \
                parse_pdb_ccd(filename, valid_types, self._max_monomers):
            structure = get_pdb_ccd_open_babel_mol(pdb_structure)
            if structure is None:
                continue

            # set structure, atoms
            mol = openbabel.OBMol()
            mol += structure

            if "P" in atoms:
                atom = mol.GetAtom(atoms["P"]['position'])
                assert atom.GetAtomicNum() == 15
                atom.SetIsotope(1)

            ops = []
            for i_o in range(1, 4):
                id_o = 'OP' + str(i_o)
                id_h = 'H' + id_o
                if id_o in atoms and id_h in atoms:
                    atom_o = mol.GetAtom(atoms[id_o]['position'])
                    assert atom_o.GetAtomicNum() == 8
                    atom_o.SetIsotope(len(ops))

                    atom_h = mol.GetAtom(atoms[id_h]['position'])
                    assert atom_h.GetAtomicNum() == 1

                    ops.append((atom_o, atom_h))
                    if len(ops) == 2:
                        break

            if "O3'" in atoms:
                atom = mol.GetAtom(atoms["O3'"]['position'])
                assert atom.GetAtomicNum() == 8
                atom.SetIsotope(2)

            for op, hop in ops:
                assert mol.DeleteAtom(hop, True)
                op.SetFormalCharge(-1)

            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            isotope_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, isotope_smiles)
            isotope_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, isotope_smiles)
            i_left_p = None
            i_right_o = None
            i_left_o = None
            for atom in openbabel.OBMolAtomIter(mol):
                if atom.GetAtomicNum() == 15 and atom.GetIsotope() == 1:
                    i_left_p = atom.GetIdx()
                    atom.SetIsotope(0)
                elif atom.GetAtomicNum() == 8 and atom.GetIsotope() == 1:
                    i_left_o = atom.GetIdx()
                    atom.SetIsotope(0)
                elif atom.GetAtomicNum() == 8 and atom.GetIsotope() == 2:
                    i_right_o = atom.GetIdx()
                    atom.SetIsotope(0)
            can_smiles = conv.WriteString(mol, True)

            atom_map = {}
            for atom in openbabel.OBMolAtomIter(mol):
                if atom.GetAtomicNum() > 1:
                    atom_map[atom.GetIdx()] = len(atom_map) + 1

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)
            can_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)

            atom_map_2 = {}
            for atom in openbabel.OBMolAtomIter(mol):
                if atom.GetAtomicNum() > 1:
                    atom_map_2[len(atom_map_2) + 1] = atom.GetIdx()

            if i_left_p is not None:
                i_left_p = atom_map_2[atom_map[i_left_p]]
                assert mol.GetAtom(i_left_p).GetAtomicNum() == 15
            if i_left_o is not None:
                i_left_o = atom_map_2[atom_map[i_left_o]]
                assert mol.GetAtom(i_left_o).GetAtomicNum() == 8
            if i_right_o is not None:
                i_right_o = atom_map_2[atom_map[i_right_o]]
                assert mol.GetAtom(i_right_o).GetAtomicNum() == 8

            pdb_monomer.structure = mol
            if i_left_p is not None and i_left_o is not None and \
                    ('HOP3' in atoms or mol.GetAtom(i_left_o).GetFormalCharge() == -1):
                pdb_monomer.l_bond_atoms = [Atom(Monomer, element='P', position=i_left_p)]
                pdb_monomer.l_displaced_atoms = [
                    Atom(Monomer, element='O', position=i_left_o, charge=-1)]
            if i_right_o is not None and "HO3'" in atoms:
                pdb_monomer.r_bond_atoms = [Atom(Monomer, element='O', position=i_right_o)]
                pdb_monomer.r_displaced_atoms = [Atom(Monomer, element='H', position=i_right_o)]

            # get canonical SMILES for entry
            can_smiles = smiles

            if ph:
                can_smiles = get_major_micro_species(
                    can_smiles, 'smiles', 'smiles', ph=ph,
                    major_tautomer=major_tautomer,
                    dearomatize=dearomatize)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)
            can_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)
            can_smiles = conv.WriteString(mol, True)

            mol = openbabel.OBMol()
            conv = openbabel.OBConversion()
            assert conv.SetInFormat('smi')
            assert conv.SetOutFormat('smi')
            conv.SetOptions('c', conv.OUTOPTIONS)
            conv.ReadString(mol, can_smiles)
            can_smiles = get_can_smiles(mol)

            # determine if the entry is already represented in the alphabet
            merge_with_new = False

            monomer = None

            if pdb_monomer.id not in ['A3A', 'DNR']:
                monomer = smiles_to_monomer.get(can_smiles, None)
                if monomer is not None:
                    for identifier in monomer.identifiers:
                        if identifier.ns == 'pdb-ccd':
                            monomer = None
                            break
                if monomer is not None and (
                    (monomer.l_bond_atoms and not pdb_monomer.l_bond_atoms) or
                        (monomer.r_bond_atoms and not pdb_monomer.r_bond_atoms)):
                    monomer = None

            if monomer is not None:
                merge_with_new = True
                same_structures.append((pdb_monomer.id, monomer_to_codes[monomer]))
            if pdb_monomer.id in ['DA', 'DC', 'DG', 'DT']:
                merge_with_new = False
                monomer = alphabet.monomers.get(pdb_monomer.id[1], None)
            if monomer is None:
                monomer = monomer_codes.get(pdb_monomer.id, None)
                if monomer is not None:
                    same_ids.append(pdb_monomer.id)
            if monomer is None:
                monomer_diff_case = monomer_lc_codes.get(pdb_monomer.id.lower(), None)
                if monomer_diff_case is not None:
                    same_ids_case_insensitive.append((pdb_monomer.id, monomer_diff_case))

            # add/merge the entry with the alphabet
            if monomer is None:
                # add the entry to the alphabet
                alphabet.monomers[pdb_monomer.id] = pdb_monomer
                if base_monomer is not None:
                    base_monomers[pdb_monomer] = base_monomer
                pdb_id_to_monomer[pdb_monomer.id] = pdb_monomer

            elif merge_with_new:
                # merge an existing monomer with the PDB entry
                alphabet.monomers[monomer_to_codes[monomer]] = pdb_monomer

                pdb_monomer.synonyms.update(monomer.synonyms)
                pdb_monomer.synonyms.add(pdb_monomer.id)
                pdb_monomer.synonyms.add(pdb_monomer.name)
                pdb_monomer.id = monomer.id or pdb_monomer.id
                pdb_monomer.name = monomer.name or pdb_monomer.name
                pdb_monomer.synonyms.discard(pdb_monomer.id)
                pdb_monomer.synonyms.discard(pdb_monomer.name)

                pdb_monomer.identifiers.update(monomer.identifiers)
                pdb_monomer.base_monomers.update(monomer.base_monomers)
                pdb_monomer.comments = monomer.comments

                if base_monomer is not None:
                    base_monomers[pdb_monomer] = base_monomer
                pdb_id_to_monomer[pdb_monomer.id] = pdb_monomer

                replaced_monomers.append((monomer, pdb_monomer))

            else:
                # merge the entry with an existing monomer
                if pdb_monomer.id not in [monomer.id, monomer.name]:
                    monomer.synonyms.add(pdb_monomer.id)
                if pdb_monomer.name not in [monomer.id, monomer.name]:
                    monomer.synonyms.add(pdb_monomer.name)
                monomer.synonyms.update(pdb_monomer.synonyms)
                monomer.identifiers.update(pdb_monomer.identifiers)

                if base_monomer is not None:
                    base_monomers[monomer] = base_monomer
                pdb_id_to_monomer[pdb_monomer.id] = monomer

        # set base monomers
        for monomer, base_monomer in base_monomers.items():
            if base_monomer in pdb_id_to_monomer:
                monomer.base_monomers.add(pdb_id_to_monomer[base_monomer])

        for old_monomer, new_monomer in replaced_monomers:
            for o_monomer in alphabet.monomers.values():
                if old_monomer in o_monomer.base_monomers:
                    o_monomer.base_monomers.remove(old_monomer)
                    o_monomer.base_monomers.add(new_monomer)

        # print summary
        print('{} entries with the similar structures were joined:\n  {}\t{}\n  {}'.format(
            len(same_structures), 'PDB CCD id', 'Alphabet code',
            '\n  '.join(sorted('\t'.join(ids) for ids in same_structures))))
        print('{} entries with the same ids were joined:\n  {}'.format(
            len(same_ids), '\n  '.join(sorted(same_ids))))
        print('{} entries with similar ids potentially should be joined:\n  {}\t{}\n  {}'.format(
            len(same_ids_case_insensitive), 'PDB CCD id', 'Alphabet code',
            '\n  '.join(sorted('\t'.join(ids) for ids in same_ids_case_insensitive))))


class DnaForm(BpForm):
    """ DNA form """

    DEFAULT_FASTA_CODE = 'N'

    def __init__(self, seq=None, circular=False):
        """
        Args:
            seq (:obj:`MonomerSequence`, optional): monomeric forms of the DNA form
            circular (:obj:`bool`, optional): if :obj:`True`, indicates that the biopolymer is circular
        """
        super(DnaForm, self).__init__(
            seq=seq, alphabet=dna_alphabet,
            backbone=None,
            bond=Bond(
                r_bond_atoms=[Atom(Monomer, element='O', position=None)],
                l_bond_atoms=[Atom(Monomer, element='P', position=None)],
                r_displaced_atoms=[Atom(Monomer, element='H', position=None)],
                l_displaced_atoms=[Atom(Monomer, element='O', position=None, charge=-1)]),
            circular=circular)


class CanonicalDnaForm(BpForm):
    """ Canonical DNA form """

    DEFAULT_FASTA_CODE = 'N'

    def __init__(self, seq=None, circular=False):
        """
        Args:
            seq (:obj:`MonomerSequence`, optional): monomeric forms of the DNA form
            circular (:obj:`bool`, optional): if :obj:`True`, indicates that the biopolymer is circular
        """
        super(CanonicalDnaForm, self).__init__(
            seq=seq, alphabet=canonical_dna_alphabet,
            backbone=None,
            bond=Bond(
                r_bond_atoms=[Atom(Monomer, element='O', position=None)],
                l_bond_atoms=[Atom(Monomer, element='P', position=None)],
                r_displaced_atoms=[Atom(Monomer, element='H', position=None)],
                l_displaced_atoms=[Atom(Monomer, element='O', position=None, charge=-1)]),
            circular=circular)