bpforms/alphabet/core.py from KarrLab/bpforms

bpforms/alphabet/core.py
Summary

Maintainability

1 day
Test Coverage

95%
Issues
Coverage
""" Code to help build alphabets

:Author: Jonathan Karr <karr@mssm.edu>
:Date: 2019-08-14
:Copyright: 2019, Karr Lab
:License: MIT
"""

from bpforms.core import (cache, Monomer, Identifier, IdentifierSet, SynonymSet)
from xml.etree.ElementTree import ElementTree
import io
import mendeleev
import openbabel
import os
import pkg_resources
import requests
import tarfile


def download_pdb_ccd():
    """ Download PDB CCD

    Returns:
        :obj:`str`: path to tar.gz file for the PDB CCD
    """
    dirname = pkg_resources.resource_filename('bpforms', os.path.join('alphabet', 'pdb'))
    if not os.path.isdir(dirname):
        os.makedirs(dirname)
    filename = os.path.join(dirname, 'components-pub-xml.tar.gz')
    if not os.path.isfile(filename):
        response = requests.get('http://ligand-expo.rcsb.org/dictionaries/components-pub-xml.tar.gz')
        response.raise_for_status()
        with open(filename, 'wb') as file:
            file.write(response.content)

    return filename


@cache.memoize(typed=False, expire=30 * 24 * 60 * 60)  # , filename_args=[0])
def parse_pdb_ccd(filename, valid_types, max_monomers):
    """ Parse entries out of the PDB CCD

    Args:
        filename (:obj:`str`): path to tar.gz file for PDB CCD
        valid_types (:obj:`tuple` of :obj:`str`): list
            of types of entries to retrieve
        max_monomers (:obj:`float`): maximum number of
            entries to process

    Returns:
        :obj:`list` of :obj:`tuple`:  list of metadata and
            structures of the entries
    """
    entries = []
    with tarfile.open(filename, 'r:gz') as tar_file:
        i_file = 0
        n_files = len(tar_file.getmembers())
        for file_info in tar_file:
            i_file += 1
            if i_file % 1000 == 1:
                print('Processing file {} of {}'.format(i_file, n_files))

            if os.path.splitext(file_info.name)[-1] != '.xml':
                continue

            xml_file = tar_file.extractfile(file_info)
            entry = parse_pdb_ccd_entry(
                xml_file, valid_types)
            if entry is not None:
                entries.append(entry)
            if max_monomers is not None and len(entries) >= max_monomers:
                break
    return entries


def parse_pdb_ccd_entry(xml_file, valid_types):
    """ Parse an entry of the PDB CCD

    Args:
        xml_file (:obj:`io.BufferedReader`): XML file
            that defines an entry of the PDB CCD
        valid_types (:obj:`list` of :obj:`str`): list
            of types of entries to retrieve

    Returns:
        :obj:`tuple`:

            * :obj:`Monomer`: metadata about the entry
            * :obj:`str`: id of base monomer
            * :obj:`str`: SMILES-encoded structure of the entry
            * :obj:`dict`: structure of the entry
            * :obj:`dict`: dictionary that maps atom ids to their
                coordinates
    """
    ns = '{http://pdbml.pdb.org/schema/pdbx-v40.xsd}'

    xml_root = ElementTree().parse(xml_file)
    xml_group = xml_root.find(ns + 'chem_compCategory')
    if xml_group is None:
        return  # pragma: no cover # element is always present

    # get id
    xml_comp = xml_group.find(ns + 'chem_comp')
    if xml_comp is None:
        return  # pragma: no cover # element is always present
    id = xml_comp.get('id')
    identifiers = IdentifierSet([Identifier('pdb-ccd', id)])

    # check that compound has been released, is an amino acid, and is no ambiguous
    xml_el = xml_comp.find(ns + 'pdbx_release_status')
    if xml_el is None or xml_el.text != 'REL':
        return

    xml_el = xml_comp.find(ns + 'type')
    if xml_el is None or xml_el.text not in valid_types:
        return

    xml_el = xml_comp.find(ns + 'pdbx_ambiguous_flag')
    if xml_el is None or xml_el.text != 'N':
        return  # pragma: no cover # element is always present

    # get name
    xml_el = xml_comp.find(ns + 'name')
    if xml_el is None:
        name = None  # pragma: no cover # element is always present
    else:
        name = xml_el.text.lower()

    # retrieve synonyms
    synonyms = SynonymSet()
    xml_el = xml_comp.find(ns + 'one_letter_code')
    if xml_el is not None:
        synonyms.add(xml_el.text)

    for xml_group in xml_root.findall(ns + 'pdbx_chem_comp_identifierCategory'):
        for xml_subgroup in xml_group.findall(ns + 'pdbx_chem_comp_identifier'):
            if xml_subgroup.get('type') == 'SYSTEMATIC NAME':
                synonyms.add(xml_subgroup.find(ns + 'identifier').text)

    xml_el = xml_comp.find(ns + 'mon_nstd_parent_comp_id')
    if xml_el is not None:
        base_monomer = xml_el.text
    else:
        base_monomer = None

    # retrieve structure
    smiles = None
    for xml_group in xml_root.findall(ns + 'pdbx_chem_comp_descriptorCategory'):
        for xml_subgroup in xml_group.findall(ns + 'pdbx_chem_comp_descriptor'):
            if xml_subgroup.get('type') == 'SMILES_CANONICAL' and \
                    xml_subgroup.get('program') == 'OpenEye OEToolkits':
                smiles = xml_subgroup.find(ns + 'descriptor').text
    if smiles is None:
        return  # pragma: no cover # element is always present

    # discard entries with coordinating metals
    mol = openbabel.OBMol()
    inchi_conv = openbabel.OBConversion()
    assert inchi_conv.SetInFormat('smi'), 'Unable to set format to SMILES'
    assert inchi_conv.SetOutFormat('inchi'), 'Unable to set format to InChI'
    inchi_conv.ReadString(mol, smiles)
    inchi = inchi_conv.WriteString(mol)
    formula = inchi.split('/')[1]
    if '.' in formula:
        return

    mol = {
        'complete': True,
        'atoms': [],
        'bonds': [],
    }
    atoms = {}

    xml_group = xml_root.find(ns + 'chem_comp_atomCategory')
    xml_atoms = xml_group.findall(ns + 'chem_comp_atom')
    for xml_atom in xml_atoms:
        if xml_atom.get('comp_id') != id:
            continue

        atom_id = xml_atom.get('atom_id')
        charge = int(float(xml_atom.find(ns + 'charge').text))
        element = xml_atom.find(ns + 'type_symbol').text
        element = element[0] + element[1:].lower()

        mol['atoms'].append({
            'atomic_num': getattr(mendeleev, element).atomic_number,
            'charge': charge,
        })

        atoms[atom_id] = {
            'position': int(float(xml_atom.find(ns + 'pdbx_ordinal').text)),
            'element': element,
            'charge': charge,
        }

    xml_group = xml_root.find(ns + 'chem_comp_bondCategory')
    xml_bonds = xml_group.findall(ns + 'chem_comp_bond')
    for xml_bond in xml_bonds:
        if xml_bond.get('comp_id') != id:
            continue

        i_atom_1 = atoms[xml_bond.get('atom_id_1')]['position']
        i_atom_2 = atoms[xml_bond.get('atom_id_2')]['position']

        order_str = xml_bond.find(ns + 'value_order').text
        if order_str == 'sing':
            order = 1
        elif order_str == 'doub':
            order = 2
        elif order_str == 'trip':
            order = 3
        elif order_str == 'quad':
            order = 4
        elif order_str == 'arom':
            order = 5
        else:  # ['delo', 'pi', 'poly']
            mol['complete'] = False

        mol['bonds'].append({
            'begin': i_atom_1,
            'end': i_atom_2,
            'order': order,
        })

    return (Monomer(id=id, name=name, synonyms=synonyms, identifiers=identifiers),
            base_monomer, smiles, mol, atoms)


def get_pdb_ccd_open_babel_mol(pdb_mol):
    """ Generate an Open Babel representation of a PDB CCD entry

    Args:
        pdb_mol (:obj:`dict`): structure of a entry

    Returns:
        :obj:`openbabel.OBMol`: structure of a entry
    """
    if not pdb_mol['complete']:
        return None

    mol = openbabel.OBMol()

    for pdb_atom in pdb_mol['atoms']:
        atom = openbabel.OBAtom()
        atom.SetAtomicNum(pdb_atom['atomic_num'])
        atom.SetFormalCharge(pdb_atom['charge'])
        mol.AddAtom(atom)

    for pdb_bond in pdb_mol['bonds']:
        bond = openbabel.OBBond()
        bond.SetBegin(mol.GetAtom(pdb_bond['begin']))
        bond.SetEnd(mol.GetAtom(pdb_bond['end']))
        bond.SetBondOrder(pdb_bond['order'])
        assert mol.AddBond(bond)

    return mol


def get_can_smiles(mol):
    """ Get the canonical SMILES representation of a molecule without its stereochemistry

    Args:
        mol (:obj:`openbabel.OBMol`): molecule

    Returns:
        :obj:`str`: SMILES representation of a molecule without its stereochemistry
    """
    conv = openbabel.OBConversion()
    assert conv.SetInFormat('smi')
    assert conv.SetOutFormat('smi')
    conv.SetOptions('c', conv.OUTOPTIONS)
    smiles = conv.WriteString(mol, True)
    mol = openbabel.OBMol()
    conv.ReadString(mol, smiles)

    for atom in openbabel.OBMolAtomIter(mol):
        atom.UnsetStereo()

    for bond in openbabel.OBMolBondIter(mol):
        bond.UnsetHash()
        bond.UnsetWedge()
        bond.UnsetUp()
        bond.UnsetDown()

    mol.DeleteData(openbabel.StereoData)

    conv = openbabel.OBConversion()
    assert conv.SetInFormat('smi')
    assert conv.SetOutFormat('smi')
    conv.SetOptions('c', conv.OUTOPTIONS)
    smiles = conv.WriteString(mol, True)

    conv = openbabel.OBConversion()
    assert conv.SetInFormat('smi')
    assert conv.SetOutFormat('smi')
    conv.SetOptions('c', conv.OUTOPTIONS)
    mol = openbabel.OBMol()
    conv.ReadString(mol, smiles)
    smiles = conv.WriteString(mol, True)

    return smiles