choderalab/protons

View on GitHub
protons/app/amber_parser.py

Summary

Maintainability
F
1 wk
Test Coverage
#!/usr/bin/env python
"""
Adapted from openmoltools: https://github.com/choderalab/openmoltools.

Original license and copyright:
Copyright 2017 Kyle A. Beauchamp, John D. Chodera, David L. Mobley

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
import sys
import math
import simtk.openmm.app.element as element
import simtk.unit as unit
import subprocess
import datetime
from six.moves import cStringIO
import mdtraj as md
import logging

logger = logging.getLogger(__name__)


def fix(atomClass):
    if atomClass == "X":
        return ""
    return atomClass


elements = {}
for elem in element.Element._elements_by_symbol.values():
    num = elem.atomic_number
    if num not in elements or elem.mass < elements[num].mass:
        elements[num] = elem

OTHER = 0
ATOMS = 1
CONNECT = 2
CONNECTIVITY = 3
RESIDUECONNECT = 4
section = OTHER

charge14scale = 1.0 / 1.2
epsilon14scale = 0.5

skipResidues = [
    "CIO",
    "IB",
]  # "Generic" ions defined by Amber, which are identical to other real ions
skipClasses = ["OW", "HW"]  # Skip water atoms, since we define these in separate files


class AmberParser(object):
    def __init__(self, override_mol2_residue_name=None):
        """Create an AmberParser object for converting amber force field files to XML format.

        Parameters
        ----------
        override_mol2_residue_name : str, default=None
            If given, use this name to override mol2 residue names.
            Useful to ensure that multiple ligands have unique residue
            names, as required by the OpenMM ffXML parser.
        """

        self.override_mol2_residue_name = override_mol2_residue_name
        self.current_mol2 = 0

        self.residueAtoms = {}
        self.residueBonds = {}
        self.residueConnections = {}

        self.types = []
        self.type_names = []
        self.masses = {}
        self.resAtomTypes = {}
        self.vdwEquivalents = {}
        self.vdw = {}
        self.charge = {}
        self.bonds = []
        self.angles = []
        self.torsions = []
        self.impropers = []

        self.set_provenance()

    def addAtom(
        self, residue, atomName, atomClass, element, charge, use_numeric_types=True
    ):
        """Add an atom to the database of FF data.

        Notes
        -----

        use_numeric_types was not originally present in the OpenMM AMBER
        parsers.  It was added so that we can have atom types of the form
        "RES-X", where RES is the name of the molecule or residue and X
        is the atom numbering within that molecule.  use_numeric_types is
        set to False when processing mol2 files--e.g. for ligands.
        """
        if residue is None:
            return
        type_id = len(self.types)
        self.residueAtoms[residue].append([atomName, type_id])
        self.types.append((atomClass, element, charge))
        if use_numeric_types:
            self.type_names.append("%d" % (type_id))
        else:
            self.type_names.append("%s-%s" % (residue, atomName))

    def addBond(self, residue, atom1, atom2):
        """Add a bond to the database of FF data."""
        if residue is None:
            return
        self.residueBonds[residue].append((atom1, atom2))

    def addExternalBond(self, residue, atom):
        """Add an external bond to the database of FF data."""
        if residue is None:
            return
        if atom != -1:
            self.residueConnections[residue] += [atom]

    def process_mol2_file(self, inputfile):
        """Process an AMBER GAFF-compatible mol2 file.

        Parameters
        ----------
        inputfile : str
            filename of an .mol2 file

        Notes
        -----

        Antechamber is known to produce NONSTANDARD mol2 files.  This function
        is designed to work with those nonstandard mol2 files, not
        Tripos standard mol2 files.  We are forced to live with the poor
        decisions of our predecessors...

        """
        atoms, bonds = md.formats.mol2.mol2_to_dataframes(inputfile)

        if self.override_mol2_residue_name is None:
            residue_name = atoms.resName[1]  # To Do: Add check for consistency
        else:
            residue_name = self.override_mol2_residue_name

        # Give each mol2 file a unique numbering to avoid conflicts.
        residue_name = "%s-%d" % (residue_name, self.current_mol2)
        self.current_mol2 += 1

        self.residueAtoms[residue_name] = []
        self.residueBonds[residue_name] = []
        self.residueConnections[residue_name] = []

        for (i0, i1, name, x, y, z, atype, code, resname, charge) in atoms.itertuples(
            index=True
        ):
            # i0 and i1 are zero-based and one-based indices, respectively
            full_name = residue_name + "_" + name
            element_symbol = md.formats.mol2.gaff_elements[atype]
            e = element.Element.getBySymbol(element_symbol)
            self.addAtom(
                residue_name, name, atype, e, charge, use_numeric_types=False
            )  # use_numeric_types set to false to use string-based atom names, rather than numbers
            self.vdwEquivalents[full_name] = atype

        for (id0, id1, bond_type) in bonds.itertuples(False):
            i = id0 - 1  # Subtract 1 for zero based indexing in OpenMM???
            j = id1 - 1  # Subtract 1 for zero based indexing in OpenMM???
            self.addBond(residue_name, i, j)

    def process_library_file(self, inputfile):
        """Process an AMBER .lib file.

        Parameters
        ----------
        inputfile : str
            filename of an .lib file

        """
        for line in open(inputfile):
            if line.startswith("!entry"):
                fields = line.split(".")
                residue = fields[1]
                if residue in skipResidues:
                    residue = None
                    continue
                key = fields[3].split()[0]
                if key == "atoms":
                    section = ATOMS
                    self.residueAtoms[residue] = []
                    self.residueBonds[residue] = []
                    self.residueConnections[residue] = []
                elif key == "connect":
                    section = CONNECT
                elif key == "connectivity":
                    section = CONNECTIVITY
                elif key == "residueconnect":
                    section = RESIDUECONNECT
                else:
                    section = OTHER
            elif section == ATOMS:
                fields = line.split()
                atomName = fields[0][1:-1]
                atomClass = fields[1][1:-1]
                if fields[6] == "-1":
                    # Workaround for bug in some Amber files.
                    if atomClass[0] == "C":
                        elem = elements[6]
                    elif atomClass[0] == "H":
                        elem = elements[1]
                    else:
                        raise ValueError("Illegal atomic number: " + line)
                else:
                    elem = elements[int(fields[6])]
                self.charge = float(fields[7])
                self.addAtom(residue, atomName, atomClass, elem, self.charge)
            elif section == CONNECT:
                self.addExternalBond(residue, int(line) - 1)
            elif section == CONNECTIVITY:
                fields = line.split()
                self.addBond(residue, int(fields[0]) - 1, int(fields[1]) - 1)
            elif section == RESIDUECONNECT:
                # Some Amber files have errors in them, incorrectly listing atoms that should not be
                # connected in the first two positions.  We therefore rely on the "connect" section for
                # those, using this block only for other external connections.
                for atom in [int(x) - 1 for x in line.split()[2:]]:
                    self.addExternalBond(residue, atom)

    def process_dat_file(self, inputfile):
        """Process an AMBER .dat file.

        Parameters
        ----------
        inputfile : str
            filename of an .dat file

        """
        block = 0
        continueTorsion = False
        for line in open(inputfile):
            # Use to detect blank lines
            line_length = len(line.strip())
            if block == 0:  # Title
                block += 1
            elif block == 1:  # Mass
                if line_length == 0:
                    block += 1
                else:
                    params = self._parse_dat_atom_symbols_and_masses(line)
                    self.masses[params["kndsym"]] = float(params["amass"])
            elif block == 2:  # Hydrophilic atoms
                block += 1
            elif block == 3:  # Bonds
                if line_length == 0:
                    block += 1
                else:
                    params = self._parse_dat_bond_length_parameters(line)
                    self.bonds.append(
                        [params["ibt"], params["jbt"], params["rk"], params["req"]]
                    )
            elif block == 4:  # Angles
                if line_length == 0:
                    block += 1
                else:
                    params = self._parse_dat_bond_angle_parameters(line)
                    self.angles.append(
                        [
                            params["itt"],
                            params["jtt"],
                            params["ktt"],
                            params["tk"],
                            params["teq"],
                        ]
                    )
            elif block == 5:  # Torsions
                if line_length == 0:
                    block += 1
                else:
                    params = self._parse_dat_dihedral_parameters(line)
                    # Periodicity parameter pn is an int stored as a float,
                    # and a negative sign indicates additional dihedral terms are added on the next line
                    pn = int(float(params["pn"]))
                    pk_over_idivf = float(params["pk"]) / float(params["idivf"])
                    if continueTorsion:
                        self.torsions[-1] += [pk_over_idivf, params["phase"], abs(pn)]
                    else:
                        self.torsions.append(
                            [
                                params["ipt"],
                                params["jpt"],
                                params["kpt"],
                                params["lpt"],
                                pk_over_idivf,
                                params["phase"],
                                abs(pn),
                            ]
                        )
                    continueTorsion = pn < 0
            elif block == 6:  # Improper torsions
                if line_length == 0:
                    block += 1
                else:
                    params = self._parse_dat_improper_dihedral_parameters(line)
                    self.impropers.append(
                        (
                            params["ipt"],
                            params["jpt"],
                            params["kpt"],
                            params["lpt"],
                            params["pk"],
                            params["phase"],
                            params["pn"],
                        )
                    )
            elif block == 7:  # 10-12 hbond potential
                if line_length == 0:
                    block += 1
            elif block == 8:  # VDW equivalents
                if line_length == 0:
                    block += 1
                else:
                    symbols = self._parse_dat_6_12_equivalence_symbols(line)
                    for atom in symbols["ieqv"]:
                        self.vdwEquivalents[atom] = symbols["iorg"]
            elif block == 9:  # VDW type
                block += 1
                spec = self._parse_dat_6_12_potential_kind(line)
                self.vdwType = spec["kindnb"].upper()
                if self.vdwType not in ["RE", "AC"]:
                    raise ValueError("Nonbonded type (KINDNB) must be RE or AC")
            elif block == 10:  # VDW parameters
                if line_length == 0:
                    block += 1
                else:
                    params = self._parse_dat_6_12_nb_parameters(line, spec["kindnb"])
                    if self.vdwType == "RE":
                        self.vdw[params["ltynb"]] = (params["r"], params["edep"])
                    elif self.vdwType == "AC":
                        self.vdw[params["ltynb"]] = (params["a"], params["c"])

    @staticmethod
    def _parse_dat_atom_symbols_and_masses(line):
        """
        Parse a line in a parm.dat file using atom symbol and mass specification.

        Parameters
        ----------
        line : str
            Single line string containing atom symbol and mass parameters in a parm.dat file.

        Returns
        -------
        dict containing
        kndsym : str
        amass : str
        atpol : str
        line :str

        Notes
        -----
        Original format specification http://ambermd.org/formats.html#parm.dat

        - 2 -      ***** INPUT FOR ATOM SYMBOLS AND MASSES *****

                    KNDSYM , AMASS, ATPOL

                        FORMAT(A2,2X,F10.2x,f10.2)

         KNDSYM     The unique atom symbol used in the system.

         AMASS      Atomic mass of the center having the symbol "KNDSYM".

         ATPOL      The atomic polarizability for each atom (in A**3)
                    This is the type of polarizability used in sander
                    and gibbs. No parameters are supplied for this since
                    the feature is still in development (Amber 4.1).

              NOTE: All the unique atomic symbols and their masses must
                    be read.  The input is terminated by a blank card.

        Examples
        --------
        c3 12.01         0.878               Sp3 C
        ca 12.01         0.360               Sp2 C in pure aromatic systems

        """
        kndsym = line[0:2].strip()
        amass = line[4:14].strip()

        # prevent potential IndexError from line being too short
        try:
            apol = line[14:24].split()[0].strip()
        except IndexError:
            apol = line[14:-1].split()[0].strip()
        return locals()

    @staticmethod
    def _parse_dat_bond_length_parameters(line):
        """
        Parse a line in a parm.dat file using bond length format specification.

        Parameters
        ----------
        line : str
            Single line string containing bond length parameters in a parm.dat file.

        Returns
        -------
        dict containing
        ibt : str
        jbt : str
        rk : str
        req : str
        line : str

        Notes
        -----
        Original format specification http://ambermd.org/formats.html#parm.dat

        - 4 -      ***** INPUT FOR BOND LENGTH PARAMETERS *****

                    IBT , JBT , RK , REQ

                        FORMAT(A2,1X,A2,2F10.2)

         IBT,JBT    Atom symbols for the two bonded atoms.

         RK         The harmonic force constant for the bond "IBT"-"JBT".
                    The unit is kcal/mol/(A**2).

         REQ        The equilibrium bond length for the above bond in Angstroms

                    The input is terminated by a blank card.

        Examples
        --------

        n -os  395.0    1.4103       SOURCE4      30    0.0112
        no-s4  143.0    1.9960       SOURCE3       3    0.0313
        no-s6  149.6    1.9760       SOURCE3       3    0.0520
        """

        ibt = line[0:2].strip()
        jbt = line[3:5].strip()
        rk = line[5:15].strip()
        # prevent potential IndexError from line being too short
        try:
            req = line[15:25].split()[0].strip()
        except IndexError:
            req = line[15:-1].split()[0].strip()

        return locals()

    @staticmethod
    def _parse_dat_bond_angle_parameters(line):
        """
        Parse a line in a parm.dat file using bond angle format specification.
        Parameters
        ----------
        line : str
            Single line string containing bond angle parameters in a parm.dat file.

        Returns
        -------
        dict containing
        itt : str
        jtt : str
        ktt : str
        tk : str
        teq : str
        line : str

        Notes
        -----
        Original format specification http://ambermd.org/formats.html#parm.dat

        - 5 -      ***** INPUT FOR BOND ANGLE PARAMETERS *****


                    ITT , JTT , KTT , TK , TEQ

                        FORMAT(A2,1X,A2,1X,A2,2F10.2)

         ITT,...    The atom symbols for the atoms making an angle.

         TK         The harmonic force constants for the angle "ITT"-"JTT"-
                    "KTT" in units of kcal/mol/(rad**2) (radians are the
                    traditional unit for angle parameters in force fields).

         TEQ        The equilibrium bond angle for the above angle in degrees.

                    The input is terminated by a blank card.

        Examples
        --------
        n3-c3-n3   69.61      109.59    SOURCE4           27    1.8125
        n3-c3-nc   68.79      113.29    SOURCE3            1    0.0000
        n3-c3-nd   68.79      113.29    SOURCE3            1    same_as_n3-c3-nc
        c1-sh-hs   48.23       95.99    calculated_based_on_C#C-SH       0
        """
        itt = line[0:2].strip()
        jtt = line[3:5].strip()
        ktt = line[6:8].strip()
        tk = line[8:18].strip()

        # prevent potential IndexError from line being too short
        try:
            teq = line[18:28].split()[0].strip()
        except IndexError:
            teq = line[18:-1].split()[0].strip()
        return locals()

    @staticmethod
    def _parse_dat_dihedral_parameters(line):
        """
        Parse a line in a parm.dat file using dihedral format specification.
        Parameters
        ----------
        line : str
            Single line string containing dihedral parameters in a parm.dat file.

        Returns
        -------
        dict containing
        ipt : str
        jpt : str
        kpt : str
        lpt : str
        idivf : str
        pk : str
        phase : str
        pn : str
        line : str

        Notes
        -----
        Original format specification http://ambermd.org/formats.html#parm.dat

        - 6 -      ***** INPUT FOR DIHEDRAL PARAMETERS *****

                    IPT , JPT , KPT , LPT , IDIVF , PK , PHASE , PN

                        FORMAT(A2,1X,A2,1X,A2,1X,A2,I4,3F15.2)

         IPT, ...   The atom symbols for the atoms forming a dihedral
                    angle.  If IPT .eq. 'X ' .and. LPT .eq. 'X ' then
                    any dihedrals in the system involving the atoms "JPT" and
                    and "KPT" are assigned the same parameters.  This is
                    called the general dihedral type and is of the form
                    "X "-"JPT"-"KPT"-"X ".

         IDIVF      The factor by which the torsional barrier is divided.
                    Consult Weiner, et al., JACS 106:765 (1984) p. 769 for
                    details. Basically, the actual torsional potential is

                           (PK/IDIVF) * (1 + cos(PN*phi - PHASE))

         PK         The barrier height divided by a factor of 2.

         PHASE      The phase shift angle in the torsional function.

                    The unit is degrees.

         PN         The periodicity of the torsional barrier.
                    NOTE: If PN .lt. 0.0 then the torsional potential
                          is assumed to have more than one term, and the
                          values of the rest of the terms are read from the
                          next cards until a positive PN is encountered.  The
                          negative value of pn is used only for identifying
                          the existence of the next term and only the
                          absolute value of PN is kept.

            The input is terminated by a blank card.

        Examples
        --------
        X -c -cy-X    6    0.000       180.000           2.000      JCC, 7, (1986), 230
        X -c -ca-X    4    4.000       180.000           2.000      optimized by Junmei Wang, Jan-2013
        X -c -cc-X    4   11.500       180.000           2.000      statistic value


        """

        ipt = line[0:2].strip()
        jpt = line[3:5].strip()
        kpt = line[6:8].strip()
        lpt = line[9:11].strip()
        idivf = line[11:15].strip()
        pk = line[15:30].strip()
        phase = line[30:45].strip()
        # prevent potential IndexError from line being too short
        try:
            pn = line[45:60].split()[0].strip()
        except IndexError:
            pn = line[45:-1].split()[0].strip()

        return locals()

    @staticmethod
    def _parse_dat_improper_dihedral_parameters(line):
        """
        Parse a line in a parm.dat file using improper dihedral format specification.
        Parameters
        ----------
        line : str
            Single line string containing dihedral parameters in a parm.dat file.

        Returns
        -------
        dict containing
        ipt : str
        jpt : str
        kpt : str
        lpt : str
        idivf : str
        pk : str
        phase : str
        pn : str
        line : str

        Notes
        -----
        Original format specification http://ambermd.org/formats.html#parm.dat

         - 7 -      ***** INPUT FOR IMPROPER DIHEDRAL PARAMETERS *****

                    IPT , JPT , KPT , LPT , IDIVF , PK , PHASE , PN

                        FORMAT(A2,1X,A2,1X,A2,1X,A2,I4,3F15.2)

                    The input is the same as in for the dihedrals except that
                    the torsional barrier height is NOT divided by the factor
                    idivf.  The improper torsions are defined between any four
                    atoms not bonded (in a successive fashion) with each other
                    as in the case of "regular" or "proper" dihedrals.  Improper
                    dihedrals are used to keep certain groups planar and to
                    prevent the racemization of certain centers in the united
                    atom model.  Consult the above reference for details.

             Important note: all general type improper dihedrals
                             (e.g. x -x -ct-hc) should appear before all
                             specifics (ct-ct-ct-hc) in the parm list.
                             Otherwise the generals will override the
                             specific with no warning.

             The input is terminated by a blank card.

        Examples
        --------
        X -o -c -o          1.1          180.          2.           JCC,7,(1986),230
        X -X -c -o          10.5         180.          2.           JCC,7,(1986),230


        """

        ipt = line[0:2].strip()
        jpt = line[3:5].strip()
        kpt = line[6:8].strip()
        lpt = line[9:11].strip()
        idivf = line[11:15].strip()
        pk = line[15:30].strip()
        phase = line[30:45].strip()
        # prevent potential IndexError from line being too short
        try:
            pn = line[45:60].split()[0].strip()
        except IndexError:
            pn = line[45:-1].split()[0].strip()

        return locals()

    @staticmethod
    def _parse_dat_6_12_equivalence_symbols(line):
        """
        Parse a line in a parm.dat file using equivalencing symbols for non-bonded 6-12 potential specification.

        Parameters
        ----------
        line : str
            Single line string containing equivalencing symbols for 6-12 non-bonded parameters in a parm.dat file.

        Returns
        -------
        dict containing
        iorg : str
        ieqv : list of str
        line : str

        Notes
        -----
        Original format specification http://ambermd.org/formats.html#parm.dat

        - 9 -      ***** INPUT FOR EQUIVALENCING ATOM SYMBOLS FOR
                          THE NON-BONDED 6-12 POTENTIAL PARAMETERS *****

                          IORG , IEQV(I) , I = 1 , 19

                              FORMAT(20(A2,2X))

         IORG        The atom symbols to which other atom symbols are to be
                     equivalenced in generating the 6-12 potential parameters.

         IEQV(I)     The atoms symbols which are to be equivalenced to the
                     atom symbol "IORG".  If more than 19 atom symbols have
                     to be equivalenced to a given atom symbol they can be
                     included as extra cards.

                     It is advisable not to equivalence any hydrogen bond
                     atom type atoms with any other atom types.

          NOTE: The input is terminated by a blank card.

        """
        ieqv = list()
        iorg = line[0:2].strip()
        # continue adding names till line runs out,
        # or reaches 19 which is the maximum according to format
        try:
            for n in range(1, 20):
                ieqv.append(line[4 * n : 4 * n + 2].split()[0].strip())
        except IndexError:
            pass

        return dict(ieqv=ieqv, iorg=iorg, line=line)

    @staticmethod
    def _parse_dat_6_12_potential_kind(line):
        """
        Parse a line in a parm.dat file using input for non-bonded 6-12 potential specification.

        Parameters
        ----------
        line : str
            Single line string containing the input format for 6-12 non-bonded parameters in a parm.dat file.

        Returns
        -------
        dict containing
        label : str
        kindb : str
        line : str

        Notes
        -----
        Original format specification http://ambermd.org/formats.html#parm.dat

        - 10 -      ***** INPUT FOR THE 6-12 POTENTIAL PARAMETERS *****

                     LABEL , KINDNB

                         FORMAT(A4,6X,A2)

         LABEL       The name of the non-bonded input parameter to be
                     used.  It has to be matched with "NAMNB" read through
                     unit 5.  The program searches the file to load the
                     the required non-bonded parameters.  If that name is
                     not found the run will be terminated.

         KINDNB      Flag for the type of 6-12 parameters.

          'SK'       Slater-Kirkwood parameters are input.
                     see "caution" below.

          'RE'       van der Waals radius and the potential well depth
                     parameters are read.

          'AC'       The 6-12 potential coefficients are read.

             NOTE: All the non equivalenced atoms' parameters have to
                   be given.

           The input is terminated when label .eq. 'END'

        Examples
        --------
        MOD4      RE

        """

        label = line[0:4].strip()
        kindnb = line[10:12].strip()

        if kindnb not in ["SK", "RE", "AC"]:
            raise ValueError(
                "Unsupported 6-12 potential format {kindb}".format(**locals())
            )

        return locals()

    @staticmethod
    def _parse_dat_6_12_nb_parameters(line, kindnb):
        """
        Parse a line in a parm.dat file using RE format for 6-12 potential specification.

        Parameters
        ----------
        line : str
            Single line string containing equivalencing symbols for 6-12 non-bonded parameters in a parm.dat file.

        kindnb : str
            The kind of format for the nonbonded parameter line ("SK", "RE", or "AC")

        Returns
        -------
        dict containing
        lytnb : str
        line : str
        kindnb : str

        and for SK

        pol : str
        xneff : str
        rmin :str

        or for RE

        r : str
        edep : str

        or for AC
        a : str
        c : str


        Notes
        -----

        This code assumes the format is   FORMAT(2X,A2,6X,2F10.6) for 10B and 10C

        Original format specification http://ambermd.org/formats.html#parm.dat
         - 10A -     ***** ONLY IF KINDNB .EQ. 'SK' *****

                     LTYNB , POL , XNEFF , RMIN

                         FORMAT(2X,A2,6X,3F10.6)

         LTYNB       Atom symbol.

         POL         Atomic polarizability for the atom centers having the
                     the above symbol.

         XNEFF       Effective number of electrons on the atom centers having
                     the above symbol.

         RMIN        van der Waals radius of the atom center having the above
                     symbol.


        - 10B -     ***** ONLY IF KINDNB .EQ. 'RE' *****

                     LTYNB , R , EDEP

         LTYNB       Atom symbol.

         R           The van der Waals radius of the atoms having the symbol
                     "LTYNB"  (Angstoms)

         EDEP        The 6-12 potential well depth. (kcal/mol)

        ------------------------------------------------------------------------

         - 10C -     ***** ONLY IF KINDNB .EQ. 'AC' *****

                     LTYNB , A , C

         LTYNB       Atom symbol.

         A           The coefficient of the 12th power term (A/r**12).

         C           The coefficient of the 6th power term (-C/r**6).

        Examples
        --------
          c1          1.9080  0.2100             cp C DLM 11/2007 well depth from OPLS replacing 0.0860
          c2          1.9080  0.0860             sp2 atom in the middle of C=CD-CD=C

        """
        ltynb = line[2:4].strip()
        if kindnb.upper() == "SK":
            pol = line[10:20].strip()
            xneff = line[20:30].strip()

            # prevent IndexError from line being too short
            try:
                rmin = line[30:40].split()[0].strip()
            except IndexError:
                rmin = line[30:-1].split()[0].strip()
        elif kindnb.upper() == "RE":
            r = line[10:20].strip()
            # prevent IndexError from line being too short
            try:
                edep = line[20:30].split()[0].strip()
            except IndexError:
                edep = line[20:-1].split()[0].strip()
        elif kindnb.upper() == "AC":
            a = line[10:20].strip()
            # prevent IndexError from line being too short
            try:
                c = line[20:30].split()[0].strip()
            except IndexError:
                c = line[20:-1].split()[0].strip()
        else:
            raise ValueError("Unsupported NB format {nbformat}".format(**locals()))

        return locals()

    def process_frc_file(self, inputfile):
        """Process an AMBER .frc file.

        Parameters
        ----------
        inputfile : str
            filename of an .frc file

        """
        block = ""
        continueTorsion = False
        first = True
        for line in open(inputfile):
            line = line.strip()
            if len(line) == 0 or first:
                block = None
                first = False
            elif block is None:
                block = line
            elif block.startswith("MASS"):
                fields = line.split()
                self.masses[fields[0]] = float(fields[1])
            elif block.startswith("BOND"):
                fields = line[5:].split()
                self.bonds.append(
                    (line[:2].strip(), line[3:5].strip(), fields[0], fields[1])
                )
            elif block.startswith("ANGL"):
                fields = line[8:].split()
                self.angles.append(
                    (
                        line[:2].strip(),
                        line[3:5].strip(),
                        line[6:8].strip(),
                        fields[0],
                        fields[1],
                    )
                )
            elif block.startswith("DIHE"):
                fields = line[11:].split()
                periodicity = int(float(fields[3]))
                if continueTorsion:
                    self.torsions[-1] += [
                        float(fields[1]) / float(fields[0]),
                        fields[2],
                        abs(periodicity),
                    ]
                else:
                    self.torsions.append(
                        [
                            line[:2].strip(),
                            line[3:5].strip(),
                            line[6:8].strip(),
                            line[9:11].strip(),
                            float(fields[1]) / float(fields[0]),
                            fields[2],
                            abs(periodicity),
                        ]
                    )
                continueTorsion = periodicity < 0
            elif block.startswith("IMPR"):
                fields = line[11:].split()
                self.impropers.append(
                    (
                        line[:2].strip(),
                        line[3:5].strip(),
                        line[6:8].strip(),
                        line[9:11].strip(),
                        fields[0],
                        fields[1],
                        fields[2],
                    )
                )
            elif block.startswith("NONB"):
                fields = line.split()
                self.vdw[fields[0]] = (fields[1], fields[2])

    def generate_xml(self):
        """Return the processed forcefield files as an XML stream.

        Returns
        -------
        stream : cStringIO
            The text of the output XML forcefield data.

        Notes
        -----

        The stream can be written to disk via:

        outfile = open("my_forcefield.xml", 'w')
        outfile.write(stream.read())
        outfile.close()

        """
        stream = cStringIO()
        write_stream = lambda x: stream.write(x + "\n")
        write_stream(self.provenance)
        write_stream("<ForceField>")
        write_stream(" <AtomTypes>")
        for index, type in enumerate(self.types):
            write_stream(
                """  <Type name="%s" class="%s" element="%s" mass="%s"/>"""
                % (
                    self.type_names[index],
                    type[0],
                    type[1].symbol,
                    type[1].mass.value_in_unit(unit.amu),
                )
            )
        write_stream(" </AtomTypes>")
        write_stream(" <Residues>")
        for res in sorted(self.residueAtoms):
            write_stream("""  <Residue name="%s">""" % res)
            for atom in self.residueAtoms[res]:
                atom_name, type_id = tuple(atom)
                atom_type = self.type_names[type_id]
                write_stream('   <Atom name="%s" type="%s"/>' % (atom_name, atom_type))
            if res in self.residueBonds:
                for bond in self.residueBonds[res]:
                    write_stream("""   <Bond from="%d" to="%d"/>""" % bond)
            if res in self.residueConnections:
                for bond in self.residueConnections[res]:
                    write_stream("""   <ExternalBond from="%d"/>""" % bond)
            write_stream("  </Residue>")
        write_stream(" </Residues>")
        write_stream(" <HarmonicBondForce>")
        processed = set()
        for bond in self.bonds:
            signature = (bond[0], bond[1])
            if signature in processed:
                continue
            if any([c in skipClasses for c in signature]):
                continue
            processed.add(signature)
            length = float(bond[3]) * 0.1
            k = float(bond[2]) * 2 * 100 * 4.184
            write_stream(
                """  <Bond class1="%s" class2="%s" length="%s" k="%s"/>"""
                % (bond[0], bond[1], str(length), str(k))
            )
        write_stream(" </HarmonicBondForce>")
        write_stream(" <HarmonicAngleForce>")
        processed = set()
        for angle in self.angles:
            signature = (angle[0], angle[1], angle[2])
            if signature in processed:
                continue
            if any([c in skipClasses for c in signature]):
                continue
            processed.add(signature)
            theta = float(angle[4]) * math.pi / 180.0
            k = float(angle[3]) * 2 * 4.184
            write_stream(
                """  <Angle class1="%s" class2="%s" class3="%s" angle="%s" k="%s"/>"""
                % (angle[0], angle[1], angle[2], str(theta), str(k))
            )
        write_stream(" </HarmonicAngleForce>")
        write_stream(" <PeriodicTorsionForce>")
        processed = set()
        for tor in reversed(self.torsions):
            signature = (fix(tor[0]), fix(tor[1]), fix(tor[2]), fix(tor[3]))
            if signature in processed:
                continue
            if any([c in skipClasses for c in signature]):
                continue
            processed.add(signature)
            tag = (
                '  <Proper class1="%s" class2="%s" class3="%s" class4="%s"' % signature
            )
            i = 4
            while i < len(tor):
                index = i / 3
                periodicity = int(float(tor[i + 2]))
                phase = float(tor[i + 1]) * math.pi / 180.0
                k = tor[i] * 4.184
                tag += ' periodicity%d="%d" phase%d="%s" k%d="%s"' % (
                    index,
                    periodicity,
                    index,
                    str(phase),
                    index,
                    str(k),
                )
                i += 3
            tag += "/>"
            write_stream(tag)
        processed = set()
        for tor in reversed(self.impropers):
            signature = (fix(tor[2]), fix(tor[0]), fix(tor[1]), fix(tor[3]))
            if signature in processed:
                continue
            if any([c in skipClasses for c in signature]):
                continue
            processed.add(signature)
            tag = (
                '  <Improper class1="%s" class2="%s" class3="%s" class4="%s"'
                % signature
            )
            i = 4
            while i < len(tor):
                index = i / 3
                periodicity = int(float(tor[i + 2]))
                phase = float(tor[i + 1]) * math.pi / 180.0
                k = float(tor[i]) * 4.184
                tag += ' periodicity%d="%d" phase%d="%s" k%d="%s"' % (
                    index,
                    periodicity,
                    index,
                    str(phase),
                    index,
                    str(k),
                )
                i += 3
            tag += "/>"
            write_stream(tag)
        write_stream(" </PeriodicTorsionForce>")
        write_stream(
            """ <NonbondedForce coulomb14scale="%g" lj14scale="%s">"""
            % (charge14scale, epsilon14scale)
        )
        sigmaScale = 0.1 * 2.0 / (2.0 ** (1.0 / 6.0))
        for index, type in enumerate(self.types):
            atomClass = type[0]
            q = type[2]
            if atomClass in self.vdwEquivalents:
                atomClass = self.vdwEquivalents[atomClass]
            if atomClass in self.vdw:
                params = [float(x) for x in self.vdw[atomClass]]
                if self.vdwType == "RE":
                    sigma = params[0] * sigmaScale
                    epsilon = params[1] * 4.184
                else:
                    sigma = (params[0] / params[1]) ** (1.0 / 6.0)
                    epsilon = 4.184 * params[1] * params[1] / (4 * params[0])
            else:
                sigma = 1.0
                epsilon = 0
            if q != 0 or epsilon != 0:
                write_stream(
                    """  <Atom type="%s" charge="%s" sigma="%s" epsilon="%s"/>"""
                    % (self.type_names[index], q, sigma, epsilon)
                )
        write_stream(" </NonbondedForce>")
        write_stream("</ForceField>")
        stream.seek(0)

        return stream

    def parse_filenames(self, filenames):
        """Process a list of filenames according to their filetype suffixes

        Parameters
        ----------
        filenames : list (of strings)
            List of filenames of type (lib, off, dat, or mol2)

        Notes
        -----
        When parameterizing small molecules, the correct order of inputs is:

        $AMBER_LIB_PATH/gaff.dat ligand_name.mol2 ligand_name.frcmod

        """
        for inputfile in filenames:
            if inputfile.endswith(".lib") or inputfile.endswith(".off"):
                self.process_library_file(inputfile)
            elif inputfile.endswith(".dat"):
                self.process_dat_file(inputfile)
            elif inputfile.endswith("mol2"):
                self.process_mol2_file(inputfile)
            else:
                self.process_frc_file(inputfile)

        self.reduce_atomtypes()

    def reduce_atomtypes(self, symmetrize_protons=False):
        """Reduce the list of atom self.types.

        Parameters
        ----------
        symmetrize_protons : bool, default=False
            if True, multiple hydrogens bound to the same heavy atom
            should all use the same type.

        Notes
        -----

        The default behavior of symmetrize_protons differs from the
        original OpenMM version of this script.  For arbitrary small
        molecules, we can not assume symmetric protons.
        """

        removeType = [False] * len(self.types)
        for res in self.residueAtoms:
            if res not in self.residueBonds:
                continue
            atomBonds = [[] for atom in self.residueAtoms[res]]
            for bond in self.residueBonds[res]:
                atomBonds[bond[0]].append(bond[1])
                atomBonds[bond[1]].append(bond[0])
            if symmetrize_protons is True:
                for index, atom in enumerate(self.residueAtoms[res]):
                    hydrogens = [
                        x
                        for x in atomBonds[index]
                        if self.types[self.residueAtoms[res][x][1]][1]
                        == element.hydrogen
                    ]
                    for h in hydrogens[1:]:
                        removeType[self.residueAtoms[res][h][1]] = True
                        self.residueAtoms[res][h][1] = self.residueAtoms[res][
                            hydrogens[0]
                        ][1]
        newTypes = []
        replaceWithType = [0] * len(self.types)
        for i in range(len(self.types)):
            if not removeType[i]:
                newTypes.append(self.types[i])
            replaceWithType[i] = len(newTypes) - 1
        self.types = newTypes
        for res in self.residueAtoms:
            for atom in self.residueAtoms[res]:
                atom[1] = replaceWithType[atom[1]]

    def set_provenance(self):
        """Set the provenance attribute with information about the current python session."""
        self.provenance = []
        line = """<!-- %s -->\n""" % "Time and parameters of origin:"
        self.provenance.append(line)
        now = datetime.datetime.now()
        line = """<!-- %s -->\n""" % str(now)
        self.provenance.append(line)
        cmd_string = subprocess.list2cmdline(sys.argv[1:])
        cmd_string = cmd_string.replace(
            "-", " "
        )  # Replace XML specific characters that can break some XML parsers
        cmd_string = cmd_string.replace(">", " ")  #
        cmd_string = cmd_string.replace("<", " ")  #
        line = """<!-- %s -->\n""" % cmd_string
        self.provenance.append(line)
        self.provenance = "".join(self.provenance)