packages/miew/src/io/parsers/CMLParser.js

Summary

Maintainability
F
1 wk
Test Coverage
import Parser from './Parser'
import chem from '../../chem'
import { isString } from 'lodash'
import { Vector3 } from 'three'

const { Complex, Element, SGroup, Bond } = chem

const cOrderCharCodes = {
  A: 0,
  S: 1,
  D: 2,
  T: 3
}

const cmlStartRegexp = /\s*<\?xml\b[^?>]*\?>\s*<(?:cml|molecule)\b/i

class CMLParser extends Parser {
  constructor(data, options) {
    super(data, options)
    this._complex = null
    this._residue = null
    this._serialAtomMap = null
    this._modelId = 1
    this._lastMolId = -1
    this._readOnlyOneMolecule = false
    this._options.fileType = 'cml'
  }

  static canProbablyParse(data) {
    return isString(data) && cmlStartRegexp.test(data)
  }

  _rebuidBondIndexes(atoms, bonds) {
    const count = atoms.length
    for (let i = 0; i < count; i++) {
      const atomId = atoms[i].id

      const countBonds = bonds.length
      for (let j = 0; j < countBonds; j++) {
        const idxs = bonds[j].atomRefs2.split(' ')
        if (idxs[0] === atomId) {
          bonds[j].start = i
        }

        if (idxs[1] === atomId) {
          bonds[j].end = i
        }
      }
    }
  }

  _createSGroup(molecule, moleculeArr) {
    const newGroup = new SGroup(
      molecule.id,
      molecule.fieldData,
      new Vector3(parseFloat(molecule.x), parseFloat(molecule.y), 0),
      molecule.atomRefs,
      molecule
    )
    if (molecule.placement === 'Relative') {
      newGroup._center = new Vector3(0, 0, 0)
    }
    if (molecule.fieldName === 'MDLBG_FRAGMENT_CHARGE') {
      newGroup._charge = parseInt(molecule.fieldData, 10) || 0
    }
    if (molecule.fieldName === 'MDLBG_FRAGMENT_COEFFICIENT') {
      newGroup._repeat = parseInt(molecule.fieldData, 10) || 1
    }
    moleculeArr.push(newGroup)
  }

  _extractSGroup(molecule, moleculeArr) {
    if (!Array.isArray(moleculeArr)) {
      moleculeArr = []
    }

    if (molecule) {
      if (Array.isArray(molecule)) {
        const count = molecule.length
        for (let i = 0; i < count; i++) {
          if (molecule[i].molecule) {
            moleculeArr = moleculeArr.concat(
              this._extractSGroup(molecule[i].molecule)
            )
          }
          this._createSGroup(molecule[i], moleculeArr)
        }
      } else {
        if (molecule.molecule) {
          if (molecule.molecule) {
            moleculeArr = moleculeArr.concat(
              this._extractSGroup(molecule.molecule)
            )
          }
        }
        this._createSGroup(molecule, moleculeArr)
      }
    }

    return moleculeArr
  }

  _extractSGroups(molecule, atoms) {
    const moleculeArr = this._extractSGroup(molecule)

    const count = atoms.length
    let i
    let j

    for (i = 0; i < count; i++) {
      const atomId = atoms[i].id
      for (j = 0; j < moleculeArr.length; j++) {
        const firstAtomRef = moleculeArr[j]._atoms.split(' ')[0]
        if (firstAtomRef === atomId) {
          if (!atoms[i].sgroupRef) {
            atoms[i].sgroupRef = []
          }
          atoms[i].sgroupRef.push(moleculeArr[j])
        }
      }
    }
    // build sGroups centers
    let atomMap = {} // sgrpmap cache
    let mapEntry = null
    const nLimon = 100000000
    const bLow = new Vector3(nLimon, nLimon, nLimon)
    const bHight = new Vector3(-nLimon, -nLimon, -nLimon)

    function cycleFuncInner(e) {
      mapEntry = atomMap[e]
      if (mapEntry) {
        moleculeArr[j]._atoms.push(mapEntry.a)
      }
    }

    function cycleFunc(e) {
      mapEntry = atomMap[e]
      if (mapEntry) {
        bLow.set(
          Math.min(bLow.x, mapEntry.x),
          Math.min(bLow.y, mapEntry.y),
          Math.min(bLow.z, mapEntry.z)
        )
        bHight.set(
          Math.max(bHight.x, mapEntry.x),
          Math.max(bHight.y, mapEntry.y),
          Math.max(bHight.z, mapEntry.z)
        )
        cycleFuncInner(e)
      }
    }

    for (i = 0; i < atoms.length; i++) {
      atomMap[atoms[i].id] = {}
      atomMap[atoms[i].id].x = atoms[i].x2
      if (atoms[i].x3) {
        atomMap[atoms[i].id].x = atoms[i].x3
      }
      atomMap[atoms[i].id].x = parseFloat(atomMap[atoms[i].id].x)
      atomMap[atoms[i].id].y = atoms[i].y2
      if (atoms[i].y3) {
        atomMap[atoms[i].id].y = atoms[i].y3
      }
      atomMap[atoms[i].id].y = parseFloat(atomMap[atoms[i].id].y)
      atomMap[atoms[i].id].z = '0.0'
      if (atoms[i].z3) {
        atomMap[atoms[i].id].z = atoms[i].z3
      }
      atomMap[atoms[i].id].z = parseFloat(atomMap[atoms[i].id].z)
      atomMap[atoms[i].id].a = atoms[i]
    }

    let atomsRef
    for (j = 0; j < moleculeArr.length; j++) {
      if (moleculeArr[j]._center !== null) {
        bLow.set(nLimon, nLimon, nLimon)
        bHight.set(-nLimon, -nLimon, -nLimon)
        atomsRef = moleculeArr[j]._atoms.split(' ')
        moleculeArr[j]._atoms = []
        atomsRef.forEach(cycleFunc)

        moleculeArr[j]._center.addVectors(bLow, bHight)
        moleculeArr[j]._center.multiplyScalar(0.5)
      } else {
        atomsRef = moleculeArr[j]._atoms.split(' ')
        moleculeArr[j]._atoms = []
        atomsRef.forEach(cycleFuncInner)
      }
    }
    atomMap = null
  }

  _traverseData(dom) {
    function isArray(o) {
      return Object.prototype.toString.apply(o) === '[object Array]'
    }

    function parseNode(xmlNode, result) {
      if (xmlNode.nodeName === '#text' && xmlNode.nodeValue.trim() === '') {
        return
      }

      const jsonNode = {}
      jsonNode.xmlNode = xmlNode
      const existing = result[xmlNode.nodeName]
      if (existing) {
        if (!isArray(existing)) {
          result[xmlNode.nodeName] = [existing, jsonNode]
        } else {
          result[xmlNode.nodeName].push(jsonNode)
        }
      } else {
        result[xmlNode.nodeName] = jsonNode
      }

      let length
      let i
      if (xmlNode.attributes) {
        ;({ length } = xmlNode.attributes)
        for (i = 0; i < length; i++) {
          const attribute = xmlNode.attributes[i]
          jsonNode[attribute.nodeName] = attribute.nodeValue
        }
      }

      ;({ length } = xmlNode.childNodes)
      for (i = 0; i < length; i++) {
        parseNode(xmlNode.childNodes[i], jsonNode)
      }
    }

    const result = {}
    if (dom.childNodes.length) {
      parseNode(dom.childNodes[0], result)
    }

    return result
  }

  _findSuitableMolecule(data, molSet) {
    for (const key in data) {
      if (key === 'xmlNode') {
        continue
      } else if (key === 'molecule') {
        if (data.molecule) {
          if (data.molecule.atomArray && data.molecule.atomArray.atom) {
            molSet.push(data)
          }
          if (Array.isArray(data.molecule)) {
            for (let i = 0; i < data.molecule.length; i++) {
              if (
                data.molecule[i].atomArray &&
                data.molecule[i].atomArray.atom
              ) {
                molSet.push({ molecule: data.molecule[i] })
              }
            }
          }
        }
      } else if (
        data[key] &&
        data[key] !== null &&
        typeof data[key] === 'object'
      ) {
        this._findSuitableMolecule(data[key], molSet)
      }
    }
  }

  _selectComponents(text) {
    const parser = new DOMParser()
    const doc = parser.parseFromString(text, 'application/xml')
    const traversedData = this._traverseData(doc)
    let rawData
    const self = this

    function prepareComponentCompound(data) {
      let atoms = []
      if (
        data.molecule &&
        data.molecule.atomArray &&
        data.molecule.atomArray.atom
      ) {
        if (!Array.isArray(data.molecule.atomArray.atom)) {
          atoms.push(data.molecule.atomArray.atom)
        } else {
          atoms = data.molecule.atomArray.atom
        }
      } else if (!data.molecule) {
        const ret = {}
        ret.atomLabels = null
        ret.labelsCount = 1
        return ret
      }

      if (data.molecule.molecule) {
        self._extractSGroups(data.molecule.molecule, atoms)
      }

      let atom
      let count = atoms.length
      for (let i = 0; i < count; i++) {
        atom = atoms[i]
        atom.edges = []
      }

      let localBond = []
      if (data.molecule.bondArray && data.molecule.bondArray.bond) {
        if (!Array.isArray(data.molecule.bondArray.bond)) {
          localBond.push(data.molecule.bondArray.bond)
        } else {
          localBond = data.molecule.bondArray.bond
        }
      }
      let bond
      count = localBond.length
      self._rebuidBondIndexes(atoms, localBond)

      function addCurrBond(index) {
        bond = localBond[index]
        atom = atoms[bond.start]
        if (!atom) {
          return false
        }
        atom.edges.push(bond.end)
        atom = atoms[bond.end]
        if (!atom) {
          return false
        }
        atom.edges.push(bond.start)
        return true
      }

      for (let i = 0; i < count; i++) {
        if (!addCurrBond(i)) {
          // ignore invalid bond
          continue
        }
        const orderAttr = bond.xmlNode.getAttribute('order')
        const tc = parseInt(orderAttr, 10)
        // the default bond order is unknown
        localBond[i].order = 0
        localBond[i].type = Bond.BondType.UNKNOWN
        if (tc > 1) {
          localBond[i].order = tc
        } else {
          // another option - bond order is a string
          const order = cOrderCharCodes[orderAttr]
          if (order !== undefined) {
            localBond[i].order = order
            if (orderAttr === 'A') {
              localBond[i].type = Bond.BondType.AROMATIC
            }
          }
        }
      }

      count = atoms.length
      for (let i = 0; i < count; i++) {
        atom = atoms[i]
        atom.edges.sort()
      }

      const labels = self._breadWidthSearch(atoms, 0) // for now

      const retStruct = {}
      retStruct.atoms = atoms
      retStruct.bonds = localBond
      retStruct.labels = labels.atomLabels
      retStruct.count = Math.min(1, labels.labelsCount) // for now
      retStruct.curr = -1
      retStruct.originalCML = doc

      return retStruct
    }

    if (traversedData.cml) {
      rawData = traversedData.cml
    } else {
      rawData = traversedData
    }
    const retData = []
    const filteredData = []
    this._findSuitableMolecule(rawData, filteredData)
    if (this._readOnlyOneMolecule && filteredData.length > 1) {
      filteredData.splice(1, filteredData.length - 1)
    }
    filteredData.forEach((d) => {
      const rd = prepareComponentCompound(d)
      if (rd.atoms.length > 0) {
        retData.push(rd)
      }
    })
    return retData
  }

  _packLabel(compId, molId) {
    const shift = 16
    return (molId << shift) + compId
  }

  _unpackLabel(l) {
    const shift = 16
    const mask = (1 << shift) - 1
    return { molId: l >>> shift, compId: l & mask }
  }

  _breadWidthSearch(atoms, molID) {
    const atomLabels = new Array(atoms.length)

    let id
    for (id = 0; id < atomLabels.length; id++) {
      atomLabels[id] = this._packLabel(0, molID)
    }

    const breadthQueue = []
    let componentID = 0
    let labeledAtoms = atoms.length

    while (labeledAtoms > 0) {
      componentID++

      let startID = -1
      for (id = 0; id < atomLabels.length; id++) {
        if (this._unpackLabel(atomLabels[id]).compId === 0) {
          startID = id
          break
        }
      }

      if (startID < 0) {
        break
      }

      // Bread first search
      breadthQueue.push(atoms[startID])
      atomLabels[startID] = this._packLabel(componentID, molID)
      labeledAtoms--

      while (breadthQueue.length > 0) {
        const curr = breadthQueue.shift()
        if (!curr) {
          continue
        }

        for (let i = 0; i < curr.edges.length; i++) {
          if (atomLabels[curr.edges[i]] !== componentID) {
            breadthQueue.push(atoms[curr.edges[i]])
            atomLabels[curr.edges[i]] = componentID
            labeledAtoms--
          }
        }
      }
    }
    const ret = {}
    ret.atomLabels = atomLabels
    ret.labelsCount = componentID
    return ret
  }

  _parseBond(eAtom, mainAtom, order, type) {
    if (eAtom >= 0) {
      const h = [Math.min(eAtom, mainAtom), Math.max(eAtom, mainAtom)]
      this._complex.addBond(h[0], h[1], order, type, true)
    }
  }

  _fixBondsArray() {
    const serialAtomMap = (this._serialAtomMap = {})
    const complex = this._complex

    const atoms = complex._atoms
    for (let i = 0, ni = atoms.length; i < ni; ++i) {
      const atom = atoms[i]
      serialAtomMap[atom.serial] = atom
    }

    const bonds = complex._bonds
    const { logger } = this
    for (let j = 0, nj = bonds.length; j < nj; ++j) {
      const bond = bonds[j]
      if (bond._right < bond._left) {
        logger.debug('_fixBondsArray: Logic error.')
      }
      bond._left = serialAtomMap[bond._left] || null
      bond._right = serialAtomMap[bond._right] || null
    }
  }

  _parseSet(varData) {
    const complex = (this._complex = new Complex())
    const data = varData
    const currentLabel = data.curr
    const { atoms, labels } = data
    let atom = null
    let i
    let j
    const count = atoms.length

    function addFunc(a) {
      a.xmlNodeRef = atom
      if (atom.x2) {
        atom.x3 = atom.x2
        delete atom.x2
      }
      if (atom.y2) {
        atom.y3 = atom.y2
        delete atom.y2
      }
      if (!atom.z3) {
        atom.z3 = '0.0'
      }
      atom.complexAtom = a
    }

    let chains = {}
    // parse atoms in label order
    const reorder = []
    for (i = 0; i < count; i++) {
      reorder.push(i)
    }
    reorder.sort((a, b) => labels[a] - labels[b])
    for (i = 0; i < count; i++) {
      const atomCharge = 0
      const lLabel = labels[reorder[i]]
      if (
        this._unpackLabel(lLabel).molId ===
        this._unpackLabel(currentLabel).molId
      ) {
        atom = atoms[reorder[i]]
        const atomFullNameStruct = atom.elementType

        if (atom.sgroupRef) {
          const countRef = atom.sgroupRef.length
          for (let k = 0; k < countRef; ++k) {
            complex._sgroups.push(atom.sgroupRef[k])
          }
        }

        if (atom.x3 || atom.x2) {
          const currAtomComp = this._unpackLabel(lLabel).compId
          // use ' ' by default instead of synthetic creation of chain names
          const chainID = ' ' //= String.fromCharCode('A'.charCodeAt(0) + currAtomComp);
          const resSeq = currAtomComp
          const iCode = ' '
          let strLabel = currAtomComp.toString()
          if (strLabel.length === 1) {
            strLabel = `0${strLabel}`
          }
          const resName = `N${strLabel}`
          let chain = chains[chainID]
          if (!chain || chain.getName() !== chainID) {
            chains[chainID] = chain =
              this._complex.getChain(chainID) || this._complex.addChain(chainID)
            this._residue = null
          }

          let residue = this._residue
          if (
            !residue ||
            residue.getSequence() !== resSeq ||
            residue.getICode() !== iCode
          ) {
            this._residue = residue = chain.addResidue(resName, resSeq, iCode)
          }

          // _x, _y, _z, mname, mindex, atomNameFull, atomName, chainID, serial, isHet, atlLocInd, atomNameToTypeF
          let xyz = null
          if (atom.x3) {
            xyz = new Vector3(
              parseFloat(atom.x3),
              parseFloat(atom.y3),
              parseFloat(atom.z3)
            )
          } else if (atom.x2) {
            xyz = new Vector3(parseFloat(atom.x2), parseFloat(atom.y2), 0)
          }
          let element = Element.ByName[atom.elementType.toUpperCase()]
          if (!element) {
            element = JSON.parse(
              JSON.stringify(
                Element.ByName[
                  Object.keys(Element.ByName)[
                    Object.keys(Element.ByName).length - 1
                  ]
                ]
              )
            )
            element.number += 1
            element.name = atom.elementType.toUpperCase()
            element.fullName = 'Unknown'
            Element.ByName[atom.elementType.toUpperCase()] = element
          }
          const atomSerial = parseInt(atom.id.replace(/[^0-9]/, ''), 10)
          const added = residue.addAtom(
            atomFullNameStruct,
            element,
            xyz,
            Element.Role.SG,
            true,
            atomSerial,
            ' ',
            1.0,
            0.0,
            atomCharge
          )
          if (atom.hydrogenCount) {
            added.hydrogenCount = parseInt(atom.hydrogenCount, 10)
          }
          if (atom.mrvValence) {
            added.valence = parseInt(atom.mrvValence, 10)
          }
          addFunc(added)
        }
      }
    }
    chains = null // NOSONAR
    for (i = 0; i < data.bonds.length; i++) {
      const cb = data.bonds[i]
      if (
        this._unpackLabel(labels[cb.start]).molId ===
          this._unpackLabel(currentLabel).molId &&
        this._unpackLabel(labels[cb.end]).molId ===
          this._unpackLabel(currentLabel).molId
      ) {
        atom = atoms[cb.start]
        if (!atom || !atoms[cb.end]) {
          continue // skip invalid
        }
        this._parseBond(
          parseInt(atom.id.replace(/[^0-9]/, ''), 10),
          parseInt(atoms[cb.end].id.replace(/[^0-9]/, ''), 10),
          cb.order,
          cb.type
        )
      }
    }

    for (i = 0; i < this._complex.getSGroupCount(); i++) {
      const sGrp = this._complex.getSGroups()[i]
      for (j = 0; j < sGrp._atoms.length; j++) {
        sGrp._atoms[j] = sGrp._atoms[j].complexAtom
      }
    }
    for (i = 0; i < count; i++) {
      if (
        this._unpackLabel(labels[i]).molId ===
        this._unpackLabel(currentLabel).molId
      ) {
        atom = atoms[i]
        atom.complexAtom = null
        delete atom.complexAtom
      }
    }
    this._complex.originalCML = data.originalCML
    this._fixBondsArray()
    complex.finalize({
      needAutoBonding: false,
      detectAromaticLoops: this.settings.now.aromatic,
      enableEditing: this.settings.now.editing,
      serialAtomMap: this._serialAtomMap
    })
    this._serialAtomMap = null
    this._complex = null
    return complex
  }

  parseSync() {
    const complexes = []
    const self = this
    const moleculaSet = this._selectComponents(this._data)
    moleculaSet.forEach((molSet) => {
      molSet.curr = 2
      if (molSet.count === 0) {
        molSet.count = 1
      }
      for (let i = 0; i < molSet.count; i++) {
        molSet.curr = i + 1
        complexes.push(self._parseSet(molSet, false))
      }
    })

    let totalAtomsParsed = 0
    complexes.forEach((c) => {
      totalAtomsParsed += c.getAtomCount()
    })
    if (totalAtomsParsed <= 0) {
      throw new Error('The data does not contain valid atoms')
    }

    if (complexes.length > 1) {
      const joinedComplex = new Complex()
      joinedComplex.joinComplexes(complexes)
      joinedComplex.originalCML = complexes[0].originalCML
      return joinedComplex
    }
    if (complexes.length === 1) {
      return complexes[0]
    }
    return new Complex()
  }
}

CMLParser.formats = ['cml']
CMLParser.extensions = ['.cml']

export default CMLParser