packages/miew/src/io/parsers/CIFParser.js

Summary

Maintainability
F
3 days
Test Coverage
import Parser from './Parser'
import chem from '../../chem'
import StructuralElement from '../../chem/StructuralElement'
import readCIF from './readCIF'
import { isArray, isString } from 'lodash'
import { Matrix4, Vector3 } from 'three'

const { Complex, Element, Helix, Sheet, Strand, Assembly, Molecule } = chem

const cRequiredAtomFields = [
  'auth_seq_id',
  'Cartn_x',
  'Cartn_y',
  'Cartn_z',
  'label_atom_id'
]

const cSecondaryCoding = {
  helx: 'helix',
  turn: 'turn',
  strn: 'strand'
}

function getTypeFromId(string) {
  const typeId = /[A-Za-z]+/.exec(string)
  if (!typeId) {
    return null
  }

  return cSecondaryCoding[typeId[0].toLowerCase()]
}

/**
 * Make valid object an array
 * @param arrayLikeObject
 * @return {array, object} array or object
 */
function arrize(arrayLikeObject) {
  if (
    arrayLikeObject === null ||
    arrayLikeObject === undefined ||
    isArray(arrayLikeObject)
  ) {
    return arrayLikeObject
  }
  return [arrayLikeObject]
}

function nameToElement(name) {
  // http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
  //
  // http://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html#note1
  //
  // Atom names start with element symbols right-justified in columns 13-14
  // as permitted by the length of the name. For example, the symbol FE for
  // iron appears in columns 13-14, whereas the symbol C for carbon appears
  // in column 14 (see Misaligned Atom Names). If an atom name has four
  // characters, however, it must start in column 13 even if the element
  // symbol is a single character (for example, see Hydrogen Atoms).

  const veryLong = name.trim().length === 4
  return name.slice(0, veryLong ? 1 : 2).trim()
}

class AtomDataError extends Error {
  constructor(message) {
    super()
    this.name = 'AtomDataError'
    this.message = message
  }
}

function _getOperations(operList) {
  if (!operList) {
    return null
  }
  const idc = arrize(operList.id)
  const { matrix, vector } = operList
  if (!idc || !matrix || !vector) {
    return null
  }

  const ops = []
  for (let i = 0, n = idc.length; i < n; ++i) {
    const mtx = new Matrix4()
    const { elements } = mtx

    for (let row = 0; row < 3; ++row) {
      const matrixData = matrix[row + 1]
      elements[row] = arrize(matrixData[1])[i]
      elements[row + 4] = arrize(matrixData[2])[i]
      elements[row + 8] = arrize(matrixData[3])[i]
      elements[row + 12] = arrize(vector[row + 1])[i]
    }
    ops[idc[i]] = mtx
  }
  return ops
}

function _extractOperations(assemblyGen, opsDict) {
  assemblyGen = isString(assemblyGen) ? assemblyGen : `${assemblyGen}`
  const l = assemblyGen.replace(/\)\s*\(/g, '!').replace(/[()']/g, '')
  const groupStr = l.split('!')
  const gps = []

  for (let grIdx = 0, grCount = groupStr.length; grIdx < grCount; ++grIdx) {
    const gr = groupStr[grIdx].split(',')
    const gp = []
    let idx = 0
    for (let i = 0, n = gr.length; i < n; ++i) {
      const s = gr[i]
      if (s.includes('-')) {
        const es = s.split('-')
        let j = parseInt(es[0], 10)
        const m = parseInt(es[1], 10)
        for (; j <= m; ++j) {
          gp[idx++] = opsDict[j]
        }
      } else {
        gp[idx++] = opsDict[s]
      }
    }
    gps.push(gp)
  }

  // traverse all groups from the end of array and make all mults
  const matrices = []
  let cnt = 0
  function traverse(level, mtx) {
    for (let ii = 0, nn = gps[level].length; ii < nn; ++ii) {
      const newMtx = mtx ? mtx.clone() : new Matrix4()
      newMtx.multiplyMatrices(gps[level][ii], newMtx)
      if (level === 0) {
        matrices[cnt++] = newMtx
      } else {
        traverse(level - 1, newMtx)
      }
    }
  }
  traverse(gps.length - 1)
  return matrices
}

class CIFParser extends Parser {
  constructor(data, options) {
    super(data, options)
    this.asymDict = {}
    this.molecules = []
    this._options.fileType = 'cif'
  }

  static canProbablyParse(data) {
    return isString(data) && /^\s*data_/i.test(data)
  }

  parseSync() {
    this.logger.info('Parsing CIF file..')
    const data = readCIF(this._data)
    return this._toComplex(data)
  }

  /**
   * Convert intermediate structure into our valid Complex object
   * @param cifData intermediate CIF object
   * @returns {Complex} complex
   * @private
   */
  _toComplex(cifData) {
    const complex = new Complex()
    const complexData = cifData[Object.keys(cifData)[0]]
    this._extractAtoms(complex, complexData)
    this._extractSecondary(complex, complexData)
    this._extractAssemblies(complex, complexData)
    this._extractMolecules(complex, complexData)
    this._extractMetadata(complex, complexData)
    complex.finalize({
      needAutoBonding: true,
      detectAromaticLoops: this.settings.now.aromatic,
      enableEditing: this.settings.now.editing
    })

    return complex
  }

  /**
   * Extract metadata
   * @param complex structure to fill
   * @param complexData complex data from CIF file
   * @private
   */

  _extractMetadata(complex, complexData) {
    const { metadata } = complex
    metadata.id = complexData.entry.id
    metadata.classification = complexData.struct_keywords.pdbx_keywords
    const databaserev = complexData.database_PDB_rev
    metadata.date =
      databaserev && databaserev.date_original ? databaserev.date_original : ''
    metadata.format = 'cif'
    metadata.title = []
    metadata.title[0] = complexData.struct.title
  }

  /**
   * Extract molecules information from CIF structure (should be called strictly after _extractAtoms)
   * @param complexData complex data from CIF file
   * @private
   */
  _extractMolecules(complex, complexData) {
    const molData = complexData.entity
    const names = arrize(molData.pdbx_description)
    const count = names.length
    let i

    // molecules names from cif
    for (i = 0; i < count; i++) {
      if (this.molecules[i]) {
        // molecule was created during atoms processing
        this.molecules[i].name = names[i]
      } else {
        // molecule wasn't created, because there is no atom which is contained
        this.molecules[i] = { name: names[i], residues: [] }
      }
    }

    // reorganize molecules for complex and check chains
    const molecules = complex.getMolecules()
    for (i = 0; i < count; i++) {
      const molecule = this.molecules[i]
      molecules[i] = new Molecule(complex, molecule.name, i + 1)
      molecules[i].residues = molecule.residues
    }
  }

  /**
   * Extract atom information from CIF structure and fill complex
   * @param {Complex} complex
   * @param complexData complex data from CIF file
   * @private
   */
  _extractAtoms(complex, complexData) {
    const atomData = complexData.atom_site
    if (!atomData) {
      throw new AtomDataError('CIF parsing error: atom_site is not specified!')
    }

    for (let f = 0, n = cRequiredAtomFields.length; f < n; ++f) {
      if (!atomData[cRequiredAtomFields[f]]) {
        throw new AtomDataError(
          `CIF parsing error: requires field ${cRequiredAtomFields[f]} not found!`
        )
      }
    }

    const { asymDict } = this
    // required fields
    const resIdc = arrize(atomData.auth_seq_id)
    const x = arrize(atomData.Cartn_x)
    const y = arrize(atomData.Cartn_y)
    const z = arrize(atomData.Cartn_z)
    const names = arrize(atomData.label_atom_id)
    const count = names.length
    // optional fields
    const group = arrize(atomData.group_PDB) || []
    const chainIdc = arrize(atomData.auth_asym_id) || []
    const chainLabelIdc = arrize(atomData.label_asym_id) || []
    const serials = arrize(atomData.id) || []
    const iCodes = arrize(atomData.pdbx_PDB_ins_code) || []
    const resNames = arrize(atomData.label_comp_id) || []
    const elements = arrize(atomData.type_symbol) || []
    const tempFactors = arrize(atomData.B_iso_or_equiv) || []
    const occupancies = arrize(atomData.occupancy) || []
    const charges = arrize(atomData.pdbx_formal_charge) || []
    const altLocs = arrize(atomData.label_alt_id) || []
    const models = arrize(atomData.pdbx_PDB_model_num) || []
    const molecules = arrize(atomData.label_entity_id) || []

    let chain = null
    let residue = null
    for (let i = 0; i < count; ++i) {
      const model = models[i] || 1
      if (model !== 1) {
        continue
      }
      const chainID = String(chainIdc[i] || ' ')

      if (!chain || chain.getName() !== chainID) {
        chain = complex.getChain(chainID) || complex.addChain(chainID)
      }
      asymDict[String(chainLabelIdc[i] || ' ')] = chainID
      const resSeq = resIdc[i]
      const iCode = String(iCodes[i] || ' ')
      const resName = String(resNames[i] || '')
      if (
        !residue ||
        residue.getSequence() !== resSeq ||
        residue.getICode() !== iCode
      ) {
        residue = chain.addResidue(resName, resSeq, iCode)

        // store residue in appropriate molecule
        const moleculeIdx = molecules[i] - 1
        let entity = this.molecules[moleculeIdx]
        if (!entity) {
          // create new molecule if it hasn't been created
          this.molecules[moleculeIdx] = { name: '', residues: [] }
          entity = this.molecules[moleculeIdx]
        }
        entity.residues.push(residue)
      }

      const name = names[i]
      const element = elements[i] || nameToElement(name)
      const type = Element.getByName(element)
      const role = Element.Role[name.trim()]
      const xyz = new Vector3(x[i], y[i], z[i])
      const het = group[i] === 'HETATM' || false
      const serial = serials[i] || i
      const tempFactor = tempFactors[i] || 0.0
      const occupancy = occupancies[i] || 0.0
      const altLoc = String(altLocs[i] || '')
      const charge = charges[i] || 0

      residue.addAtom(
        name,
        type,
        xyz,
        role,
        het,
        serial,
        altLoc,
        occupancy,
        tempFactor,
        charge
      )
    }
  }

  /**
   * Extracts secondary structure information from CIF intermediate data
   * and adds it into complex
   * @param {Complex} complex - complex to fill
   * @param complexData - CIF complex data
   * @private
   */
  _extractSecondary(complex, complexData) {
    if (complexData.struct_conf) {
      this._extractConfs(complex, complexData.struct_conf)
    }
    if (complexData.struct_sheet_range) {
      this._extractSheets(complex, complexData.struct_sheet_range)
    }
  }

  /**
   * Extracts sheets information from CIF intermediate data
   * and adds it into complex
   * @param {Complex} complex
   * @param sheetData
   * @private
   */
  _extractSheets(complex, sheetData) {
    const { asymDict } = this
    if (
      !sheetData.sheet_id ||
      !sheetData.id ||
      !sheetData.beg_label_seq_id ||
      !sheetData.end_label_seq_id ||
      !sheetData.beg_label_asym_id
    ) {
      return
    }
    // Strand(sheet, start, end, sense, cur, prev)
    const sheets = complex._sheets

    function getSheet(name) {
      const n = sheets.length
      for (let i = 0; i < n; ++i) {
        if (sheets[i]._name === name) {
          return sheets[i]
        }
      }
      sheets[n] = new Sheet(name, 0)
      return sheets[n]
    }

    const sheetNames = arrize(sheetData.sheet_id)
    const strandNames = arrize(sheetData.id)
    const starts = arrize(sheetData.beg_auth_seq_id)
    const ends = arrize(sheetData.end_auth_seq_id)
    const chains = arrize(sheetData.beg_label_asym_id)
    const stICodes = arrize(sheetData.pdbx_beg_PDB_ins_code) || []
    const endICodes = arrize(sheetData.pdbx_end_PDB_ins_code) || []

    for (let i = 0, n = strandNames.length; i < n; ++i) {
      const chain = complex.getChain(asymDict[chains[i]])
      const sheet = getSheet(sheetNames[i])
      const startIdx = starts[i]
      const endIdx = ends[i]
      const startICode = stICodes[i] || ' '
      const endICode = endICodes[i] || ' '

      const start = chain.findResidue(startIdx, startICode)
      const end = chain.findResidue(endIdx, endICode)

      // TODO think about last condition
      if (!start || !end) {
        continue
      }

      const strand = new Strand(sheet, start[0], end[0], 0, null, null)
      const residues = chain.getResidues()
      for (let r = start[1]; r <= end[1]; ++r) {
        residues[r]._secondary = strand
      }
      sheet.addStrand(strand)
      complex.structures.push(strand)
    }
  }

  /**
   * Extracts helix/turn/strand(?) information from CIF intermediate data
   * and adds it into complex
   * @param {Complex} complex
   * @param helicesData
   * @private
   */
  _extractConfs(complex, helicesData) {
    const { asymDict } = this
    if (
      !helicesData.conf_type_id ||
      !helicesData.beg_label_seq_id ||
      !helicesData.end_label_seq_id ||
      !helicesData.beg_label_asym_id
    ) {
      return
    }

    const types = arrize(helicesData.conf_type_id)
    const starts = arrize(helicesData.beg_auth_seq_id)
    const stICodes = arrize(helicesData.pdbx_beg_PDB_ins_code) || []
    const ends = arrize(helicesData.end_auth_seq_id)
    const endICodes = arrize(helicesData.pdbx_end_PDB_ins_code) || []
    const comments = arrize(helicesData.details) || []
    const lengths = arrize(helicesData.pdbx_PDB_helix_length) || []
    const helixClasses = arrize(helicesData.pdbx_PDB_helix_class) || []
    const names = arrize(helicesData.id) || []
    const chains = arrize(helicesData.beg_label_asym_id)

    for (let i = 0, n = types.length; i < n; ++i) {
      const type = getTypeFromId(types[i])
      if (!type) {
        continue
      }
      const name = names[i] || types[i]
      const chain = complex.getChain(asymDict[chains[i]])

      const startIdx = starts[i]
      const endIdx = ends[i]
      const startICode = stICodes[i] || ' '
      const endICode = endICodes[i] || ' '

      const start = chain.findResidue(startIdx, startICode)
      const end = chain.findResidue(endIdx, endICode)

      // TODO think about last condition
      if (!start || !end) {
        continue
      }
      const comment = comments[i] || ''
      const length = lengths[i] || 0
      const helixClass = helixClasses[i] || ' '
      let struct
      if (type === 'helix') {
        const idx = complex._helices.length
        struct = new Helix(
          helixClass,
          start[0],
          end[0],
          idx,
          name,
          comment,
          length
        )
        complex.addHelix(struct)
        complex.structures.push(struct)
      } else if (type === 'turn') {
        struct = new StructuralElement(
          StructuralElement.Type.TURN,
          start[0],
          end[0]
        )
        complex.structures.push(struct)
      } else {
        struct = null
      }
      if (!struct) {
        continue
      }
      const residues = chain.getResidues()
      for (let r = start[1]; r <= end[1]; ++r) {
        residues[r]._secondary = struct
      }
    }
  }

  /**
   * Extract biological assemblies information from CIF structure and fill complex
   * @param {Complex} complex
   * @param complexData complex data from CIF file
   * @private
   */
  _extractAssemblies(complex, complexData) {
    const { asymDict } = this
    const asmGen = complexData.pdbx_struct_assembly_gen
    if (!asmGen) {
      return
    }

    const asmIdx = arrize(asmGen.assembly_id)
    const asmOper = arrize(asmGen.oper_expression)
    const asmList = arrize(asmGen.asym_id_list)
    if (!asmIdx || !asmOper || !asmList) {
      return
    }

    const operList = _getOperations(complexData.pdbx_struct_oper_list)
    if (!operList) {
      return
    }

    for (let i = 0, n = asmIdx.length; i < n; ++i) {
      const asm = new Assembly(complex)
      const assemblyOps = _extractOperations(asmOper[i], operList)
      const entries = asmList[i].split(',')
      for (let ii = 0, nn = entries.length; ii < nn; ++ii) {
        const chain = entries[ii].trim()
        if (chain.length > 0) {
          asm.addChain(asymDict[chain])
        }
      }
      asm.matrices = assemblyOps
      complex.units.push(asm)
    }
  }
}

CIFParser.formats = ['cif', 'mmcif']
CIFParser.extensions = ['.cif', '.mmcif']

export default CIFParser