documentation/pipeline.md
> This code has not been updated. So usage may differ.
Here is an example of a pipeline to iterate across all the compounds in a list, merge them, score then and upload them.
For MPro, see the MPro.
## Prerequisite
### Template
I made an energy minimised template.
Technically, it need not be substrate bound, but having a closed active site is good.
Also it actually does not need to be energy minimised.
pdbcode = '6WOJ' # ADP bound.
from fragmenstein import Victor, Igor
Igor.download_map(pdbcode, pdbcode+'.ccp4')
# topology
from rdkit_to_params import Params
p = Params.from_smiles_w_pdbfile(pdb_file='mono.pdb',
smiles='Nc1ncnc2n(cnc12)[C@@H]3O[C@H](CO[P@@]([O-])(=O)O[P@@]([O-])(=O)OC[C@H]4O[C@@H](O)[C@H](O)[C@@H]4O)[C@@H](O)[C@H]3O',
name='APR',
proximityBonding=False)
p.dump('APR.params')
# start
import pyrosetta
pyrosetta.init(extra_options='-no_optH false -mute all -ex1 -ex2 -ignore_unrecognized_res false -load_PDB_components false -ignore_waters false')
#import nglview
#nglview.show_rosetta(p.test())
params_file = 'APR.params'
pdbfile = 'mono.pdb' # 6WOJ manually inspected.
pose = pyrosetta.Pose()
params_paths = pyrosetta.rosetta.utility.vector1_string()
params_paths.extend([params_file])
pyrosetta.generate_nonstandard_residue_set(pose, params_paths)
pyrosetta.rosetta.core.import_pose.pose_from_file(pose, pdbfile)
Igor.relax_with_ED(pose=pose, ccp4_file=pdbcode+'.ccp4')
pose.dump_pdb('mono.r.pdb')
# this part makes no sense, but is just an example —in reality checking the result visually would make more sense.
# or it could be done in Pyrosetta.
import pymol2
with pymol2.PyMOL() as pymol:
pymol.cmd.load('mono.r.pdb')
pymol.cmd.remove('resn APR')
pymol.cmd.save('template.pdb')
### Compound extraction
XChem did not used to provide extracted compounds. These compounds below however preserve covalent bonding
by using the */R atom as the attachment. Say `CC[Cl]` reacted with `[S-]CC`, the former would be `CC*`, where `*` has the place of `S`.
import os
from rdkit import Chem
from shutil import copyfile
hits = {}
# os.mkdir('comprimenda')
masterfolder = '/Users/matteo/Desktop/NSP3-macrodomain/mArh/aligned'
for subfolder in os.listdir(masterfolder):
molfile = os.path.join(masterfolder, subfolder, f'{subfolder}.mol')
if os.path.exists(molfile):
mol = Chem.MolFromMolFile(molfile)
mol.SetProp('_Name', subfolder)
hits[subfolder] = mol
copyfile(molfile, os.path.join('comprimenda', f'{subfolder}.mol'))
## Compound merger generation
It runs on a node on N processes and save the data as it goes along to a sqlite file to prevent and issues.
project = 'mergers'
nput_foldername = 'input'
##############################################
cores = 20
out_path = f'{project}'
db_name = f'{project}.sqlite'
##############################################
Touch the directory and DB file
import os, re
from sqlitedict import SqliteDict
import json
results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
definite the process task
def process(x):
project = 'mergers'
db_name = f'{project}.sqlite'
import pyrosetta, logging
pyrosetta.distributed.maybe_init(extra_options='-no_optH false -mute all -ignore_unrecognized_res true -load_PDB_components false')
from fragmenstein import Victor
Victor.work_path = f'{project}' # db_name
Victor.fragmenstein_throw_on_discard= True
Victor.fragmenstein_joining_cutoff = 5 # 10
Victor.quick_renanimation = False
Victor.error_to_catch = Exception
#Victor.enable_stdout(logging.ERROR)
Victor.enable_logfile(f'{project}.log', logging.INFO)
Victor.log_errors()
from sqlitedict import SqliteDict
import json, logging
from fragmenstein.mpro import MProVictor
print('NEW', x)
try:
from rdkit import Chem
def loadmol(file):
mol = Chem.MolFromMolFile(file)
if mol.GetProp('_Name') == '':
mol.SetProp('_Name', file.split('/')[-1].replace('.mol',''))
return mol
frags = [loadmol(file) for file in x]
v = Victor(pdb_filename='input/template.pdb',
covalent_resi='81A', # a random residue is still required for the constaint ref atom.
covalent_resn='CYS')
v.combine(hits=frags)
results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
results[v.long_name] = v.summarise()
if not v.error:
v.make_pse()
except Exception as error:
name = '-'.join([file.split('/')[-1].replace('.mol','') for file in x])
error_msg = f'{error.__class__.__name__} {error}'
results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
name = '-'.join([file.split('/')[-1].replace('.mol','') for file in x])
results[name] = {'error': error_msg}
Victor.journal.critical(f'*** {error_msg}, files: {x}')
except ConnectionError:
pass
print('DONE', x)
return True
Get stuff started
from multiprocessing import Pool
import itertools, random, re
pool = Pool(cores, maxtasksperchild=1)
# get done list to prevent repeats
results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
done = list(results.keys())
hits = [os.path.join(input_foldername, file) for file in os.listdir(input_foldername) if '.mol' in file]
to_do = [(a, b) for a, b in itertools.permutations(hits, 2) if f'{a.split("/")[-1]}-{b.split("/")[-1]}'.replace('.mol', '') not in done]
random.shuffle(to_do)
for pair in to_do:
pool.apply_async(process, (pair,))
The main process will be free. But the pool will be running.
pool.close()
pool.join()
print("completed")
## Tabulate
The upload step requires, `michelanglo_api` uploading the data to github.
Method to make table:
from sqlitedict import SqliteDict
from rdkit.Chem import PandasTools
import json
import pandas as pd
from fragmenstein import Victor
import numpy as np
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.stats import skewnorm, gennorm
def old_ranker(row):
try:
return float(row['∆∆G'])/5 + float(row.comRMSD) + row.N_unconstrained_atoms /5 - row.N_constrained_atoms / 10
#return float(row['∆∆G'])/(row.N_unconstrained_atoms + row.N_constrained_atoms * 0.5)*10 + float(row.comRMSD)
except:
return float('nan')
rank_weights = {'LE': 1., 'comRMSD': 2., 'atom_bonus': 2. , 'novelty_penalty': 5.}
def ranker(row):
try:
#atom_bonus = row.N_constrained_atoms / (20 + row.N_constrained_atoms)
#atom_bonus = skewnorm.pdf((row.N_constrained_atoms - 20)/8, 3)
ζ = (row.N_constrained_atoms**2 - 25**2)/500
atom_bonus = gennorm.pdf(ζ, 5) / 0.5445622105291682
novelty_penalty = row.N_unconstrained_atoms / row.N_constrained_atoms
return rank_weights['LE'] * float(row.LE) + \
rank_weights['comRMSD'] * float(row.comRMSD) + \
- rank_weights['atom_bonus'] * atom_bonus + \
rank_weights['novelty_penalty'] * novelty_penalty
except:
return float('nan')
def LE(row):
try:
return float(row['∆∆G'])/(row.N_unconstrained_atoms + row.N_constrained_atoms)
except:
return float('nan')
def get_mol3D(name):
path = os.path.join(Victor.work_path, name, name+'.minimised.mol')
if os.path.exists(path):
try:
mol = Chem.MolFromMolFile(path, sanitize=True)
if mol is None:
return None
Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL)
return mol
except Exception as error:
print(f'{type(error)}: {error}')
return None
else:
return None
def get_table(db_name, mols=True, mol_only=True):
results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
result_table = pd.DataFrame(results.values())
print(len(result_table), sum(~result_table['∆∆G'].isna()))
result_table['LE'] = result_table.apply(LE,1)
rank = result_table.apply(ranker, axis=1).rank()
m = np.nanmax(rank.values)
result_table['%Rank'] = rank / m * 100
result_table['N_hits'] = result_table.regarded.apply(lambda x: len(x) if str(x) != 'nan' else float('nan'))
result_table = result_table.loc[~result_table.smiles.isna()].sort_values(['%Rank'], axis=0)
if mols:
result_table['mol3D'] = result_table['name'].apply(get_mol3D)
#result_table['mol2D'] = result_table['name'].apply(get_mol2D)
PandasTools.AddMoleculeColumnToFrame(result_table,'smiles','mol2D')
if mol_only:
result_table = result_table.loc[~result_table.mol3D.isna()]
return result_table
Make table
##############################################
project = 'mergers'
from fragmenstein import Victor
Victor.work_path = project
db_name = f'{project}.sqlite'
result_table = get_table(db_name, mols=True)
#result_table
# shoot. I forgot to count atoms in hits...
atom_Ns = {}
for folder in ('newinputs',): #'input', 'UCSF2-hits', 'frags'):
for file in os.listdir(folder):
if '.mol' in file:
mol = Chem.MolFromMolFile(os.path.join(folder, file), sanitize=False)
if mol is None:
atom_Ns[file.replace('.mol','')] = 0 # float nan?
else:
mol = Chem.GetMolFrags(mol, asMols=True)[0] # just in case
atom_Ns[file.replace('.mol','')] = mol.GetNumAtoms()
# add atom n
hit_counter = lambda hits: sum([atom_Ns[hit] for hit in hits])
merge_counter = lambda row: row.N_hit_atoms - row.N_unconstrained_atoms - row.N_constrained_atoms
result_table = result_table.assign(N_hit_atoms=result_table.regarded.apply(hit_counter))
result_table = result_table.assign(N_diff_atoms=result_table.apply(merge_counter, axis='columns'))
result_table
# upload
from michelanglo_api import MikeAPI
mike = MikeAPI('xxx', 'xxxx')
# p = mike.convert_pdb('6WOJ') # make new
p = mike.get_page('xxxxxxxx') # retrieve old
p.retrieve()
p.show_link()
task = 'Fragmenstein_NSP3_mergers'
#repo_name = 'Data_for_own_Michelanglo_pages2'
repo_name = 'NSP3-macrodomain'
folder = 'molfiles'
title = 'Fragmenstein NSP3 Mergers'
#gitfolder=f'/well/brc/matteo/{repo_name}'
gitfolder=f'/well/brc/matteo/NSP3/git-repo'
sdfile=f'{gitfolder}/{folder}/mergers.sdf'
xlsfile=f'{gitfolder}/{folder}/mergers.xlsx'
targetfolder=f'{gitfolder}/{folder}'
import os, re
if not os.path.exists(targetfolder):
os.mkdir(targetfolder)
target = 'Mike'
#target = 'Excel'
#target = 'Frag'
def clean_refs(x):
h = ','.join([xx for xx in x if 'diamond-x' in xx])
#h = ','.join(x)
if h == '':
return 'x0104_0'
else:
return h
from datetime import datetime, date
outgoing = result_table.sort_values(['%Rank'], axis=0).loc[~result_table.mol3D.isna()]
outgoing = outgoing.loc[outgoing.regarded.apply(lambda x: len(x) >= 1)]
outgoing['ref_mols'] = outgoing.regarded.apply(clean_refs)
# frgaments have _Greek
outgoing.ref_mols.replace('_[α-ω]', '', regex=True, inplace=True)
outgoing.ref_mols.replace('mArh-', '', regex=True, inplace=True)
outgoing.ref_mols.replace('diamond-', '', regex=True, inplace=True)
outgoing.ref_mols.replace('mac-x(\d+)', r'MAC-\1_0_A', regex=True, inplace=True)
outgoing['regarded'] = outgoing.regarded.apply(lambda x: ','.join(x))
outgoing['disregarded'] = outgoing.disregarded.apply(lambda x: ','.join(x))
# outgoing['ref_mols'] = outgoing.apply(lambda row: row.regarded+','+row.disregarded if row.disregarded else row.regarded, axis=1)
outgoing['name'] = outgoing['name'].str.replace('-covalent', '')
outgoing['name'] = outgoing['name'].str.replace('mArh-', '')
outgoing['ref_pdb'] = 'x0104_0' #'6WOJ_0_A' #'X0104_0_A' #
outgoing['original SMILES'] = outgoing['smiles']
# ref_url - the url to the forum post that describes the work
# submitter_name - the name of the person submitting the compounds
# submitter_email - the email address of the submitter
# submitter_institution - the submitters institution
# generation_date - the date that the file was generated in format yyyy-mm-dd
# method
#ref_mols - a comma separated list of the fragments that inspired the design of the new molecule (codes as they appear in fragalysis - e.g. x0104_0,x0692_0)
#ref_pdb - either (a) a filepath (relative to the sdf file) to an uploaded pdb file (e.g. Mpro-x0692_0/Mpro-x0692_0_apo.pdb) or (b) the code to the fragment pdb from fragalysis that should be used (e.g. x0692_0)
#original SMILES
metadata = {'name': 'ver_1.2',
'submitter_name': 'Matteo Ferla',
'submitter_email': 'matteo@well.ox.ac.uk',
'submitter_institution': 'Universtity of Oxford',
'generation_date': date.today().isoformat(),
'method': task, #'Fragmenstein-top500-automerger',
'original SMILES': 'molecule smiles',
#'smiles': 'molecule smiles used',
'ref_url': 'https://github.com/matteoferla/NSP3-macrodomain',
'ref_mols': 'all reference molecules',
'ref_pdb': 'All ligands were evaluated against the apo of Rosetta-ED-guided-minimised 6WOJ',
'N_hits': 'Number of hits used',
'N_constrained_atoms': 'Number of atoms in the submission that were constrained',
'N_diff_atoms': 'Difference in number of heavy atoms between the merger and the hits (negative: atoms added, positive: atoms merged)',
#'N_unconstrained_atoms': 'Number of heavy atoms in the submission that were NOT constrained',
#'runtime': 'seconds it took',
'regarded': 'Fragments used for mapping',
'disregarded': 'Fragments rejected for mapping',
'comRMSD': 'Combined RMSD from the atoms of the fragments that contributed to the position of the followup',
'∆∆G': 'Difference in Gibbs Free energy relative to unbound molecule in kcal/mol (ref2015 scorefxn; negative=Good)',
#'∆G_bound': 'Gibbs Free energy of ligand bound',
#'∆G_unbound': 'Gibbs Free energy of ligand unbound',
'LE': 'Ligand efficiency (kcal/mol/N_heavy)',
'%Rank':f"Sorted by {rank_weights['comRMSD']}x RSMD (high is bad) + "+\
f"{rank_weights['LE']}x ligand efficiency (high is bad) - "+\
f"{rank_weights['atom_bonus']}x N_constrained_atoms/(20+N_constrained_atoms) + "+\
f"{rank_weights['novelty_penalty']}x N_unconstrained_atoms/N_constrained_atoms",
'mol3D': Chem.MolFromSmiles('FOO'),
'mol2D': Chem.MolFromSmiles('FOO'),
}
if target == 'Frag':
del metadata['regarded']
del metadata['disregarded']
del metadata['mol2D']
outgoing = outgoing.iloc[:500]
elif target == 'Mike':
del metadata['mol2D']
elif target == 'Excel':
del metadata['ref_url']
del metadata['ref_mols']
del metadata['ref_pdb']
del metadata['submitter_name']
del metadata['submitter_email']
del metadata['submitter_institution']
del metadata['generation_date']
del metadata['mol3D']
#outgoing = outgoing[list(set(outgoing.columns.values) - set(metadata.keys()))]
# add fist compound
outgoing = pd.concat([pd.DataFrame([metadata]), outgoing], ignore_index=True)
# upload
title = 'Fragmenstein/Smallworld NSP3 mergers'
p.description = f'''
## {title} {date.today().isoformat()}
[Fragmenstein](https://github.com/matteoferla/Fragmenstein) scored suggestions from Smallworld server.
> For files and notes see [NSP3 data on GitHub](https://github.com/matteoferla/NSP3-macrodomain).
'''
p.loadfun = ''
p.title = title
p.columns_viewport = 6
p.columns_text = 6
p.sdf_to_mols(sdfile=sdfile,
targetfolder=targetfolder,
skip_first=True)
p.sdf_to_json(sdfile=sdfile,
keys=('regarded',
'∆∆G', 'LE', 'N_hits', 'N_constrained_atoms', 'N_diff_atoms', 'comRMSD',
'%Rank'),
key_defaults=('', # regarded
999., #∆∆G
999., #LE
0, #N_hits
0, #N_constrained_atoms
0, #N_diff_atoms
999., #comRMSD
100. #%Rank
),
filename=f'{targetfolder}/data.json',
spaced=True)
p.make_fragment_table(sdfile=sdfile,
username='matteoferla',
repo_name=repo_name,
foldername=folder,
protein_sele='81:A',
sort_col=8,
sort_dir='asc',
template_row=-1,
fragment_row=1,
jsonfile='data.json')
p.commit()
Copy the inputs
import os, re, shutil
hit_folder = 'newinputs' #'UCSF2-hits' #'frags' #'input'
for file in os.listdir(hit_folder):
if '.mol' in file:
print(file)
shutil.copy(os.path.join(hit_folder, file),
os.path.join(targetfolder, file.replace('_0_','_'))) #
shutil.copy(os.path.join('input', 'template.pdb'),
os.path.join(targetfolder, 'template.pdb'))
Git push!
Alternatively for an SDF for Fragalysis say
## MAKE SDF
assert target != 'Excel', 'Requires mol2D'
from rdkit.Chem import PandasTools
#Fragmenstein_permissive_rescored_20200609.sdf
PandasTools.WriteSDF(outgoing.iloc[:501], sdfile, molColName='mol3D', idName='name',
properties=list(set(metadata.keys()) - {'name', 'mol3D',
# 'N_diff_atoms',
'method',
'submitter_name',
'submitter_institution',
'submitter_email'
}), allNumeric=False)