Package Chem :: Package ChemUtils :: Module TemplateExpand
[hide private]
[frames] | no frames]

Source Code for Module Chem.ChemUtils.TemplateExpand

  1  # $Id: TemplateExpand.py 872 2008-05-15 15:32:13Z landrgr1 $ 
  2  # 
  3  #  Created by Greg Landrum August, 2006 
  4  # 
  5  # 
  6  import RDLogger as logging 
  7  logger = logging.logger() 
  8  logger.setLevel(logging.INFO) 
  9  import Chem 
 10  from Chem import Crippen 
 11  from Chem import AllChem 
 12  from Chem.ChemUtils.AlignDepict import AlignDepict 
 13  import sys 
 14  _version="0.7.1" 
 15  _greet="This is TemplateExpand version %s"%_version 
 16   
 17  _usage=""" 
 18  Usage: TemplateExpand [options] template <sidechains> 
 19   
 20   Unless otherwise indicated, the template and sidechains are assumed to be 
 21     Smiles 
 22   
 23   Each sidechain entry should be: 
 24     [-r] SMARTS filename 
 25       The SMARTS pattern is used to recognize the attachment point, 
 26       if the -r argument is not provided, then atoms matching the pattern 
 27       will be removed from the sidechains. 
 28     or 
 29     -n filename 
 30       where the attachment atom is the first atom in each molecule 
 31     The filename provides the list of potential sidechains. 
 32   
 33   options: 
 34     -o filename.sdf:      provides the name of the output file, otherwise 
 35                           stdout is used 
 36   
 37     --sdf :               expect the sidechains to be in SD files 
 38   
 39     --moltemplate:        the template(s) are in a mol/SD file, new depiction(s) 
 40                           will not be generated unless the --redraw argument is also 
 41                           provided 
 42   
 43     --smilesFileTemplate: extract the template(s) from a SMILES file instead of  
 44                           expecting SMILES on the command line. 
 45   
 46     --redraw:             generate a new depiction for the molecular template(s) 
 47   
 48     --useall: 
 49       or 
 50     --useallmatches:      generate a product for each possible match of the attachment 
 51                           pattern to each sidechain. If this is not provided, the first 
 52                           match (not canonically defined) will be used. 
 53   
 54     --force:              by default, the program prompts the user if the library is  
 55                           going to contain more than 1000 compounds. This argument  
 56                           disables the prompt. 
 57      
 58     --templateSmarts="smarts":  provides a space-delimited list containing the SMARTS  
 59                                 patterns to be used to recognize attachment points in 
 60                                                   the template 
 61                
 62     --autoNames:          when set this toggle causes the resulting compounds to be named 
 63                           based on there sequence id in the file, e.g.  
 64                           "TemplateEnum: Mol_1", "TemplateEnum: Mol_2", etc. 
 65                           otherwise the names of the template and building blocks (from 
 66                           the input files) will be combined to form a name for each 
 67                           product molecule. 
 68                                
 69  """ 
70 -def Usage():
71 import sys 72 print >>sys.stderr,_usage 73 sys.exit(-1)
74 75 nDumped=0
76 -def _exploder(mol,depth,sidechains,core,chainIndices,autoNames=True,templateName='', 77 resetCounter=True):
78 global nDumped 79 if resetCounter: 80 nDumped=0 81 ourChains = sidechains[depth] 82 patt = '[%d*]'%(depth+1) 83 patt = Chem.MolFromSmiles(patt) 84 for i,(chainIdx,chain) in enumerate(ourChains): 85 tchain = chainIndices[:] 86 tchain.append((i,chainIdx)) 87 rs = Chem.ReplaceSubstructs(mol,patt,chain,replaceAll=True) 88 if rs: 89 r = rs[0] 90 if depth<len(sidechains)-1: 91 for entry in _exploder(r,depth+1,sidechains,core,tchain, 92 autoNames=autoNames,templateName=templateName, 93 resetCounter=0): 94 yield entry 95 else: 96 try: 97 Chem.SanitizeMol(r) 98 except ValueError: 99 import traceback 100 traceback.print_exc() 101 continue 102 if r.HasSubstructMatch(core): 103 try: 104 AlignDepict(r,core) 105 except: 106 import traceback 107 traceback.print_exc() 108 print >>sys.stderr,Chem.MolToSmiles(r) 109 else: 110 print >>sys.stderr,'>>> no match' 111 AllChem.Compute2DCoords(r) 112 Chem.Kekulize(r) 113 if autoNames: 114 tName = "TemplateEnum: Mol_%d"%(nDumped+1) 115 else: 116 tName = templateName 117 for bbI,bb in enumerate(tchain): 118 bbMol = sidechains[bbI][bb[0]][1] 119 if bbMol.HasProp('_Name'): 120 bbNm = bbMol.GetProp('_Name') 121 else: 122 bbNm = str(bb[1]) 123 tName += '_' + bbNm 124 125 r.SetProp("_Name",tName) 126 r.SetProp('seq_num',str(nDumped+1)) 127 r.SetProp('reagent_indices','_'.join([str(x[1]) for x in tchain])) 128 for bbI,bb in enumerate(tchain): 129 bbMol = sidechains[bbI][bb[0]][1] 130 if bbMol.HasProp('_Name'): 131 bbNm = bbMol.GetProp('_Name') 132 else: 133 bbNm = str(bb[1]) 134 r.SetProp('building_block_%d'%(bbI+1),bbNm) 135 for propN in bbMol.GetPropNames(): 136 r.SetProp('building_block_%d_%s'%(bbI+1,propN),bbMol.GetProp(propN)) 137 nDumped += 1 138 if not nDumped%100: 139 logger.info('Done %d molecules'%nDumped) 140 yield r
141
142 -def Explode(template,sidechains,outF,autoNames=True):
143 chainIndices=[] 144 core = Chem.DeleteSubstructs(template,Chem.MolFromSmiles('[*]')) 145 try: 146 templateName = template.GetProp('_Name') 147 except KeyError: 148 templateName="template" 149 for mol in _exploder(template,0,sidechains,core,chainIndices,autoNames=autoNames,templateName=templateName): 150 outF.write(Chem.MolToMolBlock(mol)) 151 for pN in mol.GetPropNames(): 152 print >>outF,'> <%s>\n%s\n'%(pN,mol.GetProp(pN)) 153 print >>outF,'$$$$'
154
155 -def MoveDummyNeighborsToBeginning(mol,useAll=False):
156 dummyPatt=Chem.MolFromSmiles('[*]') 157 matches = mol.GetSubstructMatches(dummyPatt) 158 res = [] 159 for match in matches: 160 matchIdx = match[0] 161 smi = Chem.MolToSmiles(mol,True,rootedAtAtom=matchIdx) 162 entry = Chem.MolFromSmiles(smi) 163 # entry now has [*] as atom 0 and the neighbor 164 # as atom 1. Cleave the [*]: 165 entry = Chem.DeleteSubstructs(entry,dummyPatt) 166 for propN in mol.GetPropNames(): 167 entry.SetProp(propN,mol.GetProp(propN)); 168 169 # now we have a molecule with the atom to be joined 170 # in position zero; Keep that: 171 res.append(entry) 172 if not useAll: 173 break 174 return res
175
176 -def ConstructSidechains(suppl,sma=None,replace=True,useAll=False):
177 if sma: 178 try: 179 patt = Chem.MolFromSmarts(sma) 180 except: 181 logger.error('could not construct pattern from smarts: %s'%sma, 182 exc_info=True) 183 return None 184 else: 185 patt = None 186 187 if replace: 188 replacement = Chem.MolFromSmiles('[*]') 189 190 res = [] 191 for idx,mol in enumerate(suppl): 192 if not mol: 193 continue 194 if patt: 195 if not mol.HasSubstructMatch(patt): 196 logger.warning('The substructure pattern did not match sidechain %d. This may result in errors.'%(idx+1)) 197 if replace: 198 tmp = list(Chem.ReplaceSubstructs(mol,patt,replacement)) 199 if not useAll: tmp = [tmp[0]] 200 for i,entry in enumerate(tmp): 201 entry = MoveDummyNeighborsToBeginning(entry) 202 if not entry: 203 continue 204 entry = entry[0] 205 206 for propN in mol.GetPropNames(): 207 entry.SetProp(propN,mol.GetProp(propN)); 208 # now we have a molecule with the atom to be joined 209 # in position zero; Keep that: 210 tmp[i] = (idx+1,entry) 211 else: 212 # no replacement, use the pattern to reorder 213 # atoms only: 214 matches = mol.GetSubstructMatches(patt) 215 if matches: 216 tmp = [] 217 for match in matches: 218 smi = Chem.MolToSmiles(mol,True,rootedAtAtom=match[0]) 219 entry = Chem.MolFromSmiles(smi) 220 for propN in mol.GetPropNames(): 221 entry.SetProp(propN,mol.GetProp(propN)); 222 223 # now we have a molecule with the atom to be joined 224 # in position zero; Keep that: 225 tmp.append((idx+1,entry)) 226 else: 227 tmp = None 228 else: 229 tmp = [(idx+1,mol)] 230 if tmp: 231 res.extend(tmp) 232 return res
233 234 if __name__=='__main__': 235 import getopt,sys 236 print >>sys.stderr,_greet 237 238 try: 239 args,extras = getopt.getopt(sys.argv[1:],'o:h',[ 240 'sdf', 241 'moltemplate', 242 'smilesFileTemplate', 243 'templateSmarts=', 244 'redraw', 245 'force', 246 'useall', 247 'useallmatches', 248 'autoNames', 249 ]) 250 except: 251 import traceback 252 traceback.print_exc() 253 Usage() 254 255 if len(extras)<3: 256 Usage() 257 258 tooLong=1000 259 sdLigands=False 260 molTemplate=False 261 redrawTemplate=False 262 outF=None 263 forceIt=False 264 useAll=False 265 templateSmarts=[] 266 smilesFileTemplate=False 267 autoNames=False 268 for arg,val in args: 269 if arg=='-o': 270 outF=val 271 elif arg=='--sdf': 272 sdLigands=True 273 elif arg=='--moltemplate': 274 molTemplate=True 275 elif arg=='--smilesFileTemplate': 276 smilesFileTemplate=True 277 elif arg=='--templateSmarts': 278 templateSmarts = val 279 elif arg=='--redraw': 280 redrawTemplate=True 281 elif arg=='--force': 282 forceIt=True 283 elif arg=='--autoNames': 284 autoNames=True 285 elif arg in ('--useall','--useallmatches'): 286 useAll=True 287 elif arg=='-h': 288 Usage() 289 sys.exit(0) 290 291 292 if templateSmarts: 293 splitL = templateSmarts.split(' ') 294 templateSmarts = [] 295 for i,sma in enumerate(splitL): 296 patt = Chem.MolFromSmarts(sma) 297 if not patt: 298 raise ValueError,'could not convert smarts "%s" to a query'%sma 299 if i>=4: 300 i+=1 301 replace = Chem.MolFromSmiles('[%d*]'%(i+1)) 302 templateSmarts.append((patt,replace)) 303 304 if molTemplate: 305 try: 306 s = Chem.SDMolSupplier(extras[0]) 307 templates = [x for x in s] 308 except: 309 logger.error('Could not construct templates from input file: %s'%extras[0], 310 exc_info=True) 311 sys.exit(1) 312 if redrawTemplate: 313 for template in templates: 314 AllChem.Compute2DCoords(template) 315 else: 316 if not smilesFileTemplate: 317 try: 318 templates = [Chem.MolFromSmiles(extras[0])] 319 except: 320 logger.error('Could not construct template from smiles: %s'%extras[0], 321 exc_info=True) 322 sys.exit(1) 323 else: 324 try: 325 s = Chem.SmilesMolSupplier(extras[0],titleLine=False) 326 templates = [x for x in s] 327 except: 328 logger.error('Could not construct templates from input file: %s'%extras[0], 329 exc_info=True) 330 sys.exit(1) 331 for template in templates: 332 AllChem.Compute2DCoords(template) 333 if templateSmarts: 334 finalTs = [] 335 for i,template in enumerate(templates): 336 for j,(patt,replace) in enumerate(templateSmarts): 337 if not template.HasSubstructMatch(patt): 338 logger.error('template %d did not match sidechain pattern %d, skipping it'%(i+1,j+1)) 339 template =None 340 break 341 template = Chem.ReplaceSubstructs(template,patt,replace)[0] 342 if template: 343 Chem.SanitizeMol(template) 344 finalTs.append(template) 345 templates = finalTs 346 347 sidechains = [] 348 pos = 1 349 while pos<len(extras): 350 if extras[pos]=='-r': 351 replaceIt=False 352 pos += 1 353 else: 354 replaceIt=True 355 if extras[pos]=='-n': 356 sma = None 357 else: 358 sma = extras[pos] 359 pos += 1 360 try: 361 dat = extras[pos] 362 except IndexError: 363 logger.error('missing a sidechain filename') 364 sys.exit(-1) 365 pos += 1 366 if sdLigands: 367 try: 368 suppl = Chem.SDMolSupplier(dat) 369 except: 370 logger.error('could not construct supplier from SD file: %s'%dat, 371 exc_info=True) 372 suppl = [] 373 else: 374 tmpF = file(dat,'r') 375 inL = tmpF.readline() 376 if len(inL.split(' '))<2: 377 nmCol=-1 378 else: 379 nmCol=1 380 try: 381 suppl = Chem.SmilesMolSupplier(dat,nameColumn=nmCol) 382 except: 383 logger.error('could not construct supplier from smiles file: %s'%dat, 384 exc_info=True) 385 suppl = [] 386 suppl = [x for x in suppl] 387 chains = ConstructSidechains(suppl,sma=sma,replace=replaceIt,useAll=useAll) 388 if chains: 389 sidechains.append(chains) 390 count = 1 391 for chain in sidechains: 392 count *= len(chain) 393 count *= len(templates) 394 if not sidechains or not count: 395 print >>sys.stderr,"No molecules to be generated." 396 sys.exit(0) 397 398 if not forceIt and count>tooLong: 399 print >>sys.stderr,"This will generate %d molecules."%count 400 print >>sys.stderr,"Continue anyway? [no] ", 401 sys.stderr.flush() 402 ans = sys.stdin.readline() 403 if ans not in ('y','yes','Y','YES'): 404 sys.exit(0) 405 406 if outF and outF!="-": 407 try: 408 outF = file(outF,'w+') 409 except IOError: 410 logger.error('could not open file %s for writing'%(outF), 411 exc_info=True) 412 else: 413 outF = sys.stdout 414 415 for template in templates: 416 Explode(template,sidechains,outF,autoNames=autoNames) 417