Package rdkit :: Package Chem :: Package Fingerprints :: Module FingerprintMols
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.FingerprintMols

  1  # $Id: FingerprintMols.py 997 2009-02-25 06:12:43Z glandrum $ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ utility functionality for fingerprinting sets of molecules 
  8   includes a command line app for working with fingerprints 
  9   and databases 
 10   
 11   
 12  Sample Usage: 
 13   
 14    python FingerprintMols.py  -d data.gdb \ 
 15          -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID"  \ 
 16          --outTable="daylight_sig" 
 17   
 18   
 19  """ 
 20  from rdkit import Chem 
 21  from rdkit.Chem import MACCSkeys 
 22  from rdkit.ML.Cluster import Murtagh 
 23  from rdkit import DataStructs 
 24  import sys 
 25  import cPickle 
 26   
 27  _cvsVersion="$Id: FingerprintMols.py 997 2009-02-25 06:12:43Z glandrum $" 
 28  idx1 = _cvsVersion.find(':')+1 
 29  idx2 = _cvsVersion.rfind('$') 
 30  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 31   
 32   
33 -def error(msg):
34 sys.stderr.write(msg)
35 -def message(msg):
36 sys.stderr.write(msg)
37
38 -def GetDaylightFingerprint(mol):
39 """ uses default parameters """ 40 details = FingerprinterDetails() 41 return apply(FingerprintMol,(mol,),details.__dict__)
42
43 -def FoldFingerprintToTargetDensity(fp,**fpArgs):
44 nOn = fp.GetNumOnBits() 45 nTot = fp.GetNumBits() 46 while( float(nOn)/nTot < fpArgs['tgtDensity'] ): 47 if nTot / 2 > fpArgs['minSize']: 48 fp = DataStructs.FoldFingerprint(fp,2) 49 nOn = fp.GetNumOnBits() 50 nTot = fp.GetNumBits() 51 else: 52 break 53 return fp
54
55 -def FingerprintMol(mol, 56 fingerprinter=Chem.RDKFingerprint, 57 **fpArgs):
58 if not fpArgs: 59 details = FingerprinterDetails() 60 fpArgs = details.__dict__ 61 62 if fingerprinter != Chem.RDKFingerprint: 63 fp = fingerprinter(mol,**fpArgs) 64 fp = FoldFingerprintToTargetDensity(fp,**fpArgs) 65 else: 66 fp = fingerprinter(mol,fpArgs['minPath'],fpArgs['maxPath'], 67 fpArgs['fpSize'],fpArgs['bitsPerHash'], 68 fpArgs['useHs'],fpArgs['tgtDensity'], 69 fpArgs['minSize']) 70 return fp
71 72
73 -def FingerprintsFromSmiles(dataSource,idCol,smiCol, 74 fingerprinter=Chem.RDKFingerprint, 75 reportFreq=10,maxMols=-1, 76 **fpArgs):
77 """ fpArgs are passed as keyword arguments to the fingerprinter 78 79 Returns a list of 2-tuples: (id,fp) 80 81 """ 82 res = [] 83 nDone = 0 84 for entry in dataSource: 85 id,smi = str(entry[idCol]),str(entry[smiCol]) 86 try: 87 mol = Chem.MolFromSmiles(smi) 88 except: 89 mol = None 90 if mol: 91 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 92 res.append((id,fp)) 93 nDone += 1 94 if reportFreq>0 and not nDone % reportFreq: 95 message('Done %d molecules\n'%(nDone)) 96 if maxMols > 0 and nDone >= maxMols: 97 break 98 else: 99 error('Problems parsing SMILES: %s\n'%smi) 100 return res
101
102 -def FingerprintsFromMols(mols, 103 fingerprinter=Chem.RDKFingerprint, 104 reportFreq=10,maxMols=-1, 105 **fpArgs):
106 """ fpArgs are passed as keyword arguments to the fingerprinter 107 108 Returns a list of 2-tuples: (id,fp) 109 110 """ 111 res = [] 112 nDone = 0 113 for id,mol in mols: 114 if mol: 115 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 116 res.append((id,fp)) 117 nDone += 1 118 if reportFreq>0 and not nDone % reportFreq: 119 message('Done %d molecules\n'%(nDone)) 120 if maxMols > 0 and nDone >= maxMols: 121 break 122 else: 123 error('Problems parsing SMILES: %s\n'%smi) 124 return res
125
126 -def FingerprintsFromPickles(dataSource,idCol,pklCol, 127 fingerprinter=Chem.RDKFingerprint, 128 reportFreq=10,maxMols=-1, 129 **fpArgs):
130 """ fpArgs are passed as keyword arguments to the fingerprinter 131 132 Returns a list of 2-tuples: (id,fp) 133 134 """ 135 res = [] 136 nDone = 0 137 for entry in dataSource: 138 id,pkl = str(entry[idCol]),str(entry[pklCol]) 139 try: 140 mol = Chem.Mol(pkl) 141 except: 142 mol = None 143 if mol: 144 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 145 res.append((id,fp)) 146 nDone += 1 147 if reportFreq>0 and not nDone % reportFreq: 148 message('Done %d molecules\n'%(nDone)) 149 if maxMols > 0 and nDone >= maxMols: 150 break 151 else: 152 error('Problems parsing pickle for id: %s\n'%id) 153 return res
154
155 -def FingerprintsFromDetails(details,reportFreq=10):
156 data = None 157 if details.dbName and details.tableName: 158 from rdkit.Dbase.DbConnection import DbConnect 159 from rdkit.Dbase import DbInfo 160 from rdkit.ML.Data import DataUtils 161 try: 162 conn = DbConnect(details.dbName,details.tableName) 163 except: 164 import traceback 165 error('Problems establishing connection to database: %s|%s\n'%(details.dbName, 166 details.tableName)) 167 traceback.print_exc() 168 if not details.idName: 169 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0] 170 dataSet = DataUtils.DBToData(details.dbName,details.tableName, 171 what='%s,%s'%(details.idName,details.smilesName)) 172 idCol = 0 173 smiCol = 1 174 elif details.inFileName and details.useSmiles: 175 from rdkit.ML.Data import DataUtils 176 conn = None 177 if not details.idName: 178 details.idName='ID' 179 try: 180 dataSet = DataUtils.TextFileToData(details.inFileName, 181 onlyCols=[details.idName,details.smilesName]) 182 except IOError: 183 import traceback 184 error('Problems reading from file %s\n'%(details.inFileName)) 185 traceback.print_exc() 186 187 idCol = 0 188 smiCol = 1 189 elif details.inFileName and details.useSD: 190 conn = None 191 dataset=None 192 if not details.idName: 193 details.idName='ID' 194 dataSet = [] 195 try: 196 s = Chem.SDMolSupplier(details.inFileName) 197 except: 198 import traceback 199 error('Problems reading from file %s\n'%(details.inFileName)) 200 traceback.print_exc() 201 else: 202 while 1: 203 try: 204 m = s.next() 205 except StopIteration: 206 break 207 if m: 208 dataSet.append(m) 209 if reportFreq>0 and not len(dataSet) % reportFreq: 210 message('Read %d molecules\n'%(len(dataSet))) 211 if details.maxMols > 0 and len(dataSet) >= details.maxMols: 212 break 213 214 for i,mol in enumerate(dataSet): 215 if mol.HasProp(details.idName): 216 nm = mol.GetProp(details.idName) 217 else: 218 nm = mol.GetProp('_Name') 219 dataSet[i] = (nm,mol) 220 else: 221 dataSet = None 222 223 fps = None 224 if dataSet and not details.useSD: 225 data = dataSet.GetNamedData() 226 if not details.molPklName: 227 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol), 228 details.__dict__) 229 else: 230 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol), 231 details.__dict__) 232 elif dataSet and details.useSD: 233 fps = apply(FingerprintsFromMols,(dataSet,),details.__dict__) 234 235 if fps: 236 if details.outFileName: 237 outF = open(details.outFileName,'wb+') 238 for i in range(len(fps)): 239 cPickle.dump(fps[i],outF) 240 outF.close() 241 dbName = details.outDbName or details.dbName 242 if details.outTableName and dbName: 243 from rdkit.Dbase.DbConnection import DbConnect 244 from rdkit.Dbase import DbInfo,DbUtils,DbModule 245 conn = DbConnect(dbName) 246 # 247 # We don't have a db open already, so we'll need to figure out 248 # the types of our columns... 249 # 250 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0])) 251 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes, 252 keyCol=details.idName) 253 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName) 254 255 # FIX: we should really check to see if the table 256 # is already there and, if so, add the appropriate 257 # column. 258 259 # 260 # create the new table 261 # 262 if details.replaceTable or \ 263 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]: 264 conn.AddTable(details.outTableName,cols) 265 266 # 267 # And add the data 268 # 269 for id,fp in fps: 270 tpl = id,DbModule.binaryHolder(fp.ToBinary()) 271 conn.InsertData(details.outTableName,tpl) 272 conn.Commit() 273 return fps
274 # ------------------------------------------------ 275 # 276 # Command line parsing stuff 277 # 278 # ------------------------------------------------ 279
280 -class FingerprinterDetails(object):
281 """ class for storing the details of a fingerprinting run, 282 generates sensible defaults on construction 283 284 """
285 - def __init__(self):
286 self._fingerprinterInit() 287 self._screenerInit() 288 self._clusterInit()
289
290 - def _fingerprinterInit(self):
291 self.fingerprinter = Chem.RDKFingerprint 292 self.fpColName="AutoFragmentFP" 293 self.idName='' 294 self.dbName='' 295 self.outDbName='' 296 self.tableName='' 297 self.minSize=64 298 self.fpSize=2048 299 self.tgtDensity=0.3 300 self.minPath=1 301 self.maxPath=7 302 self.discrimHash=0 303 self.useHs=0 304 self.useValence=0 305 self.bitsPerHash=4 306 self.smilesName='SMILES' 307 self.maxMols=-1 308 self.outFileName='' 309 self.outTableName='' 310 self.inFileName='' 311 self.replaceTable=True 312 self.molPklName='' 313 self.useSmiles=True 314 self.useSD=False
315
316 - def _screenerInit(self):
317 self.metric = DataStructs.TanimotoSimilarity 318 self.doScreen='' 319 self.topN=10 320 self.screenThresh=0.75 321 self.doThreshold=0 322 self.smilesTableName='' 323 self.probeSmiles='' 324 self.probeMol=None 325 self.noPickle=0
326
327 - def _clusterInit(self):
328 self.clusterAlgo = Murtagh.WARDS 329 self.actTableName = '' 330 self.actName = ''
331
332 - def GetMetricName(self):
333 if self.metric == DataStructs.TanimotoSimilarity: 334 return 'Tanimoto' 335 elif self.metric == DataStructs.DiceSimilarity: 336 return 'Dice' 337 elif self.metric == DataStructs.CosineSimilarity: 338 return 'Cosine' 339 elif self.metric: 340 return self.metric 341 else: 342 return 'Unknown'
343 - def SetMetricFromName(self,name):
344 name = name.upper() 345 if name=="TANIMOTO": 346 self.metric = DataStructs.TanimotoSimilarity 347 elif name=="DICE": 348 self.metric = DataStructs.DiceSimilarity 349 elif name=="COSINE": 350 self.metric = DataStructs.CosineSimilarity
351
352 -def Usage():
353 """ prints a usage string and exits 354 355 """ 356 print _usageDoc 357 sys.exit(-1)
358 359 _usageDoc=""" 360 Usage: FingerprintMols.py [args] <fName> 361 362 If <fName> is provided and no tableName is specified (see below), 363 data will be read from the text file <fName>. Text files delimited 364 with either commas (extension .csv) or tabs (extension .txt) are 365 supported. 366 367 Command line arguments are: 368 - -d _dbName_: set the name of the database from which 369 to pull input molecule information. If output is 370 going to a database, this will also be used for that 371 unless the --outDbName option is used. 372 373 - -t _tableName_: set the name of the database table 374 from which to pull input molecule information 375 376 - --smilesName=val: sets the name of the SMILES column 377 in the input database. Default is *SMILES*. 378 379 - --useSD: Assume that the input file is an SD file, not a SMILES 380 table. 381 382 - --idName=val: sets the name of the id column in the input 383 database. Defaults to be the name of the first db column 384 (or *ID* for text files). 385 386 - -o _outFileName_: name of the output file (output will 387 be a pickle file with one label,fingerprint entry for each 388 molecule). 389 390 - --outTable=val: name of the output db table used to store 391 fingerprints. If this table already exists, it will be 392 replaced. 393 394 - --outDbName: name of output database, if it's being used. 395 Defaults to be the same as the input db. 396 397 - --fpColName=val: name to use for the column which stores 398 fingerprints (in pickled format) in the output db table. 399 Default is *AutoFragmentFP* 400 401 - --maxSize=val: base size of the fingerprints to be generated 402 Default is *2048* 403 404 - --minSize=val: minimum size of the fingerprints to be generated 405 (limits the amount of folding that happens). Default is *64* 406 407 - --density=val: target bit density in the fingerprint. The 408 fingerprint will be folded until this density is 409 reached. Default is *0.3* 410 411 - --minPath=val: minimum path length to be included in 412 fragment-based fingerprints. Default is *1*. 413 414 - --maxPath=val: maximum path length to be included in 415 fragment-based fingerprints. Default is *7*. 416 417 - --nBitsPerHash: number of bits to be set in the output 418 fingerprint for each fragment. Default is *4*. 419 420 - --discrim: use of path-based discriminators to hash bits. 421 Default is *false*. 422 423 - -V: include valence information in the fingerprints 424 Default is *false*. 425 426 - -H: include Hs in the fingerprint 427 Default is *false*. 428 429 - --maxMols=val: sets the maximum number of molecules to be 430 fingerprinted. 431 432 - --useMACCS: use the public MACCS keys to do the fingerprinting 433 (instead of a daylight-type fingerprint) 434 435 """ 436
437 -def ParseArgs(details=None):
438 """ parses the command line arguments and returns a 439 _FingerprinterDetails_ instance with the results. 440 441 **Note**: 442 443 - If you make modifications here, please update the global 444 _usageDoc string so the Usage message is up to date. 445 446 - This routine is used by both the fingerprinter, the clusterer and the 447 screener; not all arguments make sense for all applications. 448 449 """ 450 import sys,getopt 451 try: 452 args = sys.argv[1:] 453 except: 454 Usage() 455 try: 456 args,extras = getopt.getopt(args,'HVs:d:t:o:h', 457 [ 458 'minSize=','maxSize=', 459 'density=', 460 'minPath=','maxPath=', 461 'bitsPerHash=', 462 'smilesName=', 463 'molPkl=', 464 'useSD', 465 'idName=', 466 'discrim', 467 'outTable=', 468 'outDbName=', 469 'fpColName=', 470 'maxMols=', 471 'useMACCS', 472 'keepTable', 473 # SCREENING: 474 'smilesTable=', 475 'doScreen=', 476 'topN=', 477 'thresh=', 478 'smiles=', 479 'dice', 480 'cosine', 481 # CLUSTERING: 482 'actTable=', 483 'actName=', 484 'SLINK', 485 'CLINK', 486 'UPGMA', 487 488 ]) 489 except: 490 import traceback 491 traceback.print_exc() 492 Usage() 493 494 if details is None: 495 details = FingerprinterDetails() 496 if len(extras): 497 details.inFileName=extras[0] 498 499 for arg,val in args: 500 if arg=='-H': 501 details.useHs=1 502 elif arg=='-V': 503 details.useValence=1 504 elif arg=='-d': 505 details.dbName = val 506 elif arg=='-t': 507 details.tableName = val 508 elif arg=='-o': 509 details.outFileName = val 510 elif arg=='--minSize': 511 details.minSize= int(val) 512 elif arg=='--maxSize': 513 details.fpSize= int(val) 514 elif arg=='--density': 515 details.tgtDensity = float(val) 516 elif arg=='--outTable': 517 details.outTableName = val 518 elif arg=='--outDbName': 519 details.outDbName = val 520 elif arg=='--fpColName': 521 details.fpColName = val 522 elif arg=='--minPath': 523 details.minPath= int(val) 524 elif arg=='--maxPath': 525 details.maxPath= int(val) 526 elif arg=='--nBitsPerHash': 527 details.bitsPerHash= int(val) 528 elif arg=='--discrim': 529 details.discrimHash=1 530 elif arg=='--smilesName': 531 details.smilesName = val 532 elif arg=='--molPkl': 533 details.molPklName = val 534 elif arg=='--useSD': 535 details.useSmiles=False 536 details.useSD=True 537 elif arg=='--idName': 538 details.idName = val 539 elif arg=='--maxMols': 540 details.maxMols = int(val) 541 elif arg=='--useMACCS': 542 details.fingerprinter = MACCSkeys.GenMACCSKeys 543 elif arg=='--keepTable': 544 details.replaceTable=False 545 546 # SCREENER: 547 elif arg=='--smilesTable': 548 details.smilesTableName=val; 549 elif arg=='--topN': 550 details.doThreshold=0 551 details.topN=int(val) 552 elif arg=='--thresh': 553 details.doThreshold=1 554 details.screenThresh=float(val) 555 elif arg=='--smiles': 556 details.probeSmiles=val; 557 elif arg=='--dice': 558 details.metric = DataStructs.DiceSimilarity 559 elif arg=='--cosine': 560 details.metric = DataStructs.CosineSimilarity 561 562 # CLUSTERS: 563 elif arg=='--SLINK': 564 details.clusterAlgo = Murtagh.SLINK 565 elif arg=='--CLINK': 566 details.clusterAlgo = Murtagh.CLINK 567 elif arg=='--UPGMA': 568 details.clusterAlgo = Murtagh.UPGMA 569 elif arg=='--actTable': 570 details.actTableName = val 571 elif arg=='--actName': 572 details.actName = val 573 elif arg=='-h': 574 Usage() 575 return details
576 577 if __name__ == '__main__': 578 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING)) 579 details = ParseArgs() 580 FingerprintsFromDetails(details) 581