Package Chem :: Package Fingerprints :: Module FingerprintMols
[hide private]
[frames] | no frames]

Source Code for Module Chem.Fingerprints.FingerprintMols

  1  # $Id: FingerprintMols.py 346 2007-09-27 05:27:16Z glandrum $ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ utility functionality for fingerprinting sets of molecules 
  8   includes a command line app for working with fingerprints 
  9   and databases 
 10   
 11   
 12  Sample Usage: 
 13   
 14    python FingerprintMols.py  -d data.gdb \ 
 15          -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID"  \ 
 16          --outTable="daylight_sig" 
 17   
 18   
 19  """ 
 20  import Chem 
 21  from Chem import MACCSkeys 
 22  from Dbase.DbConnection import DbConnect 
 23  from Dbase import DbInfo,DbUtils,DbModule 
 24  from ML.Data import DataUtils 
 25  from ML.Cluster import Murtagh 
 26  import DataStructs 
 27  import sys 
 28  import cPickle 
 29   
 30  _cvsVersion="$Id: FingerprintMols.py 346 2007-09-27 05:27:16Z glandrum $" 
 31  idx1 = _cvsVersion.find(':')+1 
 32  idx2 = _cvsVersion.rfind('$') 
 33  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 34   
 35   
36 -def error(msg):
37 sys.stderr.write(msg)
38 -def message(msg):
39 sys.stderr.write(msg)
40 41
42 -def FingerprintMol(mol, 43 fingerprinter=Chem.DaylightFingerprint, 44 **fpArgs):
45 if not fpArgs: 46 details = FingerprinterDetails() 47 fpArgs = details.__dict__ 48 49 if fingerprinter != Chem.DaylightFingerprint: 50 fp = fingerprinter(mol,**fpArgs) 51 else: 52 fp = fingerprinter(mol,fpArgs['minPath'],fpArgs['maxPath'], 53 fpArgs['fpSize'],fpArgs['bitsPerHash'], 54 fpArgs['useHs'],fpArgs['tgtDensity'], 55 fpArgs['minSize']) 56 return fp
57
58 -def FingerprintsFromSmiles(dataSource,idCol,smiCol, 59 fingerprinter=Chem.DaylightFingerprint, 60 reportFreq=10,maxMols=-1, 61 **fpArgs):
62 """ fpArgs are passed as keyword arguments to the fingerprinter 63 64 Returns a list of 2-tuples: (id,fp) 65 66 """ 67 res = [] 68 nDone = 0 69 for entry in dataSource: 70 id,smi = str(entry[idCol]),str(entry[smiCol]) 71 try: 72 mol = Chem.MolFromSmiles(smi) 73 except: 74 mol = None 75 if mol: 76 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 77 res.append((id,fp)) 78 nDone += 1 79 if reportFreq>0 and not nDone % reportFreq: 80 message('Done %d molecules\n'%(nDone)) 81 if maxMols > 0 and nDone >= maxMols: 82 break 83 else: 84 error('Problems parsing SMILES: %s\n'%smi) 85 return res
86
87 -def FingerprintsFromPickles(dataSource,idCol,pklCol, 88 fingerprinter=Chem.DaylightFingerprint, 89 reportFreq=10,maxMols=-1, 90 **fpArgs):
91 """ fpArgs are passed as keyword arguments to the fingerprinter 92 93 Returns a list of 2-tuples: (id,fp) 94 95 """ 96 res = [] 97 nDone = 0 98 for entry in dataSource: 99 id,pkl = str(entry[idCol]),str(entry[pklCol]) 100 try: 101 mol = Chem.Mol(pkl) 102 except: 103 mol = None 104 if mol: 105 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 106 res.append((id,fp)) 107 nDone += 1 108 if reportFreq>0 and not nDone % reportFreq: 109 message('Done %d molecules\n'%(nDone)) 110 if maxMols > 0 and nDone >= maxMols: 111 break 112 else: 113 error('Problems parsing pickle for id: %s\n'%id) 114 return res
115
116 -def FingerprintsFromDetails(details):
117 data = None 118 if details.dbName and details.tableName: 119 try: 120 conn = DbConnect(details.dbName,details.tableName) 121 except: 122 import traceback 123 error('Problems establishing connection to database: %s|%s\n'%(details.dbName, 124 details.tableName)) 125 traceback.print_exc() 126 if not details.idName: 127 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0] 128 dataName = details.smilesName 129 if details.molPklName: 130 dataName = details.molPklName 131 else: 132 dataName = details.smilesName 133 dataSet = DataUtils.DBToData(details.dbName,details.tableName, 134 what='%s,%s'%(details.idName,dataName)) 135 idCol = 0 136 smiCol = 1 137 elif details.inFileName: 138 conn = None 139 if not details.idName: 140 details.idName='ID' 141 try: 142 dataSet = DataUtils.TextFileToData(details.inFileName, 143 onlyCols=[details.idName,details.smilesName]) 144 except IOError: 145 import traceback 146 error('Problems reading from file %s\n'%(details.inFileName)) 147 traceback.print_exc() 148 149 idCol = 0 150 smiCol = 1 151 else: 152 dataSet = None 153 154 fps = None 155 if dataSet: 156 data = dataSet.GetNamedData() 157 if not details.molPklName: 158 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol), 159 details.__dict__) 160 else: 161 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol), 162 details.__dict__) 163 if fps: 164 if details.outFileName: 165 outF = open(details.outFileName,'wb+') 166 for i in range(len(fps)): 167 cPickle.dump(fps[i],outF) 168 outF.close() 169 dbName = details.outDbName or details.dbName 170 if details.outTableName and dbName: 171 conn = DbConnect(dbName) 172 # 173 # We don't have a db open already, so we'll need to figure out 174 # the types of our columns... 175 # 176 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0])) 177 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes, 178 keyCol=details.idName) 179 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName) 180 181 # FIX: we should really check to see if the table 182 # is already there and, if so, add the appropriate 183 # column. 184 185 # 186 # create the new table 187 # 188 if details.replaceTable or \ 189 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]: 190 conn.AddTable(details.outTableName,cols) 191 192 # 193 # And add the data 194 # 195 for id,fp in fps: 196 tpl = id,DbModule.binaryHolder(fp.ToBinary()) 197 conn.InsertData(details.outTableName,tpl) 198 conn.Commit() 199 return fps
200 # ------------------------------------------------ 201 # 202 # Command line parsing stuff 203 # 204 # ------------------------------------------------ 205
206 -class FingerprinterDetails(object):
207 """ class for storing the details of a fingerprinting run, 208 generates sensible defaults on construction 209 210 """
211 - def __init__(self):
212 self._fingerprinterInit() 213 self._screenerInit() 214 self._clusterInit()
215
216 - def _fingerprinterInit(self):
217 self.fingerprinter = Chem.DaylightFingerprint 218 self.fpColName="AutoFragmentFP" 219 self.idName='' 220 self.dbName='' 221 self.outDbName='' 222 self.tableName='' 223 self.minSize=64 224 self.fpSize=2048 225 self.tgtDensity=0.3 226 self.minPath=1 227 self.maxPath=7 228 self.discrimHash=0 229 self.useHs=0 230 self.useValence=0 231 self.bitsPerHash=4 232 self.smilesName='SMILES' 233 self.molPklName='' 234 self.maxMols=-1 235 self.outFileName='' 236 self.outTableName='' 237 self.inFileName='' 238 self.replaceTable=True
239
240 - def _screenerInit(self):
241 self.metric = DataStructs.TanimotoSimilarity 242 self.doScreen='' 243 self.topN=10 244 self.screenThresh=0.75 245 self.doThreshold=0 246 self.smilesTableName='' 247 self.probeSmiles='' 248 self.probeMol=None 249 self.noPickle=0
250
251 - def _clusterInit(self):
252 self.clusterAlgo = Murtagh.WARDS 253 self.actTableName = '' 254 self.actName = ''
255
256 - def GetMetricName(self):
257 if self.metric == DataStructs.TanimotoSimilarity: 258 return 'Tanimoto' 259 elif self.metric == DataStructs.DiceSimilarity: 260 return 'Dice' 261 elif self.metric == DataStructs.CosineSimilarity: 262 return 'Cosine' 263 elif self.metric: 264 return self.metric 265 else: 266 return 'Unknown'
267 - def SetMetricFromName(self,name):
268 name = name.upper() 269 if name=="TANIMOTO": 270 self.metric = DataStructs.TanimotoSimilarity 271 elif name=="DICE": 272 self.metric = DataStructs.DiceSimilarity 273 elif name=="COSINE": 274 self.metric = DataStructs.CosineSimilarity
275
276 -def Usage():
277 """ prints a usage string and exits 278 279 """ 280 print _usageDoc 281 sys.exit(-1)
282 283 _usageDoc=""" 284 Usage: FingerprintMols.py [args] <fName> 285 286 If <fName> is provided and no tableName is specified (see below), 287 data will be read from the text file <fName>. Text files delimited 288 with either commas (extension .csv) or tabs (extension .txt) are 289 supported. 290 291 Command line arguments are: 292 - -d _dbName_: set the name of the database from which 293 to pull input molecule information. If output is 294 going to a database, this will also be used for that 295 unless the --outDbName option is used. 296 297 - -t _tableName_: set the name of the database table 298 from which to pull input molecule information 299 300 - --smilesName=val: sets the name of the SMILES column 301 in the input database. Default is *SMILES*. 302 303 - --idName=val: sets the name of the id column in the input 304 database. Defaults to be the name of the first db column 305 (or *ID* for text files). 306 307 - -o _outFileName_: name of the output file (output will 308 be a pickle file with one label,fingerprint entry for each 309 molecule). 310 311 - --outTable=val: name of the output db table used to store 312 fingerprints. If this table already exists, it will be 313 replaced. 314 315 - --outDbName: name of output database, if it's being used. 316 Defaults to be the same as the input db. 317 318 - --fpColName=val: name to use for the column which stores 319 fingerprints (in pickled format) in the output db table. 320 Default is *AutoFragmentFP* 321 322 - --maxSize=val: base size of the fingerprints to be generated 323 Default is *2048* 324 325 - --minSize=val: minimum size of the fingerprints to be generated 326 (limits the amount of folding that happens). Default is *64* 327 328 - --density=val: target bit density in the fingerprint. The 329 fingerprint will be folded until this density is 330 reached. Default is *0.3* 331 332 - --minPath=val: minimum path length to be included in 333 fragment-based fingerprints. Default is *1*. 334 335 - --maxPath=val: maximum path length to be included in 336 fragment-based fingerprints. Default is *7*. 337 338 - --nBitsPerHash: number of bits to be set in the output 339 fingerprint for each fragment. Default is *4*. 340 341 - --discrim: use of path-based discriminators to hash bits. 342 Default is *false*. 343 344 - -V: include valence information in the fingerprints 345 Default is *false*. 346 347 - -H: include Hs in the fingerprint 348 Default is *false*. 349 350 - --maxMols=val: sets the maximum number of molecules to be 351 fingerprinted. 352 353 - --useMACCS: use the public MACCS keys to do the fingerprinting 354 (instead of a daylight-type fingerprint) 355 356 """ 357
358 -def ParseArgs(details=None):
359 """ parses the command line arguments and returns a 360 _FingerprinterDetails_ instance with the results. 361 362 **Note**: 363 364 - If you make modifications here, please update the global 365 _usageDoc string so the Usage message is up to date. 366 367 - This routine is used by both the fingerprinter, the clusterer and the 368 screener; not all arguments make sense for all applications. 369 370 """ 371 import sys,getopt 372 try: 373 args = sys.argv[1:] 374 except: 375 Usage() 376 try: 377 args,extras = getopt.getopt(args,'HVs:d:t:o:h', 378 [ 379 'minSize=','maxSize=', 380 'density=', 381 'minPath=','maxPath=', 382 'bitsPerHash=', 383 'smilesName=', 384 'molPkl=', 385 'idName=', 386 'discrim', 387 'outTable=', 388 'outDbName=', 389 'fpColName=', 390 'maxMols=', 391 'useMACCS', 392 'keepTable', 393 # SCREENING: 394 'smilesTable=', 395 'doScreen=', 396 'topN=', 397 'thresh=', 398 'smiles=', 399 'dice', 400 'cosine', 401 # CLUSTERING: 402 'actTable=', 403 'actName=', 404 'SLINK', 405 'CLINK', 406 'UPGMA', 407 408 ]) 409 except: 410 import traceback 411 traceback.print_exc() 412 Usage() 413 414 if details is None: 415 details = FingerprinterDetails() 416 if len(extras): 417 details.inFileName=extras[0] 418 419 for arg,val in args: 420 if arg=='-H': 421 details.useHs=1 422 elif arg=='-V': 423 details.useValence=1 424 elif arg=='-d': 425 details.dbName = val 426 elif arg=='-t': 427 details.tableName = val 428 elif arg=='-o': 429 details.outFileName = val 430 elif arg=='--minSize': 431 details.minSize= int(val) 432 elif arg=='--maxSize': 433 details.fpSize= int(val) 434 elif arg=='--density': 435 details.tgtDensity = float(val) 436 elif arg=='--outTable': 437 details.outTableName = val 438 elif arg=='--outDbName': 439 details.outDbName = val 440 elif arg=='--fpColName': 441 details.fpColName = val 442 elif arg=='--minPath': 443 details.minPath= int(val) 444 elif arg=='--maxPath': 445 details.maxPath= int(val) 446 elif arg=='--nBitsPerHash': 447 details.bitsPerHash= int(val) 448 elif arg=='--discrim': 449 details.discrimHash=1 450 elif arg=='--smilesName': 451 details.smilesName = val 452 elif arg=='--molPkl': 453 details.molPklName=val 454 elif arg=='--idName': 455 details.idName = val 456 elif arg=='--maxMols': 457 details.maxMols = int(val) 458 elif arg=='--useMACCS': 459 details.fingerprinter = MACCSkeys.GenMACCSKeys 460 elif arg=='--keepTable': 461 details.replaceTable=False 462 463 # SCREENER: 464 elif arg=='--smilesTable': 465 details.smilesTableName=val; 466 elif arg=='--topN': 467 details.doThreshold=0 468 details.topN=int(val) 469 elif arg=='--thresh': 470 details.doThreshold=1 471 details.screenThresh=float(val) 472 elif arg=='--smiles': 473 details.probeSmiles=val; 474 elif arg=='--dice': 475 details.metric = DataStructs.DiceSimilarity 476 elif arg=='--cosine': 477 details.metric = DataStructs.CosineSimilarity 478 479 # CLUSTERS: 480 elif arg=='--SLINK': 481 details.clusterAlgo = Murtagh.SLINK 482 elif arg=='--CLINK': 483 details.clusterAlgo = Murtagh.CLINK 484 elif arg=='--UPGMA': 485 details.clusterAlgo = Murtagh.UPGMA 486 elif arg=='--actTable': 487 details.actTableName = val 488 elif arg=='--actName': 489 details.actName = val 490 elif arg=='-h': 491 Usage() 492 return details
493 494 if __name__ == '__main__': 495 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING)) 496 details = ParseArgs() 497 FingerprintsFromDetails(details) 498