1
2
3
4
5
6
7 """ utility functionality for fingerprinting sets of molecules
8 includes a command line app for working with fingerprints
9 and databases
10
11
12 Sample Usage:
13
14 python FingerprintMols.py -d data.gdb \
15 -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID" \
16 --outTable="daylight_sig"
17
18
19 """
20 from rdkit import Chem
21 from rdkit.Chem import MACCSkeys
22 from rdkit.ML.Cluster import Murtagh
23 from rdkit import DataStructs
24 import sys
25 import cPickle
26
27 _cvsVersion="$Id: FingerprintMols.py 997 2009-02-25 06:12:43Z glandrum $"
28 idx1 = _cvsVersion.find(':')+1
29 idx2 = _cvsVersion.rfind('$')
30 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
31
32
37
42
54
71
72
77 """ fpArgs are passed as keyword arguments to the fingerprinter
78
79 Returns a list of 2-tuples: (id,fp)
80
81 """
82 res = []
83 nDone = 0
84 for entry in dataSource:
85 id,smi = str(entry[idCol]),str(entry[smiCol])
86 try:
87 mol = Chem.MolFromSmiles(smi)
88 except:
89 mol = None
90 if mol:
91 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
92 res.append((id,fp))
93 nDone += 1
94 if reportFreq>0 and not nDone % reportFreq:
95 message('Done %d molecules\n'%(nDone))
96 if maxMols > 0 and nDone >= maxMols:
97 break
98 else:
99 error('Problems parsing SMILES: %s\n'%smi)
100 return res
101
106 """ fpArgs are passed as keyword arguments to the fingerprinter
107
108 Returns a list of 2-tuples: (id,fp)
109
110 """
111 res = []
112 nDone = 0
113 for id,mol in mols:
114 if mol:
115 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
116 res.append((id,fp))
117 nDone += 1
118 if reportFreq>0 and not nDone % reportFreq:
119 message('Done %d molecules\n'%(nDone))
120 if maxMols > 0 and nDone >= maxMols:
121 break
122 else:
123 error('Problems parsing SMILES: %s\n'%smi)
124 return res
125
130 """ fpArgs are passed as keyword arguments to the fingerprinter
131
132 Returns a list of 2-tuples: (id,fp)
133
134 """
135 res = []
136 nDone = 0
137 for entry in dataSource:
138 id,pkl = str(entry[idCol]),str(entry[pklCol])
139 try:
140 mol = Chem.Mol(pkl)
141 except:
142 mol = None
143 if mol:
144 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
145 res.append((id,fp))
146 nDone += 1
147 if reportFreq>0 and not nDone % reportFreq:
148 message('Done %d molecules\n'%(nDone))
149 if maxMols > 0 and nDone >= maxMols:
150 break
151 else:
152 error('Problems parsing pickle for id: %s\n'%id)
153 return res
154
156 data = None
157 if details.dbName and details.tableName:
158 from rdkit.Dbase.DbConnection import DbConnect
159 from rdkit.Dbase import DbInfo
160 from rdkit.ML.Data import DataUtils
161 try:
162 conn = DbConnect(details.dbName,details.tableName)
163 except:
164 import traceback
165 error('Problems establishing connection to database: %s|%s\n'%(details.dbName,
166 details.tableName))
167 traceback.print_exc()
168 if not details.idName:
169 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0]
170 dataSet = DataUtils.DBToData(details.dbName,details.tableName,
171 what='%s,%s'%(details.idName,details.smilesName))
172 idCol = 0
173 smiCol = 1
174 elif details.inFileName and details.useSmiles:
175 from rdkit.ML.Data import DataUtils
176 conn = None
177 if not details.idName:
178 details.idName='ID'
179 try:
180 dataSet = DataUtils.TextFileToData(details.inFileName,
181 onlyCols=[details.idName,details.smilesName])
182 except IOError:
183 import traceback
184 error('Problems reading from file %s\n'%(details.inFileName))
185 traceback.print_exc()
186
187 idCol = 0
188 smiCol = 1
189 elif details.inFileName and details.useSD:
190 conn = None
191 dataset=None
192 if not details.idName:
193 details.idName='ID'
194 dataSet = []
195 try:
196 s = Chem.SDMolSupplier(details.inFileName)
197 except:
198 import traceback
199 error('Problems reading from file %s\n'%(details.inFileName))
200 traceback.print_exc()
201 else:
202 while 1:
203 try:
204 m = s.next()
205 except StopIteration:
206 break
207 if m:
208 dataSet.append(m)
209 if reportFreq>0 and not len(dataSet) % reportFreq:
210 message('Read %d molecules\n'%(len(dataSet)))
211 if details.maxMols > 0 and len(dataSet) >= details.maxMols:
212 break
213
214 for i,mol in enumerate(dataSet):
215 if mol.HasProp(details.idName):
216 nm = mol.GetProp(details.idName)
217 else:
218 nm = mol.GetProp('_Name')
219 dataSet[i] = (nm,mol)
220 else:
221 dataSet = None
222
223 fps = None
224 if dataSet and not details.useSD:
225 data = dataSet.GetNamedData()
226 if not details.molPklName:
227 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol),
228 details.__dict__)
229 else:
230 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol),
231 details.__dict__)
232 elif dataSet and details.useSD:
233 fps = apply(FingerprintsFromMols,(dataSet,),details.__dict__)
234
235 if fps:
236 if details.outFileName:
237 outF = open(details.outFileName,'wb+')
238 for i in range(len(fps)):
239 cPickle.dump(fps[i],outF)
240 outF.close()
241 dbName = details.outDbName or details.dbName
242 if details.outTableName and dbName:
243 from rdkit.Dbase.DbConnection import DbConnect
244 from rdkit.Dbase import DbInfo,DbUtils,DbModule
245 conn = DbConnect(dbName)
246
247
248
249
250 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0]))
251 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes,
252 keyCol=details.idName)
253 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName)
254
255
256
257
258
259
260
261
262 if details.replaceTable or \
263 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
264 conn.AddTable(details.outTableName,cols)
265
266
267
268
269 for id,fp in fps:
270 tpl = id,DbModule.binaryHolder(fp.ToBinary())
271 conn.InsertData(details.outTableName,tpl)
272 conn.Commit()
273 return fps
274
275
276
277
278
279
281 """ class for storing the details of a fingerprinting run,
282 generates sensible defaults on construction
283
284 """
289
291 self.fingerprinter = Chem.RDKFingerprint
292 self.fpColName="AutoFragmentFP"
293 self.idName=''
294 self.dbName=''
295 self.outDbName=''
296 self.tableName=''
297 self.minSize=64
298 self.fpSize=2048
299 self.tgtDensity=0.3
300 self.minPath=1
301 self.maxPath=7
302 self.discrimHash=0
303 self.useHs=0
304 self.useValence=0
305 self.bitsPerHash=4
306 self.smilesName='SMILES'
307 self.maxMols=-1
308 self.outFileName=''
309 self.outTableName=''
310 self.inFileName=''
311 self.replaceTable=True
312 self.molPklName=''
313 self.useSmiles=True
314 self.useSD=False
315
317 self.metric = DataStructs.TanimotoSimilarity
318 self.doScreen=''
319 self.topN=10
320 self.screenThresh=0.75
321 self.doThreshold=0
322 self.smilesTableName=''
323 self.probeSmiles=''
324 self.probeMol=None
325 self.noPickle=0
326
328 self.clusterAlgo = Murtagh.WARDS
329 self.actTableName = ''
330 self.actName = ''
331
351
353 """ prints a usage string and exits
354
355 """
356 print _usageDoc
357 sys.exit(-1)
358
359 _usageDoc="""
360 Usage: FingerprintMols.py [args] <fName>
361
362 If <fName> is provided and no tableName is specified (see below),
363 data will be read from the text file <fName>. Text files delimited
364 with either commas (extension .csv) or tabs (extension .txt) are
365 supported.
366
367 Command line arguments are:
368 - -d _dbName_: set the name of the database from which
369 to pull input molecule information. If output is
370 going to a database, this will also be used for that
371 unless the --outDbName option is used.
372
373 - -t _tableName_: set the name of the database table
374 from which to pull input molecule information
375
376 - --smilesName=val: sets the name of the SMILES column
377 in the input database. Default is *SMILES*.
378
379 - --useSD: Assume that the input file is an SD file, not a SMILES
380 table.
381
382 - --idName=val: sets the name of the id column in the input
383 database. Defaults to be the name of the first db column
384 (or *ID* for text files).
385
386 - -o _outFileName_: name of the output file (output will
387 be a pickle file with one label,fingerprint entry for each
388 molecule).
389
390 - --outTable=val: name of the output db table used to store
391 fingerprints. If this table already exists, it will be
392 replaced.
393
394 - --outDbName: name of output database, if it's being used.
395 Defaults to be the same as the input db.
396
397 - --fpColName=val: name to use for the column which stores
398 fingerprints (in pickled format) in the output db table.
399 Default is *AutoFragmentFP*
400
401 - --maxSize=val: base size of the fingerprints to be generated
402 Default is *2048*
403
404 - --minSize=val: minimum size of the fingerprints to be generated
405 (limits the amount of folding that happens). Default is *64*
406
407 - --density=val: target bit density in the fingerprint. The
408 fingerprint will be folded until this density is
409 reached. Default is *0.3*
410
411 - --minPath=val: minimum path length to be included in
412 fragment-based fingerprints. Default is *1*.
413
414 - --maxPath=val: maximum path length to be included in
415 fragment-based fingerprints. Default is *7*.
416
417 - --nBitsPerHash: number of bits to be set in the output
418 fingerprint for each fragment. Default is *4*.
419
420 - --discrim: use of path-based discriminators to hash bits.
421 Default is *false*.
422
423 - -V: include valence information in the fingerprints
424 Default is *false*.
425
426 - -H: include Hs in the fingerprint
427 Default is *false*.
428
429 - --maxMols=val: sets the maximum number of molecules to be
430 fingerprinted.
431
432 - --useMACCS: use the public MACCS keys to do the fingerprinting
433 (instead of a daylight-type fingerprint)
434
435 """
436
438 """ parses the command line arguments and returns a
439 _FingerprinterDetails_ instance with the results.
440
441 **Note**:
442
443 - If you make modifications here, please update the global
444 _usageDoc string so the Usage message is up to date.
445
446 - This routine is used by both the fingerprinter, the clusterer and the
447 screener; not all arguments make sense for all applications.
448
449 """
450 import sys,getopt
451 try:
452 args = sys.argv[1:]
453 except:
454 Usage()
455 try:
456 args,extras = getopt.getopt(args,'HVs:d:t:o:h',
457 [
458 'minSize=','maxSize=',
459 'density=',
460 'minPath=','maxPath=',
461 'bitsPerHash=',
462 'smilesName=',
463 'molPkl=',
464 'useSD',
465 'idName=',
466 'discrim',
467 'outTable=',
468 'outDbName=',
469 'fpColName=',
470 'maxMols=',
471 'useMACCS',
472 'keepTable',
473
474 'smilesTable=',
475 'doScreen=',
476 'topN=',
477 'thresh=',
478 'smiles=',
479 'dice',
480 'cosine',
481
482 'actTable=',
483 'actName=',
484 'SLINK',
485 'CLINK',
486 'UPGMA',
487
488 ])
489 except:
490 import traceback
491 traceback.print_exc()
492 Usage()
493
494 if details is None:
495 details = FingerprinterDetails()
496 if len(extras):
497 details.inFileName=extras[0]
498
499 for arg,val in args:
500 if arg=='-H':
501 details.useHs=1
502 elif arg=='-V':
503 details.useValence=1
504 elif arg=='-d':
505 details.dbName = val
506 elif arg=='-t':
507 details.tableName = val
508 elif arg=='-o':
509 details.outFileName = val
510 elif arg=='--minSize':
511 details.minSize= int(val)
512 elif arg=='--maxSize':
513 details.fpSize= int(val)
514 elif arg=='--density':
515 details.tgtDensity = float(val)
516 elif arg=='--outTable':
517 details.outTableName = val
518 elif arg=='--outDbName':
519 details.outDbName = val
520 elif arg=='--fpColName':
521 details.fpColName = val
522 elif arg=='--minPath':
523 details.minPath= int(val)
524 elif arg=='--maxPath':
525 details.maxPath= int(val)
526 elif arg=='--nBitsPerHash':
527 details.bitsPerHash= int(val)
528 elif arg=='--discrim':
529 details.discrimHash=1
530 elif arg=='--smilesName':
531 details.smilesName = val
532 elif arg=='--molPkl':
533 details.molPklName = val
534 elif arg=='--useSD':
535 details.useSmiles=False
536 details.useSD=True
537 elif arg=='--idName':
538 details.idName = val
539 elif arg=='--maxMols':
540 details.maxMols = int(val)
541 elif arg=='--useMACCS':
542 details.fingerprinter = MACCSkeys.GenMACCSKeys
543 elif arg=='--keepTable':
544 details.replaceTable=False
545
546
547 elif arg=='--smilesTable':
548 details.smilesTableName=val;
549 elif arg=='--topN':
550 details.doThreshold=0
551 details.topN=int(val)
552 elif arg=='--thresh':
553 details.doThreshold=1
554 details.screenThresh=float(val)
555 elif arg=='--smiles':
556 details.probeSmiles=val;
557 elif arg=='--dice':
558 details.metric = DataStructs.DiceSimilarity
559 elif arg=='--cosine':
560 details.metric = DataStructs.CosineSimilarity
561
562
563 elif arg=='--SLINK':
564 details.clusterAlgo = Murtagh.SLINK
565 elif arg=='--CLINK':
566 details.clusterAlgo = Murtagh.CLINK
567 elif arg=='--UPGMA':
568 details.clusterAlgo = Murtagh.UPGMA
569 elif arg=='--actTable':
570 details.actTableName = val
571 elif arg=='--actName':
572 details.actName = val
573 elif arg=='-h':
574 Usage()
575 return details
576
577 if __name__ == '__main__':
578 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING))
579 details = ParseArgs()
580 FingerprintsFromDetails(details)
581