1
2
3
4
5
6
7 """ utility functionality for fingerprinting sets of molecules
8 includes a command line app for working with fingerprints
9 and databases
10
11
12 Sample Usage:
13
14 python FingerprintMols.py -d data.gdb \
15 -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID" \
16 --outTable="daylight_sig"
17
18
19 """
20 import Chem
21 from Chem import MACCSkeys
22 from Dbase.DbConnection import DbConnect
23 from Dbase import DbInfo,DbUtils,DbModule
24 from ML.Data import DataUtils
25 from ML.Cluster import Murtagh
26 import DataStructs
27 import sys
28 import cPickle
29
30 _cvsVersion="$Id: FingerprintMols.py 346 2007-09-27 05:27:16Z glandrum $"
31 idx1 = _cvsVersion.find(':')+1
32 idx2 = _cvsVersion.rfind('$')
33 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
34
35
40
41
45 if not fpArgs:
46 details = FingerprinterDetails()
47 fpArgs = details.__dict__
48
49 if fingerprinter != Chem.DaylightFingerprint:
50 fp = fingerprinter(mol,**fpArgs)
51 else:
52 fp = fingerprinter(mol,fpArgs['minPath'],fpArgs['maxPath'],
53 fpArgs['fpSize'],fpArgs['bitsPerHash'],
54 fpArgs['useHs'],fpArgs['tgtDensity'],
55 fpArgs['minSize'])
56 return fp
57
62 """ fpArgs are passed as keyword arguments to the fingerprinter
63
64 Returns a list of 2-tuples: (id,fp)
65
66 """
67 res = []
68 nDone = 0
69 for entry in dataSource:
70 id,smi = str(entry[idCol]),str(entry[smiCol])
71 try:
72 mol = Chem.MolFromSmiles(smi)
73 except:
74 mol = None
75 if mol:
76 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
77 res.append((id,fp))
78 nDone += 1
79 if reportFreq>0 and not nDone % reportFreq:
80 message('Done %d molecules\n'%(nDone))
81 if maxMols > 0 and nDone >= maxMols:
82 break
83 else:
84 error('Problems parsing SMILES: %s\n'%smi)
85 return res
86
91 """ fpArgs are passed as keyword arguments to the fingerprinter
92
93 Returns a list of 2-tuples: (id,fp)
94
95 """
96 res = []
97 nDone = 0
98 for entry in dataSource:
99 id,pkl = str(entry[idCol]),str(entry[pklCol])
100 try:
101 mol = Chem.Mol(pkl)
102 except:
103 mol = None
104 if mol:
105 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
106 res.append((id,fp))
107 nDone += 1
108 if reportFreq>0 and not nDone % reportFreq:
109 message('Done %d molecules\n'%(nDone))
110 if maxMols > 0 and nDone >= maxMols:
111 break
112 else:
113 error('Problems parsing pickle for id: %s\n'%id)
114 return res
115
117 data = None
118 if details.dbName and details.tableName:
119 try:
120 conn = DbConnect(details.dbName,details.tableName)
121 except:
122 import traceback
123 error('Problems establishing connection to database: %s|%s\n'%(details.dbName,
124 details.tableName))
125 traceback.print_exc()
126 if not details.idName:
127 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0]
128 dataName = details.smilesName
129 if details.molPklName:
130 dataName = details.molPklName
131 else:
132 dataName = details.smilesName
133 dataSet = DataUtils.DBToData(details.dbName,details.tableName,
134 what='%s,%s'%(details.idName,dataName))
135 idCol = 0
136 smiCol = 1
137 elif details.inFileName:
138 conn = None
139 if not details.idName:
140 details.idName='ID'
141 try:
142 dataSet = DataUtils.TextFileToData(details.inFileName,
143 onlyCols=[details.idName,details.smilesName])
144 except IOError:
145 import traceback
146 error('Problems reading from file %s\n'%(details.inFileName))
147 traceback.print_exc()
148
149 idCol = 0
150 smiCol = 1
151 else:
152 dataSet = None
153
154 fps = None
155 if dataSet:
156 data = dataSet.GetNamedData()
157 if not details.molPklName:
158 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol),
159 details.__dict__)
160 else:
161 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol),
162 details.__dict__)
163 if fps:
164 if details.outFileName:
165 outF = open(details.outFileName,'wb+')
166 for i in range(len(fps)):
167 cPickle.dump(fps[i],outF)
168 outF.close()
169 dbName = details.outDbName or details.dbName
170 if details.outTableName and dbName:
171 conn = DbConnect(dbName)
172
173
174
175
176 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0]))
177 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes,
178 keyCol=details.idName)
179 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName)
180
181
182
183
184
185
186
187
188 if details.replaceTable or \
189 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
190 conn.AddTable(details.outTableName,cols)
191
192
193
194
195 for id,fp in fps:
196 tpl = id,DbModule.binaryHolder(fp.ToBinary())
197 conn.InsertData(details.outTableName,tpl)
198 conn.Commit()
199 return fps
200
201
202
203
204
205
207 """ class for storing the details of a fingerprinting run,
208 generates sensible defaults on construction
209
210 """
215
217 self.fingerprinter = Chem.DaylightFingerprint
218 self.fpColName="AutoFragmentFP"
219 self.idName=''
220 self.dbName=''
221 self.outDbName=''
222 self.tableName=''
223 self.minSize=64
224 self.fpSize=2048
225 self.tgtDensity=0.3
226 self.minPath=1
227 self.maxPath=7
228 self.discrimHash=0
229 self.useHs=0
230 self.useValence=0
231 self.bitsPerHash=4
232 self.smilesName='SMILES'
233 self.molPklName=''
234 self.maxMols=-1
235 self.outFileName=''
236 self.outTableName=''
237 self.inFileName=''
238 self.replaceTable=True
239
241 self.metric = DataStructs.TanimotoSimilarity
242 self.doScreen=''
243 self.topN=10
244 self.screenThresh=0.75
245 self.doThreshold=0
246 self.smilesTableName=''
247 self.probeSmiles=''
248 self.probeMol=None
249 self.noPickle=0
250
252 self.clusterAlgo = Murtagh.WARDS
253 self.actTableName = ''
254 self.actName = ''
255
275
277 """ prints a usage string and exits
278
279 """
280 print _usageDoc
281 sys.exit(-1)
282
283 _usageDoc="""
284 Usage: FingerprintMols.py [args] <fName>
285
286 If <fName> is provided and no tableName is specified (see below),
287 data will be read from the text file <fName>. Text files delimited
288 with either commas (extension .csv) or tabs (extension .txt) are
289 supported.
290
291 Command line arguments are:
292 - -d _dbName_: set the name of the database from which
293 to pull input molecule information. If output is
294 going to a database, this will also be used for that
295 unless the --outDbName option is used.
296
297 - -t _tableName_: set the name of the database table
298 from which to pull input molecule information
299
300 - --smilesName=val: sets the name of the SMILES column
301 in the input database. Default is *SMILES*.
302
303 - --idName=val: sets the name of the id column in the input
304 database. Defaults to be the name of the first db column
305 (or *ID* for text files).
306
307 - -o _outFileName_: name of the output file (output will
308 be a pickle file with one label,fingerprint entry for each
309 molecule).
310
311 - --outTable=val: name of the output db table used to store
312 fingerprints. If this table already exists, it will be
313 replaced.
314
315 - --outDbName: name of output database, if it's being used.
316 Defaults to be the same as the input db.
317
318 - --fpColName=val: name to use for the column which stores
319 fingerprints (in pickled format) in the output db table.
320 Default is *AutoFragmentFP*
321
322 - --maxSize=val: base size of the fingerprints to be generated
323 Default is *2048*
324
325 - --minSize=val: minimum size of the fingerprints to be generated
326 (limits the amount of folding that happens). Default is *64*
327
328 - --density=val: target bit density in the fingerprint. The
329 fingerprint will be folded until this density is
330 reached. Default is *0.3*
331
332 - --minPath=val: minimum path length to be included in
333 fragment-based fingerprints. Default is *1*.
334
335 - --maxPath=val: maximum path length to be included in
336 fragment-based fingerprints. Default is *7*.
337
338 - --nBitsPerHash: number of bits to be set in the output
339 fingerprint for each fragment. Default is *4*.
340
341 - --discrim: use of path-based discriminators to hash bits.
342 Default is *false*.
343
344 - -V: include valence information in the fingerprints
345 Default is *false*.
346
347 - -H: include Hs in the fingerprint
348 Default is *false*.
349
350 - --maxMols=val: sets the maximum number of molecules to be
351 fingerprinted.
352
353 - --useMACCS: use the public MACCS keys to do the fingerprinting
354 (instead of a daylight-type fingerprint)
355
356 """
357
359 """ parses the command line arguments and returns a
360 _FingerprinterDetails_ instance with the results.
361
362 **Note**:
363
364 - If you make modifications here, please update the global
365 _usageDoc string so the Usage message is up to date.
366
367 - This routine is used by both the fingerprinter, the clusterer and the
368 screener; not all arguments make sense for all applications.
369
370 """
371 import sys,getopt
372 try:
373 args = sys.argv[1:]
374 except:
375 Usage()
376 try:
377 args,extras = getopt.getopt(args,'HVs:d:t:o:h',
378 [
379 'minSize=','maxSize=',
380 'density=',
381 'minPath=','maxPath=',
382 'bitsPerHash=',
383 'smilesName=',
384 'molPkl=',
385 'idName=',
386 'discrim',
387 'outTable=',
388 'outDbName=',
389 'fpColName=',
390 'maxMols=',
391 'useMACCS',
392 'keepTable',
393
394 'smilesTable=',
395 'doScreen=',
396 'topN=',
397 'thresh=',
398 'smiles=',
399 'dice',
400 'cosine',
401
402 'actTable=',
403 'actName=',
404 'SLINK',
405 'CLINK',
406 'UPGMA',
407
408 ])
409 except:
410 import traceback
411 traceback.print_exc()
412 Usage()
413
414 if details is None:
415 details = FingerprinterDetails()
416 if len(extras):
417 details.inFileName=extras[0]
418
419 for arg,val in args:
420 if arg=='-H':
421 details.useHs=1
422 elif arg=='-V':
423 details.useValence=1
424 elif arg=='-d':
425 details.dbName = val
426 elif arg=='-t':
427 details.tableName = val
428 elif arg=='-o':
429 details.outFileName = val
430 elif arg=='--minSize':
431 details.minSize= int(val)
432 elif arg=='--maxSize':
433 details.fpSize= int(val)
434 elif arg=='--density':
435 details.tgtDensity = float(val)
436 elif arg=='--outTable':
437 details.outTableName = val
438 elif arg=='--outDbName':
439 details.outDbName = val
440 elif arg=='--fpColName':
441 details.fpColName = val
442 elif arg=='--minPath':
443 details.minPath= int(val)
444 elif arg=='--maxPath':
445 details.maxPath= int(val)
446 elif arg=='--nBitsPerHash':
447 details.bitsPerHash= int(val)
448 elif arg=='--discrim':
449 details.discrimHash=1
450 elif arg=='--smilesName':
451 details.smilesName = val
452 elif arg=='--molPkl':
453 details.molPklName=val
454 elif arg=='--idName':
455 details.idName = val
456 elif arg=='--maxMols':
457 details.maxMols = int(val)
458 elif arg=='--useMACCS':
459 details.fingerprinter = MACCSkeys.GenMACCSKeys
460 elif arg=='--keepTable':
461 details.replaceTable=False
462
463
464 elif arg=='--smilesTable':
465 details.smilesTableName=val;
466 elif arg=='--topN':
467 details.doThreshold=0
468 details.topN=int(val)
469 elif arg=='--thresh':
470 details.doThreshold=1
471 details.screenThresh=float(val)
472 elif arg=='--smiles':
473 details.probeSmiles=val;
474 elif arg=='--dice':
475 details.metric = DataStructs.DiceSimilarity
476 elif arg=='--cosine':
477 details.metric = DataStructs.CosineSimilarity
478
479
480 elif arg=='--SLINK':
481 details.clusterAlgo = Murtagh.SLINK
482 elif arg=='--CLINK':
483 details.clusterAlgo = Murtagh.CLINK
484 elif arg=='--UPGMA':
485 details.clusterAlgo = Murtagh.UPGMA
486 elif arg=='--actTable':
487 details.actTableName = val
488 elif arg=='--actName':
489 details.actName = val
490 elif arg=='-h':
491 Usage()
492 return details
493
494 if __name__ == '__main__':
495 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING))
496 details = ParseArgs()
497 FingerprintsFromDetails(details)
498