|
Package rdkit ::
Package Chem ::
Module BuildFragmentCatalog
|
|
1
2
3
4
5
6
7 """ command line utility for working with FragmentCatalogs (CASE-type analysis)
8
9 **Usage**
10
11 BuildFragmentCatalog [optional args] <filename>
12
13 filename, the name of a delimited text file containing InData, is required
14 for some modes of operation (see below)
15
16 **Command Line Arguments**
17
18 - -n *maxNumMols*: specify the maximum number of molecules to be processed
19
20 - -b: build the catalog and OnBitLists
21 *requires InData*
22
23 - -s: score compounds
24 *requires InData and a Catalog, can use OnBitLists*
25
26 - -g: calculate info gains
27 *requires Scores*
28
29 - -d: show details about high-ranking fragments
30 *requires a Catalog and Gains*
31
32 - --catalog=*filename*: filename with the pickled catalog.
33 If -b is provided, this file will be overwritten.
34
35 - --onbits=*filename*: filename to hold the pickled OnBitLists.
36 If -b is provided, this file will be overwritten
37
38 - --scores=*filename*: filename to hold the text score data.
39 If -s is provided, this file will be overwritten
40
41 - --gains=*filename*: filename to hold the text gains data.
42 If -g is provided, this file will be overwritten
43
44 - --details=*filename*: filename to hold the text details data.
45 If -d is provided, this file will be overwritten.
46
47 - --minPath=2: specify the minimum length for a path
48
49 - --maxPath=6: specify the maximum length for a path
50
51 - --smiCol=1: specify which column in the input data file contains
52 SMILES
53
54 - --actCol=-1: specify which column in the input data file contains
55 activities
56
57 - --nActs=2: specify the number of possible activity values
58
59 - --nBits=-1: specify the maximum number of bits to show details for
60
61 """
62 import sys,os,cPickle
63 from rdkit import Chem
64 from rdkit import RDConfig
65 from rdkit.Chem import FragmentCatalog
66 from rdkit.Dbase.DbConnection import DbConnect
67 import numpy
68 from rdkit.ML import InfoTheory
69 import types,sets
70
71 _cvsVersion="$Revision: 997 $"
72 idx1 = _cvsVersion.find(':')+1
73 idx2 = _cvsVersion.rfind('$')
74 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
75
78
79 -def BuildCatalog(suppl,maxPts=-1,groupFileName=None,
80 minPath=2,maxPath=6,reportFreq=10):
81 """ builds a fragment catalog from a set of molecules in a delimited text block
82
83 **Arguments**
84
85 - suppl: a mol supplier
86
87 - maxPts: (optional) if provided, this will set an upper bound on the
88 number of points to be considered
89
90 - groupFileName: (optional) name of the file containing functional group
91 information
92
93 - minPath, maxPath: (optional) names of the minimum and maximum path lengths
94 to be considered
95
96 - reportFreq: (optional) how often to display status information
97
98 **Returns**
99
100 a FragmentCatalog
101
102 """
103 if groupFileName is None:
104 groupFileName = os.path.join(RDConfig.RDDataDir,"FunctionalGroups.txt")
105
106 fpParams = FragmentCatalog.FragCatParams(minPath,maxPath,groupFileName)
107 catalog = FragmentCatalog.FragCatalog(fpParams)
108 fgen = FragmentCatalog.FragCatGenerator()
109 if maxPts >0:
110 nPts = maxPts
111 else:
112 if hasattr(suppl,'__len__'):
113 nPts = len(suppl)
114 else:
115 nPts = -1
116 for i,mol in enumerate(suppl):
117 if i == nPts:
118 break
119 if i and not i%reportFreq:
120 if nPts>-1:
121 message('Done %d of %d, %d paths\n'%(i,nPts,catalog.GetFPLength()))
122 else:
123 message('Done %d, %d paths\n'%(i,catalog.GetFPLength()))
124 fgen.AddFragsFromMol(mol,catalog)
125 return catalog
126
127 -def ScoreMolecules(suppl,catalog,maxPts=-1,actName='',acts=None,
128 nActs=2,reportFreq=10):
129 """ scores the compounds in a supplier using a catalog
130
131 **Arguments**
132
133 - suppl: a mol supplier
134
135 - catalog: the FragmentCatalog
136
137 - maxPts: (optional) the maximum number of molecules to be
138 considered
139
140 - actName: (optional) the name of the molecule's activity property.
141 If this is not provided, the molecule's last property will be used.
142
143 - acts: (optional) a sequence of activity values (integers).
144 If not provided, the activities will be read from the molecules.
145
146 - nActs: (optional) number of possible activity values
147
148 - reportFreq: (optional) how often to display status information
149
150 **Returns**
151
152 a 2-tuple:
153
154 1) the results table (a 3D array of ints nBits x 2 x nActs)
155
156 2) a list containing the on bit lists for each molecule
157
158 """
159 nBits = catalog.GetFPLength()
160 resTbl = numpy.zeros((nBits,2,nActs),numpy.int)
161 obls = []
162
163 if not actName and not acts:
164 actName = suppl[0].GetPropNames()[-1]
165
166
167 fpgen = FragmentCatalog.FragFPGenerator()
168 suppl.reset()
169 i = 1
170 for mol in suppl:
171 if i and not i%reportFreq:
172 message('Done %d.\n'%(i))
173 if mol:
174 if not acts:
175 act = int(mol.GetProp(actName))
176 else:
177 act = acts[i-1]
178 fp = fpgen.GetFPForMol(mol,catalog)
179 obls.append([x for x in fp.GetOnBits()])
180 for j in range(nBits):
181 resTbl[j,0,act] += 1
182 for id in obls[i-1]:
183 resTbl[id-1,0,act] -= 1
184 resTbl[id-1,1,act] += 1
185 else:
186 obls.append([])
187 i+=1
188 return resTbl,obls
189
190 -def ScoreFromLists(bitLists,suppl,catalog,maxPts=-1,actName='',acts=None,
191 nActs=2,reportFreq=10):
192 """ similar to _ScoreMolecules()_, but uses pre-calculated bit lists
193 for the molecules (this speeds things up a lot)
194
195
196 **Arguments**
197
198 - bitLists: sequence of on bit sequences for the input molecules
199
200 - suppl: the input supplier (we read activities from here)
201
202 - catalog: the FragmentCatalog
203
204 - maxPts: (optional) the maximum number of molecules to be
205 considered
206
207 - actName: (optional) the name of the molecule's activity property.
208 If this is not provided, the molecule's last property will be used.
209
210 - nActs: (optional) number of possible activity values
211
212 - reportFreq: (optional) how often to display status information
213
214 **Returns**
215
216 the results table (a 3D array of ints nBits x 2 x nActs)
217
218 """
219 nBits = catalog.GetFPLength()
220 if maxPts >0:
221 nPts = maxPts
222 else:
223 nPts = len(bitLists)
224 resTbl = numpy.zeros((nBits,2,nActs),numpy.int)
225 if not actName and not acts:
226 actName = suppl[0].GetPropNames()[-1]
227 suppl.reset()
228 for i in range(1,nPts+1):
229 mol = suppl.next()
230 if not acts:
231 act = int(mol.GetProp(actName))
232 else:
233 act = acts[i-1]
234 if i and not i%reportFreq:
235 message('Done %d of %d\n'%(i,nPts))
236 ids = sets.Set()
237 for id in bitLists[i-1]:
238 ids.add(id-1)
239 for j in range(nBits):
240 resTbl[j,0,act] += 1
241 for id in ids:
242 resTbl[id,0,act] -= 1
243 resTbl[id,1,act] += 1
244 return resTbl
245
246 -def CalcGains(suppl,catalog,topN=-1,actName='',acts=None,
247 nActs=2,reportFreq=10,biasList=None,collectFps=0):
248 """ calculates info gains by constructing fingerprints
249 *DOC*
250
251 Returns a 2-tuple:
252 1) gains matrix
253 2) list of fingerprints
254
255 """
256 nBits = catalog.GetFPLength()
257 if topN < 0:
258 topN = nBits
259 if not actName and not acts:
260 actName = suppl[0].GetPropNames()[-1]
261
262 gains = [0]*nBits
263 if hasattr(suppl,'__len__'):
264 nMols = len(suppl)
265 else:
266 nMols = -1
267 fpgen = FragmentCatalog.FragFPGenerator()
268
269 if biasList:
270 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY)
271 ranker.SetBiasList(biasList)
272 else:
273 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY)
274 i = 0
275 fps = []
276 for mol in suppl:
277 if not acts:
278 try:
279 act = int(mol.GetProp(actName))
280 except KeyError:
281 message('ERROR: Molecule has no property: %s\n'%(actName))
282 message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames())))
283 raise KeyError,actName
284 else:
285 act = acts[i]
286 if i and not i%reportFreq:
287 if nMols>0:
288 message('Done %d of %d.\n'%(i,nMols))
289 else:
290 message('Done %d.\n'%(i))
291 fp = fpgen.GetFPForMol(mol,catalog)
292 ranker.AccumulateVotes(fp,act)
293 i+=1;
294 if collectFps:
295 fps.append(fp)
296 gains = ranker.GetTopN(topN)
297 return gains,fps
298
299 -def CalcGainsFromFps(suppl,fps,topN=-1,actName='',acts=None,
300 nActs=2,reportFreq=10,biasList=None):
341
343 actHeaders = ['Act-%d'%(x) for x in range(nActs)]
344 if cat:
345 outF.write('id,Description,Gain,%s\n'%(','.join(actHeaders)))
346 else:
347 outF.write('id,Gain,%s\n'%(','.join(actHeaders)))
348 for entry in gains:
349 id = int(entry[0])
350 outL = [str(id)]
351 if cat:
352 descr = cat.GetBitDescription(id)
353 outL.append(descr)
354 outL.append('%.6f'%entry[1])
355 outL += ['%d'%x for x in entry[2:]]
356 outF.write(','.join(outL))
357 outF.write('\n')
358
359
361 """ reads a list of ids and info gains out of an input file
362
363 """
364 res = []
365 inL = inF.readline()
366 for line in inF.xreadlines():
367 splitL = line.strip().split(delim)
368 res.append((splitL[idCol],float(splitL[gainCol])))
369 return res
370
371
372 -def ShowDetails(catalog,gains,nToDo=-1,outF=sys.stdout,idCol=0,gainCol=1,
373 outDelim=','):
374 """
375 gains should be a sequence of sequences. The idCol entry of each
376 sub-sequence should be a catalog ID. _ProcessGainsData()_ provides
377 suitable input.
378
379 """
380 if nToDo < 0:
381 nToDo = len(gains)
382 for i in range(nToDo):
383 id = int(gains[i][idCol])
384 gain = float(gains[i][gainCol])
385 descr = catalog.GetFragDescription(id)
386 if descr:
387 outF.write('%s\n'%(outDelim.join((str(id),descr,str(gain)))))
388
424
425
427 print "This is BuildFragmentCatalog version %s"%(__VERSION_STRING)
428 print 'usage error'
429
430 sys.exit(-1)
431
459
461 import getopt
462 try:
463 args,extras = getopt.getopt(sys.argv[1:],'n:d:cst',
464 ['catalog=','onbits=',
465 'scoresFile=','gainsFile=','detailsFile=','fpFile=',
466 'minPath=','maxPath=','smiCol=','actCol=','nameCol=','nActs=',
467 'nBits=','biasList=','topN=',
468 'build','sigs','gains','details','score','noTitle'])
469 except:
470 sys.stderr.write('Error parsing command line:\n')
471 import traceback
472 traceback.print_exc()
473 Usage()
474 for arg,val in args:
475 if arg=='-n':
476 details.numMols=int(val)
477 elif arg=='-c':
478 details.delim=','
479 elif arg=='-s':
480 details.delim=' '
481 elif arg=='-t':
482 details.delim='\t'
483 elif arg=='-d':
484 details.dbName=val
485 elif arg=='--build':
486 details.doBuild=1
487 elif arg=='--score':
488 details.doScore=1
489 elif arg=='--gains':
490 details.doGains=1
491 elif arg=='--sigs':
492 details.doSigs=1
493 elif arg=='-details':
494 details.doDetails=1
495 elif arg=='--catalog':
496 details.catalogName=val
497 elif arg=='--onbits':
498 details.onBitsName=val
499 elif arg=='--scoresFile':
500 details.scoresName=val
501 elif arg=='--gainsFile':
502 details.gainsName=val
503 elif arg=='--detailsFile':
504 details.detailsName=val
505 elif arg=='--fpFile':
506 details.fpName=val
507 elif arg=='--minPath':
508 details.minPath=int(val)
509 elif arg=='--maxPath':
510 details.maxPath=int(val)
511 elif arg=='--smiCol':
512 try:
513 details.smiCol=int(val)
514 except ValueError:
515 details.smiCol=val
516 elif arg=='--actCol':
517 try:
518 details.actCol=int(val)
519 except ValueError:
520 details.actCol=val
521 elif arg=='--nameCol':
522 try:
523 details.nameCol=int(val)
524 except ValueError:
525 details.nameCol=val
526 elif arg=='--nActs':
527 details.nActs=int(val)
528 elif arg=='--nBits':
529 details.nBits=int(val)
530 elif arg=='--noTitle':
531 details.hasTitle=0
532 elif arg=='--biasList':
533 details.biasList=tuple(eval(val))
534 elif arg=='--topN':
535 details.topN=int(val)
536 elif arg=='-h':
537 Usage()
538 sys.exit(0)
539 else:
540 Usage()
541 if len(extras):
542 if details.dbName:
543 details.tableName=extras[0]
544 else:
545 details.inFileName = extras[0]
546 else:
547 Usage()
548 if __name__=='__main__':
549 import time
550 details = RunDetails()
551 ParseArgs(details)
552 from cStringIO import StringIO
553 suppl = SupplierFromDetails(details)
554
555 cat = None
556 obls = None
557 if details.doBuild:
558 if not suppl:
559 message("We require inData to generate a catalog\n")
560 sys.exit(-2)
561 message("Building catalog\n")
562 t1 = time.time()
563 cat = BuildCatalog(suppl,maxPts=details.numMols,
564 minPath=details.minPath,maxPath=details.maxPath)
565 t2 = time.time()
566 message("\tThat took %.2f seconds.\n"%(t2-t1))
567 if details.catalogName:
568 message("Dumping catalog data\n")
569 cPickle.dump(cat,open(details.catalogName,'wb+'))
570 elif details.catalogName:
571 message("Loading catalog\n")
572 cat = cPickle.load(open(details.catalogName,'rb'))
573 if details.onBitsName:
574 try:
575 obls = cPickle.load(open(details.onBitsName,'rb'))
576 except:
577 obls = None
578 else:
579 if len(obls)<(inD.count('\n')-1):
580 obls = None
581 scores = None
582 if details.doScore:
583 if not suppl:
584 message("We require inData to score molecules\n")
585 sys.exit(-2)
586 if not cat:
587 message("We require a catalog to score molecules\n")
588 sys.exit(-2)
589 message("Scoring compounds\n")
590 if not obls or len(obls)<details.numMols:
591 scores,obls = ScoreMolecules(suppl,cat,maxPts=details.numMols,
592 actName=details.actCol,
593 nActs=details.nActs)
594 if details.scoresName:
595 cPickle.dump(scores,open(details.scoresName,'wb+'))
596 if details.onBitsName:
597 cPickle.dump(obls,open(details.onBitsName,'wb+'))
598 else:
599 scores = ScoreFromLists(obls,suppl,cat,maxPts=details.numMols,
600 actName=details.actCol,
601 nActs=details.nActs)
602 elif details.scoresName:
603 scores = cPickle.load(open(details.scoresName,'rb'))
604
605 if details.fpName and os.path.exists(details.fpName) and not details.doSigs:
606 message("Reading fingerprints from file.\n")
607 fps = cPickle.load(open(details.fpName,'rb'))
608 else:
609 fps = []
610 gains = None
611 if details.doGains:
612 if not suppl:
613 message("We require inData to calculate gains\n")
614 sys.exit(-2)
615 if not (cat or fps):
616 message("We require either a catalog or fingerprints to calculate gains\n")
617 sys.exit(-2)
618 message("Calculating Gains\n")
619 t1 = time.time()
620 if details.fpName:
621 collectFps=1
622 else:
623 collectFps=0
624 if not fps:
625 gains,fps = CalcGains(suppl,cat,topN=details.topN,actName=details.actCol,
626 nActs=details.nActs,biasList=details.biasList,
627 collectFps=collectFps)
628 if details.fpName:
629 message("Writing fingerprint file.\n")
630 tmpF=open(details.fpName,'wb+')
631 cPickle.dump(fps,tmpF,1)
632 tmpF.close()
633 else:
634 gains = CalcGainsFromFps(suppl,fps,topN=details.topN,actName=details.actCol,
635 nActs=details.nActs,biasList=details.biasList)
636 t2=time.time()
637 message("\tThat took %.2f seconds.\n"%(t2-t1))
638 if details.gainsName:
639 outF = open(details.gainsName,'w+')
640 OutputGainsData(outF,gains,cat,nActs=details.nActs)
641 else:
642 if details.gainsName:
643 inF = open(details.gainsName,'r')
644 gains = ProcessGainsData(inF)
645
646
647 if details.doDetails:
648 if not cat:
649 message("We require a catalog to get details\n")
650 sys.exit(-2)
651 if not gains:
652 message("We require gains data to get details\n")
653 sys.exit(-2)
654 io = StringIO()
655 io.write('id,SMILES,gain\n')
656 ShowDetails(cat,gains,nToDo=details.nBits,outF=io)
657 if details.detailsName:
658 open(details.detailsName,'w+').write(io.getvalue())
659 else:
660 sys.stderr.write(io.getvalue())
661