|
Package rdkit ::
Package ML ::
Module ScreenComposite
|
|
1
2
3
4
5
6
7 """ command line utility for screening composite models
8
9 **Usage**
10
11 _ScreenComposite [optional args] modelfile(s) datafile_
12
13 Unless indicated otherwise (via command line arguments), _modelfile_ is
14 a file containing a pickled composite model and _filename_ is a QDAT file.
15
16 **Command Line Arguments**
17
18 - -t *threshold value(s)*: use high-confidence predictions for the final
19 analysis of the hold-out data. The threshold value can be either a single
20 float or a list/tuple of floats. All thresholds should be between
21 0.0 and 1.0
22
23 - -D: do a detailed screen.
24
25 - -d *database name*: instead of reading the data from a QDAT file,
26 pull it from a database. In this case, the _datafile_ argument
27 provides the name of the database table containing the data set.
28
29 - -N *note*: use all models from the database which have this note.
30 The modelfile argument should contain the name of the table
31 with the models.
32
33 - -H: screen only the hold out set (works only if a version of
34 BuildComposite more recent than 1.2.2 was used).
35
36 - -T: screen only the training set (works only if a version of
37 BuildComposite more recent than 1.2.2 was used).
38
39 - -E: do a detailed Error analysis. This shows each misclassified
40 point and the number of times it was missed across all screened
41 composites. If the --enrich argument is also provided, only compounds
42 that have true activity value equal to the enrichment value will be
43 used.
44
45 - --enrich *enrichVal*: target "active" value to be used in calculating
46 enrichments.
47
48 - -A: show All predictions.
49
50 - -S: shuffle activity values before screening
51
52 - -R: randomize activity values before screening
53
54 - -F *filter frac*: filters the data before training to change the
55 distribution of activity values in the training set. *filter frac*
56 is the fraction of the training set that should have the target value.
57 **See note in BuildComposite help about data filtering**
58
59 - -v *filter value*: filters the data before training to change the
60 distribution of activity values in the training set. *filter value*
61 is the target value to use in filtering.
62 **See note in BuildComposite help about data filtering**
63
64 - -V: be verbose when screening multiple models
65
66 - -h: show this message and exit
67
68 - -X: send a summary of the results to Excel (NOTE: this will alter the
69 contents of the currently active workbook)
70
71 - --OOB: Do out an "out-of-bag" generalization error estimate. This only
72 makes sense when applied to the original data set.
73
74 - --pickleCol *colId*: index of the column containing a pickled value
75 (used primarily for cases where fingerprints are used as descriptors)
76
77 *** Options for making Prediction (Hanneke) Plots ***
78
79 - --predPlot=<fileName>: triggers the generation of a Hanneke plot and
80 sets the name of the .txt file which will hold the output data.
81 A Gnuplot control file, <fileName>.gnu, will also be generated.
82
83 - --predActTable=<name> (optional): name of the database table
84 containing activity values. If this is not provided, activities
85 will be read from the same table containing the screening data
86
87 - --predActCol=<name> (optional): name of the activity column. If not
88 provided, the name of the last column in the activity table will
89 be used.
90
91 - --predLogScale (optional): If provided, the x axis of the
92 prediction plot (the activity axis) will be plotted using a log
93 scale
94
95 - --predShow: launch a gnuplot instance and display the prediction
96 plot (the plot will still be written to disk).
97
98 *** The following options are likely obsolete ***
99
100 - -P: read pickled data. The datafile argument should contain
101 a pickled data set. *relevant only to qdat files*
102
103 - -q: data are not quantized (the composite should take care of
104 quantization itself if it requires quantized data). *relevant only to
105 qdat files*
106
107
108
109 """
110 from rdkit import RDConfig
111 from rdkit import DataStructs
112 import sys,cPickle,types,copy
113 import numpy
114
115 try:
116 from PIL import Image,ImageDraw
117 except ImportError:
118 hasPil=0
119 else:
120 hasPil=1
121
122 from rdkit.ML.Data import DataUtils,SplitData
123 from rdkit.ML import CompositeRun
124 from rdkit.Dbase.DbConnection import DbConnect
125 from rdkit.Dbase import DbModule
126 _details = CompositeRun.CompositeRun()
127 try:
128 from rdkit.Excel.ExcelWrapper import ExcelWrapper as Excel
129 except ImportError:
130 Excel = None
131
132 __VERSION_STRING="3.3.0"
133
135 """ emits messages to _sys.stdout_
136 override this in modules which import this one to redirect output
137
138 **Arguments**
139
140 - msg: the string to be displayed
141
142 """
143 if noRet:
144 sys.stdout.write('%s '%(msg))
145 else:
146 sys.stdout.write('%s\n'%(msg))
148 """ emits messages to _sys.stderr_
149 override this in modules which import this one to redirect output
150
151 **Arguments**
152
153 - msg: the string to be displayed
154
155 """
156 sys.stderr.write('ERROR: %s\n'%(msg))
157
159 if tgt<0 or tgt>=mat.shape[0]: return 0
160 nPts = float(sum(sum(mat)))
161 nTgtPred = float(sum(mat[:,tgt]))
162 if nTgtPred:
163 pctCorrect = mat[tgt,tgt]/nTgtPred
164 nTgtReal = float(sum(mat[tgt,:]))
165 pctOverall = nTgtReal/nPts
166 else:
167 return 0.0
168 return pctCorrect/pctOverall
169
170
171 -def CollectResults(indices,dataSet,composite,callback=None,appendExamples=0,
172 errorEstimate=0):
173 """ screens a set of examples through a composite and returns the
174 results
175 #DOC
176
177 **Arguments**
178
179 - examples: the examples to be screened (a sequence of sequences)
180 it's assumed that the last element in each example is it's "value"
181
182 - composite: the composite model to be used
183
184 - callback: (optional) if provided, this should be a function
185 taking a single argument that is called after each example is
186 screened with the number of examples screened so far as the
187 argument.
188
189 - appendExamples: (optional) this value is passed on to the
190 composite's _ClassifyExample()_ method.
191
192 - errorEstimate: (optional) calculate the "out of bag" error
193 estimate for the composite using Breiman's definition. This
194 only makes sense when screening the original data set!
195 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
196 Statistics Technical Report (1996)]
197
198 **Returns**
199
200 a list of 3-tuples _nExamples_ long:
201
202 1) answer: the value from the example
203
204 2) pred: the composite model's prediction
205
206 3) conf: the confidence of the composite
207
208 """
209
210
211
212 for j in range(len(composite)):
213 tmp = composite.GetModel(j)
214 if hasattr(tmp,'_trainIndices') and type(tmp._trainIndices)!=types.DictType:
215 tis = {}
216 if hasattr(tmp,'_trainIndices'):
217 for v in tmp._trainIndices: tis[v]=1
218 tmp._trainIndices=tis
219
220
221 nPts = len(indices)
222 res = [None]*nPts
223 for i in range(nPts):
224 idx = indices[i]
225 example = dataSet[idx]
226 if errorEstimate:
227 use = []
228 for j in range(len(composite)):
229 mdl = composite.GetModel(j)
230 if not mdl._trainIndices.get(idx,0):
231 use.append(j)
232 else:
233 use = None
234
235 pred,conf = composite.ClassifyExample(example,appendExample=appendExamples,
236 onlyModels=use)
237 if composite.GetActivityQuantBounds():
238 answer = composite.QuantizeActivity(example)[-1]
239 else:
240 answer = example[-1]
241 res[i] = answer,pred,conf
242 if callback: callback(i)
243 return res
244
245 -def DetailedScreen(indices,data,composite,threshold=0,screenResults=None,
246 goodVotes=None,badVotes=None,noVotes=None,callback=None,
247 appendExamples=0,errorEstimate=0):
248 """ screens a set of examples cross a composite and breaks the
249 predictions into *correct*,*incorrect* and *unclassified* sets.
250 #DOC
251 **Arguments**
252
253 - examples: the examples to be screened (a sequence of sequences)
254 it's assumed that the last element in each example is its "value"
255
256 - composite: the composite model to be used
257
258 - threshold: (optional) the threshold to be used to decide whether
259 or not a given prediction should be kept
260
261 - screenResults: (optional) the results of screening the results
262 (a sequence of 3-tuples in the format returned by
263 _CollectResults()_). If this is provided, the examples will not
264 be screened again.
265
266 - goodVotes,badVotes,noVotes: (optional) if provided these should
267 be lists (or anything supporting an _append()_ method) which
268 will be used to pass the screening results back.
269
270 - callback: (optional) if provided, this should be a function
271 taking a single argument that is called after each example is
272 screened with the number of examples screened so far as the
273 argument.
274
275 - appendExamples: (optional) this value is passed on to the
276 composite's _ClassifyExample()_ method.
277
278 - errorEstimate: (optional) calculate the "out of bag" error
279 estimate for the composite using Breiman's definition. This
280 only makes sense when screening the original data set!
281 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
282 Statistics Technical Report (1996)]
283
284 **Notes**
285
286 - since this function doesn't return anything, if one or more of
287 the arguments _goodVotes_, _badVotes_, and _noVotes_ is not
288 provided, there's not much reason to call it
289
290 """
291 if screenResults is None:
292 screenResults = CollectResults(indices,data,composite,callback=callback,
293 appendExamples=appendExamples,
294 errorEstimate=errorEstimate)
295 if goodVotes is None: goodVotes = []
296 if badVotes is None: badVotes = []
297 if noVotes is None: noVotes = []
298 for i in range(len(screenResults)):
299 answer,pred,conf = screenResults[i]
300 if conf > threshold:
301 if pred != answer:
302 badVotes.append((answer,pred,conf,i))
303 else:
304 goodVotes.append((answer,pred,conf,i))
305 else:
306 noVotes.append((answer,pred,conf,i))
307
308 -def ShowVoteResults(indices,data,composite,nResultCodes,threshold,verbose=1,
309 screenResults=None,callback=None,appendExamples=0,
310 goodVotes=None,badVotes=None,noVotes=None,
311 errorEstimate=0):
312 """ screens the results and shows a detailed workup
313
314 The work of doing the screening and processing the results is
315 handled by _DetailedScreen()_
316 #DOC
317
318 **Arguments**
319
320 - examples: the examples to be screened (a sequence of sequences)
321 it's assumed that the last element in each example is its "value"
322
323 - composite: the composite model to be used
324
325 - nResultCodes: the number of possible results the composite can
326 return
327
328 - threshold: the threshold to be used to decide whether or not a
329 given prediction should be kept
330
331 - screenResults: (optional) the results of screening the results
332 (a sequence of 3-tuples in the format returned by
333 _CollectResults()_). If this is provided, the examples will not
334 be screened again.
335
336 - callback: (optional) if provided, this should be a function
337 taking a single argument that is called after each example is
338 screened with the number of examples screened so far as the
339 argument.
340
341 - appendExamples: (optional) this value is passed on to the
342 composite's _ClassifyExample()_ method.
343
344 - goodVotes,badVotes,noVotes: (optional) if provided these should
345 be lists (or anything supporting an _append()_ method) which
346 will be used to pass the screening results back.
347
348 - errorEstimate: (optional) calculate the "out of bag" error
349 estimate for the composite using Breiman's definition. This
350 only makes sense when screening the original data set!
351 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
352 Statistics Technical Report (1996)]
353
354 **Returns**
355
356 a 7-tuple:
357
358 1) the number of good (correct) predictions
359
360 2) the number of bad (incorrect) predictions
361
362 3) the number of predictions skipped due to the _threshold_
363
364 4) the average confidence in the good predictions
365
366 5) the average confidence in the bad predictions
367
368 6) the average confidence in the skipped predictions
369
370 7) the results table
371
372 """
373 nExamples = len(indices)
374 if goodVotes is None:
375 goodVotes = []
376 if badVotes is None:
377 badVotes = []
378 if noVotes is None:
379 noVotes = []
380 DetailedScreen(indices,data,composite,threshold,screenResults=screenResults,
381 goodVotes=goodVotes,badVotes=badVotes,noVotes=noVotes,callback=callback,
382 appendExamples=appendExamples,errorEstimate=errorEstimate)
383 nBad = len(badVotes)
384 nGood = len(goodVotes)
385 nClassified = nGood + nBad
386 if verbose:
387 print '\n\t*** Vote Results ***'
388 print 'misclassified: %d/%d (%%%4.2f)\t%d/%d (%%%4.2f)'%(nBad,nExamples,
389 100.*float(nBad)/nExamples,
390 nBad,nClassified,
391 100.*float(nBad)/nClassified)
392 nSkip = len(noVotes)
393 if nSkip > 0:
394 if verbose:
395 print 'skipped: %d/%d (%%% 4.2f)'%(nSkip,nExamples,100.*float(nSkip)/nExamples)
396 noConf = numpy.array([x[2] for x in noVotes])
397 avgSkip = sum(noConf)/float(nSkip)
398 else:
399 avgSkip = 0.
400
401 if nBad > 0:
402 badConf = numpy.array([x[2] for x in badVotes])
403 avgBad = sum(badConf)/float(nBad)
404 else:
405 avgBad = 0.
406
407 if nGood > 0:
408 goodRes = [x[1] for x in goodVotes]
409 goodConf = numpy.array([x[2] for x in goodVotes])
410 avgGood = sum(goodConf)/float(nGood)
411 else:
412 goodRes = []
413 goodConf = []
414 avgGood = 0.
415
416 if verbose:
417 print
418 print 'average correct confidence: % 6.4f'%avgGood
419 print 'average incorrect confidence: % 6.4f'%avgBad
420
421 voteTab = numpy.zeros((nResultCodes,nResultCodes),numpy.int)
422 for res in goodRes:
423 voteTab[res,res] += 1
424 for ans,res,conf,idx in badVotes:
425 voteTab[ans,res] += 1
426
427 if verbose:
428 print
429 print '\tResults Table:'
430 vTab=numpy.transpose(voteTab)
431 colCounts = sum(vTab)
432 rowCounts = sum(vTab,1)
433 message('')
434 for i in range(nResultCodes):
435 if rowCounts[i]==0: rowCounts[i]=1
436 row = vTab[i]
437 message(' ',noRet=1)
438 for j in range(nResultCodes):
439 entry = row[j]
440 message(' % 6d'%entry,noRet=1)
441 message(' | % 4.2f'%(100.*vTab[i,i]/rowCounts[i]))
442 message(' ',noRet=1)
443 for i in range(nResultCodes):
444 message('-------',noRet=1)
445 message('')
446 message(' ',noRet=1)
447 for i in range(nResultCodes):
448 if colCounts[i]==0: colCounts[i]=1
449 message(' % 6.2f'%(100.*vTab[i,i]/colCounts[i]),noRet=1)
450 message('')
451
452
453
454 return nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab
455
456 -def ScreenIt(composite,indices,data,partialVote=0,voteTol=0.0,verbose=1,screenResults=None,
457 goodVotes=None,badVotes=None,noVotes=None):
458 """ screens a set of data using a composite model and prints out
459 statistics about the screen.
460 #DOC
461 The work of doing the screening and processing the results is
462 handled by _DetailedScreen()_
463
464 **Arguments**
465
466 - composite: the composite model to be used
467
468 - data: the examples to be screened (a sequence of sequences)
469 it's assumed that the last element in each example is its "value"
470
471 - partialVote: (optional) toggles use of the threshold value in
472 the screnning.
473
474 - voteTol: (optional) the threshold to be used to decide whether or not a
475 given prediction should be kept
476
477 - verbose: (optional) sets degree of verbosity of the screening
478
479 - screenResults: (optional) the results of screening the results
480 (a sequence of 3-tuples in the format returned by
481 _CollectResults()_). If this is provided, the examples will not
482 be screened again.
483
484 - goodVotes,badVotes,noVotes: (optional) if provided these should
485 be lists (or anything supporting an _append()_ method) which
486 will be used to pass the screening results back.
487
488
489 **Returns**
490
491 a 7-tuple:
492
493 1) the number of good (correct) predictions
494
495 2) the number of bad (incorrect) predictions
496
497 3) the number of predictions skipped due to the _threshold_
498
499 4) the average confidence in the good predictions
500
501 5) the average confidence in the bad predictions
502
503 6) the average confidence in the skipped predictions
504
505 7) None
506
507 """
508 if goodVotes is None:
509 goodVotes = []
510 if badVotes is None:
511 badVotes = []
512 if noVotes is None:
513 noVotes = []
514
515 if not partialVote:
516 voteTol = 0.0
517
518 DetailedScreen(indices,data,composite,voteTol,screenResults=screenResults,
519 goodVotes=goodVotes,badVotes=badVotes,noVotes=noVotes)
520
521 nGood = len(goodVotes)
522 goodAccum = 0.
523 for res,pred,conf,idx in goodVotes:
524 goodAccum += conf
525
526 misCount = len(badVotes)
527 badAccum = 0.
528 for res,pred,conf,idx in badVotes:
529 badAccum += conf
530
531 nSkipped = len(noVotes)
532 goodSkipped = 0
533 badSkipped = 0
534 skipAccum = 0.
535 for ans,pred,conf,idx in noVotes:
536 skipAccum += conf
537 if ans != pred:
538 badSkipped += 1
539 else:
540 goodSkipped += 1
541
542 nData = nGood + misCount + nSkipped
543 if verbose:
544 print 'Total N Points:',nData
545 if partialVote:
546 nCounted = nData-nSkipped
547 if verbose:
548 print 'Misclassifications: %d (%%%4.2f)'%(misCount,100.*float(misCount)/nCounted)
549 print 'N Skipped: %d (%%%4.2f)'%(nSkipped,100.*float(nSkipped)/nData)
550 print '\tGood Votes Skipped: %d (%%%4.2f)'%(goodSkipped,100.*float(goodSkipped)/nSkipped)
551 print '\tBad Votes Skipped: %d (%%%4.2f)'%(badSkipped,100.*float(badSkipped)/nSkipped)
552 else:
553 if verbose:
554 print 'Misclassifications: %d (%%%4.2f)'%(misCount,100.*float(misCount)/nData)
555 print 'Average Correct Vote Confidence: % 6.4f'%(goodAccum/(nData-misCount))
556 print 'Average InCorrect Vote Confidence: % 6.4f'%(badAccum/misCount)
557
558 avgGood=0
559 avgBad=0
560 avgSkip=0
561 if nGood:
562 avgGood = goodAccum/nGood
563 if misCount:
564 avgBad = badAccum/misCount
565 if nSkipped:
566 avgSkip = skipAccum/nSkipped
567 return nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,None
568
570 """ *Internal Use Only*
571
572 converts a list of 4 tuples: (answer,prediction,confidence,idx) into
573 an alternate list: (answer,prediction,confidence,data point)
574
575 **Arguments**
576
577 - votes: a list of 4 tuples: (answer, prediction, confidence,
578 index)
579
580 - data: a _DataUtils.MLData.MLDataSet_
581
582
583 **Note**: alterations are done in place in the _votes_ list
584
585 """
586 for i in range(len(votes)):
587 ans,pred,conf,idx = votes[i]
588 votes[i] = (ans,pred,conf,data[idx])
589
591 if (hasattr(details,'doHoldout') and details.doHoldout) or \
592 (hasattr(details,'doTraining') and details.doTraining):
593 try:
594 splitF = model._splitFrac
595 except AttributeError:
596 pass
597 else:
598 if verbose:
599 message('s',noRet=1)
600
601 if hasattr(details,'errorEstimate') and details.errorEstimate and \
602 hasattr(details,'doHoldout') and details.doHoldout:
603 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
604 message('****** WARNING: OOB screening should not be combined with doHoldout option.')
605 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
606 trainIdx,testIdx = SplitData.SplitIndices(data.GetNPts(),splitF,silent=1)
607
608 if hasattr(details,'filterFrac') and details.filterFrac != 0.0:
609 if verbose:
610 message('f',noRet=1)
611 trainFilt,temp = DataUtils.FilterData(data,details.filterVal,
612 details.filterFrac,-1,
613 indicesToUse=trainIdx,
614 indicesOnly=1)
615 testIdx += temp
616 trainIdx = trainFilt
617 elif hasattr(details,'errorEstimate') and details.errorEstimate:
618
619
620 if hasattr(details,'filterFrac') and details.filterFrac != 0.0:
621 if verbose:
622 message('f',noRet=1)
623 testIdx,trainIdx = DataUtils.FilterData(data,details.filterVal,
624 details.filterFrac,-1,
625 indicesToUse=range(data.GetNPts()),
626 indicesOnly=1)
627 testIdx.extend(trainIdx)
628 else:
629 testIdx = range(data.GetNPts())
630 trainIdx = []
631 else:
632 testIdx = range(data.GetNPts())
633 trainIdx = []
634 if hasattr(details,'doTraining') and details.doTraining:
635 testIdx,trainIdx = trainIdx,testIdx
636 return trainIdx,testIdx
637
638 -def ScreenFromDetails(models,details,callback=None,setup=None,appendExamples=0,
639 goodVotes=None,badVotes=None,noVotes=None,data=None,
640 enrichments=None):
641 """ Screens a set of data using a a _CompositeRun.CompositeRun_
642 instance to provide parameters
643
644 # DOC
645
646 The actual data to be used are extracted from the database and
647 table specified in _details_
648
649 Aside from dataset construction, _ShowVoteResults()_ does most of
650 the heavy lifting here.
651
652 **Arguments**
653
654 - model: a composite model
655
656 - details: a _CompositeRun.CompositeRun_ object containing details
657 (options, parameters, etc.) about the run
658
659 - callback: (optional) if provided, this should be a function
660 taking a single argument that is called after each example is
661 screened with the number of examples screened so far as the
662 argument.
663
664 - setup: (optional) a function taking a single argument which is
665 called at the start of screening with the number of points to
666 be screened as the argument.
667
668 - appendExamples: (optional) this value is passed on to the
669 composite's _ClassifyExample()_ method.
670
671 - goodVotes,badVotes,noVotes: (optional) if provided these should
672 be lists (or anything supporting an _append()_ method) which
673 will be used to pass the screening results back.
674
675
676 **Returns**
677
678 a 7-tuple:
679
680 1) the number of good (correct) predictions
681
682 2) the number of bad (incorrect) predictions
683
684 3) the number of predictions skipped due to the _threshold_
685
686 4) the average confidence in the good predictions
687
688 5) the average confidence in the bad predictions
689
690 6) the average confidence in the skipped predictions
691
692 7) the results table
693
694 """
695 if data is None:
696 if hasattr(details,'pickleCol'):
697 data = details.GetDataSet(pickleCol=details.pickleCol,
698 pickleClass=DataStructs.ExplicitBitVect)
699 else:
700 data = details.GetDataSet()
701 if details.threshold>0.0:
702 partialVote = 1
703 else:
704 partialVote = 0
705
706 if type(models) not in [types.ListType,types.TupleType]:
707 models = (models,)
708
709 nModels = len(models)
710
711 if setup is not None:
712 setup(nModels*data.GetNPts())
713
714 nGood = numpy.zeros(nModels,numpy.float)
715 nBad = numpy.zeros(nModels,numpy.float)
716 nSkip = numpy.zeros(nModels,numpy.float)
717 confGood = numpy.zeros(nModels,numpy.float)
718 confBad = numpy.zeros(nModels,numpy.float)
719 confSkip = numpy.zeros(nModels,numpy.float)
720 voteTab = None
721 if goodVotes is None:
722 goodVotes = []
723 if badVotes is None:
724 badVotes = []
725 if noVotes is None:
726 noVotes = []
727 if enrichments is None:
728 enrichments = [0.0]*nModels
729 badVoteDict = {}
730 noVoteDict = {}
731
732 for i in range(nModels):
733 if nModels>1:
734 goodVotes = []
735 badVotes=[]
736 noVotes=[]
737 model = models[i]
738
739 try:
740 seed = model._randomSeed
741 except AttributeError:
742 pass
743 else:
744 DataUtils.InitRandomNumbers(seed)
745
746 if (hasattr(details,'shuffleActivities') and details.shuffleActivities) or \
747 (hasattr(details,'randomActivities') and details.randomActivities ):
748 if hasattr(details,'shuffleActivities') and details.shuffleActivities:
749 shuffle = 1
750 else:
751 shuffle = 0
752 randomize=1
753 DataUtils.RandomizeActivities(data,shuffle=shuffle,
754 runDetails=details)
755 else:
756 randomize=0
757 shuffle=0
758
759 if hasattr(model,'_shuffleActivities') and \
760 model._shuffleActivities and \
761 not shuffle:
762 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
763 message('****** WARNING: Shuffled model being screened with unshuffled data.')
764 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
765 if hasattr(model,'_randomizeActivities') and \
766 model._randomizeActivities and \
767 not randomize:
768 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
769 message('****** WARNING: Random model being screened with non-random data.')
770 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
771
772 trainIdx,testIdx = PrepareDataFromDetails(model,details,data)
773
774 nPossible = model.GetQuantBounds()[1]
775 if callback:
776 cb = lambda x,y=callback,z=i*data.GetNPts():y(x+z)
777 else:
778 cb = None
779 if not hasattr(details,'errorEstimate') or not details.errorEstimate:
780 errorEstimate = 0
781 else:
782 errorEstimate = 1
783 g,b,s,aG,aB,aS,vT = ShowVoteResults(testIdx,data,model,nPossible[-1],
784 details.threshold,verbose=0,
785 callback=cb,appendExamples=appendExamples,
786 goodVotes=goodVotes,badVotes=badVotes,
787 noVotes=noVotes,
788 errorEstimate=errorEstimate)
789 if voteTab is None:
790 voteTab = numpy.zeros(vT.shape,numpy.float)
791 if hasattr(details,'errorAnalysis') and details.errorAnalysis:
792 for a,p,c,idx in badVotes:
793 label = testIdx[idx]
794 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
795 if a==details.enrichTgt:
796 badVoteDict[label] = badVoteDict.get(label,0)+1
797 else:
798 badVoteDict[label] = badVoteDict.get(label,0)+1
799 for a,p,c,idx in noVotes:
800 label = testIdx[idx]
801 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
802 if a==details.enrichTgt:
803 noVoteDict[label] = noVoteDict.get(label,0)+1
804 else:
805 noVoteDict[label] = noVoteDict.get(label,0)+1
806
807 voteTab += vT
808 nGood[i] = g
809 nBad[i] = b
810 nSkip[i] = s
811 confGood[i] = aG
812 confBad[i] = aB
813 confSkip[i] = aS
814
815 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
816 enrichments[i] = CalcEnrichment(vT,tgt=details.enrichTgt)
817
818 if nModels == 1:
819 return g,b,s,aG,aB,aS,vT
820 else:
821 voteTab /= nModels
822
823 avgNBad = sum(nBad)/nModels
824 devNBad = numpy.sqrt(sum((nBad-avgNBad)**2)/(nModels-1))
825
826 bestIdx = numpy.argsort(nBad)[0]
827
828 avgNGood = sum(nGood)/nModels
829 devNGood = numpy.sqrt(sum((nGood-avgNGood)**2)/(nModels-1))
830
831 avgNSkip = sum(nSkip)/nModels
832 devNSkip = numpy.sqrt(sum((nSkip-avgNSkip)**2)/(nModels-1))
833
834 avgConfBad = sum(confBad)/nModels
835 devConfBad = numpy.sqrt(sum((confBad-avgConfBad)**2)/(nModels-1))
836
837 avgConfGood = sum(confGood)/nModels
838 devConfGood = numpy.sqrt(sum((confGood-avgConfGood)**2)/(nModels-1))
839
840 avgConfSkip = sum(confSkip)/nModels
841 devConfSkip = numpy.sqrt(sum((confSkip-avgConfSkip)**2)/(nModels-1))
842 return (avgNGood,devNGood),(avgNBad,devNBad),(avgNSkip,devNSkip),\
843 (avgConfGood,devConfGood),(avgConfBad,devConfBad),(avgConfSkip,devConfSkip),\
844 voteTab
845
846
848 if not hasPil:
849 return None
850 try:
851 nTot = float(nGood)+float(nBad)+float(nRej)
852 except TypeError:
853 nGood = nGood[0]
854 nBad = nBad[0]
855 nRej = nRej[0]
856 nTot = float(nGood)+float(nBad)+float(nRej)
857
858 if not nTot:
859 return None
860 goodColor = (100,100,255)
861 badColor = (255,100,100)
862 rejColor = (255,255,100)
863
864 pctGood = float(nGood) / nTot
865 pctBad = float(nBad) / nTot
866 pctRej = float(nRej) / nTot
867
868 if size is None:
869 if RDConfig.doingDemo:
870 size = (200,200)
871 else:
872 size = (100,100)
873 img = Image.new('RGB',size,(255,255,255))
874 draw = ImageDraw.Draw(img)
875 box = (0,0,size[0]-1,size[1]-1)
876
877 startP = -90
878 endP = int(startP + pctGood*360)
879 draw.pieslice(box,startP,endP,fill=goodColor)
880 startP = endP
881 endP = int(startP + pctBad*360)
882 draw.pieslice(box,startP,endP,fill=badColor)
883 startP = endP
884 endP = int(startP + pctRej*360)
885 draw.pieslice(box,startP,endP,fill=rejColor)
886
887 return img
888
889
890 -def ScreenToHtml(nGood,nBad,nRej,avgGood,avgBad,avgSkip,voteTable,imgDir='.',
891 fullPage=1,skipImg=0,includeDefs=1):
892 """ returns the text of a web page showing the screening details
893 #DOC
894 **Arguments**
895
896 - nGood: number of correct predictions
897
898 - nBad: number of incorrect predictions
899
900 - nRej: number of rejected predictions
901
902 - avgGood: average correct confidence
903
904 - avgBad: average incorrect confidence
905
906 - avgSkip: average rejected confidence
907
908 - voteTable: vote table
909
910 - imgDir: (optional) the directory to be used to hold the vote
911 image (if constructed)
912
913 **Returns**
914
915 a string containing HTML
916
917 """
918 if type(nGood) == types.TupleType:
919 multModels=1
920 else:
921 multModels=0
922
923 if fullPage:
924 outTxt = ["""<html><body>"""]
925 outTxt.append('<center><h2>VOTE DETAILS</h2></center>')
926 else:
927 outTxt = []
928
929 if RDConfig.doingDemo:
930 outTxt.append('<font size="+2">')
931 else:
932 outTxt.append('<font>')
933
934
935 if not skipImg:
936 img = GetScreenImage(nGood,nBad,nRej)
937 if img:
938 if imgDir:
939 imgFileName = '/'.join((imgDir,'votes.png'))
940 else:
941 imgFileName = 'votes.png'
942 img.save(imgFileName)
943 outTxt.append('<center><img src="%s"></center>'%(imgFileName))
944
945 nPoss = len(voteTable)
946 pureCounts = sum(voteTable,1)
947 accCounts = sum(voteTable,0)
948 pureVect = numpy.zeros(nPoss,numpy.float)
949 accVect = numpy.zeros(nPoss,numpy.float)
950 for i in range(nPoss):
951 if pureCounts[i]:
952 pureVect[i] = float(voteTable[i,i])/pureCounts[i]
953 if accCounts[i]:
954 accVect[i] = float(voteTable[i,i])/accCounts[i]
955
956 outTxt.append('<center><table border=1>')
957 outTxt.append('<tr><td></td>')
958 for i in xrange(nPoss):
959 outTxt.append('<th>%d</th>'%i)
960 outTxt.append('<th>% Accurate</th>')
961 outTxt.append('</tr>')
962
963 for i in xrange(nPoss):
964 outTxt.append('<tr><th>%d</th>'%(i))
965 for j in xrange(nPoss):
966 if i == j:
967 if not multModels:
968 outTxt.append('<td bgcolor="#A0A0FF">%d</td>'%(voteTable[j,i]))
969 else:
970 outTxt.append('<td bgcolor="#A0A0FF">%.2f</td>'%(voteTable[j,i]))
971 else:
972 if not multModels:
973 outTxt.append('<td>%d</td>'%(voteTable[j,i]))
974 else:
975 outTxt.append('<td>%.2f</td>'%(voteTable[j,i]))
976 outTxt.append('<td>%4.2f</td</tr>'%(100.0*accVect[i]))
977 if i == 0:
978 outTxt.append('<th rowspan=%d>Predicted</th></tr>'%(nPoss))
979 else:
980 outTxt.append('</tr>')
981 outTxt.append('<tr><th>% Pure</th>')
982 for i in range(nPoss):
983 outTxt.append('<td>%4.2f</td>'%(100.0*pureVect[i]))
984 outTxt.append('</tr>')
985 outTxt.append('<tr><td></td><th colspan=%d>Original</th>'%(nPoss))
986 outTxt.append('</table></center>')
987
988
989 if not multModels:
990 nTotal = nBad+nGood+nRej
991 nClass = nBad+nGood
992 if nClass:
993 pctErr = 100.*float(nBad)/nClass
994 else:
995 pctErr = 0.0
996
997 outTxt.append('<p>%d of %d examples were misclassified (%%%4.2f)'%(nBad,nGood+nBad,pctErr))
998 if nRej > 0:
999 pctErr = 100.*float(nBad)/(nGood+nBad+nRej)
1000 outTxt.append('<p> %d of %d overall: (%%%4.2f)'%(nBad,nTotal,pctErr))
1001 pctRej = 100.*float(nRej)/nTotal
1002 outTxt.append('<p>%d of %d examples were rejected (%%%4.2f)'%(nRej,nTotal,pctRej))
1003 if nGood != 0:
1004 outTxt.append('<p>The correctly classified examples had an average confidence of %6.4f'%avgGood)
1005
1006 if nBad != 0:
1007 outTxt.append('<p>The incorrectly classified examples had an average confidence of %6.4f'%avgBad)
1008 if nRej != 0:
1009 outTxt.append('<p>The rejected examples had an average confidence of %6.4f'%avgSkip)
1010 else:
1011 nTotal = nBad[0]+nGood[0]+nRej[0]
1012 nClass = nBad[0]+nGood[0]
1013 devClass = nBad[1]+nGood[1]
1014 if nClass:
1015 pctErr = 100.*float(nBad[0])/nClass
1016 devPctErr = 100.*float(nBad[1])/nClass
1017 else:
1018 pctErr = 0.0
1019 devPctErr = 0.0
1020
1021 outTxt.append('<p>%.2f(%.2f) of %.2f(%.2f) examples were misclassified (%%%4.2f(%4.2f))'%\
1022 (nBad[0],nBad[1],nClass,devClass,pctErr,devPctErr))
1023 if nRej > 0:
1024 pctErr = 100.*float(nBad[0])/nTotal
1025 devPctErr = 100.*float(nBad[1])/nTotal
1026 outTxt.append('<p> %.2f(%.2f) of %d overall: (%%%4.2f(%4.2f))'%\
1027 (nBad[0],nBad[1],nTotal,pctErr,devPctErr))
1028 pctRej = 100.*float(nRej[0])/nTotal
1029 devPctRej = 100.*float(nRej[1])/nTotal
1030 outTxt.append('<p>%.2f(%.2f) of %d examples were rejected (%%%4.2f(%4.2f))'%\
1031 (nRej[0],nRej[1],nTotal,pctRej,devPctRej))
1032 if nGood != 0:
1033 outTxt.append('<p>The correctly classified examples had an average confidence of %6.4f(%.4f)'%avgGood)
1034
1035 if nBad != 0:
1036 outTxt.append('<p>The incorrectly classified examples had an average confidence of %6.4f(%.4f)'%avgBad)
1037 if nRej != 0:
1038 outTxt.append('<p>The rejected examples had an average confidence of %6.4f(%.4f)'%avgSkip)
1039
1040
1041
1042 outTxt.append('</font>')
1043 if includeDefs:
1044 txt = """
1045 <p><b>Definitions:</b>
1046 <ul>
1047 <li> <i>% Pure:</i> The percentage of, for example, known positives predicted to be positive.
1048 <li> <i>% Accurate:</i> The percentage of, for example, predicted positives that actually
1049 are positive.
1050 </ul>
1051 """
1052 outTxt.append(txt)
1053
1054 if fullPage:
1055 outTxt.append("""</body></html>""")
1056 return '\n'.join(outTxt)
1057
1058
1059 -def MakePredPlot(details,indices,data,goodVotes,badVotes,nRes,idCol=0,verbose=0):
1060 """
1061
1062 **Arguments**
1063
1064 - details: a CompositeRun.RunDetails object
1065
1066 - indices: a sequence of integer indices into _data_
1067
1068 - data: the data set in question. We assume that the ids for
1069 the data points are in the _idCol_ column
1070
1071 - goodVotes/badVotes: predictions where the model was correct/incorrect.
1072 These are sequences of 4-tuples:
1073 (answer,prediction,confidence,index into _indices_)
1074
1075 """
1076 if not hasattr(details,'predPlot') or not details.predPlot:
1077 return
1078
1079 if verbose: message('\n-> Constructing Prediction (Hanneke) Plot')
1080 outF = open(details.predPlot,'w+')
1081 gnuF = open('%s.gnu'%details.predPlot,'w+')
1082
1083 ptIds = [data[x][idCol] for x in indices]
1084
1085
1086
1087 origConn = DbConnect(details.dbName,details.tableName,
1088 user=details.dbUser,password=details.dbPassword)
1089 colNames = origConn.GetColumnNames()
1090 idName = colNames[idCol]
1091 if not hasattr(details,'predActTable') or \
1092 not details.predActTable or \
1093 details.predActTable==details.tableName:
1094 actConn = origConn
1095 else:
1096 actConn = DbConnect(details.dbName,details.predActTable,
1097 user=details.dbUser,password=details.dbPassword)
1098 if verbose: message('\t-> Pulling Activity Data')
1099 pts = []
1100
1101 if type(ptIds[0]) not in [type(''),type(u'')]:
1102 ptIds = [str(x) for x in ptIds]
1103 whereL = [DbModule.placeHolder]*len(ptIds)
1104 if hasattr(details,'predActCol') and details.predActCol:
1105 actColName=details.predActCol
1106 else:
1107 actColName = actConn.GetColumnNames()[-1]
1108
1109 whereTxt = "%s in (%s)"%(idName,','.join(whereL))
1110 rawD = actConn.GetData(fields='%s,%s'%(idName,actColName),
1111 where=whereTxt,extras=ptIds)
1112
1113 if verbose: message('\t-> Creating Plot')
1114 acts = [None]*len(ptIds)
1115 for entry in rawD:
1116 id,act = entry
1117 idx = ptIds.index(id)
1118 acts[idx] = act
1119 outF.write('#ID Pred Conf %s\n'%(actColName))
1120 for ans,pred,conf,idx in goodVotes:
1121 act = acts[idx]
1122 if act!='None':
1123 act= float(act)
1124 else:
1125 act=0
1126 outF.write('%s %d %.4f %f\n'%(ptIds[idx],pred,conf,act))
1127 for ans,pred,conf,idx in badVotes:
1128 act = acts[idx]
1129 if act!='None':
1130 act= float(act)
1131 else:
1132 act=0
1133 outF.write('%s %d %.4f %f\n'%(ptIds[idx],pred,conf,act))
1134 outF.close()
1135 if not hasattr(details,'predLogScale') or not details.predLogScale:
1136 actLabel = actColName
1137 else:
1138 actLabel= 'log(%s)'%(actColName)
1139 actLabel = actLabel.replace('_',' ')
1140 gnuHdr="""# Generated by ScreenComposite.py version: %s
1141 set size square 0.7
1142 set yrange [:1]
1143 set data styl points
1144 set ylab 'confidence'
1145 set xlab '%s'
1146 set grid
1147 set nokey
1148 set term postscript enh color solid "Helvetica" 16
1149 set term X
1150 """%(__VERSION_STRING,actLabel)
1151 gnuF.write(gnuHdr)
1152 plots = []
1153 for i in range(nRes):
1154 if not hasattr(details,'predLogScale') or not details.predLogScale:
1155 plots.append("'%s' us 4:($2==%d?$3:0/0)"%(details.predPlot,i))
1156 else:
1157 plots.append("'%s' us (log10($4)):($2==%d?$3:0/0)"%(details.predPlot,i))
1158 gnuF.write("plot %s\n"%(','.join(plots)))
1159 gnuTail="""
1160 # EOF
1161 """
1162 gnuF.write(gnuTail)
1163 gnuF.close()
1164 if hasattr(details,'predShow') and details.predShow:
1165 try:
1166 import os
1167 from Gnuplot import Gnuplot
1168 p = Gnuplot()
1169 p('cd "%s"'%(os.getcwd()))
1170 p('load "%s.gnu"'%(details.predPlot))
1171 raw_input('press return to continue...\n')
1172 except:
1173 import traceback
1174 traceback.print_exc()
1175
1176
1177
1180
1194
1196 """ prints a list of arguments for when this is used from the
1197 command line and then exits
1198
1199 """
1200 print __doc__
1201 sys.exit(-1)
1202
1204 """ prints the version number of the program
1205
1206 """
1207 print 'This is ScreenComposite.py version %s'%(__VERSION_STRING)
1208 if includeArgs:
1209 import sys
1210 print 'command line was:'
1211 print ' '.join(sys.argv)
1212
1214 import getopt
1215 try:
1216 args,extras = getopt.getopt(sys.argv[1:],'EDd:t:VN:HThSRF:v:AX',
1217 ['predPlot=','predActCol=','predActTable=',
1218 'predLogScale','predShow',
1219 'OOB','pickleCol=','enrich=',
1220 ])
1221 except:
1222 import traceback
1223 traceback.print_exc()
1224 Usage()
1225
1226 fName = ''
1227 details.reportToExcel=0
1228 details.predPlot=''
1229 details.predActCol=''
1230 details.predActTable=''
1231 details.predLogScale=''
1232 details.predShow=0
1233 details.errorEstimate=0
1234 details.pickleCol=-1
1235 details.enrichTgt=-1
1236 for arg,val in args:
1237 if arg == '-d':
1238 details.dbName = val
1239 elif arg == '-D':
1240 details.detailedScreen = 1
1241 elif arg == '-t':
1242 details.partialVote = 1
1243 voteTol = eval(val)
1244 if type(voteTol) not in [type([]),type((1,1))]:
1245 voteTol = [voteTol]
1246 for tol in voteTol:
1247 if tol > 1 or tol < 0:
1248 error('Voting threshold must be between 0 and 1')
1249 sys.exit(-2)
1250 details.screenVoteTol=voteTol
1251 elif arg == '-N':
1252 details.note=val
1253 elif arg == '-H':
1254 details.doTraining=0
1255 details.doHoldout=1
1256 elif arg == '-T':
1257 details.doHoldout=0
1258 details.doTraining=1
1259 elif arg == '-E':
1260 details.errorAnalysis=1
1261 details.detailedScreen=1
1262 elif arg == '-A':
1263 details.showAll=1
1264 details.detailedScreen=1
1265 elif arg == '-S':
1266 details.shuffleActivities=1
1267 elif arg == '-R':
1268 details.randomActivities=1
1269 elif arg == '-h':
1270 Usage()
1271 elif arg == '-F':
1272 details.filterFrac=float(val)
1273 elif arg == '-v':
1274 details.filterVal=float(val)
1275 elif arg == '-V':
1276 verbose=1
1277 elif arg == '-X':
1278 if Excel is not None:
1279 details.reportToExcel = 1
1280 details.detailedScreen=1
1281 else:
1282 message('NOTE: Excel support not enabled, -X option ignored.')
1283 verbose=1
1284 elif arg == '--predPlot':
1285 details.detailedScreen=1
1286 details.predPlot=val
1287 elif arg == '--predActCol':
1288 details.predActCol=val
1289 elif arg == '--predActTable':
1290 details.predActTable=val
1291 elif arg == '--predLogScale':
1292 details.predLogScale=1
1293 elif arg == '--predShow':
1294 details.predShow=1
1295 elif arg == '--predShow':
1296 details.predShow=1
1297 elif arg == '--OOB':
1298 details.errorEstimate=1
1299 elif arg == '--pickleCol':
1300 details.pickleCol=int(val)-1
1301 elif arg == '--enrich':
1302 details.enrichTgt=int(val)
1303 else:
1304 Usage()
1305
1306 if len(extras) < 1:
1307 Usage()
1308 return extras
1309
1310
1311 if __name__ == '__main__':
1312 details = SetDefaults()
1313 extras = ParseArgs(details)
1314 ShowVersion(includeArgs=1)
1315
1316 models = []
1317 if details.note and details.dbName:
1318 tblName = extras[0]
1319 message('-> Retrieving models from database')
1320 conn = DbConnect(details.dbName,tblName)
1321 blobs = conn.GetData(fields='model',where="where note='%s'"%(details.note))
1322 for blob in blobs:
1323 blob = blob[0]
1324 try:
1325 models.append(cPickle.loads(str(blob)))
1326 except:
1327 import traceback
1328 traceback.print_exc()
1329 message('Model load failed')
1330
1331 else:
1332 message('-> Loading model')
1333 modelFile=open(extras[0],'rb')
1334 models.append(cPickle.load(modelFile))
1335 if not len(models):
1336 error('No composite models found')
1337 sys.exit(-1)
1338 else:
1339 message('-> Working with %d models.'%len(models))
1340
1341 extras = extras[1:]
1342
1343 for fName in extras:
1344 if details.dbName != '':
1345 details.tableName = fName
1346 data = details.GetDataSet(pickleCol=details.pickleCol,
1347 pickleClass=DataStructs.ExplicitBitVect)
1348 else:
1349 data = DataUtils.BuildDataSet(fName)
1350 descNames = data.GetVarNames()
1351 nModels = len(models)
1352 screenResults = [None]*nModels
1353 dataSets = [None]*nModels
1354 message('-> Constructing and screening data sets')
1355 testIdx = range(data.GetNPts())
1356 trainIdx = testIdx
1357
1358 for modelIdx in range(nModels):
1359
1360 tmpD = data
1361 model = models[modelIdx]
1362 message('.',noRet=1)
1363
1364 try:
1365 seed = model._randomSeed
1366 except AttributeError:
1367 pass
1368 else:
1369 DataUtils.InitRandomNumbers(seed)
1370
1371 if details.shuffleActivities or details.randomActivities:
1372 shuffle = details.shuffleActivities
1373 random = 1
1374 DataUtils.RandomizeActivities(tmpD,shuffle=details.shuffleActivities,
1375 runDetails=details)
1376 else:
1377 random = 0
1378 shuffle = 0
1379
1380 if hasattr(model,'_shuffleActivities') and \
1381 model._shuffleActivities and \
1382 not shuffle:
1383 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1384 message('****** WARNING: Shuffled model being screened with unshuffled data.')
1385 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1386 if hasattr(model,'_randomizeActivities') and \
1387 model._randomizeActivities and \
1388 not randomize:
1389 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1390 message('****** WARNING: Random model being screened with non-random data.')
1391 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1392
1393 trainIdx,testIdx = PrepareDataFromDetails(model,details,tmpD,verbose=1)
1394 screenResults[modelIdx] = CollectResults(testIdx,tmpD,model,
1395 errorEstimate=details.errorEstimate)
1396 dataSets[modelIdx] = testIdx
1397 if details.reportToExcel and Excel is not None:
1398 xl = Excel()
1399 xlCol = 1
1400 xlRow = xl.FindLastRow(1,xlCol)
1401 if xl[xlRow,xlCol] is not None and str(xl[xlRow,xlCol]):
1402 xlRow += 1
1403 heads=['Tolerance']
1404 if details.note:
1405 heads.append('Note')
1406 if nModels > 1:
1407 heads += [
1408 'Mean(MisClass)','Dev(MisClass)',
1409 'Mean(Correct Conf)','Dev(Correct Conf)',
1410 'Mean(Incorrect Conf)','Dev(Incorrect Conf)',
1411 ]
1412 else:
1413 heads += [
1414 'MisClass',
1415 'Correct Conf',
1416 'Incorrect Conf',
1417 ]
1418 if models[0].GetActivityQuantBounds():
1419 nRes = len(models[0].GetActivityQuantBounds())+1
1420 else:
1421 nRes = models[0].GetQuantBounds()[1][-1]
1422 if nModels>1:
1423 for i in range(nRes):
1424 heads.append('Mean(Class %d %% pure)'%(i))
1425 for i in range(nRes):
1426 heads.append('Mean(Class %d %% correct)'%(i))
1427 else:
1428 for i in range(nRes):
1429 heads.append('Class %d %% pure'%(i))
1430 for i in range(nRes):
1431 heads.append('Class %d %% correct'%(i))
1432
1433 if nModels > 1:
1434 heads += [
1435 'Best(MisClass)',
1436 'Best(Correct Conf)',
1437 'Best(Incorrect Conf)',
1438 ]
1439 for i in range(len(heads)):
1440 xl[xlRow,xlCol+i] = heads[i]
1441 xl.Columns(xlCol+i).AutoFit()
1442 else:
1443 xl = None
1444 for tol in details.screenVoteTol:
1445 if len(details.screenVoteTol)>1:
1446 message('\n-----*****-----*****-----*****-----*****-----*****-----*****-----\n')
1447 message('Tolerance: %f'%tol)
1448 if xl:
1449 xlRow+=1
1450 xlCol = 1
1451 xl[xlRow,xlCol]=tol
1452 xlCol += 1
1453 if details.note:
1454 xl[xlRow,xlCol]=details.note
1455 xlCol += 1
1456 nGood = numpy.zeros(nModels,numpy.float)
1457 nBad = numpy.zeros(nModels,numpy.float)
1458 nSkip = numpy.zeros(nModels,numpy.float)
1459 confGood = numpy.zeros(nModels,numpy.float)
1460 confBad = numpy.zeros(nModels,numpy.float)
1461 confSkip = numpy.zeros(nModels,numpy.float)
1462 if details.enrichTgt >= 0:
1463 enrichments = numpy.zeros(nModels,numpy.float)
1464 goodVoteDict = {}
1465 badVoteDict = {}
1466 noVoteDict = {}
1467 voteTab = None
1468 for modelIdx in range(nModels):
1469 model = models[modelIdx]
1470 model.SetInputOrder(descNames)
1471 testIdx = dataSets[modelIdx]
1472 screenRes = screenResults[modelIdx]
1473 if not details.detailedScreen:
1474 g,b,s,aG,aB,aS,vT = ScreenIt(model,testIdx,tmpD,details.partialVote,tol,
1475 verbose=details.verbose,screenResults=screenRes)
1476 else:
1477 if model.GetActivityQuantBounds():
1478 nRes = len(model.GetActivityQuantBounds())+1
1479 else:
1480 nRes = model.GetQuantBounds()[1][-1]
1481 badVotes = []
1482 noVotes = []
1483 if (hasattr(details,'showAll') and details.showAll) or \
1484 (hasattr(details,'predPlot') and details.predPlot):
1485 goodVotes = []
1486 else:
1487 goodVotes = None
1488 g,b,s,aG,aB,aS,vT = ShowVoteResults(testIdx,tmpD,model,nRes,tol,
1489 verbose=details.verbose,
1490 screenResults=screenRes,
1491 badVotes=badVotes,noVotes=noVotes,
1492 goodVotes=goodVotes,
1493 errorEstimate=details.errorEstimate)
1494 if voteTab is None:
1495 voteTab = numpy.zeros(vT.shape,numpy.float)
1496 if details.errorAnalysis:
1497 for a,p,c,idx in badVotes:
1498 label = testIdx[idx]
1499 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
1500 if a==details.enrichTgt:
1501 badVoteDict[label] = badVoteDict.get(label,0)+1
1502 else:
1503 badVoteDict[label] = badVoteDict.get(label,0)+1
1504 for a,p,c,idx in noVotes:
1505 label = testIdx[idx]
1506 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
1507 if a==details.enrichTgt:
1508 noVoteDict[label] = noVoteDict.get(label,0)+1
1509 else:
1510 noVoteDict[label] = noVoteDict.get(label,0)+1
1511
1512 if hasattr(details,'showAll') and details.showAll:
1513 for a,p,c,idx in goodVotes:
1514 label = testIdx[idx]
1515 if details.enrichTgt >=0:
1516 if a==details.enrichTgt:
1517 goodVoteDict[label] = goodVoteDict.get(label,0)+1
1518 else:
1519 goodVoteDict[label] = goodVoteDict.get(label,0)+1
1520
1521 if details.enrichTgt>-1:
1522 enrichments[modelIdx] = CalcEnrichment(vT,tgt=details.enrichTgt)
1523
1524 voteTab += vT
1525 if details.detailedScreen and hasattr(details,'predPlot') and details.predPlot:
1526 MakePredPlot(details,testIdx,tmpD,goodVotes,badVotes,nRes,verbose=1)
1527
1528 if hasattr(details,'showAll') and details.showAll:
1529 print '-v-v-v-v-v-v-v- All Votes -v-v-v-v-v-v-v-'
1530 print 'id, prediction, confidence, flag(-1=skipped,0=wrong,1=correct)'
1531 for ans,pred,conf,idx in goodVotes:
1532 pt = tmpD[testIdx[idx]]
1533 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1534 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1535 print '%s, %d, %.4f, 1'%(str(pt[0]),pred,conf)
1536 for ans,pred,conf,idx in badVotes:
1537 pt = tmpD[testIdx[idx]]
1538 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1539 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1540 print '%s, %d, %.4f, 0'%(str(pt[0]),pred,conf)
1541 for ans,pred,conf,idx in noVotes:
1542 pt = tmpD[testIdx[idx]]
1543 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1544 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1545 print '%s, %d, %.4f, -1'%(str(pt[0]),pred,conf)
1546 print '-^-^-^-^-^-^-^- -^-^-^-^-^-^-^-'
1547
1548 nGood[modelIdx] = g
1549 nBad[modelIdx] = b
1550 nSkip[modelIdx] = s
1551 confGood[modelIdx] = aG
1552 confBad[modelIdx] = aB
1553 confSkip[modelIdx] = aS
1554 print
1555
1556 if nModels > 1:
1557 print '-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*'
1558 print 'AVERAGES:'
1559
1560 avgNBad = sum(nBad)/nModels
1561 devNBad = numpy.sqrt(sum((nBad-avgNBad)**2)/(nModels-1))
1562
1563 bestIdx = numpy.argsort(nBad)[0]
1564
1565 avgNGood = sum(nGood)/nModels
1566 devNGood = numpy.sqrt(sum((nGood-avgNGood)**2)/(nModels-1))
1567
1568 avgNSkip = sum(nSkip)/nModels
1569 devNSkip = numpy.sqrt(sum((nSkip-avgNSkip)**2)/(nModels-1))
1570
1571 avgConfBad = sum(confBad)/nModels
1572 devConfBad = numpy.sqrt(sum((confBad-avgConfBad)**2)/(nModels-1))
1573
1574 avgConfGood = sum(confGood)/nModels
1575 devConfGood = numpy.sqrt(sum((confGood-avgConfGood)**2)/(nModels-1))
1576
1577 avgConfSkip = sum(confSkip)/nModels
1578 devConfSkip = numpy.sqrt(sum((confSkip-avgConfSkip)**2)/(nModels-1))
1579
1580 nClassified = avgNGood + avgNBad
1581 nExamples = nClassified + avgNSkip
1582 print 'Misclassifications: \t%%%5.2f(%%%5.2f) %4.1f(%4.1f) / %d'%(100*avgNBad/nExamples,
1583 100*devNBad/nExamples,
1584 avgNBad,devNBad,
1585 nExamples)
1586 if avgNSkip>0:
1587 print '\tthreshold: \t%%%5.2f(%%%5.2f) %4.1f(%4.1f) / %d'%(100*avgNBad/nClassified,
1588 100*devNBad/nClassified,
1589 avgNBad,devNBad,
1590 nClassified)
1591 print
1592 print 'Number Skipped: %%%4.2f(%%%4.2f) %4.2f(%4.2f)'%(100*avgNSkip/nExamples,
1593 100*devNSkip/nExamples,
1594 avgNSkip,devNSkip)
1595
1596
1597 print
1598 print 'Confidences:'
1599 print '\tCorrect: \t%4.2f(%4.2f)'%(100*avgConfGood,100*devConfGood)
1600 print '\tIncorrect: \t%4.2f(%4.2f)'%(100*avgConfBad,100*devConfBad)
1601 if avgNSkip>0:
1602 print '\tSkipped: \t%4.2f(%4.2f)'%(100*avgConfSkip,100*devConfSkip)
1603
1604 if xl:
1605 xl[xlRow,xlCol]=100.*avgNBad/nExamples
1606 xlCol+=1
1607 xl[xlRow,xlCol]=100.*devNBad/nExamples
1608 xlCol+=1
1609 xl[xlRow,xlCol]=100.*avgConfGood
1610 xlCol+=1
1611 xl[xlRow,xlCol]=100.*devConfGood
1612 xlCol += 1
1613 xl[xlRow,xlCol]=100.*avgConfBad
1614 xlCol+=1
1615 xl[xlRow,xlCol]=100.*devConfBad
1616 xlCol += 1
1617
1618 if details.detailedScreen:
1619 message('Results Table:')
1620 voteTab = numpy.transpose(voteTab)/nModels
1621 nResultCodes = len(voteTab)
1622 colCounts = sum(voteTab)
1623 rowCounts = sum(voteTab,1)
1624 print
1625 for i in range(nResultCodes):
1626 if rowCounts[i]==0: rowCounts[i]=1
1627 row = voteTab[i]
1628 message(' ',noRet=1)
1629 for j in range(nResultCodes):
1630 entry = row[j]
1631 message(' % 6.2f'%entry,noRet=1)
1632 message(' | % 4.2f'%(100.*voteTab[i,i]/rowCounts[i]))
1633 message(' ',noRet=1)
1634 for i in range(nResultCodes):
1635 message('-------',noRet=1)
1636 message('')
1637 message(' ',noRet=1)
1638 for i in range(nResultCodes):
1639 if colCounts[i]==0: colCounts[i]=1
1640 message(' % 6.2f'%(100.*voteTab[i,i]/colCounts[i]),noRet=1)
1641 message('')
1642 if xl:
1643 for i in range(nResultCodes):
1644 xl[xlRow,xlCol]=100.*voteTab[i,i]/rowCounts[i]
1645 xlCol += 1
1646 for i in range(nResultCodes):
1647 xl[xlRow,xlCol]=100.*voteTab[i,i]/colCounts[i]
1648 xlCol += 1
1649
1650 if details.enrichTgt >-1:
1651 mean = sum(enrichments)/nModels
1652 enrichments -= mean
1653 dev = numpy.sqrt(sum(enrichments*enrichments))/(nModels-1)
1654 message(' Enrichment of value %d: %.4f (%.4f)'%(details.enrichTgt,mean,dev))
1655 else:
1656 bestIdx=0
1657 print '------------------------------------------------'
1658 print 'Best Model: ',bestIdx+1
1659 bestBad = nBad[bestIdx]
1660 bestGood = nGood[bestIdx]
1661 bestSkip = nSkip[bestIdx]
1662 nClassified = bestGood + bestBad
1663 nExamples = nClassified + bestSkip
1664 print 'Misclassifications: \t%%%5.2f %d / %d'%(100*bestBad/nExamples,
1665 bestBad,nExamples)
1666 if bestSkip>0:
1667 print '\tthreshold: \t%%%5.2f %d / %d'%(100*bestBad/nClassified,
1668 bestBad,nClassified)
1669 print
1670 print 'Number Skipped: %%%4.2f %d'%(100*bestSkip/nExamples,
1671 bestSkip)
1672
1673 print
1674 print 'Confidences:'
1675 print '\tCorrect: \t%4.2f'%(100*confGood[bestIdx])
1676 print '\tIncorrect: \t%4.2f'%(100*confBad[bestIdx])
1677 if bestSkip>0:
1678 print '\tSkipped: \t%4.2f'%(100*confSkip[bestIdx])
1679 if xl:
1680 xl[xlRow,xlCol]=100.*bestBad/nExamples
1681 xlCol+=1
1682 xl[xlRow,xlCol]=100.*confGood[bestIdx]
1683 xlCol+=1
1684 xl[xlRow,xlCol]=100.*confBad[bestIdx]
1685 xlCol+=1
1686
1687
1688 if nModels == 1 and details.detailedScreen:
1689 message('')
1690 message('Results Table:')
1691 voteTab = numpy.transpose(vT)
1692 nResultCodes = len(vT)
1693 colCounts = sum(voteTab)
1694 rowCounts = sum(voteTab,1)
1695 message('')
1696 for i in range(nResultCodes):
1697 if rowCounts[i]==0: rowCounts[i]=1
1698 row = voteTab[i]
1699 message(' ',noRet=1)
1700 for j in range(nResultCodes):
1701 entry = row[j]
1702 message(' % 6.2f'%entry,noRet=1)
1703 message(' | % 4.2f'%(100.*voteTab[i,i]/rowCounts[i]))
1704 message(' ',noRet=1)
1705 for i in range(nResultCodes):
1706 message('-------',noRet=1)
1707 message('')
1708 message(' ',noRet=1)
1709 for i in range(nResultCodes):
1710 if colCounts[i]==0: colCounts[i]=1
1711 message(' % 6.2f'%(100.*voteTab[i,i]/colCounts[i]),noRet=1)
1712 message('')
1713 if xl:
1714 for i in range(nResultCodes):
1715 xl[xlRow,xlCol]=100.*voteTab[i,i]/rowCounts[i]
1716 xlCol += 1
1717 for i in range(nResultCodes):
1718 xl[xlRow,xlCol]=100.*voteTab[i,i]/colCounts[i]
1719 xlCol += 1
1720 if details.errorAnalysis:
1721 message('\n*-*-*-*-*-*-*-*- ERROR ANALYSIS -*-*-*-*-*-*-*-*\n')
1722 ks = badVoteDict.keys()
1723 if len(ks):
1724 message(' ---> Bad Vote Counts')
1725 if xl:
1726 xlRow += 1
1727 xl[xlRow,1] = 'Misclassification Counts:'
1728 xlRow += 1
1729 xl[xlRow,1] = 'ID'
1730 xl[xlRow,2] = 'Num_Misses'
1731 xlRow += 1
1732 for k in ks:
1733 pt = data[k]
1734 message('%s,%d'%(str(pt[0]),badVoteDict[k]))
1735 if xl:
1736 xl[xlRow,1] = "'%s"%str(pt[0])
1737 xl[xlRow,2] = badVoteDict[k]
1738 xlRow += 1
1739
1740 ks = noVoteDict.keys()
1741 if len(ks):
1742 message(' ---> Skipped Compound Counts')
1743 if xl:
1744 xl[xlRow,1] = 'Skipped Compound Counts:'
1745 xlRow += 1
1746 for k in ks:
1747 pt = data[k]
1748 message('%s,%d'%(str(pt[0]),noVoteDict[k]))
1749 if xl:
1750 xl[xlRow,1] = "'%s"%str(pt[0])
1751 xl[xlRow,2] = noVoteDict[k]
1752 xlRow += 1
1753
1754 if hasattr(details,'showAll') and details.showAll:
1755 ks = goodVoteDict.keys()
1756 if len(ks):
1757 message(' ---> Good Vote Counts')
1758 if xl:
1759 xlRow += 1
1760 xl[xlRow,1] = 'Correct Classification Counts:'
1761 xlRow += 1
1762 xl[xlRow,1] = 'ID'
1763 xl[xlRow,2] = 'Num_Picks'
1764 xlRow += 1
1765 for k in ks:
1766 pt = data[k]
1767 message('%s,%d'%(str(pt[0]),goodVoteDict[k]))
1768 if xl:
1769 xl[xlRow,1] = "'%s"%str(pt[0])
1770 xl[xlRow,2] = goodVoteDict[k]
1771 xlRow += 1
1772