1
2
3
4
5
6
7 """ command line utility for screening composite models
8
9 **Usage**
10
11 _ScreenComposite [optional args] modelfile(s) datafile_
12
13 Unless indicated otherwise (via command line arguments), _modelfile_ is
14 a file containing a pickled composite model and _filename_ is a QDAT file.
15
16 **Command Line Arguments**
17
18 - -t *threshold value(s)*: use high-confidence predictions for the final
19 analysis of the hold-out data. The threshold value can be either a single
20 float or a list/tuple of floats. All thresholds should be between
21 0.0 and 1.0
22
23 - -D: do a detailed screen.
24
25 - -d *database name*: instead of reading the data from a QDAT file,
26 pull it from a database. In this case, the _datafile_ argument
27 provides the name of the database table containing the data set.
28
29 - -N *note*: use all models from the database which have this note.
30 The modelfile argument should contain the name of the table
31 with the models.
32
33 - -H: screen only the hold out set (works only if a version of
34 BuildComposite more recent than 1.2.2 was used).
35
36 - -T: screen only the training set (works only if a version of
37 BuildComposite more recent than 1.2.2 was used).
38
39 - -E: do a detailed Error analysis. This shows each misclassified
40 point and the number of times it was missed across all screened
41 composites. If the --enrich argument is also provided, only compounds
42 that have true activity value equal to the enrichment value will be
43 used.
44
45 - --enrich *enrichVal*: target "active" value to be used in calculating
46 enrichments.
47
48 - -A: show All predictions.
49
50 - -S: shuffle activity values before screening
51
52 - -R: randomize activity values before screening
53
54 - -F *filter frac*: filters the data before training to change the
55 distribution of activity values in the training set. *filter frac*
56 is the fraction of the training set that should have the target value.
57 **See note in BuildComposite help about data filtering**
58
59 - -v *filter value*: filters the data before training to change the
60 distribution of activity values in the training set. *filter value*
61 is the target value to use in filtering.
62 **See note in BuildComposite help about data filtering**
63
64 - -V: be verbose when screening multiple models
65
66 - -h: show this message and exit
67
68 - -X: send a summary of the results to Excel (NOTE: this will alter the
69 contents of the currently active workbook)
70
71 - --OOB: Do out an "out-of-bag" generalization error estimate. This only
72 makes sense when applied to the original data set.
73
74 - --pickleCol *colId*: index of the column containing a pickled value
75 (used primarily for cases where fingerprints are used as descriptors)
76
77 *** Options for making Prediction (Hanneke) Plots ***
78
79 - --predPlot=<fileName>: triggers the generation of a Hanneke plot and
80 sets the name of the .txt file which will hold the output data.
81 A Gnuplot control file, <fileName>.gnu, will also be generated.
82
83 - --predActTable=<name> (optional): name of the database table
84 containing activity values. If this is not provided, activities
85 will be read from the same table containing the screening data
86
87 - --predActCol=<name> (optional): name of the activity column. If not
88 provided, the name of the last column in the activity table will
89 be used.
90
91 - --predLogScale (optional): If provided, the x axis of the
92 prediction plot (the activity axis) will be plotted using a log
93 scale
94
95 - --predShow: launch a gnuplot instance and display the prediction
96 plot (the plot will still be written to disk).
97
98 *** The following options are likely obsolete ***
99
100 - -P: read pickled data. The datafile argument should contain
101 a pickled data set. *relevant only to qdat files*
102
103 - -q: data are not quantized (the composite should take care of
104 quantization itself if it requires quantized data). *relevant only to
105 qdat files*
106
107
108
109 """
110 import RDConfig
111 import DataStructs
112 import sys,cPickle,types,copy
113 from Numeric import *
114
115 try:
116 from PIL import Image,ImageDraw
117 except ImportError:
118 hasPil=0
119 else:
120 hasPil=1
121
122 from ML.Data import DataUtils,SplitData
123 from ML import CompositeRun
124 from Dbase.DbConnection import DbConnect
125 from Dbase import DbModule
126 _details = CompositeRun.CompositeRun()
127 try:
128 from Excel.ExcelWrapper import ExcelWrapper as Excel
129 except ImportError:
130 Excel = None
131
132 __VERSION_STRING="3.2.8"
133
135 """ emits messages to _sys.stdout_
136 override this in modules which import this one to redirect output
137
138 **Arguments**
139
140 - msg: the string to be displayed
141
142 """
143 if noRet:
144 sys.stdout.write('%s '%(msg))
145 else:
146 sys.stdout.write('%s\n'%(msg))
148 """ emits messages to _sys.stderr_
149 override this in modules which import this one to redirect output
150
151 **Arguments**
152
153 - msg: the string to be displayed
154
155 """
156 sys.stderr.write('ERROR: %s\n'%(msg))
157
159 if tgt<0 or tgt>=mat.shape[0]: return 0
160 nPts = float(sum(sum(mat)))
161 nTgtPred = float(sum(mat[:,tgt]))
162 if nTgtPred:
163 pctCorrect = mat[tgt,tgt]/nTgtPred
164 nTgtReal = float(sum(mat[tgt,:]))
165 pctOverall = nTgtReal/nPts
166 else:
167 return 0.0
168 return pctCorrect/pctOverall
169
170
171 -def CollectResults(indices,dataSet,composite,callback=None,appendExamples=0,
172 errorEstimate=0):
173 """ screens a set of examples through a composite and returns the
174 results
175 #DOC
176
177 **Arguments**
178
179 - examples: the examples to be screened (a sequence of sequences)
180 it's assumed that the last element in each example is it's "value"
181
182 - composite: the composite model to be used
183
184 - callback: (optional) if provided, this should be a function
185 taking a single argument that is called after each example is
186 screened with the number of examples screened so far as the
187 argument.
188
189 - appendExamples: (optional) this value is passed on to the
190 composite's _ClassifyExample()_ method.
191
192 - errorEstimate: (optional) calculate the "out of bag" error
193 estimate for the composite using Breiman's definition. This
194 only makes sense when screening the original data set!
195 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
196 Statistics Technical Report (1996)]
197
198 **Returns**
199
200 a list of 3-tuples _nExamples_ long:
201
202 1) answer: the value from the example
203
204 2) pred: the composite model's prediction
205
206 3) conf: the confidence of the composite
207
208 """
209
210
211
212 nPts = len(indices)
213 res = [None]*nPts
214 for i in range(nPts):
215 idx = indices[i]
216 example = dataSet[idx]
217 if errorEstimate:
218 use = []
219 for j in range(len(composite)):
220 mdl = composite.GetModel(j)
221 if not hasattr(mdl,'_trainIndices') or \
222 idx not in mdl._trainIndices:
223 use.append(j)
224 else:
225 use = None
226
227 pred,conf = composite.ClassifyExample(example,appendExample=appendExamples,
228 onlyModels=use)
229 if composite.GetActivityQuantBounds():
230 answer = composite.QuantizeActivity(example)[-1]
231 else:
232 answer = example[-1]
233 res[i] = answer,pred,conf
234 if callback: callback(i)
235 return res
236
237 -def DetailedScreen(indices,data,composite,threshold=0,screenResults=None,
238 goodVotes=None,badVotes=None,noVotes=None,callback=None,
239 appendExamples=0,errorEstimate=0):
240 """ screens a set of examples cross a composite and breaks the
241 predictions into *correct*,*incorrect* and *unclassified* sets.
242 #DOC
243 **Arguments**
244
245 - examples: the examples to be screened (a sequence of sequences)
246 it's assumed that the last element in each example is its "value"
247
248 - composite: the composite model to be used
249
250 - threshold: (optional) the threshold to be used to decide whether
251 or not a given prediction should be kept
252
253 - screenResults: (optional) the results of screening the results
254 (a sequence of 3-tuples in the format returned by
255 _CollectResults()_). If this is provided, the examples will not
256 be screened again.
257
258 - goodVotes,badVotes,noVotes: (optional) if provided these should
259 be lists (or anything supporting an _append()_ method) which
260 will be used to pass the screening results back.
261
262 - callback: (optional) if provided, this should be a function
263 taking a single argument that is called after each example is
264 screened with the number of examples screened so far as the
265 argument.
266
267 - appendExamples: (optional) this value is passed on to the
268 composite's _ClassifyExample()_ method.
269
270 - errorEstimate: (optional) calculate the "out of bag" error
271 estimate for the composite using Breiman's definition. This
272 only makes sense when screening the original data set!
273 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
274 Statistics Technical Report (1996)]
275
276 **Notes**
277
278 - since this function doesn't return anything, if one or more of
279 the arguments _goodVotes_, _badVotes_, and _noVotes_ is not
280 provided, there's not much reason to call it
281
282 """
283 if screenResults is None:
284 screenResults = CollectResults(indices,data,composite,callback=callback,
285 appendExamples=appendExamples,
286 errorEstimate=errorEstimate)
287 if goodVotes is None: goodVotes = []
288 if badVotes is None: badVotes = []
289 if noVotes is None: noVotes = []
290 for i in range(len(screenResults)):
291 answer,pred,conf = screenResults[i]
292 if conf > threshold:
293 if pred != answer:
294 badVotes.append((answer,pred,conf,i))
295 else:
296 goodVotes.append((answer,pred,conf,i))
297 else:
298 noVotes.append((answer,pred,conf,i))
299
300 -def ShowVoteResults(indices,data,composite,nResultCodes,threshold,verbose=1,
301 screenResults=None,callback=None,appendExamples=0,
302 goodVotes=None,badVotes=None,noVotes=None,
303 errorEstimate=0):
304 """ screens the results and shows a detailed workup
305
306 The work of doing the screening and processing the results is
307 handled by _DetailedScreen()_
308 #DOC
309
310 **Arguments**
311
312 - examples: the examples to be screened (a sequence of sequences)
313 it's assumed that the last element in each example is its "value"
314
315 - composite: the composite model to be used
316
317 - nResultCodes: the number of possible results the composite can
318 return
319
320 - threshold: the threshold to be used to decide whether or not a
321 given prediction should be kept
322
323 - screenResults: (optional) the results of screening the results
324 (a sequence of 3-tuples in the format returned by
325 _CollectResults()_). If this is provided, the examples will not
326 be screened again.
327
328 - callback: (optional) if provided, this should be a function
329 taking a single argument that is called after each example is
330 screened with the number of examples screened so far as the
331 argument.
332
333 - appendExamples: (optional) this value is passed on to the
334 composite's _ClassifyExample()_ method.
335
336 - goodVotes,badVotes,noVotes: (optional) if provided these should
337 be lists (or anything supporting an _append()_ method) which
338 will be used to pass the screening results back.
339
340 - errorEstimate: (optional) calculate the "out of bag" error
341 estimate for the composite using Breiman's definition. This
342 only makes sense when screening the original data set!
343 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
344 Statistics Technical Report (1996)]
345
346 **Returns**
347
348 a 7-tuple:
349
350 1) the number of good (correct) predictions
351
352 2) the number of bad (incorrect) predictions
353
354 3) the number of predictions skipped due to the _threshold_
355
356 4) the average confidence in the good predictions
357
358 5) the average confidence in the bad predictions
359
360 6) the average confidence in the skipped predictions
361
362 7) the results table
363
364 """
365 nExamples = len(indices)
366 if goodVotes is None:
367 goodVotes = []
368 if badVotes is None:
369 badVotes = []
370 if noVotes is None:
371 noVotes = []
372 DetailedScreen(indices,data,composite,threshold,screenResults=screenResults,
373 goodVotes=goodVotes,badVotes=badVotes,noVotes=noVotes,callback=callback,
374 appendExamples=appendExamples,errorEstimate=errorEstimate)
375 nBad = len(badVotes)
376 nGood = len(goodVotes)
377 nClassified = nGood + nBad
378 if verbose:
379 print '\n\t*** Vote Results ***'
380 print 'misclassified: %d/%d (%%%4.2f)\t%d/%d (%%%4.2f)'%(nBad,nExamples,
381 100.*float(nBad)/nExamples,
382 nBad,nClassified,
383 100.*float(nBad)/nClassified)
384 nSkip = len(noVotes)
385 if nSkip > 0:
386 if verbose:
387 print 'skipped: %d/%d (%%% 4.2f)'%(nSkip,nExamples,100.*float(nSkip)/nExamples)
388 noConf = array(map(lambda x:x[2],noVotes))
389 avgSkip = sum(noConf)/float(nSkip)
390 else:
391 avgSkip = 0.
392
393 if nBad > 0:
394 badConf = array(map(lambda x:x[2],badVotes))
395 avgBad = sum(badConf)/float(nBad)
396 else:
397 avgBad = 0.
398
399 if nGood > 0:
400 goodRes = map(lambda x:x[1],goodVotes)
401 goodConf = array(map(lambda x:x[2],goodVotes))
402 avgGood = sum(goodConf)/float(nGood)
403 else:
404 goodRes = []
405 goodConf = []
406 avgGood = 0.
407
408 if verbose:
409 print
410 print 'average correct confidence: % 6.4f'%avgGood
411 print 'average incorrect confidence: % 6.4f'%avgBad
412
413 voteTab = zeros((nResultCodes,nResultCodes),Int)
414 for res in goodRes:
415 voteTab[res,res] += 1
416 for ans,res,conf,idx in badVotes:
417 voteTab[ans,res] += 1
418
419 if verbose:
420 print
421 print '\tResults Table:'
422 vTab=transpose(voteTab)
423 colCounts = sum(vTab)
424 rowCounts = sum(vTab,1)
425 message('')
426 for i in range(nResultCodes):
427 if rowCounts[i]==0: rowCounts[i]=1
428 row = vTab[i]
429 message(' ',noRet=1)
430 for j in range(nResultCodes):
431 entry = row[j]
432 message(' % 6d'%entry,noRet=1)
433 message(' | % 4.2f'%(100.*vTab[i,i]/rowCounts[i]))
434 message(' ',noRet=1)
435 for i in range(nResultCodes):
436 message('-------',noRet=1)
437 message('')
438 message(' ',noRet=1)
439 for i in range(nResultCodes):
440 if colCounts[i]==0: colCounts[i]=1
441 message(' % 6.2f'%(100.*vTab[i,i]/colCounts[i]),noRet=1)
442 message('')
443
444
445
446 return nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab
447
448 -def ScreenIt(composite,indices,data,partialVote=0,voteTol=0.0,verbose=1,screenResults=None,
449 goodVotes=None,badVotes=None,noVotes=None):
450 """ screens a set of data using a composite model and prints out
451 statistics about the screen.
452 #DOC
453 The work of doing the screening and processing the results is
454 handled by _DetailedScreen()_
455
456 **Arguments**
457
458 - composite: the composite model to be used
459
460 - data: the examples to be screened (a sequence of sequences)
461 it's assumed that the last element in each example is its "value"
462
463 - partialVote: (optional) toggles use of the threshold value in
464 the screnning.
465
466 - voteTol: (optional) the threshold to be used to decide whether or not a
467 given prediction should be kept
468
469 - verbose: (optional) sets degree of verbosity of the screening
470
471 - screenResults: (optional) the results of screening the results
472 (a sequence of 3-tuples in the format returned by
473 _CollectResults()_). If this is provided, the examples will not
474 be screened again.
475
476 - goodVotes,badVotes,noVotes: (optional) if provided these should
477 be lists (or anything supporting an _append()_ method) which
478 will be used to pass the screening results back.
479
480
481 **Returns**
482
483 a 7-tuple:
484
485 1) the number of good (correct) predictions
486
487 2) the number of bad (incorrect) predictions
488
489 3) the number of predictions skipped due to the _threshold_
490
491 4) the average confidence in the good predictions
492
493 5) the average confidence in the bad predictions
494
495 6) the average confidence in the skipped predictions
496
497 7) None
498
499 """
500 if goodVotes is None:
501 goodVotes = []
502 if badVotes is None:
503 badVotes = []
504 if noVotes is None:
505 noVotes = []
506
507 if not partialVote:
508 voteTol = 0.0
509
510 DetailedScreen(indices,data,composite,voteTol,screenResults=screenResults,
511 goodVotes=goodVotes,badVotes=badVotes,noVotes=noVotes)
512
513 nGood = len(goodVotes)
514 goodAccum = 0.
515 for res,pred,conf,idx in goodVotes:
516 goodAccum += conf
517
518 misCount = len(badVotes)
519 badAccum = 0.
520 for res,pred,conf,idx in badVotes:
521 badAccum += conf
522
523 nSkipped = len(noVotes)
524 goodSkipped = 0
525 badSkipped = 0
526 skipAccum = 0.
527 for ans,pred,conf,idx in noVotes:
528 skipAccum += conf
529 if ans != pred:
530 badSkipped += 1
531 else:
532 goodSkipped += 1
533
534 nData = nGood + misCount + nSkipped
535 if verbose:
536 print 'Total N Points:',nData
537 if partialVote:
538 nCounted = nData-nSkipped
539 if verbose:
540 print 'Misclassifications: %d (%%%4.2f)'%(misCount,100.*float(misCount)/nCounted)
541 print 'N Skipped: %d (%%%4.2f)'%(nSkipped,100.*float(nSkipped)/nData)
542 print '\tGood Votes Skipped: %d (%%%4.2f)'%(goodSkipped,100.*float(goodSkipped)/nSkipped)
543 print '\tBad Votes Skipped: %d (%%%4.2f)'%(badSkipped,100.*float(badSkipped)/nSkipped)
544 else:
545 if verbose:
546 print 'Misclassifications: %d (%%%4.2f)'%(misCount,100.*float(misCount)/nData)
547 print 'Average Correct Vote Confidence: % 6.4f'%(goodAccum/(nData-misCount))
548 print 'Average InCorrect Vote Confidence: % 6.4f'%(badAccum/misCount)
549
550 avgGood=0
551 avgBad=0
552 avgSkip=0
553 if nGood:
554 avgGood = goodAccum/nGood
555 if misCount:
556 avgBad = badAccum/misCount
557 if nSkipped:
558 avgSkip = skipAccum/nSkipped
559 return nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,None
560
562 """ *Internal Use Only*
563
564 converts a list of 4 tuples: (answer,prediction,confidence,idx) into
565 an alternate list: (answer,prediction,confidence,data point)
566
567 **Arguments**
568
569 - votes: a list of 4 tuples: (answer, prediction, confidence,
570 index)
571
572 - data: a _DataUtils.MLData.MLDataSet_
573
574
575 **Note**: alterations are done in place in the _votes_ list
576
577 """
578 for i in range(len(votes)):
579 ans,pred,conf,idx = votes[i]
580 votes[i] = (ans,pred,conf,data[idx])
581
583 if (hasattr(details,'doHoldout') and details.doHoldout) or \
584 (hasattr(details,'doTraining') and details.doTraining):
585 try:
586 splitF = model._splitFrac
587 except AttributeError:
588 pass
589 else:
590 if verbose:
591 message('s',noRet=1)
592
593 if hasattr(details,'errorEstimate') and details.errorEstimate and \
594 hasattr(details,'doHoldout') and details.doHoldout:
595 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
596 message('****** WARNING: OOB screening should not be combined with doHoldout option.')
597 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
598 trainIdx,testIdx = SplitData.SplitIndices(data.GetNPts(),splitF,silent=1)
599
600 if hasattr(details,'filterFrac') and details.filterFrac != 0.0:
601 if verbose:
602 message('f',noRet=1)
603 trainFilt,temp = DataUtils.FilterData(data,details.filterVal,
604 details.filterFrac,-1,
605 indicesToUse=trainIdx,
606 indicesOnly=1)
607 testIdx += temp
608 trainIdx = trainFilt
609 elif hasattr(details,'errorEstimate') and details.errorEstimate:
610
611
612 if hasattr(details,'filterFrac') and details.filterFrac != 0.0:
613 if verbose:
614 message('f',noRet=1)
615 testIdx,trainIdx = DataUtils.FilterData(data,details.filterVal,
616 details.filterFrac,-1,
617 indicesToUse=range(data.GetNPts()),
618 indicesOnly=1)
619 testIdx.extend(trainIdx)
620 else:
621 testIdx = range(data.GetNPts())
622 trainIdx = []
623 else:
624 testIdx = range(data.GetNPts())
625 trainIdx = []
626 if hasattr(details,'doTraining') and details.doTraining:
627 testIdx,trainIdx = trainIdx,testIdx
628 return trainIdx,testIdx
629
630 -def ScreenFromDetails(models,details,callback=None,setup=None,appendExamples=0,
631 goodVotes=None,badVotes=None,noVotes=None,data=None,
632 enrichments=None):
633 """ Screens a set of data using a a _CompositeRun.CompositeRun_
634 instance to provide parameters
635
636 # DOC
637
638 The actual data to be used are extracted from the database and
639 table specified in _details_
640
641 Aside from dataset construction, _ShowVoteResults()_ does most of
642 the heavy lifting here.
643
644 **Arguments**
645
646 - model: a composite model
647
648 - details: a _CompositeRun.CompositeRun_ object containing details
649 (options, parameters, etc.) about the run
650
651 - callback: (optional) if provided, this should be a function
652 taking a single argument that is called after each example is
653 screened with the number of examples screened so far as the
654 argument.
655
656 - setup: (optional) a function taking a single argument which is
657 called at the start of screening with the number of points to
658 be screened as the argument.
659
660 - appendExamples: (optional) this value is passed on to the
661 composite's _ClassifyExample()_ method.
662
663 - goodVotes,badVotes,noVotes: (optional) if provided these should
664 be lists (or anything supporting an _append()_ method) which
665 will be used to pass the screening results back.
666
667
668 **Returns**
669
670 a 7-tuple:
671
672 1) the number of good (correct) predictions
673
674 2) the number of bad (incorrect) predictions
675
676 3) the number of predictions skipped due to the _threshold_
677
678 4) the average confidence in the good predictions
679
680 5) the average confidence in the bad predictions
681
682 6) the average confidence in the skipped predictions
683
684 7) the results table
685
686 """
687 if data is None:
688 if hasattr(details,'pickleCol'):
689 data = details.GetDataSet(pickleCol=details.pickleCol,
690 pickleClass=DataStructs.ExplicitBitVect)
691 else:
692 data = details.GetDataSet()
693 if details.threshold>0.0:
694 partialVote = 1
695 else:
696 partialVote = 0
697
698 if type(models) not in [types.ListType,types.TupleType]:
699 models = (models,)
700
701 nModels = len(models)
702
703 if setup is not None:
704 setup(nModels*data.GetNPts())
705
706 nGood = zeros(nModels,Float)
707 nBad = zeros(nModels,Float)
708 nSkip = zeros(nModels,Float)
709 confGood = zeros(nModels,Float)
710 confBad = zeros(nModels,Float)
711 confSkip = zeros(nModels,Float)
712 voteTab = None
713 if goodVotes is None:
714 goodVotes = []
715 if badVotes is None:
716 badVotes = []
717 if noVotes is None:
718 noVotes = []
719 if enrichments is None:
720 enrichments = [0.0]*nModels
721 badVoteDict = {}
722 noVoteDict = {}
723
724 for i in range(nModels):
725 if nModels>1:
726 goodVotes = []
727 badVotes=[]
728 noVotes=[]
729 model = models[i]
730
731 try:
732 seed = model._randomSeed
733 except AttributeError:
734 pass
735 else:
736 DataUtils.InitRandomNumbers(seed)
737
738 if (hasattr(details,'shuffleActivities') and details.shuffleActivities) or \
739 (hasattr(details,'randomActivities') and details.randomActivities ):
740 if hasattr(details,'shuffleActivities') and details.shuffleActivities:
741 shuffle = 1
742 else:
743 shuffle = 0
744 randomize=1
745 DataUtils.RandomizeActivities(data,shuffle=shuffle,
746 runDetails=details)
747 else:
748 randomize=0
749 shuffle=0
750
751 if hasattr(model,'_shuffleActivities') and \
752 model._shuffleActivities and \
753 not shuffle:
754 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
755 message('****** WARNING: Shuffled model being screened with unshuffled data.')
756 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
757 if hasattr(model,'_randomizeActivities') and \
758 model._randomizeActivities and \
759 not randomize:
760 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
761 message('****** WARNING: Random model being screened with non-random data.')
762 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
763
764 trainIdx,testIdx = PrepareDataFromDetails(model,details,data)
765
766 nPossible = model.GetQuantBounds()[1]
767 if callback:
768 cb = lambda x,y=callback,z=i*data.GetNPts():y(x+z)
769 else:
770 cb = None
771 if not hasattr(details,'errorEstimate') or not details.errorEstimate:
772 errorEstimate = 0
773 else:
774 errorEstimate = 1
775 g,b,s,aG,aB,aS,vT = ShowVoteResults(testIdx,data,model,nPossible[-1],
776 details.threshold,verbose=0,
777 callback=cb,appendExamples=appendExamples,
778 goodVotes=goodVotes,badVotes=badVotes,
779 noVotes=noVotes,
780 errorEstimate=errorEstimate)
781 if voteTab is None:
782 voteTab = zeros(vT.shape,Float)
783 if hasattr(details,'errorAnalysis') and details.errorAnalysis:
784 for a,p,c,idx in badVotes:
785 label = testIdx[idx]
786 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
787 if a==details.enrichTgt:
788 badVoteDict[label] = badVoteDict.get(label,0)+1
789 else:
790 badVoteDict[label] = badVoteDict.get(label,0)+1
791 for a,p,c,idx in noVotes:
792 label = testIdx[idx]
793 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
794 if a==details.enrichTgt:
795 noVoteDict[label] = noVoteDict.get(label,0)+1
796 else:
797 noVoteDict[label] = noVoteDict.get(label,0)+1
798
799 voteTab += vT
800 nGood[i] = g
801 nBad[i] = b
802 nSkip[i] = s
803 confGood[i] = aG
804 confBad[i] = aB
805 confSkip[i] = aS
806
807 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
808 enrichments[i] = CalcEnrichment(vT,tgt=details.enrichTgt)
809
810 if nModels == 1:
811 return g,b,s,aG,aB,aS,vT
812 else:
813 voteTab /= nModels
814
815 avgNBad = sum(nBad)/nModels
816 devNBad = sqrt(sum((nBad-avgNBad)**2)/(nModels-1))
817
818 bestIdx = argsort(nBad)[0]
819
820 avgNGood = sum(nGood)/nModels
821 devNGood = sqrt(sum((nGood-avgNGood)**2)/(nModels-1))
822
823 avgNSkip = sum(nSkip)/nModels
824 devNSkip = sqrt(sum((nSkip-avgNSkip)**2)/(nModels-1))
825
826 avgConfBad = sum(confBad)/nModels
827 devConfBad = sqrt(sum((confBad-avgConfBad)**2)/(nModels-1))
828
829 avgConfGood = sum(confGood)/nModels
830 devConfGood = sqrt(sum((confGood-avgConfGood)**2)/(nModels-1))
831
832 avgConfSkip = sum(confSkip)/nModels
833 devConfSkip = sqrt(sum((confSkip-avgConfSkip)**2)/(nModels-1))
834 return (avgNGood,devNGood),(avgNBad,devNBad),(avgNSkip,devNSkip),\
835 (avgConfGood,devConfGood),(avgConfBad,devConfBad),(avgConfSkip,devConfSkip),\
836 voteTab
837
838
840 if not hasPil:
841 return None
842 try:
843 nTot = float(nGood)+float(nBad)+float(nRej)
844 except TypeError:
845 nGood = nGood[0]
846 nBad = nBad[0]
847 nRej = nRej[0]
848 nTot = float(nGood)+float(nBad)+float(nRej)
849
850 if not nTot:
851 return None
852 goodColor = (100,100,255)
853 badColor = (255,100,100)
854 rejColor = (255,255,100)
855
856 pctGood = float(nGood) / nTot
857 pctBad = float(nBad) / nTot
858 pctRej = float(nRej) / nTot
859
860 if size is None:
861 if RDConfig.doingDemo:
862 size = (200,200)
863 else:
864 size = (100,100)
865 img = Image.new('RGB',size,(255,255,255))
866 draw = ImageDraw.Draw(img)
867 box = (0,0,size[0]-1,size[1]-1)
868
869 startP = -90
870 endP = int(startP + pctGood*360)
871 draw.pieslice(box,startP,endP,fill=goodColor)
872 startP = endP
873 endP = int(startP + pctBad*360)
874 draw.pieslice(box,startP,endP,fill=badColor)
875 startP = endP
876 endP = int(startP + pctRej*360)
877 draw.pieslice(box,startP,endP,fill=rejColor)
878
879 return img
880
881
882 -def ScreenToHtml(nGood,nBad,nRej,avgGood,avgBad,avgSkip,voteTable,imgDir='.',
883 fullPage=1,skipImg=0,includeDefs=1):
884 """ returns the text of a web page showing the screening details
885 #DOC
886 **Arguments**
887
888 - nGood: number of correct predictions
889
890 - nBad: number of incorrect predictions
891
892 - nRej: number of rejected predictions
893
894 - avgGood: average correct confidence
895
896 - avgBad: average incorrect confidence
897
898 - avgSkip: average rejected confidence
899
900 - voteTable: vote table
901
902 - imgDir: (optional) the directory to be used to hold the vote
903 image (if constructed)
904
905 **Returns**
906
907 a string containing HTML
908
909 """
910 if type(nGood) == types.TupleType:
911 multModels=1
912 else:
913 multModels=0
914
915 if fullPage:
916 outTxt = ["""<html><body>"""]
917 outTxt.append('<center><h2>VOTE DETAILS</h2></center>')
918 else:
919 outTxt = []
920
921 if RDConfig.doingDemo:
922 outTxt.append('<font size="+2">')
923 else:
924 outTxt.append('<font>')
925
926
927 if not skipImg:
928 img = GetScreenImage(nGood,nBad,nRej)
929 if img:
930 if imgDir:
931 imgFileName = '/'.join((imgDir,'votes.png'))
932 else:
933 imgFileName = 'votes.png'
934 img.save(imgFileName)
935 outTxt.append('<center><img src="%s"></center>'%(imgFileName))
936
937 nPoss = len(voteTable)
938 pureCounts = sum(voteTable,1)
939 accCounts = sum(voteTable,0)
940 pureVect = zeros(nPoss,Float)
941 accVect = zeros(nPoss,Float)
942 for i in range(nPoss):
943 if pureCounts[i]:
944 pureVect[i] = float(voteTable[i,i])/pureCounts[i]
945 if accCounts[i]:
946 accVect[i] = float(voteTable[i,i])/accCounts[i]
947
948 outTxt.append('<center><table border=1>')
949 outTxt.append('<tr><td></td>')
950 for i in xrange(nPoss):
951 outTxt.append('<th>%d</th>'%i)
952 outTxt.append('<th>% Accurate</th>')
953 outTxt.append('</tr>')
954
955 for i in xrange(nPoss):
956 outTxt.append('<tr><th>%d</th>'%(i))
957 for j in xrange(nPoss):
958 if i == j:
959 if not multModels:
960 outTxt.append('<td bgcolor="#A0A0FF">%d</td>'%(voteTable[j,i]))
961 else:
962 outTxt.append('<td bgcolor="#A0A0FF">%.2f</td>'%(voteTable[j,i]))
963 else:
964 if not multModels:
965 outTxt.append('<td>%d</td>'%(voteTable[j,i]))
966 else:
967 outTxt.append('<td>%.2f</td>'%(voteTable[j,i]))
968 outTxt.append('<td>%4.2f</td</tr>'%(100.0*accVect[i]))
969 if i == 0:
970 outTxt.append('<th rowspan=%d>Predicted</th></tr>'%(nPoss))
971 else:
972 outTxt.append('</tr>')
973 outTxt.append('<tr><th>% Pure</th>')
974 for i in range(nPoss):
975 outTxt.append('<td>%4.2f</td>'%(100.0*pureVect[i]))
976 outTxt.append('</tr>')
977 outTxt.append('<tr><td></td><th colspan=%d>Original</th>'%(nPoss))
978 outTxt.append('</table></center>')
979
980
981 if not multModels:
982 nTotal = nBad+nGood+nRej
983 nClass = nBad+nGood
984 if nClass:
985 pctErr = 100.*float(nBad)/nClass
986 else:
987 pctErr = 0.0
988
989 outTxt.append('<p>%d of %d examples were misclassified (%%%4.2f)'%(nBad,nGood+nBad,pctErr))
990 if nRej > 0:
991 pctErr = 100.*float(nBad)/(nGood+nBad+nRej)
992 outTxt.append('<p> %d of %d overall: (%%%4.2f)'%(nBad,nTotal,pctErr))
993 pctRej = 100.*float(nRej)/nTotal
994 outTxt.append('<p>%d of %d examples were rejected (%%%4.2f)'%(nRej,nTotal,pctRej))
995 if nGood != 0:
996 outTxt.append('<p>The correctly classified examples had an average confidence of %6.4f'%avgGood)
997
998 if nBad != 0:
999 outTxt.append('<p>The incorrectly classified examples had an average confidence of %6.4f'%avgBad)
1000 if nRej != 0:
1001 outTxt.append('<p>The rejected examples had an average confidence of %6.4f'%avgSkip)
1002 else:
1003 nTotal = nBad[0]+nGood[0]+nRej[0]
1004 nClass = nBad[0]+nGood[0]
1005 devClass = nBad[1]+nGood[1]
1006 if nClass:
1007 pctErr = 100.*float(nBad[0])/nClass
1008 devPctErr = 100.*float(nBad[1])/nClass
1009 else:
1010 pctErr = 0.0
1011 devPctErr = 0.0
1012
1013 outTxt.append('<p>%.2f(%.2f) of %.2f(%.2f) examples were misclassified (%%%4.2f(%4.2f))'%\
1014 (nBad[0],nBad[1],nClass,devClass,pctErr,devPctErr))
1015 if nRej > 0:
1016 pctErr = 100.*float(nBad[0])/nTotal
1017 devPctErr = 100.*float(nBad[1])/nTotal
1018 outTxt.append('<p> %.2f(%.2f) of %d overall: (%%%4.2f(%4.2f))'%\
1019 (nBad[0],nBad[1],nTotal,pctErr,devPctErr))
1020 pctRej = 100.*float(nRej[0])/nTotal
1021 devPctRej = 100.*float(nRej[1])/nTotal
1022 outTxt.append('<p>%.2f(%.2f) of %d examples were rejected (%%%4.2f(%4.2f))'%\
1023 (nRej[0],nRej[1],nTotal,pctRej,devPctRej))
1024 if nGood != 0:
1025 outTxt.append('<p>The correctly classified examples had an average confidence of %6.4f(%.4f)'%avgGood)
1026
1027 if nBad != 0:
1028 outTxt.append('<p>The incorrectly classified examples had an average confidence of %6.4f(%.4f)'%avgBad)
1029 if nRej != 0:
1030 outTxt.append('<p>The rejected examples had an average confidence of %6.4f(%.4f)'%avgSkip)
1031
1032
1033
1034 outTxt.append('</font>')
1035 if includeDefs:
1036 txt = """
1037 <p><b>Definitions:</b>
1038 <ul>
1039 <li> <i>% Pure:</i> The percentage of, for example, known positives predicted to be positive.
1040 <li> <i>% Accurate:</i> The percentage of, for example, predicted positives that actually
1041 are positive.
1042 </ul>
1043 """
1044 outTxt.append(txt)
1045
1046 if fullPage:
1047 outTxt.append("""</body></html>""")
1048 return '\n'.join(outTxt)
1049
1050
1051 -def MakePredPlot(details,indices,data,goodVotes,badVotes,nRes,idCol=0,verbose=0):
1052 """
1053
1054 **Arguments**
1055
1056 - details: a CompositeRun.RunDetails object
1057
1058 - indices: a sequence of integer indices into _data_
1059
1060 - data: the data set in question. We assume that the ids for
1061 the data points are in the _idCol_ column
1062
1063 - goodVotes/badVotes: predictions where the model was correct/incorrect.
1064 These are sequences of 4-tuples:
1065 (answer,prediction,confidence,index into _indices_)
1066
1067 """
1068 if not hasattr(details,'predPlot') or not details.predPlot:
1069 return
1070
1071 if verbose: message('\n-> Constructing Prediction (Hanneke) Plot')
1072 outF = open(details.predPlot,'w+')
1073 gnuF = open('%s.gnu'%details.predPlot,'w+')
1074
1075 ptIds = [data[x][idCol] for x in indices]
1076
1077
1078
1079 origConn = DbConnect(details.dbName,details.tableName,
1080 user=details.dbUser,password=details.dbPassword)
1081 colNames = origConn.GetColumnNames()
1082 idName = colNames[idCol]
1083 if not hasattr(details,'predActTable') or \
1084 not details.predActTable or \
1085 details.predActTable==details.tableName:
1086 actConn = origConn
1087 else:
1088 actConn = DbConnect(details.dbName,details.predActTable,
1089 user=details.dbUser,password=details.dbPassword)
1090 if verbose: message('\t-> Pulling Activity Data')
1091 pts = []
1092
1093 if type(ptIds[0]) not in [type(''),type(u'')]:
1094 ptIds = [str(x) for x in ptIds]
1095 whereL = [DbModule.placeHolder]*len(ptIds)
1096 if hasattr(details,'predActCol') and details.predActCol:
1097 actColName=details.predActCol
1098 else:
1099 actColName = actConn.GetColumnNames()[-1]
1100
1101 whereTxt = "%s in (%s)"%(idName,','.join(whereL))
1102 rawD = actConn.GetData(fields='%s,%s'%(idName,actColName),
1103 where=whereTxt,extras=ptIds)
1104
1105 if verbose: message('\t-> Creating Plot')
1106 acts = [None]*len(ptIds)
1107 for entry in rawD:
1108 id,act = entry
1109 idx = ptIds.index(id)
1110 acts[idx] = act
1111 outF.write('#ID Pred Conf %s\n'%(actColName))
1112 for ans,pred,conf,idx in goodVotes:
1113 act = acts[idx]
1114 if act!='None':
1115 act= float(act)
1116 else:
1117 act=0
1118 outF.write('%s %d %.4f %f\n'%(ptIds[idx],pred,conf,act))
1119 for ans,pred,conf,idx in badVotes:
1120 act = acts[idx]
1121 if act!='None':
1122 act= float(act)
1123 else:
1124 act=0
1125 outF.write('%s %d %.4f %f\n'%(ptIds[idx],pred,conf,act))
1126 outF.close()
1127 if not hasattr(details,'predLogScale') or not details.predLogScale:
1128 actLabel = actColName
1129 else:
1130 actLabel= 'log(%s)'%(actColName)
1131 actLabel = actLabel.replace('_',' ')
1132 gnuHdr="""# Generated by ScreenComposite.py version: %s
1133 set size square 0.7
1134 set yrange [:1]
1135 set data styl points
1136 set ylab 'confidence'
1137 set xlab '%s'
1138 set grid
1139 set nokey
1140 set term postscript enh color solid "Helvetica" 16
1141 set term win
1142 """%(__VERSION_STRING,actLabel)
1143 gnuF.write(gnuHdr)
1144 plots = []
1145 for i in range(nRes):
1146 if not hasattr(details,'predLogScale') or not details.predLogScale:
1147 plots.append("'%s' us 4:($2==%d?$3:0/0)"%(details.predPlot,i))
1148 else:
1149 plots.append("'%s' us (log10($4)):($2==%d?$3:0/0)"%(details.predPlot,i))
1150 gnuF.write("plot %s\n"%(','.join(plots)))
1151 gnuTail="""
1152 # EOF
1153 """
1154 gnuF.write(gnuTail)
1155 gnuF.close()
1156 if hasattr(details,'predShow') and details.predShow:
1157 try:
1158 import os
1159 from Gnuplot import Gnuplot
1160 p = Gnuplot()
1161 p('cd "%s"'%(os.getcwd()))
1162 p('load "%s.gnu"'%(details.predPlot))
1163 raw_input('press return to continue...\n')
1164 except:
1165 import traceback
1166 traceback.print_exc()
1167
1168
1169
1172
1186
1188 """ prints a list of arguments for when this is used from the
1189 command line and then exits
1190
1191 """
1192 print __doc__
1193 sys.exit(-1)
1194
1196 """ prints the version number of the program
1197
1198 """
1199 print 'This is ScreenComposite.py version %s'%(__VERSION_STRING)
1200 if includeArgs:
1201 import sys
1202 print 'command line was:'
1203 print ' '.join(sys.argv)
1204
1206 import getopt
1207 try:
1208 args,extras = getopt.getopt(sys.argv[1:],'EDd:t:VN:HThSRF:v:AX',
1209 ['predPlot=','predActCol=','predActTable=',
1210 'predLogScale','predShow',
1211 'OOB','pickleCol=','enrich=',
1212 ])
1213 except:
1214 import traceback
1215 traceback.print_exc()
1216 Usage()
1217
1218 fName = ''
1219 details.reportToExcel=0
1220 details.predPlot=''
1221 details.predActCol=''
1222 details.predActTable=''
1223 details.predLogScale=''
1224 details.predShow=0
1225 details.errorEstimate=0
1226 details.pickleCol=-1
1227 details.enrichTgt=-1
1228 for arg,val in args:
1229 if arg == '-d':
1230 details.dbName = val
1231 elif arg == '-D':
1232 details.detailedScreen = 1
1233 elif arg == '-t':
1234 details.partialVote = 1
1235 voteTol = eval(val)
1236 if type(voteTol) not in [type([]),type((1,1))]:
1237 voteTol = [voteTol]
1238 for tol in voteTol:
1239 if tol > 1 or tol < 0:
1240 error('Voting threshold must be between 0 and 1')
1241 sys.exit(-2)
1242 details.screenVoteTol=voteTol
1243 elif arg == '-N':
1244 details.note=val
1245 elif arg == '-H':
1246 details.doTraining=0
1247 details.doHoldout=1
1248 elif arg == '-T':
1249 details.doHoldout=0
1250 details.doTraining=1
1251 elif arg == '-E':
1252 details.errorAnalysis=1
1253 details.detailedScreen=1
1254 elif arg == '-A':
1255 details.showAll=1
1256 details.detailedScreen=1
1257 elif arg == '-S':
1258 details.shuffleActivities=1
1259 elif arg == '-R':
1260 details.randomActivities=1
1261 elif arg == '-h':
1262 Usage()
1263 elif arg == '-F':
1264 details.filterFrac=float(val)
1265 elif arg == '-v':
1266 details.filterVal=float(val)
1267 elif arg == '-V':
1268 verbose=1
1269 elif arg == '-X':
1270 if Excel is not None:
1271 details.reportToExcel = 1
1272 details.detailedScreen=1
1273 else:
1274 message('NOTE: Excel support not enabled, -X option ignored.')
1275 verbose=1
1276 elif arg == '--predPlot':
1277 details.detailedScreen=1
1278 details.predPlot=val
1279 elif arg == '--predActCol':
1280 details.predActCol=val
1281 elif arg == '--predActTable':
1282 details.predActTable=val
1283 elif arg == '--predLogScale':
1284 details.predLogScale=1
1285 elif arg == '--predShow':
1286 details.predShow=1
1287 elif arg == '--predShow':
1288 details.predShow=1
1289 elif arg == '--OOB':
1290 details.errorEstimate=1
1291 elif arg == '--pickleCol':
1292 details.pickleCol=int(val)-1
1293 elif arg == '--enrich':
1294 details.enrichTgt=int(val)
1295 else:
1296 Usage()
1297
1298 if len(extras) < 1:
1299 Usage()
1300 return extras
1301
1302
1303 if __name__ == '__main__':
1304 details = SetDefaults()
1305 extras = ParseArgs(details)
1306 ShowVersion(includeArgs=1)
1307
1308 models = []
1309 if details.note and details.dbName:
1310 tblName = extras[0]
1311 message('-> Retrieving models from database')
1312 conn = DbConnect(details.dbName,tblName)
1313 blobs = conn.GetData(fields='model',where="where note='%s'"%(details.note))
1314 for blob in blobs:
1315 blob = blob[0]
1316 try:
1317 models.append(cPickle.loads(str(blob)))
1318 except:
1319 import traceback
1320 traceback.print_exc()
1321 message('Model load failed')
1322
1323 else:
1324 message('-> Loading model')
1325 modelFile=open(extras[0],'rb')
1326 models.append(cPickle.load(modelFile))
1327 if not len(models):
1328 error('No composite models found')
1329 sys.exit(-1)
1330 else:
1331 message('-> Working with %d models.'%len(models))
1332
1333 extras = extras[1:]
1334
1335 for fName in extras:
1336 if details.dbName != '':
1337 details.tableName = fName
1338 data = details.GetDataSet(pickleCol=details.pickleCol,
1339 pickleClass=DataStructs.ExplicitBitVect)
1340 else:
1341 data = DataUtils.BuildDataSet(fName)
1342 descNames = data.GetVarNames()
1343 nModels = len(models)
1344 screenResults = [None]*nModels
1345 dataSets = [None]*nModels
1346 message('-> Constructing and screening data sets')
1347 testIdx = range(data.GetNPts())
1348 trainIdx = testIdx
1349
1350 for modelIdx in range(nModels):
1351
1352 tmpD = data
1353 model = models[modelIdx]
1354 message('.',noRet=1)
1355
1356 try:
1357 seed = model._randomSeed
1358 except AttributeError:
1359 pass
1360 else:
1361 DataUtils.InitRandomNumbers(seed)
1362
1363 if details.shuffleActivities or details.randomActivities:
1364 shuffle = details.shuffleActivities
1365 random = 1
1366 DataUtils.RandomizeActivities(tmpD,shuffle=details.shuffleActivities,
1367 runDetails=details)
1368 else:
1369 random = 0
1370 shuffle = 0
1371
1372 if hasattr(model,'_shuffleActivities') and \
1373 model._shuffleActivities and \
1374 not shuffle:
1375 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1376 message('****** WARNING: Shuffled model being screened with unshuffled data.')
1377 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1378 if hasattr(model,'_randomizeActivities') and \
1379 model._randomizeActivities and \
1380 not randomize:
1381 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1382 message('****** WARNING: Random model being screened with non-random data.')
1383 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1384
1385 trainIdx,testIdx = PrepareDataFromDetails(model,details,tmpD,verbose=1)
1386 screenResults[modelIdx] = CollectResults(testIdx,tmpD,model,
1387 errorEstimate=details.errorEstimate)
1388 dataSets[modelIdx] = testIdx
1389 if details.reportToExcel and Excel is not None:
1390 xl = Excel()
1391 xlCol = 1
1392 xlRow = xl.FindLastRow(1,xlCol)
1393 if xl[xlRow,xlCol] is not None and str(xl[xlRow,xlCol]):
1394 xlRow += 1
1395 heads=['Tolerance']
1396 if details.note:
1397 heads.append('Note')
1398 if nModels > 1:
1399 heads += [
1400 'Mean(MisClass)','Dev(MisClass)',
1401 'Mean(Correct Conf)','Dev(Correct Conf)',
1402 'Mean(Incorrect Conf)','Dev(Incorrect Conf)',
1403 ]
1404 else:
1405 heads += [
1406 'MisClass',
1407 'Correct Conf',
1408 'Incorrect Conf',
1409 ]
1410 if models[0].GetActivityQuantBounds():
1411 nRes = len(models[0].GetActivityQuantBounds())+1
1412 else:
1413 nRes = models[0].GetQuantBounds()[1][-1]
1414 if nModels>1:
1415 for i in range(nRes):
1416 heads.append('Mean(Class %d %% pure)'%(i))
1417 for i in range(nRes):
1418 heads.append('Mean(Class %d %% correct)'%(i))
1419 else:
1420 for i in range(nRes):
1421 heads.append('Class %d %% pure'%(i))
1422 for i in range(nRes):
1423 heads.append('Class %d %% correct'%(i))
1424
1425 if nModels > 1:
1426 heads += [
1427 'Best(MisClass)',
1428 'Best(Correct Conf)',
1429 'Best(Incorrect Conf)',
1430 ]
1431 for i in range(len(heads)):
1432 xl[xlRow,xlCol+i] = heads[i]
1433 xl.Columns(xlCol+i).AutoFit()
1434 else:
1435 xl = None
1436 for tol in details.screenVoteTol:
1437 if len(details.screenVoteTol)>1:
1438 message('\n*****-----*****-----*****-----*****-----*****-----*****-----*****\n')
1439 message('Tolerance: %f'%tol)
1440 if xl:
1441 xlRow+=1
1442 xlCol = 1
1443 xl[xlRow,xlCol]=tol
1444 xlCol += 1
1445 if details.note:
1446 xl[xlRow,xlCol]=details.note
1447 xlCol += 1
1448 nGood = zeros(nModels,Float)
1449 nBad = zeros(nModels,Float)
1450 nSkip = zeros(nModels,Float)
1451 confGood = zeros(nModels,Float)
1452 confBad = zeros(nModels,Float)
1453 confSkip = zeros(nModels,Float)
1454 if details.enrichTgt >= 0:
1455 enrichments = zeros(nModels,Float)
1456 goodVoteDict = {}
1457 badVoteDict = {}
1458 noVoteDict = {}
1459 voteTab = None
1460 for modelIdx in range(nModels):
1461 model = models[modelIdx]
1462 model.SetInputOrder(descNames)
1463 testIdx = dataSets[modelIdx]
1464 screenRes = screenResults[modelIdx]
1465 if not details.detailedScreen:
1466 g,b,s,aG,aB,aS,vT = ScreenIt(model,testIdx,tmpD,details.partialVote,tol,
1467 verbose=details.verbose,screenResults=screenRes)
1468 else:
1469 if model.GetActivityQuantBounds():
1470 nRes = len(model.GetActivityQuantBounds())+1
1471 else:
1472 nRes = model.GetQuantBounds()[1][-1]
1473 badVotes = []
1474 noVotes = []
1475 if (hasattr(details,'showAll') and details.showAll) or \
1476 (hasattr(details,'predPlot') and details.predPlot):
1477 goodVotes = []
1478 else:
1479 goodVotes = None
1480 g,b,s,aG,aB,aS,vT = ShowVoteResults(testIdx,tmpD,model,nRes,tol,
1481 verbose=details.verbose,
1482 screenResults=screenRes,
1483 badVotes=badVotes,noVotes=noVotes,
1484 goodVotes=goodVotes,
1485 errorEstimate=details.errorEstimate)
1486 if voteTab is None:
1487 voteTab = zeros(vT.shape,Float)
1488 if details.errorAnalysis:
1489 for a,p,c,idx in badVotes:
1490 label = testIdx[idx]
1491 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
1492 if a==details.enrichTgt:
1493 badVoteDict[label] = badVoteDict.get(label,0)+1
1494 else:
1495 badVoteDict[label] = badVoteDict.get(label,0)+1
1496 for a,p,c,idx in noVotes:
1497 label = testIdx[idx]
1498 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
1499 if a==details.enrichTgt:
1500 noVoteDict[label] = noVoteDict.get(label,0)+1
1501 else:
1502 noVoteDict[label] = noVoteDict.get(label,0)+1
1503
1504 if hasattr(details,'showAll') and details.showAll:
1505 for a,p,c,idx in goodVotes:
1506 label = testIdx[idx]
1507 if details.enrichTgt >=0:
1508 if a==details.enrichTgt:
1509 goodVoteDict[label] = goodVoteDict.get(label,0)+1
1510 else:
1511 goodVoteDict[label] = goodVoteDict.get(label,0)+1
1512
1513 if details.enrichTgt>-1:
1514 enrichments[modelIdx] = CalcEnrichment(vT,tgt=details.enrichTgt)
1515
1516 voteTab += vT
1517 if details.detailedScreen and hasattr(details,'predPlot') and details.predPlot:
1518 MakePredPlot(details,testIdx,tmpD,goodVotes,badVotes,nRes,verbose=1)
1519
1520 if hasattr(details,'showAll') and details.showAll:
1521 print '-v-v-v-v-v-v-v- All Votes -v-v-v-v-v-v-v-'
1522 print 'id, prediction, confidence, flag(-1=skipped,0=wrong,1=correct)'
1523 for ans,pred,conf,idx in goodVotes:
1524 pt = tmpD[testIdx[idx]]
1525 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1526 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1527 print '%s, %d, %.4f, 1'%(str(pt[0]),pred,conf)
1528 for ans,pred,conf,idx in badVotes:
1529 pt = tmpD[testIdx[idx]]
1530 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1531 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1532 print '%s, %d, %.4f, 0'%(str(pt[0]),pred,conf)
1533 for ans,pred,conf,idx in noVotes:
1534 pt = tmpD[testIdx[idx]]
1535 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1536 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1537 print '%s, %d, %.4f, -1'%(str(pt[0]),pred,conf)
1538 print '-^-^-^-^-^-^-^- -^-^-^-^-^-^-^-'
1539
1540 nGood[modelIdx] = g
1541 nBad[modelIdx] = b
1542 nSkip[modelIdx] = s
1543 confGood[modelIdx] = aG
1544 confBad[modelIdx] = aB
1545 confSkip[modelIdx] = aS
1546 print
1547
1548 if nModels > 1:
1549 print '-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*'
1550 print 'AVERAGES:'
1551
1552 avgNBad = sum(nBad)/nModels
1553 devNBad = sqrt(sum((nBad-avgNBad)**2)/(nModels-1))
1554
1555 bestIdx = argsort(nBad)[0]
1556
1557 avgNGood = sum(nGood)/nModels
1558 devNGood = sqrt(sum((nGood-avgNGood)**2)/(nModels-1))
1559
1560 avgNSkip = sum(nSkip)/nModels
1561 devNSkip = sqrt(sum((nSkip-avgNSkip)**2)/(nModels-1))
1562
1563 avgConfBad = sum(confBad)/nModels
1564 devConfBad = sqrt(sum((confBad-avgConfBad)**2)/(nModels-1))
1565
1566 avgConfGood = sum(confGood)/nModels
1567 devConfGood = sqrt(sum((confGood-avgConfGood)**2)/(nModels-1))
1568
1569 avgConfSkip = sum(confSkip)/nModels
1570 devConfSkip = sqrt(sum((confSkip-avgConfSkip)**2)/(nModels-1))
1571
1572 nClassified = avgNGood + avgNBad
1573 nExamples = nClassified + avgNSkip
1574 print 'Misclassifications: \t%%%5.2f(%%%5.2f) %4.1f(%4.1f) / %d'%(100*avgNBad/nExamples,
1575 100*devNBad/nExamples,
1576 avgNBad,devNBad,
1577 nExamples)
1578 if avgNSkip>0:
1579 print '\tthreshold: \t%%%5.2f(%%%5.2f) %4.1f(%4.1f) / %d'%(100*avgNBad/nClassified,
1580 100*devNBad/nClassified,
1581 avgNBad,devNBad,
1582 nClassified)
1583 print
1584 print 'Number Skipped: %%%4.2f(%%%4.2f) %4.2f(%4.2f)'%(100*avgNSkip/nExamples,
1585 100*devNSkip/nExamples,
1586 avgNSkip,devNSkip)
1587
1588
1589 print
1590 print 'Confidences:'
1591 print '\tCorrect: \t%4.2f(%4.2f)'%(100*avgConfGood,100*devConfGood)
1592 print '\tIncorrect: \t%4.2f(%4.2f)'%(100*avgConfBad,100*devConfBad)
1593 if avgNSkip>0:
1594 print '\tSkipped: \t%4.2f(%4.2f)'%(100*avgConfSkip,100*devConfSkip)
1595
1596 if xl:
1597 xl[xlRow,xlCol]=100.*avgNBad/nExamples
1598 xlCol+=1
1599 xl[xlRow,xlCol]=100.*devNBad/nExamples
1600 xlCol+=1
1601 xl[xlRow,xlCol]=100.*avgConfGood
1602 xlCol+=1
1603 xl[xlRow,xlCol]=100.*devConfGood
1604 xlCol += 1
1605 xl[xlRow,xlCol]=100.*avgConfBad
1606 xlCol+=1
1607 xl[xlRow,xlCol]=100.*devConfBad
1608 xlCol += 1
1609
1610 if details.detailedScreen:
1611 message('Results Table:')
1612 voteTab = transpose(voteTab)/nModels
1613 nResultCodes = len(voteTab)
1614 colCounts = sum(voteTab)
1615 rowCounts = sum(voteTab,1)
1616 print
1617 for i in range(nResultCodes):
1618 if rowCounts[i]==0: rowCounts[i]=1
1619 row = voteTab[i]
1620 message(' ',noRet=1)
1621 for j in range(nResultCodes):
1622 entry = row[j]
1623 message(' % 6.2f'%entry,noRet=1)
1624 message(' | % 4.2f'%(100.*voteTab[i,i]/rowCounts[i]))
1625 message(' ',noRet=1)
1626 for i in range(nResultCodes):
1627 message('-------',noRet=1)
1628 message('')
1629 message(' ',noRet=1)
1630 for i in range(nResultCodes):
1631 if colCounts[i]==0: colCounts[i]=1
1632 message(' % 6.2f'%(100.*voteTab[i,i]/colCounts[i]),noRet=1)
1633 message('')
1634 if xl:
1635 for i in range(nResultCodes):
1636 xl[xlRow,xlCol]=100.*voteTab[i,i]/rowCounts[i]
1637 xlCol += 1
1638 for i in range(nResultCodes):
1639 xl[xlRow,xlCol]=100.*voteTab[i,i]/colCounts[i]
1640 xlCol += 1
1641
1642 if details.enrichTgt >-1:
1643 mean = sum(enrichments)/nModels
1644 enrichments -= mean
1645 dev = sqrt(sum(enrichments*enrichments))/(nModels-1)
1646 message(' Enrichment of value %d: %.4f (%.4f)'%(details.enrichTgt,mean,dev))
1647 else:
1648 bestIdx=0
1649 print '------------------------------------------------'
1650 print 'Best Model: ',bestIdx+1
1651 bestBad = nBad[bestIdx]
1652 bestGood = nGood[bestIdx]
1653 bestSkip = nSkip[bestIdx]
1654 nClassified = bestGood + bestBad
1655 nExamples = nClassified + bestSkip
1656 print 'Misclassifications: \t%%%5.2f %d / %d'%(100*bestBad/nExamples,
1657 bestBad,nExamples)
1658 if bestSkip>0:
1659 print '\tthreshold: \t%%%5.2f %d / %d'%(100*bestBad/nClassified,
1660 bestBad,nClassified)
1661 print
1662 print 'Number Skipped: %%%4.2f %d'%(100*bestSkip/nExamples,
1663 bestSkip)
1664
1665 print
1666 print 'Confidences:'
1667 print '\tCorrect: \t%4.2f'%(100*confGood[bestIdx])
1668 print '\tIncorrect: \t%4.2f'%(100*confBad[bestIdx])
1669 if bestSkip>0:
1670 print '\tSkipped: \t%4.2f'%(100*confSkip[bestIdx])
1671 if xl:
1672 xl[xlRow,xlCol]=100.*bestBad/nExamples
1673 xlCol+=1
1674 xl[xlRow,xlCol]=100.*confGood[bestIdx]
1675 xlCol+=1
1676 xl[xlRow,xlCol]=100.*confBad[bestIdx]
1677 xlCol+=1
1678
1679
1680 if nModels == 1 and details.detailedScreen:
1681 message('')
1682 message('Results Table:')
1683 voteTab = transpose(vT)
1684 nResultCodes = len(vT)
1685 colCounts = sum(voteTab)
1686 rowCounts = sum(voteTab,1)
1687 message('')
1688 for i in range(nResultCodes):
1689 if rowCounts[i]==0: rowCounts[i]=1
1690 row = voteTab[i]
1691 message(' ',noRet=1)
1692 for j in range(nResultCodes):
1693 entry = row[j]
1694 message(' % 6.2f'%entry,noRet=1)
1695 message(' | % 4.2f'%(100.*voteTab[i,i]/rowCounts[i]))
1696 message(' ',noRet=1)
1697 for i in range(nResultCodes):
1698 message('-------',noRet=1)
1699 message('')
1700 message(' ',noRet=1)
1701 for i in range(nResultCodes):
1702 if colCounts[i]==0: colCounts[i]=1
1703 message(' % 6.2f'%(100.*voteTab[i,i]/colCounts[i]),noRet=1)
1704 message('')
1705 if xl:
1706 for i in range(nResultCodes):
1707 xl[xlRow,xlCol]=100.*voteTab[i,i]/rowCounts[i]
1708 xlCol += 1
1709 for i in range(nResultCodes):
1710 xl[xlRow,xlCol]=100.*voteTab[i,i]/colCounts[i]
1711 xlCol += 1
1712 if details.errorAnalysis:
1713 message('\n*-*-*-*-*-*-*-*- ERROR ANALYSIS -*-*-*-*-*-*-*-*\n')
1714 ks = badVoteDict.keys()
1715 if len(ks):
1716 message(' ---> Bad Vote Counts')
1717 if xl:
1718 xlRow += 1
1719 xl[xlRow,1] = 'Misclassification Counts:'
1720 xlRow += 1
1721 xl[xlRow,1] = 'ID'
1722 xl[xlRow,2] = 'Num_Misses'
1723 xlRow += 1
1724 for k in ks:
1725 pt = data[k]
1726 message('%s,%d'%(str(pt[0]),badVoteDict[k]))
1727 if xl:
1728 xl[xlRow,1] = "'%s"%str(pt[0])
1729 xl[xlRow,2] = badVoteDict[k]
1730 xlRow += 1
1731
1732 ks = noVoteDict.keys()
1733 if len(ks):
1734 message(' ---> Skipped Compound Counts')
1735 if xl:
1736 xl[xlRow,1] = 'Skipped Compound Counts:'
1737 xlRow += 1
1738 for k in ks:
1739 pt = data[k]
1740 message('%s,%d'%(str(pt[0]),noVoteDict[k]))
1741 if xl:
1742 xl[xlRow,1] = "'%s"%str(pt[0])
1743 xl[xlRow,2] = noVoteDict[k]
1744 xlRow += 1
1745
1746 if hasattr(details,'showAll') and details.showAll:
1747 ks = goodVoteDict.keys()
1748 if len(ks):
1749 message(' ---> Good Vote Counts')
1750 if xl:
1751 xlRow += 1
1752 xl[xlRow,1] = 'Correct Classification Counts:'
1753 xlRow += 1
1754 xl[xlRow,1] = 'ID'
1755 xl[xlRow,2] = 'Num_Picks'
1756 xlRow += 1
1757 for k in ks:
1758 pt = data[k]
1759 message('%s,%d'%(str(pt[0]),goodVoteDict[k]))
1760 if xl:
1761 xl[xlRow,1] = "'%s"%str(pt[0])
1762 xl[xlRow,2] = goodVoteDict[k]
1763 xlRow += 1
1764