1
2
3
4
5
6
7
8 """ command line utility for growing composite models
9
10 **Usage**
11
12 _GrowComposite [optional args] filename_
13
14 **Command Line Arguments**
15
16 - -n *count*: number of new models to build
17
18 - -C *pickle file name*: name of file containing composite upon which to build.
19
20 - --inNote *note*: note to be used in loading composite models from the database
21 for growing
22
23 - --balTable *table name*: table from which to take the original data set
24 (for balancing)
25
26 - --balWeight *weight*: (between 0 and 1) weighting factor for the new data
27 (for balancing). OR, *weight* can be a list of weights
28
29 - --balCnt *count*: number of individual models in the balanced composite
30 (for balancing)
31
32 - --balH: use only the holdout set from the original data set in the balancing
33 (for balancing)
34
35 - --balT: use only the training set from the original data set in the balancing
36 (for balancing)
37
38 - -S: shuffle the original data set
39 (for balancing)
40
41 - -r: randomize the activities of the original data set
42 (for balancing)
43
44 - -N *note*: note to be attached to the grown composite when it's saved in the
45 database
46
47 - --outNote *note*: equivalent to -N
48
49 - -o *filename*: name of an output file to hold the pickled composite after
50 it has been grown.
51 If multiple balance weights are used, the weights will be added to
52 the filenames.
53
54 - -L *limit*: provide an (integer) limit on individual model complexity
55
56 - -d *database name*: instead of reading the data from a QDAT file,
57 pull it from a database. In this case, the _filename_ argument
58 provides the name of the database table containing the data set.
59
60 - -p *tablename*: store persistence data in the database
61 in table *tablename*
62
63 - -l: locks the random number generator to give consistent sets
64 of training and hold-out data. This is primarily intended
65 for testing purposes.
66
67 - -g: be less greedy when training the models.
68
69 - -G *number*: force trees to be rooted at descriptor *number*.
70
71 - -D: show a detailed breakdown of the composite model performance
72 across the training and, when appropriate, hold-out sets.
73
74 - -t *threshold value*: use high-confidence predictions for the final
75 analysis of the hold-out data.
76
77 - -q *list string*: Add QuantTrees to the composite and use the list
78 specified in *list string* as the number of target quantization
79 bounds for each descriptor. Don't forget to include 0's at the
80 beginning and end of *list string* for the name and value fields.
81 For example, if there are 4 descriptors and you want 2 quant bounds
82 apiece, you would use _-q "[0,2,2,2,2,0]"_.
83 Two special cases:
84 1) If you would like to ignore a descriptor in the model building,
85 use '-1' for its number of quant bounds.
86 2) If you have integer valued data that should not be quantized
87 further, enter 0 for that descriptor.
88
89 - -V: print the version number and exit
90
91 """
92 import RDConfig
93 from Numeric import *
94 from ML.Data import DataUtils,SplitData
95 from ML import ScreenComposite,BuildComposite
96 from ML.Composite import AdjustComposite
97 from Dbase.DbConnection import DbConnect
98 from ML import CompositeRun
99 import sys,cPickle,time,types
100
101 _runDetails = CompositeRun.CompositeRun()
102
103 __VERSION_STRING="0.5.0"
104
105 _verbose = 1
107 """ emits messages to _sys.stdout_
108 override this in modules which import this one to redirect output
109
110 **Arguments**
111
112 - msg: the string to be displayed
113
114 """
115 if _verbose: sys.stdout.write('%s\n'%(msg))
116
117 -def GrowIt(details,composite,progressCallback=None,
118 saveIt=1,setDescNames=0,data=None):
119 """ does the actual work of building a composite model
120
121 **Arguments**
122
123 - details: a _CompositeRun.CompositeRun_ object containing details
124 (options, parameters, etc.) about the run
125
126 - composite: the composite model to grow
127
128 - progressCallback: (optional) a function which is called with a single
129 argument (the number of models built so far) after each model is built.
130
131 - saveIt: (optional) if this is nonzero, the resulting model will be pickled
132 and dumped to the filename specified in _details.outName_
133
134 - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method
135 will be called using the results of the data set's _GetVarNames()_ method;
136 it is assumed that the details object has a _descNames attribute which
137 is passed to the composites _SetDescriptorNames()_ method. Otherwise
138 (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_.
139
140 - data: (optional) the data set to be used. If this is not provided, the
141 data set described in details will be used.
142
143 **Returns**
144
145 the enlarged composite model
146
147
148 """
149 details.rundate = time.asctime()
150
151 if data is None:
152 fName = details.tableName.strip()
153 if details.outName == '':
154 details.outName = fName + '.pkl'
155 if details.dbName == '':
156 data = DataUtils.BuildQuantDataSet(fName)
157 elif details.qBounds != []:
158 details.tableName = fName
159 data = details.GetDataSet()
160 else:
161 data = DataUtils.DBToQuantData(details.dbName,fName,quantName=details.qTableName,
162 user=details.dbUser,password=details.dbPassword)
163
164 nExamples = data.GetNPts()
165 seed = composite._randomSeed
166 DataUtils.InitRandomNumbers(seed)
167 testExamples = []
168 if details.shuffleActivities == 1:
169 DataUtils.RandomizeActivities(data,shuffle=1,runDetails=details)
170 elif details.randomActivities == 1:
171 DataUtils.RandomizeActivities(data,shuffle=0,runDetails=details)
172
173 namedExamples = data.GetNamedData()
174 trainExamples = namedExamples
175 nExamples = len(trainExamples)
176 message('Training with %d examples'%(nExamples))
177 message('\t%d descriptors'%(len(trainExamples[0])-2))
178 nVars = data.GetNVars()
179 nPossibleVals = composite.nPossibleVals
180 attrs = range(1,nVars+1)
181
182 if details.useTrees:
183 from ML.DecTree import CrossValidate,PruneTree
184 if details.qBounds != []:
185 from ML.DecTree import BuildQuantTree
186 builder = BuildQuantTree.QuantTreeBoot
187 else:
188 from ML.DecTree import ID3
189 builder = ID3.ID3Boot
190 driver = CrossValidate.CrossValidationDriver
191 pruner = PruneTree.PruneTree
192
193 if setDescNames:
194 composite.SetInputOrder(data.GetVarNames())
195 composite.Grow(trainExamples,attrs,[0]+nPossibleVals,
196 buildDriver=driver,
197 pruner=pruner,
198 nTries=details.nModels,pruneIt=details.pruneIt,
199 lessGreedy=details.lessGreedy,needsQuantization=0,
200 treeBuilder=builder,nQuantBounds=details.qBounds,
201 startAt=details.startAt,
202 maxDepth=details.limitDepth,
203 progressCallback=progressCallback,
204 silent=not _verbose)
205
206
207 else:
208 from ML.Neural import CrossValidate
209 driver = CrossValidate.CrossValidationDriver
210 composite.Grow(trainExamples,attrs,[0]+nPossibleVals,nTries=details.nModels,
211 buildDriver=driver,needsQuantization=0)
212
213 composite.AverageErrors()
214 composite.SortModels()
215 modelList,counts,avgErrs = composite.GetAllData()
216 counts = array(counts)
217 avgErrs = array(avgErrs)
218 composite._varNames = data.GetVarNames()
219
220 for i in xrange(len(modelList)):
221 modelList[i].NameModel(composite._varNames)
222
223
224 weightedErrs = counts*avgErrs
225 averageErr = sum(weightedErrs)/sum(counts)
226 devs = (avgErrs - averageErr)
227 devs = devs * counts
228 devs = sqrt(devs*devs)
229 avgDev = sum(devs)/sum(counts)
230 if _verbose:
231 message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f'%(100.*averageErr,100.*avgDev))
232
233 if details.bayesModel:
234 composite.Train(trainExamples,verbose=0)
235
236 badExamples = []
237 if not details.detailedRes:
238 if _verbose:
239 message('Testing all examples')
240 wrong = BuildComposite.testall(composite,namedExamples,badExamples)
241 if _verbose:
242 message('%d examples (%% %5.2f) were misclassified'%(len(wrong),100.*float(len(wrong))/float(len(namedExamples))))
243 _runDetails.overall_error = float(len(wrong))/len(namedExamples)
244
245 if details.detailedRes:
246 if _verbose:
247 message('\nEntire data set:')
248 resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()),data,composite,
249 nPossibleVals[-1],details.threshold)
250 nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab = resTup
251 nPts = len(namedExamples)
252 nClass = nGood+nBad
253 _runDetails.overall_error = float(nBad) / nClass
254 _runDetails.overall_correct_conf = avgGood
255 _runDetails.overall_incorrect_conf = avgBad
256 _runDetails.overall_result_matrix = repr(voteTab)
257 nRej = nClass-nPts
258 if nRej > 0:
259 _runDetails.overall_fraction_dropped = float(nRej)/nPts
260
261 return composite
262
274
276 """ balances the composite using the parameters provided in details
277
278 **Arguments**
279
280 - details a _CompositeRun.RunDetails_ object
281
282 - composite: the composite model to be balanced
283
284 - data1: (optional) if provided, this should be the
285 data set used to construct the original models
286
287 - data2: (optional) if provided, this should be the
288 data set used to construct the new individual models
289
290 """
291 if not details.balCnt or details.balCnt > len(composite):
292 return composite
293 message("Balancing Composite")
294
295
296
297
298
299 if data1 is None:
300 message("\tReading First Data Set")
301 fName = details.balTable.strip()
302 tmp = details.tableName
303 details.tableName = fName
304 dbName = details.dbName
305 details.dbName = details.balDb
306 data1 = details.GetDataSet()
307 details.tableName = tmp
308 details.dbName = dbName
309 if data1 is None:
310 return composite
311 details.splitFrac = composite._splitFrac
312 details.randomSeed = composite._randomSeed
313 DataUtils.InitRandomNumbers(details.randomSeed)
314 if details.shuffleActivities == 1:
315 DataUtils.RandomizeActivities(data1,shuffle=1,runDetails=details)
316 elif details.randomActivities == 1:
317 DataUtils.RandomizeActivities(data1,shuffle=0,runDetails=details)
318 namedExamples = data1.GetNamedData()
319 if details.balDoHoldout or details.balDoTrain:
320 trainIdx,testIdx = SplitData.SplitIndices(len(namedExamples),details.splitFrac,
321 silent=1)
322 trainExamples = [namedExamples[x] for x in trainIdx]
323 testExamples = [namedExamples[x] for x in testIdx]
324 if details.filterFrac != 0.0:
325 trainIdx,temp = DataUtils.FilterData(trainExamples,details.filterVal,
326 details.filterFrac,-1,
327 indicesOnly=1)
328 tmp = [trainExamples[x] for x in trainIdx]
329 testExamples += [trainExamples[x] for x in temp]
330 trainExamples = tmp
331 if details.balDoHoldout:
332 testExamples,trainExamples = trainExamples,testExamples
333 else:
334 trainExamples = namedExamples
335 dataSet1 = trainExamples
336 cols1 = [x.upper() for x in data1.GetVarNames()]
337 data1 = None
338
339
340
341
342 if data2 is None:
343 message("\tReading Second Data Set")
344 data2 = details.GetDataSet()
345 if data2 is None:
346 return composite
347 details.splitFrac = composite._splitFrac
348 details.randomSeed = composite._randomSeed
349 DataUtils.InitRandomNumbers(details.randomSeed)
350 if details.shuffleActivities == 1:
351 DataUtils.RandomizeActivities(data2,shuffle=1,runDetails=details)
352 elif details.randomActivities == 1:
353 DataUtils.RandomizeActivities(data2,shuffle=0,runDetails=details)
354 dataSet2 = data2.GetNamedData()
355 cols2 = [x.upper() for x in data2.GetVarNames()]
356 data2 = None
357
358
359 res = []
360 weights = details.balWeight
361 if type(weights) not in (types.TupleType,types.ListType):
362 weights = (weights,)
363 for weight in weights:
364 message("\tBalancing with Weight: %.4f"%(weight))
365 res.append(AdjustComposite.BalanceComposite(composite,dataSet1,dataSet2,
366 weight,
367 details.balCnt,
368 names1=cols1,names2=cols2))
369 return res
370
372 """ prints the version number
373
374 """
375 print 'This is GrowComposite.py version %s'%(__VERSION_STRING)
376 if includeArgs:
377 import sys
378 print 'command line was:'
379 print ' '.join(sys.argv)
380
382 """ provides a list of arguments for when this is used from the command line
383
384 """
385 import sys
386 print __doc__
387 sys.exit(-1)
388
390 """ initializes a details object with default values
391
392 **Arguments**
393
394 - details: (optional) a _CompositeRun.CompositeRun_ object.
395 If this is not provided, the global _runDetails will be used.
396
397 **Returns**
398
399 the initialized _CompositeRun_ object.
400
401
402 """
403 if runDetails is None: runDetails = _runDetails
404 return CompositeRun.SetDefaults(runDetails)
405
407 """ parses command line arguments and updates _runDetails_
408
409 **Arguments**
410
411 - runDetails: a _CompositeRun.CompositeRun_ object.
412
413 """
414 import getopt
415 args,extra = getopt.getopt(sys.argv[1:],'P:o:n:p:b:sf:F:v:hlgd:rSTt:Q:q:DVG:L:C:N:',
416 ['inNote=','outNote=','balTable=','balWeight=','balCnt=',
417 'balH','balT','balDb=',])
418 runDetails.inNote=''
419 runDetails.composFileName=''
420 runDetails.balTable=''
421 runDetails.balWeight=(0.5,)
422 runDetails.balCnt=0
423 runDetails.balDoHoldout=0
424 runDetails.balDoTrain=0
425 runDetails.balDb=''
426 for arg,val in args:
427 if arg == '-n':
428 runDetails.nModels = int(val)
429 elif arg == '-C':
430 runDetails.composFileName=val
431 elif arg=='--balTable':
432 runDetails.balTable=val
433 elif arg=='--balWeight':
434 runDetails.balWeight=eval(val)
435 if type(runDetails.balWeight) not in (types.TupleType,types.ListType):
436 runDetails.balWeight=(runDetails.balWeight,)
437 elif arg=='--balCnt':
438 runDetails.balCnt=int(val)
439 elif arg=='--balH':
440 runDetails.balDoHoldout=1
441 elif arg=='--balT':
442 runDetails.balDoTrain=1
443 elif arg=='--balDb':
444 runDetails.balDb=val
445 elif arg == '--inNote':
446 runDetails.inNote=val
447 elif arg == '-N' or arg=='--outNote':
448 runDetails.note=val
449 elif arg == '-o':
450 runDetails.outName = val
451 elif arg == '-p':
452 runDetails.persistTblName=val
453 elif arg == '-r':
454 runDetails.randomActivities = 1
455 elif arg == '-S':
456 runDetails.shuffleActivities = 1
457 elif arg == '-h':
458 Usage()
459 elif arg == '-l':
460 runDetails.lockRandom = 1
461 elif arg == '-g':
462 runDetails.lessGreedy=1
463 elif arg == '-G':
464 runDetails.startAt = int(val)
465 elif arg == '-d':
466 runDetails.dbName=val
467 elif arg == '-T':
468 runDetails.useTrees = 0
469 elif arg == '-t':
470 runDetails.threshold=float(val)
471 elif arg == '-D':
472 runDetails.detailedRes = 1
473 elif arg == '-L':
474 runDetails.limitDepth = int(val)
475 elif arg == '-q':
476 qBounds = eval(val)
477 assert type(qBounds) in (types.TupleType,types.ListType),'bad argument type for -q, specify a list as a string'
478 runDetails.qBoundCount=val
479 runDetails.qBounds = qBounds
480 elif arg == '-Q':
481 qBounds = eval(val)
482 assert type(qBounds) in [type([]),type(())],'bad argument type for -Q, specify a list as a string'
483 runDetails.activityBounds=qBounds
484 runDetails.activityBoundsVals=val
485 elif arg == '-V':
486 ShowVersion()
487 sys.exit(0)
488 else:
489 print >>sys.stderr,'bad argument:',arg
490 Usage()
491 runDetails.tableName=extra[0]
492 if not runDetails.balDb:
493 runDetails.balDb=runDetails.dbName
494 if __name__ == '__main__':
495 if len(sys.argv) < 2:
496 Usage()
497
498 _runDetails.cmd = ' '.join(sys.argv)
499 SetDefaults(_runDetails)
500 ParseArgs(_runDetails)
501
502 ShowVersion(includeArgs=1)
503
504 initModels = GetComposites(_runDetails)
505 nModels = len(initModels)
506 if nModels>1:
507 for i in range(nModels):
508 sys.stderr.write('---------------------------------\n\tDoing %d of %d\n---------------------------------\n'%(i+1,nModels))
509 composite = GrowIt(_runDetails,initModels[i],setDescNames=1)
510 if _runDetails.balTable and _runDetails.balCnt:
511 composites = BalanceComposite(_runDetails,composite)
512 else:
513 composites=[composite]
514 for mdl in composites:
515 mdl.ClearModelExamples()
516 if _runDetails.outName:
517 nWeights = len(_runDetails.balWeight)
518 if nWeights==1:
519 outName = _runDetails.outName
520 composites[0].Pickle(outName)
521 else:
522 for i in range(nWeights):
523 weight = int(100*_runDetails.balWeight[i])
524 model = composites[i]
525 outName = '%s.%d.pkl'%(_runDetails.outName.split('.pkl')[0],weight)
526 model.Pickle(outName)
527 if _runDetails.persistTblName and _runDetails.dbName:
528 message('Updating results table %s:%s'%(_runDetails.dbName,_runDetails.persistTblName))
529 if(len(_runDetails.balWeight))>1:
530 message('WARNING: updating results table with models having different weights')
531
532 for i in range(len(composites)):
533 _runDetails.model = cPickle.dumps(composites[i])
534 _runDetails.Store(db=_runDetails.dbName,table=_runDetails.persistTblName)
535 elif nModels==1:
536 composite = GrowIt(_runDetails,initModels[0],setDescNames=1)
537 if _runDetails.balTable and _runDetails.balCnt:
538 composites = BalanceComposite(_runDetails,composite)
539 else:
540 composites=[composite]
541 for mdl in composites:
542 mdl.ClearModelExamples()
543 if _runDetails.outName:
544 nWeights = len(_runDetails.balWeight)
545 if nWeights==1:
546 outName = _runDetails.outName
547 composites[0].Pickle(outName)
548 else:
549 for i in range(nWeights):
550 weight = int(100*_runDetails.balWeight[i])
551 model = composites[i]
552 outName = '%s.%d.pkl'%(_runDetails.outName.split('.pkl')[0],weight)
553 model.Pickle(outName)
554 if _runDetails.persistTblName and _runDetails.dbName:
555 message('Updating results table %s:%s'%(_runDetails.dbName,_runDetails.persistTblName))
556 if(len(composites))>1:
557 message('WARNING: updating results table with models having different weights')
558 for i in range(len(composites)):
559 _runDetails.model = cPickle.dumps(composites[i])
560 _runDetails.Store(db=_runDetails.dbName,table=_runDetails.persistTblName)
561 else:
562 message("No models found")
563