1
2
3
4
5
6
7 """ command line utility to report on the contributions of descriptors to
8 tree-based composite models
9
10 Usage: AnalyzeComposite [optional args] <models>
11
12 <models>: file name(s) of pickled composite model(s)
13 (this is the name of the db table if using a database)
14
15 Optional Arguments:
16
17 -n number: the number of levels of each model to consider
18
19 -d dbname: the database from which to read the models
20
21 -N Note: the note string to search for to pull models from the database
22
23 -X: Send the results to Excel. Note: will alter the current
24 worksheet (by adding data to the end) and only works on
25 systems with Excel installed. It *is* safe to call this
26 multiple times with a single worksheet.
27
28 -v: be verbose whilst screening
29 """
30
31 from Numeric import *
32 import sys,cPickle
33 from ML.DecTree import TreeUtils,Tree
34 from ML.Data import Stats
35 from Dbase.DbConnection import DbConnect
36 from ML import ScreenComposite
37 try:
38 from Excel.ExcelWrapper import ExcelWrapper as Excel
39 except ImportError:
40 Excel = None
41
42 __VERSION_STRING="2.2.0"
43
44 -def ProcessIt(composites,nToConsider=3,verbose=0,reportToExcel=0):
45 composite=composites[0]
46 nComposites =len(composites)
47 ns = composite.GetDescriptorNames()
48
49 if len(ns)>2:
50
51 globalRes = {}
52
53 nDone = 1
54 descNames = {}
55 for composite in composites:
56 if verbose > 0:
57 print '#------------------------------------'
58 print 'Doing: ',nDone
59 nModels = len(composite)
60 nDone += 1
61 res = {}
62 for i in range(len(composite)):
63 model = composite.GetModel(i)
64 if isinstance(model,Tree.TreeNode):
65 levels = TreeUtils.CollectLabelLevels(model,{},0,nToConsider)
66 TreeUtils.CollectDescriptorNames(model,descNames,0,nToConsider)
67 for descId in levels.keys():
68 v = res.get(descId,zeros(nToConsider,Float))
69 v[levels[descId]] += 1./nModels
70 res[descId] = v
71 for k in res:
72 v = globalRes.get(k,zeros(nToConsider,Float))
73 v += res[k]/nComposites
74 globalRes[k] = v
75 if verbose > 0:
76 for k in res.keys():
77 name = descNames[k]
78 strRes = ', '.join(['%4.2f'%x for x in res[k]])
79 print '%s,%s,%5.4f'%(name,strRes,sum(res[k]))
80
81 print
82
83
84 if verbose >= 0:
85 print '# Average Descriptor Positions'
86 retVal = []
87 if reportToExcel and Excel is not None:
88 xl = Excel()
89 xlCol = 1
90 xlRow = xl.FindLastRow(1,xlCol)
91 xlRow+=1
92 xl[xlRow,xlCol]=' '.join(sys.argv)
93 xlRow+=1
94 else:
95 xl = None
96 for k in globalRes.keys():
97 name = descNames[k]
98 if verbose >= 0:
99 strRes = ', '.join(['%4.2f'%x for x in globalRes[k]])
100 print '%s,%s,%5.4f'%(name,strRes,sum(globalRes[k]))
101 if xl:
102 xlCol=1
103 xl[xlRow,xlCol]=name
104 xlCol += 1
105 for v in globalRes[k]:
106 xl[xlRow,xlCol]=v
107 xlCol+=1
108 xl[xlRow,xlCol]=sum(globalRes[k])
109 xlRow += 1
110 tmp = [name]
111 tmp.extend(globalRes[k])
112 tmp.append(sum(globalRes[k]))
113 retVal.append(tmp)
114 if verbose >= 0:
115 print
116 else:
117 retVal = []
118 return retVal
119
120
122 fields = 'overall_error,holdout_error,overall_result_matrix,holdout_result_matrix,overall_correct_conf,overall_incorrect_conf,holdout_correct_conf,holdout_incorrect_conf'
123 try:
124 data = conn.GetData(fields=fields,where=where)
125 except:
126 import traceback
127 traceback.print_exc()
128 return None
129 nPts = len(data)
130 if not nPts:
131 sys.stderr.write('no runs found\n')
132 return None
133 overall = zeros(nPts,Float)
134 overallEnrich = zeros(nPts,Float)
135 oCorConf = 0.0
136 oInCorConf = 0.0
137 holdout = zeros(nPts,Float)
138 holdoutEnrich = zeros(nPts,Float)
139 hCorConf = 0.0
140 hInCorConf = 0.0
141 overallMatrix = None
142 holdoutMatrix = None
143 for i in range(nPts):
144 if data[i][0] is not None:
145 overall[i] = data[i][0]
146 oCorConf += data[i][4]
147 oInCorConf += data[i][5]
148 if data[i][1] is not None:
149 holdout[i] = data[i][1]
150 haveHoldout=1
151 else:
152 haveHoldout=0
153 tmpOverall = 1.*eval(data[i][2])
154 if enrich >=0:
155 overallEnrich[i] = ScreenComposite.CalcEnrichment(tmpOverall,tgt=enrich)
156 if haveHoldout:
157 tmpHoldout = 1.*eval(data[i][3])
158 if enrich >=0:
159 holdoutEnrich[i] = ScreenComposite.CalcEnrichment(tmpHoldout,tgt=enrich)
160 if overallMatrix is None:
161 if data[i][2] is not None:
162 overallMatrix = tmpOverall
163 if haveHoldout and data[i][3] is not None:
164 holdoutMatrix = tmpHoldout
165 else:
166 overallMatrix += tmpOverall
167 if haveHoldout:
168 holdoutMatrix += tmpHoldout
169 if haveHoldout:
170 hCorConf += data[i][6]
171 hInCorConf += data[i][7]
172
173 avgOverall = sum(overall)/nPts
174 oCorConf /= nPts
175 oInCorConf /= nPts
176 overallMatrix /= nPts
177 oSort = argsort(overall)
178 oMin = overall[oSort[0]]
179 overall -= avgOverall
180 devOverall = sqrt(sum(overall**2)/(nPts-1))
181 res = {}
182 res['oAvg'] = 100*avgOverall
183 res['oDev'] = 100*devOverall
184 res['oCorrectConf'] = 100*oCorConf
185 res['oIncorrectConf'] = 100*oInCorConf
186 res['oResultMat']=overallMatrix
187 res['oBestIdx']=oSort[0]
188 res['oBestErr']=100*oMin
189
190 if enrich>=0:
191 mean,dev = Stats.MeanAndDev(overallEnrich)
192 res['oAvgEnrich'] = mean
193 res['oDevEnrich'] = dev
194
195 if haveHoldout:
196 avgHoldout = sum(holdout)/nPts
197 hCorConf /= nPts
198 hInCorConf /= nPts
199 holdoutMatrix /= nPts
200 hSort = argsort(holdout)
201 hMin = holdout[hSort[0]]
202 holdout -= avgHoldout
203 devHoldout = sqrt(sum(holdout**2)/(nPts-1))
204 res['hAvg'] = 100*avgHoldout
205 res['hDev'] = 100*devHoldout
206 res['hCorrectConf'] = 100*hCorConf
207 res['hIncorrectConf'] = 100*hInCorConf
208 res['hResultMat']=holdoutMatrix
209 res['hBestIdx']=hSort[0]
210 res['hBestErr']=100*hMin
211 if enrich>=0:
212 mean,dev = Stats.MeanAndDev(holdoutEnrich)
213 res['hAvgEnrich'] = mean
214 res['hDevEnrich'] = dev
215 return res
216
218 statD = statD.copy()
219 statD['oBestIdx'] = statD['oBestIdx']+1
220 txt="""
221 # Error Statistics:
222 \tOverall: %(oAvg)6.3f%% (%(oDev)6.3f) %(oCorrectConf)4.1f/%(oIncorrectConf)4.1f
223 \t\tBest: %(oBestIdx)d %(oBestErr)6.3f%%"""%(statD)
224 if statD.has_key('hAvg'):
225 statD['hBestIdx'] = statD['hBestIdx']+1
226 txt += """
227 \tHoldout: %(hAvg)6.3f%% (%(hDev)6.3f) %(hCorrectConf)4.1f/%(hIncorrectConf)4.1f
228 \t\tBest: %(hBestIdx)d %(hBestErr)6.3f%%
229 """%(statD)
230 print txt
231 print
232 print '# Results matrices:'
233 print '\tOverall:'
234 tmp = transpose(statD['oResultMat'])
235 colCounts = sum(tmp)
236 rowCounts = sum(tmp,1)
237 for i in range(len(tmp)):
238 if rowCounts[i]==0: rowCounts[i]=1
239 row = tmp[i]
240 print '\t\t',
241 for j in range(len(row)):
242 print '% 6.2f'%row[j],
243 print '\t| % 4.2f'%(100.*tmp[i,i]/rowCounts[i])
244 print '\t\t',
245 for i in range(len(tmp)):
246 print '------',
247 print
248 print '\t\t',
249 for i in range(len(tmp)):
250 if colCounts[i]==0: colCounts[i]=1
251 print '% 6.2f'%(100.*tmp[i,i]/colCounts[i]),
252 print
253 if enrich>-1 and statD.has_key('oAvgEnrich'):
254 print '\t\tEnrich(%d): %.3f (%.3f)'%(enrich,statD['oAvgEnrich'],statD['oDevEnrich'])
255
256
257 if statD.has_key('hResultMat'):
258 print '\tHoldout:'
259 tmp = transpose(statD['hResultMat'])
260 colCounts = sum(tmp)
261 rowCounts = sum(tmp,1)
262 for i in range(len(tmp)):
263 if rowCounts[i]==0: rowCounts[i]=1
264 row = tmp[i]
265 print '\t\t',
266 for j in range(len(row)):
267 print '% 6.2f'%row[j],
268 print '\t| % 4.2f'%(100.*tmp[i,i]/rowCounts[i])
269 print '\t\t',
270 for i in range(len(tmp)):
271 print '------',
272 print
273 print '\t\t',
274 for i in range(len(tmp)):
275 if colCounts[i]==0: colCounts[i]=1
276 print '% 6.2f'%(100.*tmp[i,i]/colCounts[i]),
277 print
278 if enrich>-1 and statD.has_key('hAvgEnrich'):
279 print '\t\tEnrich(%d): %.3f (%.3f)'%(enrich,statD['hAvgEnrich'],statD['hDevEnrich'])
280
281
282 return
283
284
288
289 if __name__ == "__main__":
290 import getopt
291 try:
292 args,extras = getopt.getopt(sys.argv[1:],'n:d:N:vX',('skip',
293 'enrich=',
294 ))
295 except:
296 Usage()
297
298 count = 3
299 db = None
300 note = ''
301 verbose = 0
302 skip = 0
303 enrich = 1
304 reportToExcel=0
305 for arg,val in args:
306 if arg == '-n':
307 count = int(val)+1
308 elif arg == '-d':
309 db = val
310 elif arg == '-N':
311 note = val
312 elif arg == '-v':
313 verbose = 1
314 elif arg == '-X':
315 if Excel is not None:
316 reportToExcel = 1
317 else:
318 ScreenComposite.message('NOTE: Excel support not enabled, -X option ignored.')
319
320 elif arg == '--skip':
321 skip = 1
322 elif arg == '--enrich':
323 enrich = int(val)
324 composites = []
325 if db is None:
326 for arg in extras:
327 composite = cPickle.load(open(arg,'rb'))
328 composites.append(composite)
329 else:
330 tbl = extras[0]
331 conn = DbConnect(db,tbl)
332 if note:
333 where="where note='%s'"%(note)
334 else:
335 where = ''
336 if not skip:
337 pkls = conn.GetData(fields='model',where=where)
338 composites = []
339 for pkl in pkls:
340 pkl = str(pkl[0])
341 comp = cPickle.loads(pkl)
342 composites.append(comp)
343
344 if len(composites):
345 ProcessIt(composites,count,verbose=verbose,reportToExcel=reportToExcel)
346 elif not skip:
347 print 'ERROR: no composite models found'
348 sys.exit(-1)
349
350 if db:
351 res = ErrorStats(conn,where,enrich=enrich)
352 if res:
353 ShowStats(res)
354