Package ML :: Module AnalyzeComposite
[hide private]
[frames] | no frames]

Source Code for Module ML.AnalyzeComposite

  1  # $Id: AnalyzeComposite.py 2 2006-05-06 22:54:39Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2002-2006  greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ command line utility to report on the contributions of descriptors to 
  8  tree-based composite models 
  9   
 10  Usage:  AnalyzeComposite [optional args] <models> 
 11   
 12        <models>: file name(s) of pickled composite model(s) 
 13          (this is the name of the db table if using a database) 
 14   
 15      Optional Arguments: 
 16   
 17        -n number: the number of levels of each model to consider 
 18   
 19        -d dbname: the database from which to read the models 
 20   
 21        -N Note: the note string to search for to pull models from the database 
 22   
 23        -X: Send the results to Excel.  Note: will alter the current 
 24            worksheet (by adding data to the end) and only works on 
 25            systems with Excel installed.  It *is* safe to call this 
 26            multiple times with a single worksheet. 
 27   
 28        -v: be verbose whilst screening 
 29  """ 
 30   
 31  from Numeric import * 
 32  import sys,cPickle 
 33  from ML.DecTree import TreeUtils,Tree 
 34  from ML.Data import Stats 
 35  from Dbase.DbConnection import DbConnect 
 36  from ML import ScreenComposite 
 37  try: 
 38    from Excel.ExcelWrapper import ExcelWrapper as Excel 
 39  except ImportError: 
 40    Excel = None 
 41   
 42  __VERSION_STRING="2.2.0" 
 43   
44 -def ProcessIt(composites,nToConsider=3,verbose=0,reportToExcel=0):
45 composite=composites[0] 46 nComposites =len(composites) 47 ns = composite.GetDescriptorNames() 48 #nDesc = len(ns)-2 49 if len(ns)>2: 50 #globalRes = zeros((nDesc,nToConsider),Float) 51 globalRes = {} 52 53 nDone = 1 54 descNames = {} 55 for composite in composites: 56 if verbose > 0: 57 print '#------------------------------------' 58 print 'Doing: ',nDone 59 nModels = len(composite) 60 nDone += 1 61 res = {} 62 for i in range(len(composite)): 63 model = composite.GetModel(i) 64 if isinstance(model,Tree.TreeNode): 65 levels = TreeUtils.CollectLabelLevels(model,{},0,nToConsider) 66 TreeUtils.CollectDescriptorNames(model,descNames,0,nToConsider) 67 for descId in levels.keys(): 68 v = res.get(descId,zeros(nToConsider,Float)) 69 v[levels[descId]] += 1./nModels 70 res[descId] = v 71 for k in res: 72 v = globalRes.get(k,zeros(nToConsider,Float)) 73 v += res[k]/nComposites 74 globalRes[k] = v 75 if verbose > 0: 76 for k in res.keys(): 77 name = descNames[k] 78 strRes = ', '.join(['%4.2f'%x for x in res[k]]) 79 print '%s,%s,%5.4f'%(name,strRes,sum(res[k])) 80 81 print 82 83 84 if verbose >= 0: 85 print '# Average Descriptor Positions' 86 retVal = [] 87 if reportToExcel and Excel is not None: 88 xl = Excel() 89 xlCol = 1 90 xlRow = xl.FindLastRow(1,xlCol) 91 xlRow+=1 92 xl[xlRow,xlCol]=' '.join(sys.argv) 93 xlRow+=1 94 else: 95 xl = None 96 for k in globalRes.keys(): 97 name = descNames[k] 98 if verbose >= 0: 99 strRes = ', '.join(['%4.2f'%x for x in globalRes[k]]) 100 print '%s,%s,%5.4f'%(name,strRes,sum(globalRes[k])) 101 if xl: 102 xlCol=1 103 xl[xlRow,xlCol]=name 104 xlCol += 1 105 for v in globalRes[k]: 106 xl[xlRow,xlCol]=v 107 xlCol+=1 108 xl[xlRow,xlCol]=sum(globalRes[k]) 109 xlRow += 1 110 tmp = [name] 111 tmp.extend(globalRes[k]) 112 tmp.append(sum(globalRes[k])) 113 retVal.append(tmp) 114 if verbose >= 0: 115 print 116 else: 117 retVal = [] 118 return retVal
119 120
121 -def ErrorStats(conn,where,enrich=1):
122 fields = 'overall_error,holdout_error,overall_result_matrix,holdout_result_matrix,overall_correct_conf,overall_incorrect_conf,holdout_correct_conf,holdout_incorrect_conf' 123 try: 124 data = conn.GetData(fields=fields,where=where) 125 except: 126 import traceback 127 traceback.print_exc() 128 return None 129 nPts = len(data) 130 if not nPts: 131 sys.stderr.write('no runs found\n') 132 return None 133 overall = zeros(nPts,Float) 134 overallEnrich = zeros(nPts,Float) 135 oCorConf = 0.0 136 oInCorConf = 0.0 137 holdout = zeros(nPts,Float) 138 holdoutEnrich = zeros(nPts,Float) 139 hCorConf = 0.0 140 hInCorConf = 0.0 141 overallMatrix = None 142 holdoutMatrix = None 143 for i in range(nPts): 144 if data[i][0] is not None: 145 overall[i] = data[i][0] 146 oCorConf += data[i][4] 147 oInCorConf += data[i][5] 148 if data[i][1] is not None: 149 holdout[i] = data[i][1] 150 haveHoldout=1 151 else: 152 haveHoldout=0 153 tmpOverall = 1.*eval(data[i][2]) 154 if enrich >=0: 155 overallEnrich[i] = ScreenComposite.CalcEnrichment(tmpOverall,tgt=enrich) 156 if haveHoldout: 157 tmpHoldout = 1.*eval(data[i][3]) 158 if enrich >=0: 159 holdoutEnrich[i] = ScreenComposite.CalcEnrichment(tmpHoldout,tgt=enrich) 160 if overallMatrix is None: 161 if data[i][2] is not None: 162 overallMatrix = tmpOverall 163 if haveHoldout and data[i][3] is not None: 164 holdoutMatrix = tmpHoldout 165 else: 166 overallMatrix += tmpOverall 167 if haveHoldout: 168 holdoutMatrix += tmpHoldout 169 if haveHoldout: 170 hCorConf += data[i][6] 171 hInCorConf += data[i][7] 172 173 avgOverall = sum(overall)/nPts 174 oCorConf /= nPts 175 oInCorConf /= nPts 176 overallMatrix /= nPts 177 oSort = argsort(overall) 178 oMin = overall[oSort[0]] 179 overall -= avgOverall 180 devOverall = sqrt(sum(overall**2)/(nPts-1)) 181 res = {} 182 res['oAvg'] = 100*avgOverall 183 res['oDev'] = 100*devOverall 184 res['oCorrectConf'] = 100*oCorConf 185 res['oIncorrectConf'] = 100*oInCorConf 186 res['oResultMat']=overallMatrix 187 res['oBestIdx']=oSort[0] 188 res['oBestErr']=100*oMin 189 190 if enrich>=0: 191 mean,dev = Stats.MeanAndDev(overallEnrich) 192 res['oAvgEnrich'] = mean 193 res['oDevEnrich'] = dev 194 195 if haveHoldout: 196 avgHoldout = sum(holdout)/nPts 197 hCorConf /= nPts 198 hInCorConf /= nPts 199 holdoutMatrix /= nPts 200 hSort = argsort(holdout) 201 hMin = holdout[hSort[0]] 202 holdout -= avgHoldout 203 devHoldout = sqrt(sum(holdout**2)/(nPts-1)) 204 res['hAvg'] = 100*avgHoldout 205 res['hDev'] = 100*devHoldout 206 res['hCorrectConf'] = 100*hCorConf 207 res['hIncorrectConf'] = 100*hInCorConf 208 res['hResultMat']=holdoutMatrix 209 res['hBestIdx']=hSort[0] 210 res['hBestErr']=100*hMin 211 if enrich>=0: 212 mean,dev = Stats.MeanAndDev(holdoutEnrich) 213 res['hAvgEnrich'] = mean 214 res['hDevEnrich'] = dev 215 return res
216
217 -def ShowStats(statD,enrich=1):
218 statD = statD.copy() 219 statD['oBestIdx'] = statD['oBestIdx']+1 220 txt=""" 221 # Error Statistics: 222 \tOverall: %(oAvg)6.3f%% (%(oDev)6.3f) %(oCorrectConf)4.1f/%(oIncorrectConf)4.1f 223 \t\tBest: %(oBestIdx)d %(oBestErr)6.3f%%"""%(statD) 224 if statD.has_key('hAvg'): 225 statD['hBestIdx'] = statD['hBestIdx']+1 226 txt += """ 227 \tHoldout: %(hAvg)6.3f%% (%(hDev)6.3f) %(hCorrectConf)4.1f/%(hIncorrectConf)4.1f 228 \t\tBest: %(hBestIdx)d %(hBestErr)6.3f%% 229 """%(statD) 230 print txt 231 print 232 print '# Results matrices:' 233 print '\tOverall:' 234 tmp = transpose(statD['oResultMat']) 235 colCounts = sum(tmp) 236 rowCounts = sum(tmp,1) 237 for i in range(len(tmp)): 238 if rowCounts[i]==0: rowCounts[i]=1 239 row = tmp[i] 240 print '\t\t', 241 for j in range(len(row)): 242 print '% 6.2f'%row[j], 243 print '\t| % 4.2f'%(100.*tmp[i,i]/rowCounts[i]) 244 print '\t\t', 245 for i in range(len(tmp)): 246 print '------', 247 print 248 print '\t\t', 249 for i in range(len(tmp)): 250 if colCounts[i]==0: colCounts[i]=1 251 print '% 6.2f'%(100.*tmp[i,i]/colCounts[i]), 252 print 253 if enrich>-1 and statD.has_key('oAvgEnrich'): 254 print '\t\tEnrich(%d): %.3f (%.3f)'%(enrich,statD['oAvgEnrich'],statD['oDevEnrich']) 255 256 257 if statD.has_key('hResultMat'): 258 print '\tHoldout:' 259 tmp = transpose(statD['hResultMat']) 260 colCounts = sum(tmp) 261 rowCounts = sum(tmp,1) 262 for i in range(len(tmp)): 263 if rowCounts[i]==0: rowCounts[i]=1 264 row = tmp[i] 265 print '\t\t', 266 for j in range(len(row)): 267 print '% 6.2f'%row[j], 268 print '\t| % 4.2f'%(100.*tmp[i,i]/rowCounts[i]) 269 print '\t\t', 270 for i in range(len(tmp)): 271 print '------', 272 print 273 print '\t\t', 274 for i in range(len(tmp)): 275 if colCounts[i]==0: colCounts[i]=1 276 print '% 6.2f'%(100.*tmp[i,i]/colCounts[i]), 277 print 278 if enrich>-1 and statD.has_key('hAvgEnrich'): 279 print '\t\tEnrich(%d): %.3f (%.3f)'%(enrich,statD['hAvgEnrich'],statD['hDevEnrich']) 280 281 282 return
283 284
285 -def Usage():
286 print __doc__ 287 sys.exit(-1)
288 289 if __name__ == "__main__": 290 import getopt 291 try: 292 args,extras = getopt.getopt(sys.argv[1:],'n:d:N:vX',('skip', 293 'enrich=', 294 )) 295 except: 296 Usage() 297 298 count = 3 299 db = None 300 note = '' 301 verbose = 0 302 skip = 0 303 enrich = 1 304 reportToExcel=0 305 for arg,val in args: 306 if arg == '-n': 307 count = int(val)+1 308 elif arg == '-d': 309 db = val 310 elif arg == '-N': 311 note = val 312 elif arg == '-v': 313 verbose = 1 314 elif arg == '-X': 315 if Excel is not None: 316 reportToExcel = 1 317 else: 318 ScreenComposite.message('NOTE: Excel support not enabled, -X option ignored.') 319 320 elif arg == '--skip': 321 skip = 1 322 elif arg == '--enrich': 323 enrich = int(val) 324 composites = [] 325 if db is None: 326 for arg in extras: 327 composite = cPickle.load(open(arg,'rb')) 328 composites.append(composite) 329 else: 330 tbl = extras[0] 331 conn = DbConnect(db,tbl) 332 if note: 333 where="where note='%s'"%(note) 334 else: 335 where = '' 336 if not skip: 337 pkls = conn.GetData(fields='model',where=where) 338 composites = [] 339 for pkl in pkls: 340 pkl = str(pkl[0]) 341 comp = cPickle.loads(pkl) 342 composites.append(comp) 343 344 if len(composites): 345 ProcessIt(composites,count,verbose=verbose,reportToExcel=reportToExcel) 346 elif not skip: 347 print 'ERROR: no composite models found' 348 sys.exit(-1) 349 350 if db: 351 res = ErrorStats(conn,where,enrich=enrich) 352 if res: 353 ShowStats(res) 354