Package ML :: Package Data :: Module Quantize
[hide private]
[frames] | no frames]

Source Code for Module ML.Data.Quantize

  1  # $Id: Quantize.py 2 2006-05-06 22:54:39Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2001,2002,2003  Greg Landrum and Rational Discovery LLC 
  4  #   All Rights Reserved 
  5  # 
  6   
  7  """ Automatic search for quantization bounds 
  8   
  9  This uses the expected informational gain to determine where quantization bounds should 
 10  lie. 
 11   
 12  **Notes**: 
 13   
 14    - bounds are less than, so if the bounds are [1.,2.], 
 15      [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2] 
 16   
 17  """ 
 18  from Numeric import * 
 19  from ML.InfoTheory import entropy 
 20  try: 
 21    import cQuantize 
 22  except: 
 23    hascQuantize = 0 
 24  else: 
 25    hascQuantize = 1   
 26     
 27  _float_tol = 1e-8 
28 -def feq(v1,v2,tol=_float_tol):
29 """ floating point equality with a tolerance factor 30 31 **Arguments** 32 33 - v1: a float 34 35 - v2: a float 36 37 - tol: the tolerance for comparison 38 39 **Returns** 40 41 0 or 1 42 43 """ 44 return abs(v1-v2) < tol
45
46 -def FindVarQuantBound(vals,results,nPossibleRes):
47 """ Uses FindVarMultQuantBounds, only here for historic reasons 48 """ 49 bounds,gain = FindVarMultQuantBounds(vals,1,results,nPossibleRes) 50 return (bounds[0],gain)
51 52
53 -def _GenVarTable(vals,cuts,starts,results,nPossibleRes):
54 """ Primarily intended for internal use 55 56 constructs a variable table for the data passed in 57 The table for a given variable records the number of times each possible value 58 of that variable appears for each possible result of the function. 59 60 **Arguments** 61 62 - vals: a 1D Numeric array with the values of the variables 63 64 - cuts: a list with the indices of the quantization bounds 65 (indices are into _starts_ ) 66 67 - starts: a list of potential starting points for quantization bounds 68 69 - results: a 1D Numeric array of integer result codes 70 71 - nPossibleRes: an integer with the number of possible result codes 72 73 **Returns** 74 75 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes 76 77 **Notes** 78 79 - _vals_ should be sorted! 80 81 """ 82 nVals = len(cuts)+1 83 varTable = zeros((nVals,nPossibleRes),Int) 84 idx = 0 85 for i in xrange(nVals-1): 86 cut = cuts[i] 87 while idx < starts[cut]: 88 varTable[i,results[idx]] += 1 89 idx += 1 90 while idx < len(vals): 91 varTable[-1,results[idx]] += 1 92 idx += 1 93 return varTable
94
95 -def _PyRecurseOnBounds(vals,cuts,which,starts,results,nPossibleRes,varTable=None):
96 """ Primarily intended for internal use 97 98 Recursively finds the best quantization boundaries 99 100 **Arguments** 101 102 - vals: a 1D Numeric array with the values of the variables, 103 this should be sorted 104 105 - cuts: a list with the indices of the quantization bounds 106 (indices are into _starts_ ) 107 108 - which: an integer indicating which bound is being adjusted here 109 (and index into _cuts_ ) 110 111 - starts: a list of potential starting points for quantization bounds 112 113 - results: a 1D Numeric array of integer result codes 114 115 - nPossibleRes: an integer with the number of possible result codes 116 117 **Returns** 118 119 - a 2-tuple containing: 120 121 1) the best information gain found so far 122 123 2) a list of the quantization bound indices ( _cuts_ for the best case) 124 125 **Notes** 126 127 - this is not even remotely efficient, which is why a C replacement 128 was written 129 130 """ 131 nBounds = len(cuts) 132 maxGain = -1e6 133 bestCuts = None 134 highestCutHere = len(starts) - nBounds + which 135 if varTable is None: 136 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 137 while cuts[which] <= highestCutHere: 138 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 139 gainHere = entropy.InfoGain(varTable) 140 if gainHere > maxGain: 141 maxGain = gainHere 142 bestCuts = cuts[:] 143 # recurse on the next vars if needed 144 if which < nBounds-1: 145 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes, 146 varTable = varTable) 147 if gainHere > maxGain: 148 maxGain = gainHere 149 bestCuts = cutsHere 150 # update this cut 151 cuts[which] += 1 152 for i in range(which+1,nBounds): 153 if cuts[i] == cuts[i-1]: 154 cuts[i] += 1 155 156 return maxGain,bestCuts
157
158 -def _NewPyRecurseOnBounds(vals,cuts,which,starts,results,nPossibleRes,varTable=None):
159 """ Primarily intended for internal use 160 161 Recursively finds the best quantization boundaries 162 163 **Arguments** 164 165 - vals: a 1D Numeric array with the values of the variables, 166 this should be sorted 167 168 - cuts: a list with the indices of the quantization bounds 169 (indices are into _starts_ ) 170 171 - which: an integer indicating which bound is being adjusted here 172 (and index into _cuts_ ) 173 174 - starts: a list of potential starting points for quantization bounds 175 176 - results: a 1D Numeric array of integer result codes 177 178 - nPossibleRes: an integer with the number of possible result codes 179 180 **Returns** 181 182 - a 2-tuple containing: 183 184 1) the best information gain found so far 185 186 2) a list of the quantization bound indices ( _cuts_ for the best case) 187 188 **Notes** 189 190 - this is not even remotely efficient, which is why a C replacement 191 was written 192 193 """ 194 nBounds = len(cuts) 195 maxGain = -1e6 196 bestCuts = None 197 highestCutHere = len(starts) - nBounds + which 198 if varTable is None: 199 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 200 while cuts[which] <= highestCutHere: 201 gainHere = entropy.InfoGain(varTable) 202 if gainHere > maxGain: 203 maxGain = gainHere 204 bestCuts = cuts[:] 205 # recurse on the next vars if needed 206 if which < nBounds-1: 207 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes, 208 varTable = None) 209 if gainHere > maxGain: 210 maxGain = gainHere 211 bestCuts = cutsHere 212 # update this cut 213 oldCut = cuts[which] 214 cuts[which] += 1 215 bot = starts[oldCut] 216 if oldCut+1 < len(starts): 217 top = starts[oldCut+1] 218 else: 219 top = starts[-1] 220 for i in range(bot,top): 221 v = results[i] 222 varTable[which,v] += 1 223 varTable[which+1,v] -= 1 224 for i in range(which+1,nBounds): 225 if cuts[i] == cuts[i-1]: 226 cuts[i] += 1 227 228 229 return maxGain,bestCuts
230 231 232 # -------------------------------- 233 # 234 # find all possible dividing points 235 # 236 # There are a couple requirements for a dividing point: 237 # 1) the dependent variable (descriptor) must change across it, 238 # 2) the result score must change across it 239 # 240 # So, in the list [(0,0),(1,0),(1,1),(2,1)]: 241 # - we cannot divide before (1,0) (same activity value) 242 # - we cannot divide before (1,1) (same descriptor value) 243 # - we can divide before (2,1) (same descriptor value) 244 # 245 # --------------------------------
246 -def _NewPyFindStartPoints(sortVals,sortResults,nData):
247 startNext = [] 248 tol = 1e-8 249 i = 0 250 start=0 251 actHomog=1 252 valHomog=1 253 while i<nData: 254 # we don't need to use abs (the list is ordered) 255 if sortVals[i]-sortVals[start]>tol: 256 valHomog=0 257 if sortResults[i]!=sortResults[start]: 258 actHomog=0 259 if not actHomog and not valHomog: 260 # we have a switch, now we just need to figure out where the 261 # switch is. 262 if( sortVals[i]-sortVals[i-1]<tol ): 263 # we're in a block with constant descriptor value, find its beginning: 264 while i>1 and sortVals[i]-sortVals[i-1]<tol: 265 i-=1 266 # i is now just upstream of the transition, which is exactly where 267 # we want the cut point 268 else: 269 # we don't need to touch i, the dividing line goes right before it 270 pass 271 startNext.append(i) 272 start = i 273 actHomog = 1 274 valHomog = 1 275 i += 1 276 return startNext
277
278 -def FindVarMultQuantBounds(vals,nBounds,results,nPossibleRes):
279 """ finds multiple quantization bounds for a single variable 280 281 **Arguments** 282 283 - vals: sequence of variable values (assumed to be floats) 284 285 - nBounds: the number of quantization bounds to find 286 287 - results: a list of result codes (should be integers) 288 289 - nPossibleRes: an integer with the number of possible values of the 290 result variable 291 292 **Returns** 293 294 - a 2-tuple containing: 295 296 1) a list of the quantization bounds (floats) 297 298 2) the information gain associated with this quantization 299 300 301 """ 302 assert len(vals) == len(results), 'vals/results length mismatch' 303 304 nData = len(vals) 305 if nData == 0: 306 return [],-1e8 307 308 # sort the variable values: 309 # Bypass the type-checking stuff that happens in a normal 310 # argsort call: 311 sortIdx = multiarray.argsort(vals) 312 sortVals = array(take(vals,sortIdx),Float) 313 sortResults = array(take(results,sortIdx),Int) 314 startNext=_FindStartPoints(sortVals,sortResults,nData) 315 if not len(startNext): 316 return [0],0.0 317 if len(startNext)<nBounds: 318 nBounds = len(startNext)-1 319 if nBounds == 0: 320 nBounds=1 321 initCuts = range(nBounds) 322 maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext, 323 sortResults,nPossibleRes) 324 quantBounds = [] 325 nVs = len(sortVals) 326 for cut in bestCuts: 327 idx = startNext[cut] 328 if idx == nVs: 329 quantBounds.append(sortVals[-1]) 330 elif idx == 0: 331 quantBounds.append(sortVals[idx]) 332 else: 333 quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.) 334 335 return quantBounds,maxGain
336 337 if hascQuantize: 338 _RecurseOnBounds = cQuantize._RecurseOnBounds 339 _FindStartPoints = cQuantize._FindStartPoints 340 else: 341 _RecurseOnBounds = _NewPyRecurseOnBounds 342 _FindStartPoints = _NewPyFindStartPoints 343 344 if __name__ == '__main__': 345 import sys 346 if 1: 347 d = [(1.,0), 348 (1.1,0), 349 (1.2,0), 350 (1.4,1), 351 (1.4,0), 352 (1.6,1), 353 (2.,1), 354 (2.1,0), 355 (2.1,0), 356 (2.1,0), 357 (2.2,1), 358 (2.3,0)] 359 varValues = map(lambda x:x[0],d) 360 resCodes = map(lambda x:x[1],d) 361 nPossibleRes = 2 362 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes) 363 print 'RES:',res 364 target = ([1.3, 2.05],.34707 ) 365 else: 366 d = [(1.,0), 367 (1.1,0), 368 (1.2,0), 369 (1.4,1), 370 (1.4,0), 371 (1.6,1), 372 (2.,1), 373 (2.1,0), 374 (2.1,0), 375 (2.1,0), 376 (2.2,1), 377 (2.3,0)] 378 varValues = map(lambda x:x[0],d) 379 resCodes = map(lambda x:x[1],d) 380 nPossibleRes =2 381 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes) 382 print res 383 #sys.exit(1) 384 d = [(1.4,1), 385 (1.4,0)] 386 387 varValues = map(lambda x:x[0],d) 388 resCodes = map(lambda x:x[1],d) 389 nPossibleRes =2 390 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes) 391 print res 392 393 d = [(1.4,0), 394 (1.4,0),(1.6,1)] 395 varValues = map(lambda x:x[0],d) 396 resCodes = map(lambda x:x[1],d) 397 nPossibleRes =2 398 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes) 399 print res 400