Package rdkit :: Package ML :: Package Data :: Module Quantize
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.Quantize

  1  # $Id: Quantize.py 997 2009-02-25 06:12:43Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2001-2008  Greg Landrum and Rational Discovery LLC 
  4  #   All Rights Reserved 
  5  # 
  6   
  7  """ Automatic search for quantization bounds 
  8   
  9  This uses the expected informational gain to determine where quantization bounds should 
 10  lie. 
 11   
 12  **Notes**: 
 13   
 14    - bounds are less than, so if the bounds are [1.,2.], 
 15      [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2] 
 16   
 17  """ 
 18  import numpy 
 19  from rdkit.ML.InfoTheory import entropy 
 20  try: 
 21    import cQuantize 
 22  except: 
 23    hascQuantize = 0 
 24  else: 
 25    hascQuantize = 1   
 26     
 27  _float_tol = 1e-8 
28 -def feq(v1,v2,tol=_float_tol):
29 """ floating point equality with a tolerance factor 30 31 **Arguments** 32 33 - v1: a float 34 35 - v2: a float 36 37 - tol: the tolerance for comparison 38 39 **Returns** 40 41 0 or 1 42 43 """ 44 return abs(v1-v2) < tol
45
46 -def FindVarQuantBound(vals,results,nPossibleRes):
47 """ Uses FindVarMultQuantBounds, only here for historic reasons 48 """ 49 bounds,gain = FindVarMultQuantBounds(vals,1,results,nPossibleRes) 50 return (bounds[0],gain)
51 52
53 -def _GenVarTable(vals,cuts,starts,results,nPossibleRes):
54 """ Primarily intended for internal use 55 56 constructs a variable table for the data passed in 57 The table for a given variable records the number of times each possible value 58 of that variable appears for each possible result of the function. 59 60 **Arguments** 61 62 - vals: a 1D Numeric array with the values of the variables 63 64 - cuts: a list with the indices of the quantization bounds 65 (indices are into _starts_ ) 66 67 - starts: a list of potential starting points for quantization bounds 68 69 - results: a 1D Numeric array of integer result codes 70 71 - nPossibleRes: an integer with the number of possible result codes 72 73 **Returns** 74 75 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes 76 77 **Notes** 78 79 - _vals_ should be sorted! 80 81 """ 82 nVals = len(cuts)+1 83 varTable = numpy.zeros((nVals,nPossibleRes),'i') 84 idx = 0 85 for i in range(nVals-1): 86 cut = cuts[i] 87 while idx < starts[cut]: 88 varTable[i,results[idx]] += 1 89 idx += 1 90 while idx < len(vals): 91 varTable[-1,results[idx]] += 1 92 idx += 1 93 return varTable
94
95 -def _PyRecurseOnBounds(vals,cuts,which,starts,results,nPossibleRes,varTable=None):
96 """ Primarily intended for internal use 97 98 Recursively finds the best quantization boundaries 99 100 **Arguments** 101 102 - vals: a 1D Numeric array with the values of the variables, 103 this should be sorted 104 105 - cuts: a list with the indices of the quantization bounds 106 (indices are into _starts_ ) 107 108 - which: an integer indicating which bound is being adjusted here 109 (and index into _cuts_ ) 110 111 - starts: a list of potential starting points for quantization bounds 112 113 - results: a 1D Numeric array of integer result codes 114 115 - nPossibleRes: an integer with the number of possible result codes 116 117 **Returns** 118 119 - a 2-tuple containing: 120 121 1) the best information gain found so far 122 123 2) a list of the quantization bound indices ( _cuts_ for the best case) 124 125 **Notes** 126 127 - this is not even remotely efficient, which is why a C replacement 128 was written 129 130 """ 131 nBounds = len(cuts) 132 maxGain = -1e6 133 bestCuts = None 134 highestCutHere = len(starts) - nBounds + which 135 if varTable is None: 136 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 137 while cuts[which] <= highestCutHere: 138 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 139 gainHere = entropy.InfoGain(varTable) 140 if gainHere > maxGain: 141 maxGain = gainHere 142 bestCuts = cuts[:] 143 # recurse on the next vars if needed 144 if which < nBounds-1: 145 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes, 146 varTable = varTable) 147 if gainHere > maxGain: 148 maxGain = gainHere 149 bestCuts = cutsHere 150 # update this cut 151 cuts[which] += 1 152 for i in range(which+1,nBounds): 153 if cuts[i] == cuts[i-1]: 154 cuts[i] += 1 155 156 return maxGain,bestCuts
157
158 -def _NewPyRecurseOnBounds(vals,cuts,which,starts,results,nPossibleRes,varTable=None):
159 """ Primarily intended for internal use 160 161 Recursively finds the best quantization boundaries 162 163 **Arguments** 164 165 - vals: a 1D Numeric array with the values of the variables, 166 this should be sorted 167 168 - cuts: a list with the indices of the quantization bounds 169 (indices are into _starts_ ) 170 171 - which: an integer indicating which bound is being adjusted here 172 (and index into _cuts_ ) 173 174 - starts: a list of potential starting points for quantization bounds 175 176 - results: a 1D Numeric array of integer result codes 177 178 - nPossibleRes: an integer with the number of possible result codes 179 180 **Returns** 181 182 - a 2-tuple containing: 183 184 1) the best information gain found so far 185 186 2) a list of the quantization bound indices ( _cuts_ for the best case) 187 188 **Notes** 189 190 - this is not even remotely efficient, which is why a C replacement 191 was written 192 193 """ 194 nBounds = len(cuts) 195 maxGain = -1e6 196 bestCuts = None 197 highestCutHere = len(starts) - nBounds + which 198 if varTable is None: 199 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 200 while cuts[which] <= highestCutHere: 201 gainHere = entropy.InfoGain(varTable) 202 if gainHere > maxGain: 203 maxGain = gainHere 204 bestCuts = cuts[:] 205 # recurse on the next vars if needed 206 if which < nBounds-1: 207 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes, 208 varTable = None) 209 if gainHere > maxGain: 210 maxGain = gainHere 211 bestCuts = cutsHere 212 # update this cut 213 oldCut = cuts[which] 214 cuts[which] += 1 215 bot = starts[oldCut] 216 if oldCut+1 < len(starts): 217 top = starts[oldCut+1] 218 else: 219 top = starts[-1] 220 for i in range(bot,top): 221 v = results[i] 222 varTable[which,v] += 1 223 varTable[which+1,v] -= 1 224 for i in range(which+1,nBounds): 225 if cuts[i] == cuts[i-1]: 226 cuts[i] += 1 227 228 229 return maxGain,bestCuts
230 231 232 # -------------------------------- 233 # 234 # find all possible dividing points 235 # 236 # There are a couple requirements for a dividing point: 237 # 1) the dependent variable (descriptor) must change across it, 238 # 2) the result score must change across it 239 # 240 # So, in the list [(0,0),(1,0),(1,1),(2,1)]: 241 # we should divide before (1,0) and (2,1) 242 # 243 # --------------------------------
244 -def _NewPyFindStartPoints(sortVals,sortResults,nData):
245 startNext = [] 246 tol = 1e-8 247 blockAct=sortResults[0] 248 lastBlockAct=None 249 lastDiv=None 250 i = 1 251 while i<nData: 252 # move to the end of this block: 253 while i<nData and sortVals[i]-sortVals[i-1]<=tol: 254 if sortResults[i] != blockAct: 255 # this block is heterogeneous 256 blockAct=-1 257 i+=1 258 if lastBlockAct is None: 259 # first time through: 260 lastBlockAct = blockAct 261 lastDiv = i 262 else: 263 if blockAct==-1 or lastBlockAct==-1 or blockAct!=lastBlockAct: 264 startNext.append(lastDiv) 265 lastDiv = i 266 lastBlockAct = blockAct 267 else: 268 lastDiv=i 269 if i<nData: 270 blockAct=sortResults[i] 271 i+=1 272 # catch the case that the last point also sets a bin: 273 if blockAct != lastBlockAct : 274 startNext.append(lastDiv) 275 return startNext
276
277 -def FindVarMultQuantBounds(vals,nBounds,results,nPossibleRes):
278 """ finds multiple quantization bounds for a single variable 279 280 **Arguments** 281 282 - vals: sequence of variable values (assumed to be floats) 283 284 - nBounds: the number of quantization bounds to find 285 286 - results: a list of result codes (should be integers) 287 288 - nPossibleRes: an integer with the number of possible values of the 289 result variable 290 291 **Returns** 292 293 - a 2-tuple containing: 294 295 1) a list of the quantization bounds (floats) 296 297 2) the information gain associated with this quantization 298 299 300 """ 301 assert len(vals) == len(results), 'vals/results length mismatch' 302 303 nData = len(vals) 304 if nData == 0: 305 return [],-1e8 306 307 # sort the variable values: 308 svs = zip(vals,results) 309 svs.sort() 310 sortVals,sortResults = zip(*svs) 311 startNext=_FindStartPoints(sortVals,sortResults,nData) 312 if not len(startNext): 313 return [0],0.0 314 if len(startNext)<nBounds: 315 nBounds = len(startNext)-1 316 if nBounds == 0: 317 nBounds=1 318 initCuts = range(nBounds) 319 maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext, 320 sortResults,nPossibleRes) 321 quantBounds = [] 322 nVs = len(sortVals) 323 for cut in bestCuts: 324 idx = startNext[cut] 325 if idx == nVs: 326 quantBounds.append(sortVals[-1]) 327 elif idx == 0: 328 quantBounds.append(sortVals[idx]) 329 else: 330 quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.) 331 332 return quantBounds,maxGain
333 334 #hascQuantize=0 335 if hascQuantize: 336 _RecurseOnBounds = cQuantize._RecurseOnBounds 337 _FindStartPoints = cQuantize._FindStartPoints 338 else: 339 _RecurseOnBounds = _NewPyRecurseOnBounds 340 _FindStartPoints = _NewPyFindStartPoints 341 342 if __name__ == '__main__': 343 import sys 344 if 1: 345 d = [(1.,0), 346 (1.1,0), 347 (1.2,0), 348 (1.4,1), 349 (1.4,0), 350 (1.6,1), 351 (2.,1), 352 (2.1,0), 353 (2.1,0), 354 (2.1,0), 355 (2.2,1), 356 (2.3,0)] 357 varValues = map(lambda x:x[0],d) 358 resCodes = map(lambda x:x[1],d) 359 nPossibleRes = 2 360 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes) 361 print 'RES:',res 362 target = ([1.3, 2.05],.34707 ) 363 else: 364 d = [(1.,0), 365 (1.1,0), 366 (1.2,0), 367 (1.4,1), 368 (1.4,0), 369 (1.6,1), 370 (2.,1), 371 (2.1,0), 372 (2.1,0), 373 (2.1,0), 374 (2.2,1), 375 (2.3,0)] 376 varValues = map(lambda x:x[0],d) 377 resCodes = map(lambda x:x[1],d) 378 nPossibleRes =2 379 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes) 380 print res 381 #sys.exit(1) 382 d = [(1.4,1), 383 (1.4,0)] 384 385 varValues = map(lambda x:x[0],d) 386 resCodes = map(lambda x:x[1],d) 387 nPossibleRes =2 388 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes) 389 print res 390 391 d = [(1.4,0), 392 (1.4,0),(1.6,1)] 393 varValues = map(lambda x:x[0],d) 394 resCodes = map(lambda x:x[1],d) 395 nPossibleRes =2 396 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes) 397 print res 398