1
2
3
4
5
6
7 """ Automatic search for quantization bounds
8
9 This uses the expected informational gain to determine where quantization bounds should
10 lie.
11
12 **Notes**:
13
14 - bounds are less than, so if the bounds are [1.,2.],
15 [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2]
16
17 """
18 import numpy
19 from rdkit.ML.InfoTheory import entropy
20 try:
21 import cQuantize
22 except:
23 hascQuantize = 0
24 else:
25 hascQuantize = 1
26
27 _float_tol = 1e-8
29 """ floating point equality with a tolerance factor
30
31 **Arguments**
32
33 - v1: a float
34
35 - v2: a float
36
37 - tol: the tolerance for comparison
38
39 **Returns**
40
41 0 or 1
42
43 """
44 return abs(v1-v2) < tol
45
47 """ Uses FindVarMultQuantBounds, only here for historic reasons
48 """
49 bounds,gain = FindVarMultQuantBounds(vals,1,results,nPossibleRes)
50 return (bounds[0],gain)
51
52
54 """ Primarily intended for internal use
55
56 constructs a variable table for the data passed in
57 The table for a given variable records the number of times each possible value
58 of that variable appears for each possible result of the function.
59
60 **Arguments**
61
62 - vals: a 1D Numeric array with the values of the variables
63
64 - cuts: a list with the indices of the quantization bounds
65 (indices are into _starts_ )
66
67 - starts: a list of potential starting points for quantization bounds
68
69 - results: a 1D Numeric array of integer result codes
70
71 - nPossibleRes: an integer with the number of possible result codes
72
73 **Returns**
74
75 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes
76
77 **Notes**
78
79 - _vals_ should be sorted!
80
81 """
82 nVals = len(cuts)+1
83 varTable = numpy.zeros((nVals,nPossibleRes),'i')
84 idx = 0
85 for i in range(nVals-1):
86 cut = cuts[i]
87 while idx < starts[cut]:
88 varTable[i,results[idx]] += 1
89 idx += 1
90 while idx < len(vals):
91 varTable[-1,results[idx]] += 1
92 idx += 1
93 return varTable
94
96 """ Primarily intended for internal use
97
98 Recursively finds the best quantization boundaries
99
100 **Arguments**
101
102 - vals: a 1D Numeric array with the values of the variables,
103 this should be sorted
104
105 - cuts: a list with the indices of the quantization bounds
106 (indices are into _starts_ )
107
108 - which: an integer indicating which bound is being adjusted here
109 (and index into _cuts_ )
110
111 - starts: a list of potential starting points for quantization bounds
112
113 - results: a 1D Numeric array of integer result codes
114
115 - nPossibleRes: an integer with the number of possible result codes
116
117 **Returns**
118
119 - a 2-tuple containing:
120
121 1) the best information gain found so far
122
123 2) a list of the quantization bound indices ( _cuts_ for the best case)
124
125 **Notes**
126
127 - this is not even remotely efficient, which is why a C replacement
128 was written
129
130 """
131 nBounds = len(cuts)
132 maxGain = -1e6
133 bestCuts = None
134 highestCutHere = len(starts) - nBounds + which
135 if varTable is None:
136 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
137 while cuts[which] <= highestCutHere:
138 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
139 gainHere = entropy.InfoGain(varTable)
140 if gainHere > maxGain:
141 maxGain = gainHere
142 bestCuts = cuts[:]
143
144 if which < nBounds-1:
145 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
146 varTable = varTable)
147 if gainHere > maxGain:
148 maxGain = gainHere
149 bestCuts = cutsHere
150
151 cuts[which] += 1
152 for i in range(which+1,nBounds):
153 if cuts[i] == cuts[i-1]:
154 cuts[i] += 1
155
156 return maxGain,bestCuts
157
159 """ Primarily intended for internal use
160
161 Recursively finds the best quantization boundaries
162
163 **Arguments**
164
165 - vals: a 1D Numeric array with the values of the variables,
166 this should be sorted
167
168 - cuts: a list with the indices of the quantization bounds
169 (indices are into _starts_ )
170
171 - which: an integer indicating which bound is being adjusted here
172 (and index into _cuts_ )
173
174 - starts: a list of potential starting points for quantization bounds
175
176 - results: a 1D Numeric array of integer result codes
177
178 - nPossibleRes: an integer with the number of possible result codes
179
180 **Returns**
181
182 - a 2-tuple containing:
183
184 1) the best information gain found so far
185
186 2) a list of the quantization bound indices ( _cuts_ for the best case)
187
188 **Notes**
189
190 - this is not even remotely efficient, which is why a C replacement
191 was written
192
193 """
194 nBounds = len(cuts)
195 maxGain = -1e6
196 bestCuts = None
197 highestCutHere = len(starts) - nBounds + which
198 if varTable is None:
199 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
200 while cuts[which] <= highestCutHere:
201 gainHere = entropy.InfoGain(varTable)
202 if gainHere > maxGain:
203 maxGain = gainHere
204 bestCuts = cuts[:]
205
206 if which < nBounds-1:
207 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
208 varTable = None)
209 if gainHere > maxGain:
210 maxGain = gainHere
211 bestCuts = cutsHere
212
213 oldCut = cuts[which]
214 cuts[which] += 1
215 bot = starts[oldCut]
216 if oldCut+1 < len(starts):
217 top = starts[oldCut+1]
218 else:
219 top = starts[-1]
220 for i in range(bot,top):
221 v = results[i]
222 varTable[which,v] += 1
223 varTable[which+1,v] -= 1
224 for i in range(which+1,nBounds):
225 if cuts[i] == cuts[i-1]:
226 cuts[i] += 1
227
228
229 return maxGain,bestCuts
230
231
232
233
234
235
236
237
238
239
240
241
242
243
245 startNext = []
246 tol = 1e-8
247 blockAct=sortResults[0]
248 lastBlockAct=None
249 lastDiv=None
250 i = 1
251 while i<nData:
252
253 while i<nData and sortVals[i]-sortVals[i-1]<=tol:
254 if sortResults[i] != blockAct:
255
256 blockAct=-1
257 i+=1
258 if lastBlockAct is None:
259
260 lastBlockAct = blockAct
261 lastDiv = i
262 else:
263 if blockAct==-1 or lastBlockAct==-1 or blockAct!=lastBlockAct:
264 startNext.append(lastDiv)
265 lastDiv = i
266 lastBlockAct = blockAct
267 else:
268 lastDiv=i
269 if i<nData:
270 blockAct=sortResults[i]
271 i+=1
272
273 if blockAct != lastBlockAct :
274 startNext.append(lastDiv)
275 return startNext
276
278 """ finds multiple quantization bounds for a single variable
279
280 **Arguments**
281
282 - vals: sequence of variable values (assumed to be floats)
283
284 - nBounds: the number of quantization bounds to find
285
286 - results: a list of result codes (should be integers)
287
288 - nPossibleRes: an integer with the number of possible values of the
289 result variable
290
291 **Returns**
292
293 - a 2-tuple containing:
294
295 1) a list of the quantization bounds (floats)
296
297 2) the information gain associated with this quantization
298
299
300 """
301 assert len(vals) == len(results), 'vals/results length mismatch'
302
303 nData = len(vals)
304 if nData == 0:
305 return [],-1e8
306
307
308 svs = zip(vals,results)
309 svs.sort()
310 sortVals,sortResults = zip(*svs)
311 startNext=_FindStartPoints(sortVals,sortResults,nData)
312 if not len(startNext):
313 return [0],0.0
314 if len(startNext)<nBounds:
315 nBounds = len(startNext)-1
316 if nBounds == 0:
317 nBounds=1
318 initCuts = range(nBounds)
319 maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext,
320 sortResults,nPossibleRes)
321 quantBounds = []
322 nVs = len(sortVals)
323 for cut in bestCuts:
324 idx = startNext[cut]
325 if idx == nVs:
326 quantBounds.append(sortVals[-1])
327 elif idx == 0:
328 quantBounds.append(sortVals[idx])
329 else:
330 quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.)
331
332 return quantBounds,maxGain
333
334
335 if hascQuantize:
336 _RecurseOnBounds = cQuantize._RecurseOnBounds
337 _FindStartPoints = cQuantize._FindStartPoints
338 else:
339 _RecurseOnBounds = _NewPyRecurseOnBounds
340 _FindStartPoints = _NewPyFindStartPoints
341
342 if __name__ == '__main__':
343 import sys
344 if 1:
345 d = [(1.,0),
346 (1.1,0),
347 (1.2,0),
348 (1.4,1),
349 (1.4,0),
350 (1.6,1),
351 (2.,1),
352 (2.1,0),
353 (2.1,0),
354 (2.1,0),
355 (2.2,1),
356 (2.3,0)]
357 varValues = map(lambda x:x[0],d)
358 resCodes = map(lambda x:x[1],d)
359 nPossibleRes = 2
360 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
361 print 'RES:',res
362 target = ([1.3, 2.05],.34707 )
363 else:
364 d = [(1.,0),
365 (1.1,0),
366 (1.2,0),
367 (1.4,1),
368 (1.4,0),
369 (1.6,1),
370 (2.,1),
371 (2.1,0),
372 (2.1,0),
373 (2.1,0),
374 (2.2,1),
375 (2.3,0)]
376 varValues = map(lambda x:x[0],d)
377 resCodes = map(lambda x:x[1],d)
378 nPossibleRes =2
379 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
380 print res
381
382 d = [(1.4,1),
383 (1.4,0)]
384
385 varValues = map(lambda x:x[0],d)
386 resCodes = map(lambda x:x[1],d)
387 nPossibleRes =2
388 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
389 print res
390
391 d = [(1.4,0),
392 (1.4,0),(1.6,1)]
393 varValues = map(lambda x:x[0],d)
394 resCodes = map(lambda x:x[1],d)
395 nPossibleRes =2
396 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
397 print res
398