1
2
3
4
5
6
7 """ Automatic search for quantization bounds
8
9 This uses the expected informational gain to determine where quantization bounds should
10 lie.
11
12 **Notes**:
13
14 - bounds are less than, so if the bounds are [1.,2.],
15 [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2]
16
17 """
18 from Numeric import *
19 from ML.InfoTheory import entropy
20 try:
21 import cQuantize
22 except:
23 hascQuantize = 0
24 else:
25 hascQuantize = 1
26
27 _float_tol = 1e-8
29 """ floating point equality with a tolerance factor
30
31 **Arguments**
32
33 - v1: a float
34
35 - v2: a float
36
37 - tol: the tolerance for comparison
38
39 **Returns**
40
41 0 or 1
42
43 """
44 return abs(v1-v2) < tol
45
47 """ Uses FindVarMultQuantBounds, only here for historic reasons
48 """
49 bounds,gain = FindVarMultQuantBounds(vals,1,results,nPossibleRes)
50 return (bounds[0],gain)
51
52
54 """ Primarily intended for internal use
55
56 constructs a variable table for the data passed in
57 The table for a given variable records the number of times each possible value
58 of that variable appears for each possible result of the function.
59
60 **Arguments**
61
62 - vals: a 1D Numeric array with the values of the variables
63
64 - cuts: a list with the indices of the quantization bounds
65 (indices are into _starts_ )
66
67 - starts: a list of potential starting points for quantization bounds
68
69 - results: a 1D Numeric array of integer result codes
70
71 - nPossibleRes: an integer with the number of possible result codes
72
73 **Returns**
74
75 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes
76
77 **Notes**
78
79 - _vals_ should be sorted!
80
81 """
82 nVals = len(cuts)+1
83 varTable = zeros((nVals,nPossibleRes),Int)
84 idx = 0
85 for i in xrange(nVals-1):
86 cut = cuts[i]
87 while idx < starts[cut]:
88 varTable[i,results[idx]] += 1
89 idx += 1
90 while idx < len(vals):
91 varTable[-1,results[idx]] += 1
92 idx += 1
93 return varTable
94
96 """ Primarily intended for internal use
97
98 Recursively finds the best quantization boundaries
99
100 **Arguments**
101
102 - vals: a 1D Numeric array with the values of the variables,
103 this should be sorted
104
105 - cuts: a list with the indices of the quantization bounds
106 (indices are into _starts_ )
107
108 - which: an integer indicating which bound is being adjusted here
109 (and index into _cuts_ )
110
111 - starts: a list of potential starting points for quantization bounds
112
113 - results: a 1D Numeric array of integer result codes
114
115 - nPossibleRes: an integer with the number of possible result codes
116
117 **Returns**
118
119 - a 2-tuple containing:
120
121 1) the best information gain found so far
122
123 2) a list of the quantization bound indices ( _cuts_ for the best case)
124
125 **Notes**
126
127 - this is not even remotely efficient, which is why a C replacement
128 was written
129
130 """
131 nBounds = len(cuts)
132 maxGain = -1e6
133 bestCuts = None
134 highestCutHere = len(starts) - nBounds + which
135 if varTable is None:
136 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
137 while cuts[which] <= highestCutHere:
138 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
139 gainHere = entropy.InfoGain(varTable)
140 if gainHere > maxGain:
141 maxGain = gainHere
142 bestCuts = cuts[:]
143
144 if which < nBounds-1:
145 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
146 varTable = varTable)
147 if gainHere > maxGain:
148 maxGain = gainHere
149 bestCuts = cutsHere
150
151 cuts[which] += 1
152 for i in range(which+1,nBounds):
153 if cuts[i] == cuts[i-1]:
154 cuts[i] += 1
155
156 return maxGain,bestCuts
157
159 """ Primarily intended for internal use
160
161 Recursively finds the best quantization boundaries
162
163 **Arguments**
164
165 - vals: a 1D Numeric array with the values of the variables,
166 this should be sorted
167
168 - cuts: a list with the indices of the quantization bounds
169 (indices are into _starts_ )
170
171 - which: an integer indicating which bound is being adjusted here
172 (and index into _cuts_ )
173
174 - starts: a list of potential starting points for quantization bounds
175
176 - results: a 1D Numeric array of integer result codes
177
178 - nPossibleRes: an integer with the number of possible result codes
179
180 **Returns**
181
182 - a 2-tuple containing:
183
184 1) the best information gain found so far
185
186 2) a list of the quantization bound indices ( _cuts_ for the best case)
187
188 **Notes**
189
190 - this is not even remotely efficient, which is why a C replacement
191 was written
192
193 """
194 nBounds = len(cuts)
195 maxGain = -1e6
196 bestCuts = None
197 highestCutHere = len(starts) - nBounds + which
198 if varTable is None:
199 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
200 while cuts[which] <= highestCutHere:
201 gainHere = entropy.InfoGain(varTable)
202 if gainHere > maxGain:
203 maxGain = gainHere
204 bestCuts = cuts[:]
205
206 if which < nBounds-1:
207 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
208 varTable = None)
209 if gainHere > maxGain:
210 maxGain = gainHere
211 bestCuts = cutsHere
212
213 oldCut = cuts[which]
214 cuts[which] += 1
215 bot = starts[oldCut]
216 if oldCut+1 < len(starts):
217 top = starts[oldCut+1]
218 else:
219 top = starts[-1]
220 for i in range(bot,top):
221 v = results[i]
222 varTable[which,v] += 1
223 varTable[which+1,v] -= 1
224 for i in range(which+1,nBounds):
225 if cuts[i] == cuts[i-1]:
226 cuts[i] += 1
227
228
229 return maxGain,bestCuts
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
247 startNext = []
248 tol = 1e-8
249 i = 0
250 start=0
251 actHomog=1
252 valHomog=1
253 while i<nData:
254
255 if sortVals[i]-sortVals[start]>tol:
256 valHomog=0
257 if sortResults[i]!=sortResults[start]:
258 actHomog=0
259 if not actHomog and not valHomog:
260
261
262 if( sortVals[i]-sortVals[i-1]<tol ):
263
264 while i>1 and sortVals[i]-sortVals[i-1]<tol:
265 i-=1
266
267
268 else:
269
270 pass
271 startNext.append(i)
272 start = i
273 actHomog = 1
274 valHomog = 1
275 i += 1
276 return startNext
277
279 """ finds multiple quantization bounds for a single variable
280
281 **Arguments**
282
283 - vals: sequence of variable values (assumed to be floats)
284
285 - nBounds: the number of quantization bounds to find
286
287 - results: a list of result codes (should be integers)
288
289 - nPossibleRes: an integer with the number of possible values of the
290 result variable
291
292 **Returns**
293
294 - a 2-tuple containing:
295
296 1) a list of the quantization bounds (floats)
297
298 2) the information gain associated with this quantization
299
300
301 """
302 assert len(vals) == len(results), 'vals/results length mismatch'
303
304 nData = len(vals)
305 if nData == 0:
306 return [],-1e8
307
308
309
310
311 sortIdx = multiarray.argsort(vals)
312 sortVals = array(take(vals,sortIdx),Float)
313 sortResults = array(take(results,sortIdx),Int)
314 startNext=_FindStartPoints(sortVals,sortResults,nData)
315 if not len(startNext):
316 return [0],0.0
317 if len(startNext)<nBounds:
318 nBounds = len(startNext)-1
319 if nBounds == 0:
320 nBounds=1
321 initCuts = range(nBounds)
322 maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext,
323 sortResults,nPossibleRes)
324 quantBounds = []
325 nVs = len(sortVals)
326 for cut in bestCuts:
327 idx = startNext[cut]
328 if idx == nVs:
329 quantBounds.append(sortVals[-1])
330 elif idx == 0:
331 quantBounds.append(sortVals[idx])
332 else:
333 quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.)
334
335 return quantBounds,maxGain
336
337 if hascQuantize:
338 _RecurseOnBounds = cQuantize._RecurseOnBounds
339 _FindStartPoints = cQuantize._FindStartPoints
340 else:
341 _RecurseOnBounds = _NewPyRecurseOnBounds
342 _FindStartPoints = _NewPyFindStartPoints
343
344 if __name__ == '__main__':
345 import sys
346 if 1:
347 d = [(1.,0),
348 (1.1,0),
349 (1.2,0),
350 (1.4,1),
351 (1.4,0),
352 (1.6,1),
353 (2.,1),
354 (2.1,0),
355 (2.1,0),
356 (2.1,0),
357 (2.2,1),
358 (2.3,0)]
359 varValues = map(lambda x:x[0],d)
360 resCodes = map(lambda x:x[1],d)
361 nPossibleRes = 2
362 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
363 print 'RES:',res
364 target = ([1.3, 2.05],.34707 )
365 else:
366 d = [(1.,0),
367 (1.1,0),
368 (1.2,0),
369 (1.4,1),
370 (1.4,0),
371 (1.6,1),
372 (2.,1),
373 (2.1,0),
374 (2.1,0),
375 (2.1,0),
376 (2.2,1),
377 (2.3,0)]
378 varValues = map(lambda x:x[0],d)
379 resCodes = map(lambda x:x[1],d)
380 nPossibleRes =2
381 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
382 print res
383
384 d = [(1.4,1),
385 (1.4,0)]
386
387 varValues = map(lambda x:x[0],d)
388 resCodes = map(lambda x:x[1],d)
389 nPossibleRes =2
390 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
391 print res
392
393 d = [(1.4,0),
394 (1.4,0),(1.6,1)]
395 varValues = map(lambda x:x[0],d)
396 resCodes = map(lambda x:x[1],d)
397 nPossibleRes =2
398 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
399 print res
400