1
2
3
4
5 """ classes to be used to help work with data sets
6
7 """
8 import numpy
9 import math
10 import copy,types
11
12
13 numericTypes = [type(1),type(1.0),type(1L)]
15 """ A data set for holding general data (floats, ints, and strings)
16
17 **Note**
18 this is intended to be a read-only data structure
19 (i.e. after calling the constructor you cannot touch it)
20 """
21 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None,
22 qBounds=None,varNames=None,ptNames=None,nResults=1):
23 """ Constructor
24
25 **Arguments**
26
27 - data: a list of lists containing the data. The data are copied, so don't worry
28 about us overwriting them.
29
30 - nVars: the number of variables
31
32 - nPts: the number of points
33
34 - nPossibleVals: an list containing the number of possible values
35 for each variable (should contain 0 when not relevant)
36 This is _nVars_ long
37
38 - qBounds: a list of lists containing quantization bounds for variables
39 which are to be quantized (note, this class does not quantize
40 the variables itself, it merely stores quantization bounds.
41 an empty sublist indicates no quantization for a given variable
42 This is _nVars_ long
43
44 - varNames: a list of the names of the variables.
45 This is _nVars_ long
46
47 - ptNames: the names (labels) of the individual data points
48 This is _nPts_ long
49
50 - nResults: the number of results columns in the data lists. This is usually
51 1, but can be higher.
52 """
53 self.data = [x[:] for x in data]
54 self.nResults = nResults
55 if nVars is None:
56 nVars = len(self.data[0])-self.nResults
57 self.nVars = nVars
58 if nPts is None:
59 nPts = len(data)
60 self.nPts = nPts
61 if qBounds is None:
62 qBounds = [[]]*len(self.data[0])
63 self.qBounds = qBounds
64 if nPossibleVals is None:
65 nPossibleVals = self._CalcNPossible(self.data)
66 self.nPossibleVals = nPossibleVals
67 if varNames is None:
68 varNames = ['']*self.nVars
69 self.varNames = varNames
70 if ptNames is None:
71 ptNames = ['']*self.nPts
72 self.ptNames = ptNames
73
75 """calculates the number of possible values of each variable (where possible)
76
77 **Arguments**
78
79 -data: a list of examples to be used
80
81 **Returns**
82
83 a list of nPossible values for each variable
84
85 """
86 nVars = self.GetNVars()+self.nResults
87 nPossible = [-1]*nVars
88 cols = range(nVars)
89 for i,bounds in enumerate(self.qBounds):
90 if len(bounds)>0:
91 nPossible[i] = len(bounds)
92 cols.remove(i)
93
94 nPts = self.GetNPts()
95 for i,pt in enumerate(self.data):
96 for col in cols[:]:
97 d = pt[col]
98 if type(d) in numericTypes:
99 if math.floor(d) == d:
100 nPossible[col] = max(math.floor(d),nPossible[col])
101 else:
102 nPossible[col] = -1
103 cols.remove(col)
104 else:
105 nPossible[col] = -1
106 cols.remove(col)
107 return [int(x)+1 for x in nPossible]
108
116 return self.nPossibleVals
119
121 res = [self.ptNames[idx]]+self.data[idx][:]
122 return res
124 if len(val) != self.GetNVars()+self.GetNResults()+1:
125 raise ValueError,'bad value in assignment'
126 self.ptNames[idx] = val[0]
127 self.data[idx] = val[1:]
128 return val
129
131 """ returns a list of named examples
132
133 **Note**
134
135 a named example is the result of prepending the example
136 name to the data list
137
138 """
139 res = [None]*self.nPts
140 for i in xrange(self.nPts):
141 res[i] = [self.ptNames[i]]+self.data[i][:]
142 return res
143
145 """ returns a *copy* of the data
146
147 """
148 return copy.deepcopy(self.data)
160
162 """ Returns the result fields from each example
163
164 """
165 if self.GetNResults()>1:
166 v = self.GetNResults()
167 res = [x[-v:] for x in self.data]
168 else:
169 res = [x[-1] for x in self.data]
170 return res
171
176
178 self.data.append(pt[1:])
179 self.ptNames.append(pt[0])
180 self.nPts += 1
181
183 if len(pts)!=len(names):
184 raise ValueError,"input length mismatch"
185 self.data += pts
186 self.ptNames += names
187 self.nPts = len(self.data)
188
190 """ a data set for holding quantized data
191
192
193 **Note**
194
195 this is intended to be a read-only data structure
196 (i.e. after calling the constructor you cannot touch it)
197
198 **Big differences to MLDataSet**
199
200 1) data are stored in a numpy array since they are homogenous
201
202 2) results are assumed to be quantized (i.e. no qBounds entry is required)
203
204 """
206 """calculates the number of possible values of each variable
207
208 **Arguments**
209
210 -data: a list of examples to be used
211
212 **Returns**
213
214 a list of nPossible values for each variable
215
216 """
217 return [max(x)+1 for x in numpy.transpose(data)]
218
220 """ returns a list of named examples
221
222 **Note**
223
224 a named example is the result of prepending the example
225 name to the data list
226
227 """
228 res = [None]*self.nPts
229 for i in xrange(self.nPts):
230 res[i] = [self.ptNames[i]]+self.data[i].tolist()
231 return res
232
234 """ returns a *copy* of the data
235
236 """
237 return self.data.tolist()
249 """ Returns the result fields from each example
250
251 """
252 if self.GetNResults()>1:
253 v = self.GetNResults()
254 res = [x[-v:] for x in self.data]
255 else:
256 res = [x[-1] for x in self.data]
257 return res
258
259
260 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None,
261 qBounds=None,varNames=None,ptNames=None,nResults=1):
262 """ Constructor
263
264 **Arguments**
265
266 - data: a list of lists containing the data. The data are copied, so don't worry
267 about us overwriting them.
268
269 - nVars: the number of variables
270
271 - nPts: the number of points
272
273 - nPossibleVals: an list containing the number of possible values
274 for each variable (should contain 0 when not relevant)
275 This is _nVars_ long
276
277 - qBounds: a list of lists containing quantization bounds for variables
278 which are to be quantized (note, this class does not quantize
279 the variables itself, it merely stores quantization bounds.
280 an empty sublist indicates no quantization for a given variable
281 This is _nVars_ long
282
283 - varNames: a list of the names of the variables.
284 This is _nVars_ long
285
286 - ptNames: the names (labels) of the individual data points
287 This is _nPts_ long
288
289 - nResults: the number of results columns in the data lists. This is usually
290 1, but can be higher.
291 """
292 self.data = numpy.array(data)
293 self.nResults = nResults
294 if nVars is None:
295 nVars = len(data[0])-self.nResults
296 self.nVars = nVars
297 if nPts is None:
298 nPts = len(data)
299 self.nPts = nPts
300 if qBounds is None:
301 qBounds = [[]]*self.nVars
302 self.qBounds = qBounds
303 if nPossibleVals is None:
304 nPossibleVals = self._CalcNPossible(data)
305 self.nPossibleVals = nPossibleVals
306 if varNames is None:
307 varNames = ['']*self.nVars
308 self.varNames = varNames
309 if ptNames is None:
310 ptNames = ['']*self.nPts
311 self.ptNames = ptNames
312
313
314 if __name__ == '__main__':
315 import DataUtils
316 examples = [[0,0,0,0,0],
317 [0,0,0,1,0],
318 [1,0,0,0,1],
319 [2,1,0,0,1],
320 [2,2,1,0,1]
321 ]
322 varNames = ['foo1','foo2','foo3','foo4','res']
323 ptNames = ['p1','p2','p3','p4','p5']
324 set = MLQuantDataSet(examples,varNames=varNames,ptNames=ptNames)
325 DataUtils.WritePickledData('test_data/test.qdat.pkl',set)
326 print 'nVars:',set.GetNVars()
327 print 'nPts:',set.GetNPts()
328 print 'nPoss:',set.GetNPossibleVals()
329 print 'qBounds:',set.GetQuantBounds()
330 print 'data:',set.GetAllData()
331 print 'Input data:',set.GetInputData()
332 print 'results:',set.GetResults()
333
334 print 'nameddata:',set.GetNamedData()
335
336 examples = [
337 ['foo',1,1.0,1,1.1],
338 ['foo',2,1.0,1,2.1],
339 ['foo',3,1.2,1.1,3.1],
340 ['foo',4,1.0,1,4.1],
341 ['foo',5,1.1,1,5.1],
342 ]
343 qBounds = [[],[],[],[],[2,4]]
344 varNames = ['foo1','foo2','foo3','foo4','res']
345 ptNames = ['p1','p2','p3','p4','p5']
346 set = MLDataSet(examples,qBounds=qBounds)
347 DataUtils.WritePickledData('test_data/test.dat.pkl',set)
348 print 'nVars:',set.GetNVars()
349 print 'nPts:',set.GetNPts()
350 print 'nPoss:',set.GetNPossibleVals()
351 print 'qBounds:',set.GetQuantBounds()
352 print 'data:',set.GetAllData()
353 print 'Input data:',set.GetInputData()
354 print 'results:',set.GetResults()
355
356 print 'nameddata:',set.GetNamedData()
357