1
2
3
4
5 """ classes to be used to help work with data sets
6
7 """
8 import copy,types
9 from Numeric import *
10
11 numericTypes = [type(1),type(1.0),type(1L)]
13 """ A data set for holding general data (floats, ints, and strings)
14
15 **Note**
16 this is intended to be a read-only data structure
17 (i.e. after calling the constructor you cannot touch it)
18 """
19 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None,
20 qBounds=None,varNames=None,ptNames=None,nResults=1):
21 """ Constructor
22
23 **Arguments**
24
25 - data: a list of lists containing the data. The data are copied, so don't worry
26 about us overwriting them.
27
28 - nVars: the number of variables
29
30 - nPts: the number of points
31
32 - nPossibleVals: an list containing the number of possible values
33 for each variable (should contain 0 when not relevant)
34 This is _nVars_ long
35
36 - qBounds: a list of lists containing quantization bounds for variables
37 which are to be quantized (note, this class does not quantize
38 the variables itself, it merely stores quantization bounds.
39 an empty sublist indicates no quantization for a given variable
40 This is _nVars_ long
41
42 - varNames: a list of the names of the variables.
43 This is _nVars_ long
44
45 - ptNames: the names (labels) of the individual data points
46 This is _nPts_ long
47
48 - nResults: the number of results columns in the data lists. This is usually
49 1, but can be higher.
50 """
51 self.data = [x[:] for x in data]
52 self.nResults = nResults
53 if nVars is None:
54 nVars = len(self.data[0])-self.nResults
55 self.nVars = nVars
56 if nPts is None:
57 nPts = len(data)
58 self.nPts = nPts
59 if qBounds is None:
60 qBounds = [[]]*len(self.data[0])
61 self.qBounds = qBounds
62 if nPossibleVals is None:
63 nPossibleVals = self._CalcNPossible(self.data)
64 self.nPossibleVals = nPossibleVals
65 if varNames is None:
66 varNames = ['']*self.nVars
67 self.varNames = varNames
68 if ptNames is None:
69 ptNames = ['']*self.nPts
70 self.ptNames = ptNames
71
73 """calculates the number of possible values of each variable (where possible)
74
75 **Arguments**
76
77 -data: a list of examples to be used
78
79 **Returns**
80
81 a list of nPossible values for each variable
82
83 """
84 nVars = self.GetNVars()+self.nResults
85 nPossible = [-1]*nVars
86 cols = range(nVars)
87 for i,bounds in enumerate(self.qBounds):
88 if len(bounds)>0:
89 nPossible[i] = len(bounds)
90 cols.remove(i)
91
92 nPts = self.GetNPts()
93 for i,pt in enumerate(self.data):
94 for col in cols[:]:
95 d = pt[col]
96 if type(d) in numericTypes:
97 if floor(d) == d:
98 nPossible[col] = max(floor(d),nPossible[col])
99 else:
100 nPossible[col] = -1
101 cols.remove(col)
102 else:
103 nPossible[col] = -1
104 cols.remove(col)
105 return [int(x)+1 for x in nPossible]
106
114 return self.nPossibleVals
117
119 res = [self.ptNames[idx]]+self.data[idx][:]
120 return res
122 if len(val) != self.GetNVars()+self.GetNResults()+1:
123 raise ValueError,'bad value in assignment'
124 self.ptNames[idx] = val[0]
125 self.data[idx] = val[1:]
126 return val
127
129 """ returns a list of named examples
130
131 **Note**
132
133 a named example is the result of prepending the example
134 name to the data list
135
136 """
137 res = [None]*self.nPts
138 for i in xrange(self.nPts):
139 res[i] = [self.ptNames[i]]+self.data[i][:]
140 return res
141
143 """ returns a *copy* of the data
144
145 """
146 return copy.deepcopy(self.data)
158
160 """ Returns the result fields from each example
161
162 """
163 if self.GetNResults()>1:
164 v = self.GetNResults()
165 res = [x[-v:] for x in self.data]
166 else:
167 res = [x[-1] for x in self.data]
168 return res
169
174
179
181 if len(pts)!=len(names):
182 raise ValueError,"input length mismatch"
183 self.data += pts
184 self.ptNames += names
185 self.nPts = len(self.data)
186
188 """ a data set for holding quantized data
189
190
191 **Note**
192
193 this is intended to be a read-only data structure
194 (i.e. after calling the constructor you cannot touch it)
195
196 **Big differences to MLDataSet**
197
198 1) data are stored in a Numeric array since they are homogenous
199
200 2) results are assumed to be quantized (i.e. no qBounds entry is required)
201
202 """
204 """calculates the number of possible values of each variable
205
206 **Arguments**
207
208 -data: a list of examples to be used
209
210 **Returns**
211
212 a list of nPossible values for each variable
213
214 """
215 return [max(x)+1 for x in transpose(data)]
216
218 """ returns a list of named examples
219
220 **Note**
221
222 a named example is the result of prepending the example
223 name to the data list
224
225 """
226 res = [None]*self.nPts
227 for i in xrange(self.nPts):
228 res[i] = [self.ptNames[i]]+self.data[i].tolist()
229 return res
230
232 """ returns a *copy* of the data
233
234 """
235 return self.data.tolist()
247 """ Returns the result fields from each example
248
249 """
250 if self.GetNResults()>1:
251 v = self.GetNResults()
252 res = [x[-v:] for x in self.data]
253 else:
254 res = [x[-1] for x in self.data]
255 return res
256
257
258 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None,
259 qBounds=None,varNames=None,ptNames=None,nResults=1):
260 """ Constructor
261
262 **Arguments**
263
264 - data: a list of lists containing the data. The data are copied, so don't worry
265 about us overwriting them.
266
267 - nVars: the number of variables
268
269 - nPts: the number of points
270
271 - nPossibleVals: an list containing the number of possible values
272 for each variable (should contain 0 when not relevant)
273 This is _nVars_ long
274
275 - qBounds: a list of lists containing quantization bounds for variables
276 which are to be quantized (note, this class does not quantize
277 the variables itself, it merely stores quantization bounds.
278 an empty sublist indicates no quantization for a given variable
279 This is _nVars_ long
280
281 - varNames: a list of the names of the variables.
282 This is _nVars_ long
283
284 - ptNames: the names (labels) of the individual data points
285 This is _nPts_ long
286
287 - nResults: the number of results columns in the data lists. This is usually
288 1, but can be higher.
289 """
290 self.data = array(data)
291 self.nResults = nResults
292 if nVars is None:
293 nVars = len(data[0])-self.nResults
294 self.nVars = nVars
295 if nPts is None:
296 nPts = len(data)
297 self.nPts = nPts
298 if qBounds is None:
299 qBounds = [[]]*self.nVars
300 self.qBounds = qBounds
301 if nPossibleVals is None:
302 nPossibleVals = self._CalcNPossible(data)
303 self.nPossibleVals = nPossibleVals
304 if varNames is None:
305 varNames = ['']*self.nVars
306 self.varNames = varNames
307 if ptNames is None:
308 ptNames = ['']*self.nPts
309 self.ptNames = ptNames
310
311
312 if __name__ == '__main__':
313 import DataUtils
314 examples = [[0,0,0,0,0],
315 [0,0,0,1,0],
316 [1,0,0,0,1],
317 [2,1,0,0,1],
318 [2,2,1,0,1]
319 ]
320 varNames = ['foo1','foo2','foo3','foo4','res']
321 ptNames = ['p1','p2','p3','p4','p5']
322 set = MLQuantDataSet(examples,varNames=varNames,ptNames=ptNames)
323 DataUtils.WritePickledData('test_data/test.qdat.pkl',set)
324 print 'nVars:',set.GetNVars()
325 print 'nPts:',set.GetNPts()
326 print 'nPoss:',set.GetNPossibleVals()
327 print 'qBounds:',set.GetQuantBounds()
328 print 'data:',set.GetAllData()
329 print 'Input data:',set.GetInputData()
330 print 'results:',set.GetResults()
331
332 print 'nameddata:',set.GetNamedData()
333
334 examples = [
335 ['foo',1,1.0,1,1.1],
336 ['foo',2,1.0,1,2.1],
337 ['foo',3,1.2,1.1,3.1],
338 ['foo',4,1.0,1,4.1],
339 ['foo',5,1.1,1,5.1],
340 ]
341 qBounds = [[],[],[],[],[2,4]]
342 varNames = ['foo1','foo2','foo3','foo4','res']
343 ptNames = ['p1','p2','p3','p4','p5']
344 set = MLDataSet(examples,qBounds=qBounds)
345 DataUtils.WritePickledData('test_data/test.dat.pkl',set)
346 print 'nVars:',set.GetNVars()
347 print 'nPts:',set.GetNPts()
348 print 'nPoss:',set.GetNPossibleVals()
349 print 'qBounds:',set.GetQuantBounds()
350 print 'data:',set.GetAllData()
351 print 'Input data:',set.GetInputData()
352 print 'results:',set.GetResults()
353
354 print 'nameddata:',set.GetNamedData()
355