Package ML :: Package Data :: Module MLData
[hide private]
[frames] | no frames]

Source Code for Module ML.Data.MLData

  1  # 
  2  #  Copyright (C) 2000-2004  greg Landrum and Rational Discovery LLC 
  3  #    All Rights Reserved 
  4  # 
  5  """ classes to be used to help work with data sets 
  6   
  7  """ 
  8  import copy,types 
  9  from Numeric import * 
 10   
 11  numericTypes = [type(1),type(1.0),type(1L)] 
12 -class MLDataSet(object):
13 """ A data set for holding general data (floats, ints, and strings) 14 15 **Note** 16 this is intended to be a read-only data structure 17 (i.e. after calling the constructor you cannot touch it) 18 """
19 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None, 20 qBounds=None,varNames=None,ptNames=None,nResults=1):
21 """ Constructor 22 23 **Arguments** 24 25 - data: a list of lists containing the data. The data are copied, so don't worry 26 about us overwriting them. 27 28 - nVars: the number of variables 29 30 - nPts: the number of points 31 32 - nPossibleVals: an list containing the number of possible values 33 for each variable (should contain 0 when not relevant) 34 This is _nVars_ long 35 36 - qBounds: a list of lists containing quantization bounds for variables 37 which are to be quantized (note, this class does not quantize 38 the variables itself, it merely stores quantization bounds. 39 an empty sublist indicates no quantization for a given variable 40 This is _nVars_ long 41 42 - varNames: a list of the names of the variables. 43 This is _nVars_ long 44 45 - ptNames: the names (labels) of the individual data points 46 This is _nPts_ long 47 48 - nResults: the number of results columns in the data lists. This is usually 49 1, but can be higher. 50 """ 51 self.data = [x[:] for x in data] 52 self.nResults = nResults 53 if nVars is None: 54 nVars = len(self.data[0])-self.nResults 55 self.nVars = nVars 56 if nPts is None: 57 nPts = len(data) 58 self.nPts = nPts 59 if qBounds is None: 60 qBounds = [[]]*len(self.data[0]) 61 self.qBounds = qBounds 62 if nPossibleVals is None: 63 nPossibleVals = self._CalcNPossible(self.data) 64 self.nPossibleVals = nPossibleVals 65 if varNames is None: 66 varNames = ['']*self.nVars 67 self.varNames = varNames 68 if ptNames is None: 69 ptNames = ['']*self.nPts 70 self.ptNames = ptNames
71
72 - def _CalcNPossible(self,data):
73 """calculates the number of possible values of each variable (where possible) 74 75 **Arguments** 76 77 -data: a list of examples to be used 78 79 **Returns** 80 81 a list of nPossible values for each variable 82 83 """ 84 nVars = self.GetNVars()+self.nResults 85 nPossible = [-1]*nVars 86 cols = range(nVars) 87 for i,bounds in enumerate(self.qBounds): 88 if len(bounds)>0: 89 nPossible[i] = len(bounds) 90 cols.remove(i) 91 92 nPts = self.GetNPts() 93 for i,pt in enumerate(self.data): 94 for col in cols[:]: 95 d = pt[col] 96 if type(d) in numericTypes: 97 if floor(d) == d: 98 nPossible[col] = max(floor(d),nPossible[col]) 99 else: 100 nPossible[col] = -1 101 cols.remove(col) 102 else: 103 nPossible[col] = -1 104 cols.remove(col) 105 return [int(x)+1 for x in nPossible]
106
107 - def GetNResults(self):
108 return self.nResults
109 - def GetNVars(self):
110 return self.nVars
111 - def GetNPts(self):
112 return self.nPts
113 - def GetNPossibleVals(self):
114 return self.nPossibleVals
115 - def GetQuantBounds(self):
116 return self.qBounds
117
118 - def __getitem__(self,idx):
119 res = [self.ptNames[idx]]+self.data[idx][:] 120 return res
121 - def __setitem__(self,idx,val):
122 if len(val) != self.GetNVars()+self.GetNResults()+1: 123 raise ValueError,'bad value in assignment' 124 self.ptNames[idx] = val[0] 125 self.data[idx] = val[1:] 126 return val
127
128 - def GetNamedData(self):
129 """ returns a list of named examples 130 131 **Note** 132 133 a named example is the result of prepending the example 134 name to the data list 135 136 """ 137 res = [None]*self.nPts 138 for i in xrange(self.nPts): 139 res[i] = [self.ptNames[i]]+self.data[i][:] 140 return res
141
142 - def GetAllData(self):
143 """ returns a *copy* of the data 144 145 """ 146 return copy.deepcopy(self.data)
147 - def GetInputData(self):
148 """ returns the input data 149 150 **Note** 151 152 _inputData_ means the examples without their result fields 153 (the last _NResults_ entries) 154 155 """ 156 v = self.GetNResults() 157 return [x[:-v] for x in self.data]
158
159 - def GetResults(self):
160 """ Returns the result fields from each example 161 162 """ 163 if self.GetNResults()>1: 164 v = self.GetNResults() 165 res = [x[-v:] for x in self.data] 166 else: 167 res = [x[-1] for x in self.data] 168 return res
169
170 - def GetVarNames(self):
171 return self.varNames
172 - def GetPtNames(self):
173 return self.ptNames
174
175 - def AddPoint(self,pt):
176 self.data.append(pt[1:]) 177 self.ptNames.append(pt[0]) 178 self.nPts += 1
179
180 - def AddPoints(self,pts,names):
181 if len(pts)!=len(names): 182 raise ValueError,"input length mismatch" 183 self.data += pts 184 self.ptNames += names 185 self.nPts = len(self.data)
186
187 -class MLQuantDataSet(MLDataSet):
188 """ a data set for holding quantized data 189 190 191 **Note** 192 193 this is intended to be a read-only data structure 194 (i.e. after calling the constructor you cannot touch it) 195 196 **Big differences to MLDataSet** 197 198 1) data are stored in a Numeric array since they are homogenous 199 200 2) results are assumed to be quantized (i.e. no qBounds entry is required) 201 202 """
203 - def _CalcNPossible(self,data):
204 """calculates the number of possible values of each variable 205 206 **Arguments** 207 208 -data: a list of examples to be used 209 210 **Returns** 211 212 a list of nPossible values for each variable 213 214 """ 215 return [max(x)+1 for x in transpose(data)]
216
217 - def GetNamedData(self):
218 """ returns a list of named examples 219 220 **Note** 221 222 a named example is the result of prepending the example 223 name to the data list 224 225 """ 226 res = [None]*self.nPts 227 for i in xrange(self.nPts): 228 res[i] = [self.ptNames[i]]+self.data[i].tolist() 229 return res
230
231 - def GetAllData(self):
232 """ returns a *copy* of the data 233 234 """ 235 return self.data.tolist()
236 - def GetInputData(self):
237 """ returns the input data 238 239 **Note** 240 241 _inputData_ means the examples without their result fields 242 (the last _NResults_ entries) 243 244 """ 245 return (self.data[:,:-self.nResults]).tolist()
246 - def GetResults(self):
247 """ Returns the result fields from each example 248 249 """ 250 if self.GetNResults()>1: 251 v = self.GetNResults() 252 res = [x[-v:] for x in self.data] 253 else: 254 res = [x[-1] for x in self.data] 255 return res
256 257
258 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None, 259 qBounds=None,varNames=None,ptNames=None,nResults=1):
260 """ Constructor 261 262 **Arguments** 263 264 - data: a list of lists containing the data. The data are copied, so don't worry 265 about us overwriting them. 266 267 - nVars: the number of variables 268 269 - nPts: the number of points 270 271 - nPossibleVals: an list containing the number of possible values 272 for each variable (should contain 0 when not relevant) 273 This is _nVars_ long 274 275 - qBounds: a list of lists containing quantization bounds for variables 276 which are to be quantized (note, this class does not quantize 277 the variables itself, it merely stores quantization bounds. 278 an empty sublist indicates no quantization for a given variable 279 This is _nVars_ long 280 281 - varNames: a list of the names of the variables. 282 This is _nVars_ long 283 284 - ptNames: the names (labels) of the individual data points 285 This is _nPts_ long 286 287 - nResults: the number of results columns in the data lists. This is usually 288 1, but can be higher. 289 """ 290 self.data = array(data) 291 self.nResults = nResults 292 if nVars is None: 293 nVars = len(data[0])-self.nResults 294 self.nVars = nVars 295 if nPts is None: 296 nPts = len(data) 297 self.nPts = nPts 298 if qBounds is None: 299 qBounds = [[]]*self.nVars 300 self.qBounds = qBounds 301 if nPossibleVals is None: 302 nPossibleVals = self._CalcNPossible(data) 303 self.nPossibleVals = nPossibleVals 304 if varNames is None: 305 varNames = ['']*self.nVars 306 self.varNames = varNames 307 if ptNames is None: 308 ptNames = ['']*self.nPts 309 self.ptNames = ptNames
310 311 312 if __name__ == '__main__': 313 import DataUtils 314 examples = [[0,0,0,0,0], 315 [0,0,0,1,0], 316 [1,0,0,0,1], 317 [2,1,0,0,1], 318 [2,2,1,0,1] 319 ] 320 varNames = ['foo1','foo2','foo3','foo4','res'] 321 ptNames = ['p1','p2','p3','p4','p5'] 322 set = MLQuantDataSet(examples,varNames=varNames,ptNames=ptNames) 323 DataUtils.WritePickledData('test_data/test.qdat.pkl',set) 324 print 'nVars:',set.GetNVars() 325 print 'nPts:',set.GetNPts() 326 print 'nPoss:',set.GetNPossibleVals() 327 print 'qBounds:',set.GetQuantBounds() 328 print 'data:',set.GetAllData() 329 print 'Input data:',set.GetInputData() 330 print 'results:',set.GetResults() 331 332 print 'nameddata:',set.GetNamedData() 333 334 examples = [ 335 ['foo',1,1.0,1,1.1], 336 ['foo',2,1.0,1,2.1], 337 ['foo',3,1.2,1.1,3.1], 338 ['foo',4,1.0,1,4.1], 339 ['foo',5,1.1,1,5.1], 340 ] 341 qBounds = [[],[],[],[],[2,4]] 342 varNames = ['foo1','foo2','foo3','foo4','res'] 343 ptNames = ['p1','p2','p3','p4','p5'] 344 set = MLDataSet(examples,qBounds=qBounds) 345 DataUtils.WritePickledData('test_data/test.dat.pkl',set) 346 print 'nVars:',set.GetNVars() 347 print 'nPts:',set.GetNPts() 348 print 'nPoss:',set.GetNPossibleVals() 349 print 'qBounds:',set.GetQuantBounds() 350 print 'data:',set.GetAllData() 351 print 'Input data:',set.GetInputData() 352 print 'results:',set.GetResults() 353 354 print 'nameddata:',set.GetNamedData() 355