1
2
3
4
5
6 """ Utilities for data manipulation
7
8 **FILE FORMATS:**
9
10 - *.qdat files* contain quantized data suitable for
11 feeding to learning algorithms.
12
13 The .qdat file, written by _DecTreeGui_, is structured as follows:
14
15 1) Any number of lines which are ignored.
16
17 2) A line containing the string 'Variable Table'
18
19 any number of variable definitions in the format:
20
21 '# Variable_name [quant_bounds]'
22
23 where '[quant_bounds]' is a list of the boundaries used for quantizing
24 that variable. If the variable is inherently integral (i.e. not
25 quantized), this can be an empty list.
26
27 3) A line beginning with '# ----' which signals the end of the variable list
28
29 4) Any number of lines containing data points, in the format:
30
31 'Name_of_point var1 var2 var3 .... varN'
32
33 all variable values should be integers
34
35 Throughout, it is assumed that varN is the result
36
37 - *.dat files* contain the same information as .qdat files, but the variable
38 values can be anything (floats, ints, strings). **These files should
39 still contain quant_bounds!**
40
41 - *.qdat.pkl file* contain a pickled (binary) representation of
42 the data read in. They stores, in order:
43
44 1) A python list of the variable names
45
46 2) A python list of lists with the quantization bounds
47
48 3) A python list of the point names
49
50 4) A python list of lists with the data points
51
52 """
53 import RDConfig
54 from utils import fileutils
55 from ML.Data import MLData
56 from Dbase.DbConnection import DbConnect
57 from DataStructs import BitUtils
58 import string
59 import re,csv
60 import cPickle
61 import RandomArray
62
63 -def WriteData(outFile,varNames,qBounds,examples):
64 """ writes out a .qdat file
65
66 **Arguments**
67
68 - outFile: a file object
69
70 - varNames: a list of variable names
71
72 - qBounds: the list of quantization bounds (should be the same length
73 as _varNames_)
74
75 - examples: the data to be written
76
77 """
78 outFile.write('# Quantized data from DataUtils\n')
79 outFile.write('# ----------\n')
80 outFile.write('# Variable Table\n')
81 for i in xrange(len(varNames)):
82 outFile.write('# %s %s\n'%(varNames[i],str(qBounds[i])))
83 outFile.write('# ----------\n')
84 for example in examples:
85 outFile.write(string.join(map(str,example),' ')+'\n')
86
87
89 """ reads the variables and quantization bounds from a .qdat or .dat file
90
91 **Arguments**
92
93 - inFile: a file object
94
95 **Returns**
96
97 a 2-tuple containing:
98
99 1) varNames: a list of the variable names
100
101 2) qbounds: the list of quantization bounds for each variable
102
103 """
104 varNames = []
105 qBounds = []
106 fileutils.MoveToMatchingLine(inFile,'Variable Table')
107 inLine = inFile.readline()
108 while string.find(inLine,'# ----') == -1:
109 splitLine = string.split(inLine[2:],'[')
110 varNames.append(string.strip(splitLine[0]))
111 qBounds.append(splitLine[1][:-2])
112 inLine = inFile.readline()
113 for i in xrange(len(qBounds)):
114
115 if qBounds[i] != '':
116 l = string.split(qBounds[i],',')
117 qBounds[i] = []
118 for item in l:
119 qBounds[i].append(float(item))
120 else:
121 qBounds[i] = []
122 return varNames,qBounds
123
125 """ reads the examples from a .qdat file
126
127 **Arguments**
128
129 - inFile: a file object
130
131 **Returns**
132
133 a 2-tuple containing:
134
135 1) the names of the examples
136
137 2) a list of lists containing the examples themselves
138
139 **Note**
140
141 because this is reading a .qdat file, it assumed that all variable values
142 are integers
143
144 """
145 expr1 = re.compile(r'^#')
146 expr2 = re.compile(r'[\ ]*|[\t]*')
147 examples = []
148 names = []
149 inLine = inFile.readline()
150 while inLine:
151 if expr1.search(inLine) is None:
152 resArr = expr2.split(inLine)
153 if len(resArr)>1:
154 examples.append(map(lambda x: int(x),resArr[1:]))
155 names.append(resArr[0])
156 inLine = inFile.readline()
157 return names,examples
158
160 """ reads the examples from a .dat file
161
162 **Arguments**
163
164 - inFile: a file object
165
166 **Returns**
167
168 a 2-tuple containing:
169
170 1) the names of the examples
171
172 2) a list of lists containing the examples themselves
173
174 **Note**
175
176 - this attempts to convert variable values to ints, then floats.
177 if those both fail, they are left as strings
178
179 """
180 expr1 = re.compile(r'^#')
181 expr2 = re.compile(r'[\ ]*|[\t]*')
182 examples = []
183 names = []
184 inLine = inFile.readline()
185 while inLine:
186 if expr1.search(inLine) is None:
187 resArr = expr2.split(inLine)[:-1]
188 if len(resArr)>1:
189 for i in xrange(1,len(resArr)):
190 d = resArr[i]
191 try:
192 resArr[i] = int(d)
193 except ValueError:
194 try:
195 resArr[i] = float(d)
196 except ValueError:
197 pass
198 examples.append(resArr[1:])
199 names.append(resArr[0])
200 inLine = inFile.readline()
201 return names,examples
202
204 """ builds a data set from a .qdat file
205
206 **Arguments**
207
208 - fileName: the name of the .qdat file
209
210 **Returns**
211
212 an _MLData.MLQuantDataSet_
213
214 """
215 inFile = open(fileName,'r')
216
217 varNames,qBounds = ReadVars(inFile)
218 ptNames,examples = ReadQuantExamples(inFile)
219 data = MLData.MLQuantDataSet(examples,qBounds=qBounds,varNames=varNames,
220 ptNames=ptNames)
221 return data
222
223
225 """ builds a data set from a .dat file
226
227 **Arguments**
228
229 - fileName: the name of the .dat file
230
231 **Returns**
232
233 an _MLData.MLDataSet_
234
235 """
236 inFile = open(fileName,'r')
237
238 varNames,qBounds = ReadVars(inFile)
239 ptNames,examples = ReadGeneralExamples(inFile)
240 data = MLData.MLDataSet(examples,qBounds=qBounds,varNames=varNames,
241 ptNames=ptNames)
242 return data
243
244
246 """ calculates the number of possible values for each variable in a data set
247
248 **Arguments**
249
250 - data: a list of examples
251
252 - order: the ordering map between the variables in _data_ and _qBounds_
253
254 - qBounds: the quantization bounds for the variables
255
256 **Returns**
257
258 a list with the number of possible values each variable takes on in the data set
259
260 **Notes**
261
262 - variables present in _qBounds_ will have their _nPossible_ number read
263 from _qbounds
264
265 - _nPossible_ for other numeric variables will be calculated
266
267 """
268 numericTypes = [type(1),type(1.0),type(1L)]
269 print 'order:',order, len(order)
270 print 'qB:',qBounds
271
272 assert (qBounds and len(order)==len(qBounds)) or (nQBounds and len(order)==len(nQBounds)),\
273 'order/qBounds mismatch'
274 nVars = len(order)
275 nPossible = [-1]*nVars
276 cols = range(nVars)
277 for i in xrange(nVars):
278 if nQBounds and nQBounds[i] != 0:
279 nPossible[i] = -1
280 cols.remove(i)
281 elif len(qBounds[i])>0:
282 nPossible[i] = len(qBounds[i])
283 cols.remove(i)
284
285 nPts = len(data)
286 for i in xrange(nPts):
287 for col in cols[:]:
288 d = data[i][order[col]]
289 if type(d) in numericTypes:
290 if int(d) == d:
291 nPossible[col] = max(int(d),nPossible[col])
292 else:
293 nPossible[col] = -1
294 cols.remove(col)
295 else:
296 print 'bye bye col %d: %s'%(col,repr(d))
297 nPossible[col] = -1
298 cols.remove(col)
299
300 return map(lambda x:int(x)+1,nPossible)
301
302
303
305 """ writes either a .qdat.pkl or a .dat.pkl file
306
307 **Arguments**
308
309 - outName: the name of the file to be used
310
311 - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_
312
313 """
314 outFile = open(outName,'wb+')
315 varNames = data.GetVarNames()
316 qBounds = data.GetQuantBounds()
317 ptNames = data.GetPtNames()
318 examples = data.GetAllData()
319
320 cPickle.dump(varNames,outFile)
321 cPickle.dump(qBounds,outFile)
322 cPickle.dump(ptNames,outFile)
323 cPickle.dump(examples,outFile)
324 outFile.close()
325
326 -def QuantDataToDB(qData,dbName,baseTableName,user='sysdba',password='masterkey'):
327 """ fires an _MLData.MLQuantDataSet_ into a database
328
329 **Arguments**
330
331 - qData: an _MLData.MLQuantDataSet_
332
333 - dbName: the name of the database to be opened
334
335 - baseTableName: the table name to contain the data in the database
336
337 - user: the user name to be used to connect to the database
338
339 - password: the password to be used to connect to the database
340
341 **Notes**
342
343 - at the moment this is specific to using *Interbase* with the *gvib*
344 adaptor, but that would be straightforward to change.
345
346 - two tables are actually created:
347
348 1) _baseTableName_ contains the quantized data
349
350 2) _baseTableName_QBounds_ contains the quantization bounds
351
352
353 """
354 import gvib
355 cn = gvib.connect(dbName,user,password)
356 c = cn.cursor()
357 varNames = qData.varNames
358 maxPtNameLen = max(map(len,qData.GetPtNames()))
359
360 valStr = '%s varchar(%d)'%(varNames[0],maxPtNameLen)
361 for var in xrange(1,len(varNames)):
362 valStr = valStr + ', %s int'%(varNames[var])
363
364 try:
365 c.execute('drop table %s'%baseTableName)
366 except:
367 pass
368 createStr = 'create table %s (%s)'%(baseTableName,valStr)
369 print 'create:',createStr
370 c.execute(createStr)
371 for pt in qData.GetNamedData():
372 vals = map(str,pt)
373 vals[0] = "\'%s\'"%vals[0]
374 valStr= string.join(vals,',')
375 c.execute('insert into %s values (%s)'%(baseTableName,valStr))
376
377 quantName = '%s_QBounds'%(baseTableName)
378 maxVarNameLen = max(map(len,varNames))
379 strBounds = map(str,qData.GetQuantBounds())
380 maxStrBoundLen = max(map(len,strBounds))
381 try:
382 c.execute('drop table %s'%quantName)
383 except:
384 pass
385 createStr = 'create table %s (variable_name varchar(%d), quant_bounds varchar(%d))'%(quantName,maxVarNameLen,maxStrBoundLen)
386 c.execute(createStr)
387 for i in xrange(len(varNames)):
388 c.execute("insert into %s values ('%s','%s')"%(quantName,varNames[i],strBounds[i]))
389
390 cn.commit()
391
392 -def DBToQuantData(dbName,baseTableName,quantName='',user='sysdba',password='masterkey'):
393 """ constructs an _MLData.MLQuantDataSet_ from a database
394
395 **Arguments**
396
397 - dbName: the name of the database to be opened
398
399 - baseTableName: the table name containing the data in the database
400
401 - quantName: the table name containing the quantization bounds in the database
402
403 - user: the user name to be used to connect to the database
404
405 - password: the password to be used to connect to the database
406
407 **Returns**
408
409 an _MLData.MLQuantDataSet_
410
411 **Notes**
412
413 - at the moment this is specific to using *Interbase* with the *gvib*
414 adaptor, but that would be straightforward to change.
415
416 - two tables are actually required:
417
418 1) _baseTableName_ contains the quantized data
419
420 2) _baseTableName_QBounds_ contains the quantization bounds
421
422 """
423 import gvib
424 try:
425 cn = gvib.connect(dbName,user,password)
426 except:
427 print 'cannot connect to database %s'%(dbName)
428 return None
429 c = cn.cursor()
430
431 if quantName == '':
432 quantName = '%s_QBounds'%(baseTableName)
433 try:
434 c.execute('select * from %s'%quantName)
435 except:
436 print 'cannot query table %s in database %s'%(quantName,dbName)
437 return None
438
439 res = c.fetchall()
440 varNames = map(lambda x:x[0][:-1],res)
441 qBounds = map(lambda x: eval(x[1]),res)
442
443 try:
444 c.execute('select * from %s'%baseTableName)
445 except:
446 print 'cannot query table %s in database %s'%(baseTableName,dbName)
447 return None
448 res = c.fetchall()
449 vals = map(lambda x: map(lambda y: int(y),x[1:]),res)
450
451 ptNames = map(lambda x: x[0],res)
452 data = MLData.MLQuantDataSet(vals,qBounds=qBounds,varNames=varNames,
453 ptNames=ptNames)
454 return data
455
457 """
458
459 >>> v = [10,20,30,40,50]
460 >>> TakeEnsemble(v,(1,2,3))
461 [20, 30, 40]
462 >>> v = ['foo',10,20,30,40,50,1]
463 >>> TakeEnsemble(v,(1,2,3),isDataVect=True)
464 ['foo', 20, 30, 40, 1]
465
466
467
468 """
469 if isDataVect:
470 ensembleIds = [x+1 for x in ensembleIds]
471 vect = [vect[0]]+[vect[x] for x in ensembleIds]+[vect[-1]]
472 else:
473 vect = [vect[x] for x in ensembleIds]
474 return vect
475
476
477
478 -def DBToData(dbName,tableName,user='sysdba',password='masterkey',dupCol=-1,
479 what='*',where='',join='',pickleCol=-1,pickleClass=None,
480 ensembleIds=None):
481 """ constructs an _MLData.MLDataSet_ from a database
482
483 **Arguments**
484
485 - dbName: the name of the database to be opened
486
487 - tableName: the table name containing the data in the database
488
489 - user: the user name to be used to connect to the database
490
491 - password: the password to be used to connect to the database
492
493 - dupCol: if nonzero specifies which column should be used to recognize
494 duplicates.
495
496 **Returns**
497
498 an _MLData.MLDataSet_
499
500 **Notes**
501
502 - this uses Dbase.DataUtils functionality
503
504 """
505 conn = DbConnect(dbName,tableName,user,password)
506 res = conn.GetData(fields=what,where=where,join=join,removeDups=dupCol,
507 forceList=1)
508 nPts = len(res)
509 vals = [None]*nPts
510 ptNames = [None]*nPts
511 classWorks=True
512 for i in range(nPts):
513 tmp = list(res[i])
514 ptNames[i] = tmp.pop(0)
515 if pickleCol>=0:
516 if not pickleClass or not classWorks:
517 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
518 else:
519 try:
520 tmp[pickleCol] = pickleClass(str(tmp[pickleCol]))
521 except:
522 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
523 classWorks=False
524 if ensembleIds:
525 tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol],ensembleIds)
526 else:
527 if ensembleIds:
528 tmp = TakeEnsemble(tmp,ensembleIds,isDataVect=True)
529 vals[i] = tmp
530 varNames = conn.GetColumnNames(join=join,what=what)
531 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames)
532 return data
533
534 -def TextToData(reader,ignoreCols=[],onlyCols=None):
535 """ constructs an _MLData.MLDataSet_ from a bunch of text
536 #DOC
537 **Arguments**
538 - reader needs to be iterable and return lists of elements
539 (like a csv.reader)
540
541 **Returns**
542
543 an _MLData.MLDataSet_
544
545 """
546
547 varNames = reader.next()
548 if not onlyCols:
549 keepCols = []
550 for i,name in enumerate(varNames):
551 if name not in ignoreCols:
552 keepCols.append(i)
553 else:
554 keepCols = [-1]*len(onlyCols)
555 for i,name in enumerate(varNames):
556 if name in onlyCols:
557 keepCols[onlyCols.index(name)]=i
558
559 nCols = len(varNames)
560 varNames = tuple([varNames[x] for x in keepCols])
561 nVars = len(varNames)
562 vals = []
563 ptNames = []
564 for splitLine in reader:
565 if len(splitLine):
566 if len(splitLine)!=nCols:
567 raise ValueError,'unequal line lengths'
568 tmp = [splitLine[x] for x in keepCols]
569 ptNames.append(tmp[0])
570 pt = [None]*(nVars-1)
571 for j in range(nVars-1):
572 try:
573 val = int(tmp[j+1])
574 except:
575 try:
576 val = float(tmp[j+1])
577 except:
578 val = str(tmp[j+1])
579 pt[j] = val
580 vals.append(pt)
581 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames)
582 return data
583
584 -def TextFileToData(fName,onlyCols=None):
585 """
586 #DOC
587
588 """
589 ext = fName.split('.')[-1]
590 if ext.upper() == 'CSV':
591
592 splitter = csv.reader(open(fName,'rU'))
593 else:
594 splitter = csv.reader(open(fName,'rU'),delimiter='\t')
595 return TextToData(splitter,onlyCols=onlyCols)
596
598 """ Seeds the random number generators
599
600 **Arguments**
601
602 - seed: a 2-tuple containing integers to be used as the random number seeds
603
604 **Notes**
605
606 this seeds both the _Numeric.RandomArray_ generator and the one in the standard
607 Python _random_ module
608
609 """
610 import RandomArray
611 apply(RandomArray.seed,seed)
612 import RDRandom
613 RDRandom.seed(seed[0])
614 import random
615 random.seed(seed[0])
616
617 -def FilterData(inData,val,frac,col=-1,indicesToUse=None,indicesOnly=0):
618 """
619 #DOC
620 """
621 if frac<0 or frac>1: raise ValueError,'filter fraction out of bounds'
622 try:
623 inData[0][col]
624 except IndexError:
625 raise ValueError,'target column index out of range'
626
627
628
629 if indicesToUse:
630 tmp = [inData[x] for x in indicesToUse]
631 else:
632 tmp = list(inData)
633 nOrig = len(tmp)
634 sortOrder = range(nOrig)
635 sortOrder.sort(lambda x,y,col=col,tmp=tmp:cmp(tmp[x][col],tmp[y][col]))
636 tmp = [tmp[x] for x in sortOrder]
637
638
639 start = 0
640 while start < nOrig and tmp[start][col] != val:
641 start += 1
642 if start >= nOrig:
643 raise ValueError,'target value (%d) not found in data'%(val)
644
645
646 finish = start+1
647 while finish<nOrig and tmp[finish][col] ==val:
648 finish += 1
649
650
651 nWithVal = finish-start
652
653
654 nOthers = len(tmp)-nWithVal
655
656 currFrac = float(nWithVal) / nOrig
657 if currFrac < frac:
658
659
660
661
662
663 nTgtFinal = nWithVal
664 nFinal = int(round(nWithVal / frac))
665 nOthersFinal = nFinal - nTgtFinal
666
667
668
669
670
671
672 while float(nTgtFinal) / nFinal > frac:
673 nTgtFinal -= 1
674 nFinal -= 1
675
676 else:
677
678
679
680
681
682 nOthersFinal = nOthers
683 nFinal = int(round(nOthers/(1-frac)))
684 nTgtFinal = nFinal - nOthersFinal
685
686
687
688
689
690
691 while float(nTgtFinal) / nFinal < frac:
692 nOthersFinal -= 1
693 nFinal -= 1
694
695 others = range(start) + range(finish,nOrig)
696 othersTake = RandomArray.permutation(nOthers)
697 others = [others[x] for x in othersTake[:nOthersFinal]]
698
699 targets = range(start,finish)
700 targetsTake = RandomArray.permutation(nWithVal)
701 targets = [targets[x] for x in targetsTake[:nTgtFinal]]
702
703
704 indicesToKeep = targets+others
705 nToKeep = len(indicesToKeep)
706 nRej = nOrig-nToKeep
707
708 res = []
709 rej = []
710
711 if not indicesOnly:
712 for i in RandomArray.permutation(nOrig):
713 if i in indicesToKeep:
714 res.append(tmp[i])
715 else:
716 rej.append(tmp[i])
717 else:
718
719 for i in RandomArray.permutation(nOrig):
720 if not indicesToUse:
721 idx = sortOrder[i]
722 else:
723 idx = indicesToUse[sortOrder[i]]
724 if i in indicesToKeep:
725 res.append(idx)
726 else:
727 rej.append(idx)
728 return res,rej
729
731 """ #DOC
732 """
733 counts = {}
734 for p in inData:
735 if not bounds:
736 r = p[col]
737 else:
738 act = p[col]
739 bound = 0
740 placed = 0
741 while not placed and bound < len(bounds):
742 if act < bounds[bound]:
743 r = bound
744 placed = 1
745 else:
746 bound += 1
747 if not placed:
748 r = bound
749
750 counts[r] = counts.get(r,0)+1
751 return counts
752
753
755 """ randomizes the activity values of a dataset
756
757 **Arguments**
758
759 - dataSet: a _ML.Data.MLQuantDataSet_, the activities here will be randomized
760
761 - shuffle: an optional toggle. If this is set, the activity values
762 will be shuffled (so the number in each class remains constant)
763
764 - runDetails: an optional CompositeRun object
765
766 **Note**
767
768 - _examples_ are randomized in place
769
770 - this uses Numeric's _RandomArray_ to do the randomization
771
772 """
773 import RandomArray
774 nPossible = dataSet.GetNPossibleVals()[-1]
775 nPts = dataSet.GetNPts()
776 if shuffle:
777 if runDetails: runDetails.shuffled = 1
778 origActs = dataSet.GetResults()
779 perm = RandomArray.permutation(nPts)
780 acts = [origActs[x] for x in perm]
781 else:
782 if runDetails: runDetails.randomized = 1
783 acts = RandomArray.randint(0,nPossible,[len(examples)])
784 for i in xrange(nPts):
785 tmp = dataSet[i]
786 tmp[-1] = acts[i]
787 dataSet[i] = tmp
788
789
790
791
792
793
794
795
796
798 import doctest,sys
799 return doctest.testmod(sys.modules["__main__"])
800
801 if __name__ == '__main__':
802 import sys
803 failed,tried = _test()
804 sys.exit(failed)
805