Package ML :: Package Data :: Module DataUtils
[hide private]
[frames] | no frames]

Source Code for Module ML.Data.DataUtils

  1  # 
  2  #  Copyright (C) 2000-2004  greg Landrum and Rational Discovery LLC 
  3  #   All Rights Reserved 
  4  # 
  5   
  6  """ Utilities for data manipulation 
  7   
  8  **FILE FORMATS:** 
  9   
 10   - *.qdat files* contain quantized data suitable for 
 11    feeding to learning algorithms. 
 12   
 13    The .qdat file, written by _DecTreeGui_, is structured as follows: 
 14   
 15     1) Any number of lines which are ignored. 
 16   
 17     2) A line containing the string 'Variable Table' 
 18   
 19        any number of variable definitions in the format: 
 20   
 21        '# Variable_name [quant_bounds]' 
 22   
 23          where '[quant_bounds]' is a list of the boundaries used for quantizing 
 24           that variable.  If the variable is inherently integral (i.e. not 
 25           quantized), this can be an empty list. 
 26   
 27     3) A line beginning with '# ----' which signals the end of the variable list 
 28   
 29     4) Any number of lines containing data points, in the format: 
 30   
 31        'Name_of_point var1 var2 var3 .... varN' 
 32   
 33        all variable values should be integers 
 34   
 35     Throughout, it is assumed that varN is the result 
 36   
 37   - *.dat files* contain the same information as .qdat files, but the variable 
 38     values can be anything (floats, ints, strings).  **These files should 
 39     still contain quant_bounds!** 
 40      
 41   - *.qdat.pkl file* contain a pickled (binary) representation of 
 42     the data read in.  They stores, in order: 
 43   
 44      1) A python list of the variable names 
 45   
 46      2) A python list of lists with the quantization bounds 
 47   
 48      3) A python list of the point names 
 49       
 50      4) A python list of lists with the data points 
 51   
 52  """ 
 53  import RDConfig 
 54  from utils import fileutils 
 55  from ML.Data import MLData 
 56  from Dbase.DbConnection import DbConnect 
 57  from DataStructs import BitUtils 
 58  import string 
 59  import re,csv 
 60  import cPickle 
 61  import RandomArray 
 62   
63 -def WriteData(outFile,varNames,qBounds,examples):
64 """ writes out a .qdat file 65 66 **Arguments** 67 68 - outFile: a file object 69 70 - varNames: a list of variable names 71 72 - qBounds: the list of quantization bounds (should be the same length 73 as _varNames_) 74 75 - examples: the data to be written 76 77 """ 78 outFile.write('# Quantized data from DataUtils\n') 79 outFile.write('# ----------\n') 80 outFile.write('# Variable Table\n') 81 for i in xrange(len(varNames)): 82 outFile.write('# %s %s\n'%(varNames[i],str(qBounds[i]))) 83 outFile.write('# ----------\n') 84 for example in examples: 85 outFile.write(string.join(map(str,example),' ')+'\n')
86 87
88 -def ReadVars(inFile):
89 """ reads the variables and quantization bounds from a .qdat or .dat file 90 91 **Arguments** 92 93 - inFile: a file object 94 95 **Returns** 96 97 a 2-tuple containing: 98 99 1) varNames: a list of the variable names 100 101 2) qbounds: the list of quantization bounds for each variable 102 103 """ 104 varNames = [] 105 qBounds = [] 106 fileutils.MoveToMatchingLine(inFile,'Variable Table') 107 inLine = inFile.readline() 108 while string.find(inLine,'# ----') == -1: 109 splitLine = string.split(inLine[2:],'[') 110 varNames.append(string.strip(splitLine[0])) 111 qBounds.append(splitLine[1][:-2]) 112 inLine = inFile.readline() 113 for i in xrange(len(qBounds)): 114 115 if qBounds[i] != '': 116 l = string.split(qBounds[i],',') 117 qBounds[i] = [] 118 for item in l: 119 qBounds[i].append(float(item)) 120 else: 121 qBounds[i] = [] 122 return varNames,qBounds
123
124 -def ReadQuantExamples(inFile):
125 """ reads the examples from a .qdat file 126 127 **Arguments** 128 129 - inFile: a file object 130 131 **Returns** 132 133 a 2-tuple containing: 134 135 1) the names of the examples 136 137 2) a list of lists containing the examples themselves 138 139 **Note** 140 141 because this is reading a .qdat file, it assumed that all variable values 142 are integers 143 144 """ 145 expr1 = re.compile(r'^#') 146 expr2 = re.compile(r'[\ ]*|[\t]*') 147 examples = [] 148 names = [] 149 inLine = inFile.readline() 150 while inLine: 151 if expr1.search(inLine) is None: 152 resArr = expr2.split(inLine) 153 if len(resArr)>1: 154 examples.append(map(lambda x: int(x),resArr[1:])) 155 names.append(resArr[0]) 156 inLine = inFile.readline() 157 return names,examples
158
159 -def ReadGeneralExamples(inFile):
160 """ reads the examples from a .dat file 161 162 **Arguments** 163 164 - inFile: a file object 165 166 **Returns** 167 168 a 2-tuple containing: 169 170 1) the names of the examples 171 172 2) a list of lists containing the examples themselves 173 174 **Note** 175 176 - this attempts to convert variable values to ints, then floats. 177 if those both fail, they are left as strings 178 179 """ 180 expr1 = re.compile(r'^#') 181 expr2 = re.compile(r'[\ ]*|[\t]*') 182 examples = [] 183 names = [] 184 inLine = inFile.readline() 185 while inLine: 186 if expr1.search(inLine) is None: 187 resArr = expr2.split(inLine)[:-1] 188 if len(resArr)>1: 189 for i in xrange(1,len(resArr)): 190 d = resArr[i] 191 try: 192 resArr[i] = int(d) 193 except ValueError: 194 try: 195 resArr[i] = float(d) 196 except ValueError: 197 pass 198 examples.append(resArr[1:]) 199 names.append(resArr[0]) 200 inLine = inFile.readline() 201 return names,examples
202
203 -def BuildQuantDataSet(fileName):
204 """ builds a data set from a .qdat file 205 206 **Arguments** 207 208 - fileName: the name of the .qdat file 209 210 **Returns** 211 212 an _MLData.MLQuantDataSet_ 213 214 """ 215 inFile = open(fileName,'r') 216 217 varNames,qBounds = ReadVars(inFile) 218 ptNames,examples = ReadQuantExamples(inFile) 219 data = MLData.MLQuantDataSet(examples,qBounds=qBounds,varNames=varNames, 220 ptNames=ptNames) 221 return data
222 223
224 -def BuildDataSet(fileName):
225 """ builds a data set from a .dat file 226 227 **Arguments** 228 229 - fileName: the name of the .dat file 230 231 **Returns** 232 233 an _MLData.MLDataSet_ 234 235 """ 236 inFile = open(fileName,'r') 237 238 varNames,qBounds = ReadVars(inFile) 239 ptNames,examples = ReadGeneralExamples(inFile) 240 data = MLData.MLDataSet(examples,qBounds=qBounds,varNames=varNames, 241 ptNames=ptNames) 242 return data
243 244
245 -def CalcNPossibleUsingMap(data,order,qBounds,nQBounds=None):
246 """ calculates the number of possible values for each variable in a data set 247 248 **Arguments** 249 250 - data: a list of examples 251 252 - order: the ordering map between the variables in _data_ and _qBounds_ 253 254 - qBounds: the quantization bounds for the variables 255 256 **Returns** 257 258 a list with the number of possible values each variable takes on in the data set 259 260 **Notes** 261 262 - variables present in _qBounds_ will have their _nPossible_ number read 263 from _qbounds 264 265 - _nPossible_ for other numeric variables will be calculated 266 267 """ 268 numericTypes = [type(1),type(1.0),type(1L)] 269 print 'order:',order, len(order) 270 print 'qB:',qBounds 271 #print 'nQB:',nQBounds, len(nQBounds) 272 assert (qBounds and len(order)==len(qBounds)) or (nQBounds and len(order)==len(nQBounds)),\ 273 'order/qBounds mismatch' 274 nVars = len(order) 275 nPossible = [-1]*nVars 276 cols = range(nVars) 277 for i in xrange(nVars): 278 if nQBounds and nQBounds[i] != 0: 279 nPossible[i] = -1 280 cols.remove(i) 281 elif len(qBounds[i])>0: 282 nPossible[i] = len(qBounds[i]) 283 cols.remove(i) 284 285 nPts = len(data) 286 for i in xrange(nPts): 287 for col in cols[:]: 288 d = data[i][order[col]] 289 if type(d) in numericTypes: 290 if int(d) == d: 291 nPossible[col] = max(int(d),nPossible[col]) 292 else: 293 nPossible[col] = -1 294 cols.remove(col) 295 else: 296 print 'bye bye col %d: %s'%(col,repr(d)) 297 nPossible[col] = -1 298 cols.remove(col) 299 300 return map(lambda x:int(x)+1,nPossible)
301 302 303
304 -def WritePickledData(outName,data):
305 """ writes either a .qdat.pkl or a .dat.pkl file 306 307 **Arguments** 308 309 - outName: the name of the file to be used 310 311 - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_ 312 313 """ 314 outFile = open(outName,'wb+') 315 varNames = data.GetVarNames() 316 qBounds = data.GetQuantBounds() 317 ptNames = data.GetPtNames() 318 examples = data.GetAllData() 319 320 cPickle.dump(varNames,outFile) 321 cPickle.dump(qBounds,outFile) 322 cPickle.dump(ptNames,outFile) 323 cPickle.dump(examples,outFile) 324 outFile.close()
325
326 -def QuantDataToDB(qData,dbName,baseTableName,user='sysdba',password='masterkey'):
327 """ fires an _MLData.MLQuantDataSet_ into a database 328 329 **Arguments** 330 331 - qData: an _MLData.MLQuantDataSet_ 332 333 - dbName: the name of the database to be opened 334 335 - baseTableName: the table name to contain the data in the database 336 337 - user: the user name to be used to connect to the database 338 339 - password: the password to be used to connect to the database 340 341 **Notes** 342 343 - at the moment this is specific to using *Interbase* with the *gvib* 344 adaptor, but that would be straightforward to change. 345 346 - two tables are actually created: 347 348 1) _baseTableName_ contains the quantized data 349 350 2) _baseTableName_QBounds_ contains the quantization bounds 351 352 353 """ 354 import gvib 355 cn = gvib.connect(dbName,user,password) 356 c = cn.cursor() 357 varNames = qData.varNames 358 maxPtNameLen = max(map(len,qData.GetPtNames())) 359 360 valStr = '%s varchar(%d)'%(varNames[0],maxPtNameLen) 361 for var in xrange(1,len(varNames)): 362 valStr = valStr + ', %s int'%(varNames[var]) 363 364 try: 365 c.execute('drop table %s'%baseTableName) 366 except: 367 pass 368 createStr = 'create table %s (%s)'%(baseTableName,valStr) 369 print 'create:',createStr 370 c.execute(createStr) 371 for pt in qData.GetNamedData(): 372 vals = map(str,pt) 373 vals[0] = "\'%s\'"%vals[0] 374 valStr= string.join(vals,',') 375 c.execute('insert into %s values (%s)'%(baseTableName,valStr)) 376 377 quantName = '%s_QBounds'%(baseTableName) 378 maxVarNameLen = max(map(len,varNames)) 379 strBounds = map(str,qData.GetQuantBounds()) 380 maxStrBoundLen = max(map(len,strBounds)) 381 try: 382 c.execute('drop table %s'%quantName) 383 except: 384 pass 385 createStr = 'create table %s (variable_name varchar(%d), quant_bounds varchar(%d))'%(quantName,maxVarNameLen,maxStrBoundLen) 386 c.execute(createStr) 387 for i in xrange(len(varNames)): 388 c.execute("insert into %s values ('%s','%s')"%(quantName,varNames[i],strBounds[i])) 389 390 cn.commit()
391
392 -def DBToQuantData(dbName,baseTableName,quantName='',user='sysdba',password='masterkey'):
393 """ constructs an _MLData.MLQuantDataSet_ from a database 394 395 **Arguments** 396 397 - dbName: the name of the database to be opened 398 399 - baseTableName: the table name containing the data in the database 400 401 - quantName: the table name containing the quantization bounds in the database 402 403 - user: the user name to be used to connect to the database 404 405 - password: the password to be used to connect to the database 406 407 **Returns** 408 409 an _MLData.MLQuantDataSet_ 410 411 **Notes** 412 413 - at the moment this is specific to using *Interbase* with the *gvib* 414 adaptor, but that would be straightforward to change. 415 416 - two tables are actually required: 417 418 1) _baseTableName_ contains the quantized data 419 420 2) _baseTableName_QBounds_ contains the quantization bounds 421 422 """ 423 import gvib 424 try: 425 cn = gvib.connect(dbName,user,password) 426 except: 427 print 'cannot connect to database %s'%(dbName) 428 return None 429 c = cn.cursor() 430 431 if quantName == '': 432 quantName = '%s_QBounds'%(baseTableName) 433 try: 434 c.execute('select * from %s'%quantName) 435 except: 436 print 'cannot query table %s in database %s'%(quantName,dbName) 437 return None 438 439 res = c.fetchall() 440 varNames = map(lambda x:x[0][:-1],res) 441 qBounds = map(lambda x: eval(x[1]),res) 442 443 try: 444 c.execute('select * from %s'%baseTableName) 445 except: 446 print 'cannot query table %s in database %s'%(baseTableName,dbName) 447 return None 448 res = c.fetchall() 449 vals = map(lambda x: map(lambda y: int(y),x[1:]),res) 450 451 ptNames = map(lambda x: x[0],res) 452 data = MLData.MLQuantDataSet(vals,qBounds=qBounds,varNames=varNames, 453 ptNames=ptNames) 454 return data
455
456 -def TakeEnsemble(vect,ensembleIds,isDataVect=False):
457 """ 458 459 >>> v = [10,20,30,40,50] 460 >>> TakeEnsemble(v,(1,2,3)) 461 [20, 30, 40] 462 >>> v = ['foo',10,20,30,40,50,1] 463 >>> TakeEnsemble(v,(1,2,3),isDataVect=True) 464 ['foo', 20, 30, 40, 1] 465 466 467 468 """ 469 if isDataVect: 470 ensembleIds = [x+1 for x in ensembleIds] 471 vect = [vect[0]]+[vect[x] for x in ensembleIds]+[vect[-1]] 472 else: 473 vect = [vect[x] for x in ensembleIds] 474 return vect
475 476 477
478 -def DBToData(dbName,tableName,user='sysdba',password='masterkey',dupCol=-1, 479 what='*',where='',join='',pickleCol=-1,pickleClass=None, 480 ensembleIds=None):
481 """ constructs an _MLData.MLDataSet_ from a database 482 483 **Arguments** 484 485 - dbName: the name of the database to be opened 486 487 - tableName: the table name containing the data in the database 488 489 - user: the user name to be used to connect to the database 490 491 - password: the password to be used to connect to the database 492 493 - dupCol: if nonzero specifies which column should be used to recognize 494 duplicates. 495 496 **Returns** 497 498 an _MLData.MLDataSet_ 499 500 **Notes** 501 502 - this uses Dbase.DataUtils functionality 503 504 """ 505 conn = DbConnect(dbName,tableName,user,password) 506 res = conn.GetData(fields=what,where=where,join=join,removeDups=dupCol, 507 forceList=1) 508 nPts = len(res) 509 vals = [None]*nPts 510 ptNames = [None]*nPts 511 classWorks=True 512 for i in range(nPts): 513 tmp = list(res[i]) 514 ptNames[i] = tmp.pop(0) 515 if pickleCol>=0: 516 if not pickleClass or not classWorks: 517 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) 518 else: 519 try: 520 tmp[pickleCol] = pickleClass(str(tmp[pickleCol])) 521 except: 522 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) 523 classWorks=False 524 if ensembleIds: 525 tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol],ensembleIds) 526 else: 527 if ensembleIds: 528 tmp = TakeEnsemble(tmp,ensembleIds,isDataVect=True) 529 vals[i] = tmp 530 varNames = conn.GetColumnNames(join=join,what=what) 531 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames) 532 return data
533
534 -def TextToData(reader,ignoreCols=[],onlyCols=None):
535 """ constructs an _MLData.MLDataSet_ from a bunch of text 536 #DOC 537 **Arguments** 538 - reader needs to be iterable and return lists of elements 539 (like a csv.reader) 540 541 **Returns** 542 543 an _MLData.MLDataSet_ 544 545 """ 546 547 varNames = reader.next() 548 if not onlyCols: 549 keepCols = [] 550 for i,name in enumerate(varNames): 551 if name not in ignoreCols: 552 keepCols.append(i) 553 else: 554 keepCols = [-1]*len(onlyCols) 555 for i,name in enumerate(varNames): 556 if name in onlyCols: 557 keepCols[onlyCols.index(name)]=i 558 559 nCols = len(varNames) 560 varNames = tuple([varNames[x] for x in keepCols]) 561 nVars = len(varNames) 562 vals = [] 563 ptNames = [] 564 for splitLine in reader: 565 if len(splitLine): 566 if len(splitLine)!=nCols: 567 raise ValueError,'unequal line lengths' 568 tmp = [splitLine[x] for x in keepCols] 569 ptNames.append(tmp[0]) 570 pt = [None]*(nVars-1) 571 for j in range(nVars-1): 572 try: 573 val = int(tmp[j+1]) 574 except: 575 try: 576 val = float(tmp[j+1]) 577 except: 578 val = str(tmp[j+1]) 579 pt[j] = val 580 vals.append(pt) 581 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames) 582 return data
583
584 -def TextFileToData(fName,onlyCols=None):
585 """ 586 #DOC 587 588 """ 589 ext = fName.split('.')[-1] 590 if ext.upper() == 'CSV': 591 # CSV module distributed with python2.3 and later 592 splitter = csv.reader(open(fName,'rU')) 593 else: 594 splitter = csv.reader(open(fName,'rU'),delimiter='\t') 595 return TextToData(splitter,onlyCols=onlyCols)
596
597 -def InitRandomNumbers(seed):
598 """ Seeds the random number generators 599 600 **Arguments** 601 602 - seed: a 2-tuple containing integers to be used as the random number seeds 603 604 **Notes** 605 606 this seeds both the _Numeric.RandomArray_ generator and the one in the standard 607 Python _random_ module 608 609 """ 610 import RandomArray 611 apply(RandomArray.seed,seed) 612 import RDRandom 613 RDRandom.seed(seed[0]) 614 import random 615 random.seed(seed[0])
616
617 -def FilterData(inData,val,frac,col=-1,indicesToUse=None,indicesOnly=0):
618 """ 619 #DOC 620 """ 621 if frac<0 or frac>1: raise ValueError,'filter fraction out of bounds' 622 try: 623 inData[0][col] 624 except IndexError: 625 raise ValueError,'target column index out of range' 626 627 628 # convert the input data to a list and sort them 629 if indicesToUse: 630 tmp = [inData[x] for x in indicesToUse] 631 else: 632 tmp = list(inData) 633 nOrig = len(tmp) 634 sortOrder = range(nOrig) 635 sortOrder.sort(lambda x,y,col=col,tmp=tmp:cmp(tmp[x][col],tmp[y][col])) 636 tmp = [tmp[x] for x in sortOrder] 637 638 # find the start of the entries with value val 639 start = 0 640 while start < nOrig and tmp[start][col] != val: 641 start += 1 642 if start >= nOrig: 643 raise ValueError,'target value (%d) not found in data'%(val) 644 645 # find the end of the entries with value val 646 finish = start+1 647 while finish<nOrig and tmp[finish][col] ==val: 648 finish += 1 649 650 # how many entries have the target value? 651 nWithVal = finish-start 652 653 # how many don't? 654 nOthers = len(tmp)-nWithVal 655 656 currFrac = float(nWithVal) / nOrig 657 if currFrac < frac: 658 # 659 # We're going to keep most of (all) the points with the target value, 660 # We need to figure out how many of the other points we'll 661 # toss out 662 # 663 nTgtFinal = nWithVal 664 nFinal = int(round(nWithVal / frac)) 665 nOthersFinal = nFinal - nTgtFinal 666 667 # 668 # We may need to reduce the number of targets to keep 669 # because it may make it impossible to hit exactly the 670 # fraction we're trying for. Take care of that now 671 # 672 while float(nTgtFinal) / nFinal > frac: 673 nTgtFinal -= 1 674 nFinal -= 1 675 676 else: 677 # 678 # There are too many points with the target value, 679 # we'll keep most of (all) the other points and toss a random 680 # selection of the target value points 681 # 682 nOthersFinal = nOthers 683 nFinal = int(round(nOthers/(1-frac))) 684 nTgtFinal = nFinal - nOthersFinal 685 686 # 687 # We may need to reduce the number of others to keep 688 # because it may make it impossible to hit exactly the 689 # fraction we're trying for. Take care of that now 690 # 691 while float(nTgtFinal) / nFinal < frac: 692 nOthersFinal -= 1 693 nFinal -= 1 694 695 others = range(start) + range(finish,nOrig) 696 othersTake = RandomArray.permutation(nOthers) 697 others = [others[x] for x in othersTake[:nOthersFinal]] 698 699 targets = range(start,finish) 700 targetsTake = RandomArray.permutation(nWithVal) 701 targets = [targets[x] for x in targetsTake[:nTgtFinal]] 702 703 # these are all the indices we'll be keeping 704 indicesToKeep = targets+others 705 nToKeep = len(indicesToKeep) 706 nRej = nOrig-nToKeep 707 708 res = [] 709 rej = [] 710 # now pull the points, but in random order 711 if not indicesOnly: 712 for i in RandomArray.permutation(nOrig): 713 if i in indicesToKeep: 714 res.append(tmp[i]) 715 else: 716 rej.append(tmp[i]) 717 else: 718 # EFF: this is slower than it needs to be 719 for i in RandomArray.permutation(nOrig): 720 if not indicesToUse: 721 idx = sortOrder[i] 722 else: 723 idx = indicesToUse[sortOrder[i]] 724 if i in indicesToKeep: 725 res.append(idx) 726 else: 727 rej.append(idx) 728 return res,rej
729
730 -def CountResults(inData,col=-1,bounds=None):
731 """ #DOC 732 """ 733 counts = {} 734 for p in inData: 735 if not bounds: 736 r = p[col] 737 else: 738 act = p[col] 739 bound = 0 740 placed = 0 741 while not placed and bound < len(bounds): 742 if act < bounds[bound]: 743 r = bound 744 placed = 1 745 else: 746 bound += 1 747 if not placed: 748 r = bound 749 750 counts[r] = counts.get(r,0)+1 751 return counts
752 753
754 -def RandomizeActivities(dataSet,shuffle=0,runDetails=None):
755 """ randomizes the activity values of a dataset 756 757 **Arguments** 758 759 - dataSet: a _ML.Data.MLQuantDataSet_, the activities here will be randomized 760 761 - shuffle: an optional toggle. If this is set, the activity values 762 will be shuffled (so the number in each class remains constant) 763 764 - runDetails: an optional CompositeRun object 765 766 **Note** 767 768 - _examples_ are randomized in place 769 770 - this uses Numeric's _RandomArray_ to do the randomization 771 772 """ 773 import RandomArray 774 nPossible = dataSet.GetNPossibleVals()[-1] 775 nPts = dataSet.GetNPts() 776 if shuffle: 777 if runDetails: runDetails.shuffled = 1 778 origActs = dataSet.GetResults() 779 perm = RandomArray.permutation(nPts) 780 acts = [origActs[x] for x in perm] 781 else: 782 if runDetails: runDetails.randomized = 1 783 acts = RandomArray.randint(0,nPossible,[len(examples)]) 784 for i in xrange(nPts): 785 tmp = dataSet[i] 786 tmp[-1] = acts[i] 787 dataSet[i] = tmp
788 789 790 791 792 793 #------------------------------------ 794 # 795 # doctest boilerplate 796 #
797 -def _test():
798 import doctest,sys 799 return doctest.testmod(sys.modules["__main__"])
800 801 if __name__ == '__main__': 802 import sys 803 failed,tried = _test() 804 sys.exit(failed) 805