Package rdkit :: Package ML :: Package Data :: Module DataUtils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.DataUtils

  1  ## Automatically adapted for numpy.oldnumeric Jun 27, 2008 by -c 
  2   
  3  # 
  4  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  5  #   All Rights Reserved 
  6  # 
  7   
  8  """ Utilities for data manipulation 
  9   
 10  **FILE FORMATS:** 
 11   
 12   - *.qdat files* contain quantized data suitable for 
 13    feeding to learning algorithms. 
 14   
 15    The .qdat file, written by _DecTreeGui_, is structured as follows: 
 16   
 17     1) Any number of lines which are ignored. 
 18   
 19     2) A line containing the string 'Variable Table' 
 20   
 21        any number of variable definitions in the format: 
 22   
 23        '# Variable_name [quant_bounds]' 
 24   
 25          where '[quant_bounds]' is a list of the boundaries used for quantizing 
 26           that variable.  If the variable is inherently integral (i.e. not 
 27           quantized), this can be an empty list. 
 28   
 29     3) A line beginning with '# ----' which signals the end of the variable list 
 30   
 31     4) Any number of lines containing data points, in the format: 
 32   
 33        'Name_of_point var1 var2 var3 .... varN' 
 34   
 35        all variable values should be integers 
 36   
 37     Throughout, it is assumed that varN is the result 
 38   
 39   - *.dat files* contain the same information as .qdat files, but the variable 
 40     values can be anything (floats, ints, strings).  **These files should 
 41     still contain quant_bounds!** 
 42      
 43   - *.qdat.pkl file* contain a pickled (binary) representation of 
 44     the data read in.  They stores, in order: 
 45   
 46      1) A python list of the variable names 
 47   
 48      2) A python list of lists with the quantization bounds 
 49   
 50      3) A python list of the point names 
 51       
 52      4) A python list of lists with the data points 
 53   
 54  """ 
 55  from rdkit import RDConfig 
 56  from rdkit.utils import fileutils 
 57  from rdkit.ML.Data import MLData 
 58  from rdkit.Dbase.DbConnection import DbConnect 
 59  from rdkit.DataStructs import BitUtils 
 60  import string 
 61  import re,csv 
 62  import cPickle 
 63  import random 
 64   
65 -def permutation(nToDo):
66 res = range(nToDo) 67 random.shuffle(res) 68 return res
69
70 -def WriteData(outFile,varNames,qBounds,examples):
71 """ writes out a .qdat file 72 73 **Arguments** 74 75 - outFile: a file object 76 77 - varNames: a list of variable names 78 79 - qBounds: the list of quantization bounds (should be the same length 80 as _varNames_) 81 82 - examples: the data to be written 83 84 """ 85 outFile.write('# Quantized data from DataUtils\n') 86 outFile.write('# ----------\n') 87 outFile.write('# Variable Table\n') 88 for i in xrange(len(varNames)): 89 outFile.write('# %s %s\n'%(varNames[i],str(qBounds[i]))) 90 outFile.write('# ----------\n') 91 for example in examples: 92 outFile.write(string.join(map(str,example),' ')+'\n')
93 94
95 -def ReadVars(inFile):
96 """ reads the variables and quantization bounds from a .qdat or .dat file 97 98 **Arguments** 99 100 - inFile: a file object 101 102 **Returns** 103 104 a 2-tuple containing: 105 106 1) varNames: a list of the variable names 107 108 2) qbounds: the list of quantization bounds for each variable 109 110 """ 111 varNames = [] 112 qBounds = [] 113 fileutils.MoveToMatchingLine(inFile,'Variable Table') 114 inLine = inFile.readline() 115 while string.find(inLine,'# ----') == -1: 116 splitLine = string.split(inLine[2:],'[') 117 varNames.append(string.strip(splitLine[0])) 118 qBounds.append(splitLine[1][:-2]) 119 inLine = inFile.readline() 120 for i in xrange(len(qBounds)): 121 122 if qBounds[i] != '': 123 l = string.split(qBounds[i],',') 124 qBounds[i] = [] 125 for item in l: 126 qBounds[i].append(float(item)) 127 else: 128 qBounds[i] = [] 129 return varNames,qBounds
130
131 -def ReadQuantExamples(inFile):
132 """ reads the examples from a .qdat file 133 134 **Arguments** 135 136 - inFile: a file object 137 138 **Returns** 139 140 a 2-tuple containing: 141 142 1) the names of the examples 143 144 2) a list of lists containing the examples themselves 145 146 **Note** 147 148 because this is reading a .qdat file, it assumed that all variable values 149 are integers 150 151 """ 152 expr1 = re.compile(r'^#') 153 expr2 = re.compile(r'[\ ]*|[\t]*') 154 examples = [] 155 names = [] 156 inLine = inFile.readline() 157 while inLine: 158 if expr1.search(inLine) is None: 159 resArr = expr2.split(inLine) 160 if len(resArr)>1: 161 examples.append(map(lambda x: int(x),resArr[1:])) 162 names.append(resArr[0]) 163 inLine = inFile.readline() 164 return names,examples
165
166 -def ReadGeneralExamples(inFile):
167 """ reads the examples from a .dat file 168 169 **Arguments** 170 171 - inFile: a file object 172 173 **Returns** 174 175 a 2-tuple containing: 176 177 1) the names of the examples 178 179 2) a list of lists containing the examples themselves 180 181 **Note** 182 183 - this attempts to convert variable values to ints, then floats. 184 if those both fail, they are left as strings 185 186 """ 187 expr1 = re.compile(r'^#') 188 expr2 = re.compile(r'[\ ]*|[\t]*') 189 examples = [] 190 names = [] 191 inLine = inFile.readline() 192 while inLine: 193 if expr1.search(inLine) is None: 194 resArr = expr2.split(inLine)[:-1] 195 if len(resArr)>1: 196 for i in xrange(1,len(resArr)): 197 d = resArr[i] 198 try: 199 resArr[i] = int(d) 200 except ValueError: 201 try: 202 resArr[i] = float(d) 203 except ValueError: 204 pass 205 examples.append(resArr[1:]) 206 names.append(resArr[0]) 207 inLine = inFile.readline() 208 return names,examples
209
210 -def BuildQuantDataSet(fileName):
211 """ builds a data set from a .qdat file 212 213 **Arguments** 214 215 - fileName: the name of the .qdat file 216 217 **Returns** 218 219 an _MLData.MLQuantDataSet_ 220 221 """ 222 inFile = open(fileName,'r') 223 224 varNames,qBounds = ReadVars(inFile) 225 ptNames,examples = ReadQuantExamples(inFile) 226 data = MLData.MLQuantDataSet(examples,qBounds=qBounds,varNames=varNames, 227 ptNames=ptNames) 228 return data
229 230
231 -def BuildDataSet(fileName):
232 """ builds a data set from a .dat file 233 234 **Arguments** 235 236 - fileName: the name of the .dat file 237 238 **Returns** 239 240 an _MLData.MLDataSet_ 241 242 """ 243 inFile = open(fileName,'r') 244 245 varNames,qBounds = ReadVars(inFile) 246 ptNames,examples = ReadGeneralExamples(inFile) 247 data = MLData.MLDataSet(examples,qBounds=qBounds,varNames=varNames, 248 ptNames=ptNames) 249 return data
250 251
252 -def CalcNPossibleUsingMap(data,order,qBounds,nQBounds=None):
253 """ calculates the number of possible values for each variable in a data set 254 255 **Arguments** 256 257 - data: a list of examples 258 259 - order: the ordering map between the variables in _data_ and _qBounds_ 260 261 - qBounds: the quantization bounds for the variables 262 263 **Returns** 264 265 a list with the number of possible values each variable takes on in the data set 266 267 **Notes** 268 269 - variables present in _qBounds_ will have their _nPossible_ number read 270 from _qbounds 271 272 - _nPossible_ for other numeric variables will be calculated 273 274 """ 275 numericTypes = [type(1),type(1.0),type(1L)] 276 print 'order:',order, len(order) 277 print 'qB:',qBounds 278 #print 'nQB:',nQBounds, len(nQBounds) 279 assert (qBounds and len(order)==len(qBounds)) or (nQBounds and len(order)==len(nQBounds)),\ 280 'order/qBounds mismatch' 281 nVars = len(order) 282 nPossible = [-1]*nVars 283 cols = range(nVars) 284 for i in xrange(nVars): 285 if nQBounds and nQBounds[i] != 0: 286 nPossible[i] = -1 287 cols.remove(i) 288 elif len(qBounds[i])>0: 289 nPossible[i] = len(qBounds[i]) 290 cols.remove(i) 291 292 nPts = len(data) 293 for i in xrange(nPts): 294 for col in cols[:]: 295 d = data[i][order[col]] 296 if type(d) in numericTypes: 297 if int(d) == d: 298 nPossible[col] = max(int(d),nPossible[col]) 299 else: 300 nPossible[col] = -1 301 cols.remove(col) 302 else: 303 print 'bye bye col %d: %s'%(col,repr(d)) 304 nPossible[col] = -1 305 cols.remove(col) 306 307 return map(lambda x:int(x)+1,nPossible)
308 309 310
311 -def WritePickledData(outName,data):
312 """ writes either a .qdat.pkl or a .dat.pkl file 313 314 **Arguments** 315 316 - outName: the name of the file to be used 317 318 - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_ 319 320 """ 321 outFile = open(outName,'wb+') 322 varNames = data.GetVarNames() 323 qBounds = data.GetQuantBounds() 324 ptNames = data.GetPtNames() 325 examples = data.GetAllData() 326 327 cPickle.dump(varNames,outFile) 328 cPickle.dump(qBounds,outFile) 329 cPickle.dump(ptNames,outFile) 330 cPickle.dump(examples,outFile) 331 outFile.close()
332
333 -def QuantDataToDB(qData,dbName,baseTableName,user='sysdba',password='masterkey'):
334 """ fires an _MLData.MLQuantDataSet_ into a database 335 336 **Arguments** 337 338 - qData: an _MLData.MLQuantDataSet_ 339 340 - dbName: the name of the database to be opened 341 342 - baseTableName: the table name to contain the data in the database 343 344 - user: the user name to be used to connect to the database 345 346 - password: the password to be used to connect to the database 347 348 **Notes** 349 350 - at the moment this is specific to using *Interbase* with the *gvib* 351 adaptor, but that would be straightforward to change. 352 353 - two tables are actually created: 354 355 1) _baseTableName_ contains the quantized data 356 357 2) _baseTableName_QBounds_ contains the quantization bounds 358 359 360 """ 361 import gvib 362 cn = gvib.connect(dbName,user,password) 363 c = cn.cursor() 364 varNames = qData.varNames 365 maxPtNameLen = max(map(len,qData.GetPtNames())) 366 367 valStr = '%s varchar(%d)'%(varNames[0],maxPtNameLen) 368 for var in xrange(1,len(varNames)): 369 valStr = valStr + ', %s int'%(varNames[var]) 370 371 try: 372 c.execute('drop table %s'%baseTableName) 373 except: 374 pass 375 createStr = 'create table %s (%s)'%(baseTableName,valStr) 376 print 'create:',createStr 377 c.execute(createStr) 378 for pt in qData.GetNamedData(): 379 vals = map(str,pt) 380 vals[0] = "\'%s\'"%vals[0] 381 valStr= string.join(vals,',') 382 c.execute('insert into %s values (%s)'%(baseTableName,valStr)) 383 384 quantName = '%s_QBounds'%(baseTableName) 385 maxVarNameLen = max(map(len,varNames)) 386 strBounds = map(str,qData.GetQuantBounds()) 387 maxStrBoundLen = max(map(len,strBounds)) 388 try: 389 c.execute('drop table %s'%quantName) 390 except: 391 pass 392 createStr = 'create table %s (variable_name varchar(%d), quant_bounds varchar(%d))'%(quantName,maxVarNameLen,maxStrBoundLen) 393 c.execute(createStr) 394 for i in xrange(len(varNames)): 395 c.execute("insert into %s values ('%s','%s')"%(quantName,varNames[i],strBounds[i])) 396 397 cn.commit()
398
399 -def DBToQuantData(dbName,baseTableName,quantName='',user='sysdba',password='masterkey'):
400 """ constructs an _MLData.MLQuantDataSet_ from a database 401 402 **Arguments** 403 404 - dbName: the name of the database to be opened 405 406 - baseTableName: the table name containing the data in the database 407 408 - quantName: the table name containing the quantization bounds in the database 409 410 - user: the user name to be used to connect to the database 411 412 - password: the password to be used to connect to the database 413 414 **Returns** 415 416 an _MLData.MLQuantDataSet_ 417 418 **Notes** 419 420 - at the moment this is specific to using *Interbase* with the *gvib* 421 adaptor, but that would be straightforward to change. 422 423 - two tables are actually required: 424 425 1) _baseTableName_ contains the quantized data 426 427 2) _baseTableName_QBounds_ contains the quantization bounds 428 429 """ 430 import gvib 431 try: 432 cn = gvib.connect(dbName,user,password) 433 except: 434 print 'cannot connect to database %s'%(dbName) 435 return None 436 c = cn.cursor() 437 438 if quantName == '': 439 quantName = '%s_QBounds'%(baseTableName) 440 try: 441 c.execute('select * from %s'%quantName) 442 except: 443 print 'cannot query table %s in database %s'%(quantName,dbName) 444 return None 445 446 res = c.fetchall() 447 varNames = map(lambda x:x[0][:-1],res) 448 qBounds = map(lambda x: eval(x[1]),res) 449 450 try: 451 c.execute('select * from %s'%baseTableName) 452 except: 453 print 'cannot query table %s in database %s'%(baseTableName,dbName) 454 return None 455 res = c.fetchall() 456 vals = map(lambda x: map(lambda y: int(y),x[1:]),res) 457 458 ptNames = map(lambda x: x[0],res) 459 data = MLData.MLQuantDataSet(vals,qBounds=qBounds,varNames=varNames, 460 ptNames=ptNames) 461 return data
462
463 -def TakeEnsemble(vect,ensembleIds,isDataVect=False):
464 """ 465 466 >>> v = [10,20,30,40,50] 467 >>> TakeEnsemble(v,(1,2,3)) 468 [20, 30, 40] 469 >>> v = ['foo',10,20,30,40,50,1] 470 >>> TakeEnsemble(v,(1,2,3),isDataVect=True) 471 ['foo', 20, 30, 40, 1] 472 473 474 475 """ 476 if isDataVect: 477 ensembleIds = [x+1 for x in ensembleIds] 478 vect = [vect[0]]+[vect[x] for x in ensembleIds]+[vect[-1]] 479 else: 480 vect = [vect[x] for x in ensembleIds] 481 return vect
482 483 484
485 -def DBToData(dbName,tableName,user='sysdba',password='masterkey',dupCol=-1, 486 what='*',where='',join='',pickleCol=-1,pickleClass=None, 487 ensembleIds=None):
488 """ constructs an _MLData.MLDataSet_ from a database 489 490 **Arguments** 491 492 - dbName: the name of the database to be opened 493 494 - tableName: the table name containing the data in the database 495 496 - user: the user name to be used to connect to the database 497 498 - password: the password to be used to connect to the database 499 500 - dupCol: if nonzero specifies which column should be used to recognize 501 duplicates. 502 503 **Returns** 504 505 an _MLData.MLDataSet_ 506 507 **Notes** 508 509 - this uses Dbase.DataUtils functionality 510 511 """ 512 conn = DbConnect(dbName,tableName,user,password) 513 res = conn.GetData(fields=what,where=where,join=join,removeDups=dupCol, 514 forceList=1) 515 nPts = len(res) 516 vals = [None]*nPts 517 ptNames = [None]*nPts 518 classWorks=True 519 for i in range(nPts): 520 tmp = list(res[i]) 521 ptNames[i] = tmp.pop(0) 522 if pickleCol>=0: 523 if not pickleClass or not classWorks: 524 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) 525 else: 526 try: 527 tmp[pickleCol] = pickleClass(str(tmp[pickleCol])) 528 except: 529 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) 530 classWorks=False 531 if ensembleIds: 532 tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol],ensembleIds) 533 else: 534 if ensembleIds: 535 tmp = TakeEnsemble(tmp,ensembleIds,isDataVect=True) 536 vals[i] = tmp 537 varNames = conn.GetColumnNames(join=join,what=what) 538 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames) 539 return data
540
541 -def TextToData(reader,ignoreCols=[],onlyCols=None):
542 """ constructs an _MLData.MLDataSet_ from a bunch of text 543 #DOC 544 **Arguments** 545 - reader needs to be iterable and return lists of elements 546 (like a csv.reader) 547 548 **Returns** 549 550 an _MLData.MLDataSet_ 551 552 """ 553 554 varNames = reader.next() 555 if not onlyCols: 556 keepCols = [] 557 for i,name in enumerate(varNames): 558 if name not in ignoreCols: 559 keepCols.append(i) 560 else: 561 keepCols = [-1]*len(onlyCols) 562 for i,name in enumerate(varNames): 563 if name in onlyCols: 564 keepCols[onlyCols.index(name)]=i 565 566 nCols = len(varNames) 567 varNames = tuple([varNames[x] for x in keepCols]) 568 nVars = len(varNames) 569 vals = [] 570 ptNames = [] 571 for splitLine in reader: 572 if len(splitLine): 573 if len(splitLine)!=nCols: 574 raise ValueError,'unequal line lengths' 575 tmp = [splitLine[x] for x in keepCols] 576 ptNames.append(tmp[0]) 577 pt = [None]*(nVars-1) 578 for j in range(nVars-1): 579 try: 580 val = int(tmp[j+1]) 581 except: 582 try: 583 val = float(tmp[j+1]) 584 except: 585 val = str(tmp[j+1]) 586 pt[j] = val 587 vals.append(pt) 588 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames) 589 return data
590
591 -def TextFileToData(fName,onlyCols=None):
592 """ 593 #DOC 594 595 """ 596 ext = fName.split('.')[-1] 597 if ext.upper() == 'CSV': 598 # CSV module distributed with python2.3 and later 599 splitter = csv.reader(open(fName,'rU')) 600 else: 601 splitter = csv.reader(open(fName,'rU'),delimiter='\t') 602 return TextToData(splitter,onlyCols=onlyCols)
603
604 -def InitRandomNumbers(seed):
605 """ Seeds the random number generators 606 607 **Arguments** 608 609 - seed: a 2-tuple containing integers to be used as the random number seeds 610 611 **Notes** 612 613 this seeds both the RDRandom generator and the one in the standard 614 Python _random_ module 615 616 """ 617 from rdkit import RDRandom 618 RDRandom.seed(seed[0]) 619 import random 620 random.seed(seed[0])
621
622 -def FilterData(inData,val,frac,col=-1,indicesToUse=None,indicesOnly=0):
623 """ 624 #DOC 625 """ 626 if frac<0 or frac>1: raise ValueError,'filter fraction out of bounds' 627 try: 628 inData[0][col] 629 except IndexError: 630 raise ValueError,'target column index out of range' 631 632 633 # convert the input data to a list and sort them 634 if indicesToUse: 635 tmp = [inData[x] for x in indicesToUse] 636 else: 637 tmp = list(inData) 638 nOrig = len(tmp) 639 sortOrder = range(nOrig) 640 sortOrder.sort(lambda x,y,col=col,tmp=tmp:cmp(tmp[x][col],tmp[y][col])) 641 tmp = [tmp[x] for x in sortOrder] 642 643 # find the start of the entries with value val 644 start = 0 645 while start < nOrig and tmp[start][col] != val: 646 start += 1 647 if start >= nOrig: 648 raise ValueError,'target value (%d) not found in data'%(val) 649 650 # find the end of the entries with value val 651 finish = start+1 652 while finish<nOrig and tmp[finish][col] ==val: 653 finish += 1 654 655 # how many entries have the target value? 656 nWithVal = finish-start 657 658 # how many don't? 659 nOthers = len(tmp)-nWithVal 660 661 currFrac = float(nWithVal) / nOrig 662 if currFrac < frac: 663 # 664 # We're going to keep most of (all) the points with the target value, 665 # We need to figure out how many of the other points we'll 666 # toss out 667 # 668 nTgtFinal = nWithVal 669 nFinal = int(round(nWithVal / frac)) 670 nOthersFinal = nFinal - nTgtFinal 671 672 # 673 # We may need to reduce the number of targets to keep 674 # because it may make it impossible to hit exactly the 675 # fraction we're trying for. Take care of that now 676 # 677 while float(nTgtFinal) / nFinal > frac: 678 nTgtFinal -= 1 679 nFinal -= 1 680 681 else: 682 # 683 # There are too many points with the target value, 684 # we'll keep most of (all) the other points and toss a random 685 # selection of the target value points 686 # 687 nOthersFinal = nOthers 688 nFinal = int(round(nOthers/(1-frac))) 689 nTgtFinal = nFinal - nOthersFinal 690 691 # 692 # We may need to reduce the number of others to keep 693 # because it may make it impossible to hit exactly the 694 # fraction we're trying for. Take care of that now 695 # 696 while float(nTgtFinal) / nFinal < frac: 697 nOthersFinal -= 1 698 nFinal -= 1 699 700 others = range(start) + range(finish,nOrig) 701 othersTake = permutation(nOthers) 702 others = [others[x] for x in othersTake[:nOthersFinal]] 703 704 targets = range(start,finish) 705 targetsTake = permutation(nWithVal) 706 targets = [targets[x] for x in targetsTake[:nTgtFinal]] 707 708 # these are all the indices we'll be keeping 709 indicesToKeep = targets+others 710 nToKeep = len(indicesToKeep) 711 nRej = nOrig-nToKeep 712 713 res = [] 714 rej = [] 715 # now pull the points, but in random order 716 if not indicesOnly: 717 for i in permutation(nOrig): 718 if i in indicesToKeep: 719 res.append(tmp[i]) 720 else: 721 rej.append(tmp[i]) 722 else: 723 # EFF: this is slower than it needs to be 724 for i in permutation(nOrig): 725 if not indicesToUse: 726 idx = sortOrder[i] 727 else: 728 idx = indicesToUse[sortOrder[i]] 729 if i in indicesToKeep: 730 res.append(idx) 731 else: 732 rej.append(idx) 733 return res,rej
734
735 -def CountResults(inData,col=-1,bounds=None):
736 """ #DOC 737 """ 738 counts = {} 739 for p in inData: 740 if not bounds: 741 r = p[col] 742 else: 743 act = p[col] 744 bound = 0 745 placed = 0 746 while not placed and bound < len(bounds): 747 if act < bounds[bound]: 748 r = bound 749 placed = 1 750 else: 751 bound += 1 752 if not placed: 753 r = bound 754 755 counts[r] = counts.get(r,0)+1 756 return counts
757 758
759 -def RandomizeActivities(dataSet,shuffle=0,runDetails=None):
760 """ randomizes the activity values of a dataset 761 762 **Arguments** 763 764 - dataSet: a _ML.Data.MLQuantDataSet_, the activities here will be randomized 765 766 - shuffle: an optional toggle. If this is set, the activity values 767 will be shuffled (so the number in each class remains constant) 768 769 - runDetails: an optional CompositeRun object 770 771 **Note** 772 773 - _examples_ are randomized in place 774 775 776 """ 777 nPossible = dataSet.GetNPossibleVals()[-1] 778 nPts = dataSet.GetNPts() 779 if shuffle: 780 if runDetails: runDetails.shuffled = 1 781 acts = dataSet.GetResults()[:] 782 random.shuffle(acts) 783 else: 784 if runDetails: runDetails.randomized = 1 785 acts = [random.randint(0,nPossible) for x in len(examples)] 786 for i in range(nPts): 787 tmp = dataSet[i] 788 tmp[-1] = acts[i] 789 dataSet[i] = tmp
790 791 792 793 794 795 #------------------------------------ 796 # 797 # doctest boilerplate 798 #
799 -def _test():
800 import doctest,sys 801 return doctest.testmod(sys.modules["__main__"])
802 803 if __name__ == '__main__': 804 import sys 805 failed,tried = _test() 806 sys.exit(failed) 807