1
2
3
4
5
6
7
8 """ Utilities for data manipulation
9
10 **FILE FORMATS:**
11
12 - *.qdat files* contain quantized data suitable for
13 feeding to learning algorithms.
14
15 The .qdat file, written by _DecTreeGui_, is structured as follows:
16
17 1) Any number of lines which are ignored.
18
19 2) A line containing the string 'Variable Table'
20
21 any number of variable definitions in the format:
22
23 '# Variable_name [quant_bounds]'
24
25 where '[quant_bounds]' is a list of the boundaries used for quantizing
26 that variable. If the variable is inherently integral (i.e. not
27 quantized), this can be an empty list.
28
29 3) A line beginning with '# ----' which signals the end of the variable list
30
31 4) Any number of lines containing data points, in the format:
32
33 'Name_of_point var1 var2 var3 .... varN'
34
35 all variable values should be integers
36
37 Throughout, it is assumed that varN is the result
38
39 - *.dat files* contain the same information as .qdat files, but the variable
40 values can be anything (floats, ints, strings). **These files should
41 still contain quant_bounds!**
42
43 - *.qdat.pkl file* contain a pickled (binary) representation of
44 the data read in. They stores, in order:
45
46 1) A python list of the variable names
47
48 2) A python list of lists with the quantization bounds
49
50 3) A python list of the point names
51
52 4) A python list of lists with the data points
53
54 """
55 from rdkit import RDConfig
56 from rdkit.utils import fileutils
57 from rdkit.ML.Data import MLData
58 from rdkit.Dbase.DbConnection import DbConnect
59 from rdkit.DataStructs import BitUtils
60 import string
61 import re,csv
62 import cPickle
63 import random
64
66 res = range(nToDo)
67 random.shuffle(res)
68 return res
69
70 -def WriteData(outFile,varNames,qBounds,examples):
71 """ writes out a .qdat file
72
73 **Arguments**
74
75 - outFile: a file object
76
77 - varNames: a list of variable names
78
79 - qBounds: the list of quantization bounds (should be the same length
80 as _varNames_)
81
82 - examples: the data to be written
83
84 """
85 outFile.write('# Quantized data from DataUtils\n')
86 outFile.write('# ----------\n')
87 outFile.write('# Variable Table\n')
88 for i in xrange(len(varNames)):
89 outFile.write('# %s %s\n'%(varNames[i],str(qBounds[i])))
90 outFile.write('# ----------\n')
91 for example in examples:
92 outFile.write(string.join(map(str,example),' ')+'\n')
93
94
96 """ reads the variables and quantization bounds from a .qdat or .dat file
97
98 **Arguments**
99
100 - inFile: a file object
101
102 **Returns**
103
104 a 2-tuple containing:
105
106 1) varNames: a list of the variable names
107
108 2) qbounds: the list of quantization bounds for each variable
109
110 """
111 varNames = []
112 qBounds = []
113 fileutils.MoveToMatchingLine(inFile,'Variable Table')
114 inLine = inFile.readline()
115 while string.find(inLine,'# ----') == -1:
116 splitLine = string.split(inLine[2:],'[')
117 varNames.append(string.strip(splitLine[0]))
118 qBounds.append(splitLine[1][:-2])
119 inLine = inFile.readline()
120 for i in xrange(len(qBounds)):
121
122 if qBounds[i] != '':
123 l = string.split(qBounds[i],',')
124 qBounds[i] = []
125 for item in l:
126 qBounds[i].append(float(item))
127 else:
128 qBounds[i] = []
129 return varNames,qBounds
130
132 """ reads the examples from a .qdat file
133
134 **Arguments**
135
136 - inFile: a file object
137
138 **Returns**
139
140 a 2-tuple containing:
141
142 1) the names of the examples
143
144 2) a list of lists containing the examples themselves
145
146 **Note**
147
148 because this is reading a .qdat file, it assumed that all variable values
149 are integers
150
151 """
152 expr1 = re.compile(r'^#')
153 expr2 = re.compile(r'[\ ]*|[\t]*')
154 examples = []
155 names = []
156 inLine = inFile.readline()
157 while inLine:
158 if expr1.search(inLine) is None:
159 resArr = expr2.split(inLine)
160 if len(resArr)>1:
161 examples.append(map(lambda x: int(x),resArr[1:]))
162 names.append(resArr[0])
163 inLine = inFile.readline()
164 return names,examples
165
167 """ reads the examples from a .dat file
168
169 **Arguments**
170
171 - inFile: a file object
172
173 **Returns**
174
175 a 2-tuple containing:
176
177 1) the names of the examples
178
179 2) a list of lists containing the examples themselves
180
181 **Note**
182
183 - this attempts to convert variable values to ints, then floats.
184 if those both fail, they are left as strings
185
186 """
187 expr1 = re.compile(r'^#')
188 expr2 = re.compile(r'[\ ]*|[\t]*')
189 examples = []
190 names = []
191 inLine = inFile.readline()
192 while inLine:
193 if expr1.search(inLine) is None:
194 resArr = expr2.split(inLine)[:-1]
195 if len(resArr)>1:
196 for i in xrange(1,len(resArr)):
197 d = resArr[i]
198 try:
199 resArr[i] = int(d)
200 except ValueError:
201 try:
202 resArr[i] = float(d)
203 except ValueError:
204 pass
205 examples.append(resArr[1:])
206 names.append(resArr[0])
207 inLine = inFile.readline()
208 return names,examples
209
211 """ builds a data set from a .qdat file
212
213 **Arguments**
214
215 - fileName: the name of the .qdat file
216
217 **Returns**
218
219 an _MLData.MLQuantDataSet_
220
221 """
222 inFile = open(fileName,'r')
223
224 varNames,qBounds = ReadVars(inFile)
225 ptNames,examples = ReadQuantExamples(inFile)
226 data = MLData.MLQuantDataSet(examples,qBounds=qBounds,varNames=varNames,
227 ptNames=ptNames)
228 return data
229
230
232 """ builds a data set from a .dat file
233
234 **Arguments**
235
236 - fileName: the name of the .dat file
237
238 **Returns**
239
240 an _MLData.MLDataSet_
241
242 """
243 inFile = open(fileName,'r')
244
245 varNames,qBounds = ReadVars(inFile)
246 ptNames,examples = ReadGeneralExamples(inFile)
247 data = MLData.MLDataSet(examples,qBounds=qBounds,varNames=varNames,
248 ptNames=ptNames)
249 return data
250
251
253 """ calculates the number of possible values for each variable in a data set
254
255 **Arguments**
256
257 - data: a list of examples
258
259 - order: the ordering map between the variables in _data_ and _qBounds_
260
261 - qBounds: the quantization bounds for the variables
262
263 **Returns**
264
265 a list with the number of possible values each variable takes on in the data set
266
267 **Notes**
268
269 - variables present in _qBounds_ will have their _nPossible_ number read
270 from _qbounds
271
272 - _nPossible_ for other numeric variables will be calculated
273
274 """
275 numericTypes = [type(1),type(1.0),type(1L)]
276 print 'order:',order, len(order)
277 print 'qB:',qBounds
278
279 assert (qBounds and len(order)==len(qBounds)) or (nQBounds and len(order)==len(nQBounds)),\
280 'order/qBounds mismatch'
281 nVars = len(order)
282 nPossible = [-1]*nVars
283 cols = range(nVars)
284 for i in xrange(nVars):
285 if nQBounds and nQBounds[i] != 0:
286 nPossible[i] = -1
287 cols.remove(i)
288 elif len(qBounds[i])>0:
289 nPossible[i] = len(qBounds[i])
290 cols.remove(i)
291
292 nPts = len(data)
293 for i in xrange(nPts):
294 for col in cols[:]:
295 d = data[i][order[col]]
296 if type(d) in numericTypes:
297 if int(d) == d:
298 nPossible[col] = max(int(d),nPossible[col])
299 else:
300 nPossible[col] = -1
301 cols.remove(col)
302 else:
303 print 'bye bye col %d: %s'%(col,repr(d))
304 nPossible[col] = -1
305 cols.remove(col)
306
307 return map(lambda x:int(x)+1,nPossible)
308
309
310
312 """ writes either a .qdat.pkl or a .dat.pkl file
313
314 **Arguments**
315
316 - outName: the name of the file to be used
317
318 - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_
319
320 """
321 outFile = open(outName,'wb+')
322 varNames = data.GetVarNames()
323 qBounds = data.GetQuantBounds()
324 ptNames = data.GetPtNames()
325 examples = data.GetAllData()
326
327 cPickle.dump(varNames,outFile)
328 cPickle.dump(qBounds,outFile)
329 cPickle.dump(ptNames,outFile)
330 cPickle.dump(examples,outFile)
331 outFile.close()
332
333 -def QuantDataToDB(qData,dbName,baseTableName,user='sysdba',password='masterkey'):
334 """ fires an _MLData.MLQuantDataSet_ into a database
335
336 **Arguments**
337
338 - qData: an _MLData.MLQuantDataSet_
339
340 - dbName: the name of the database to be opened
341
342 - baseTableName: the table name to contain the data in the database
343
344 - user: the user name to be used to connect to the database
345
346 - password: the password to be used to connect to the database
347
348 **Notes**
349
350 - at the moment this is specific to using *Interbase* with the *gvib*
351 adaptor, but that would be straightforward to change.
352
353 - two tables are actually created:
354
355 1) _baseTableName_ contains the quantized data
356
357 2) _baseTableName_QBounds_ contains the quantization bounds
358
359
360 """
361 import gvib
362 cn = gvib.connect(dbName,user,password)
363 c = cn.cursor()
364 varNames = qData.varNames
365 maxPtNameLen = max(map(len,qData.GetPtNames()))
366
367 valStr = '%s varchar(%d)'%(varNames[0],maxPtNameLen)
368 for var in xrange(1,len(varNames)):
369 valStr = valStr + ', %s int'%(varNames[var])
370
371 try:
372 c.execute('drop table %s'%baseTableName)
373 except:
374 pass
375 createStr = 'create table %s (%s)'%(baseTableName,valStr)
376 print 'create:',createStr
377 c.execute(createStr)
378 for pt in qData.GetNamedData():
379 vals = map(str,pt)
380 vals[0] = "\'%s\'"%vals[0]
381 valStr= string.join(vals,',')
382 c.execute('insert into %s values (%s)'%(baseTableName,valStr))
383
384 quantName = '%s_QBounds'%(baseTableName)
385 maxVarNameLen = max(map(len,varNames))
386 strBounds = map(str,qData.GetQuantBounds())
387 maxStrBoundLen = max(map(len,strBounds))
388 try:
389 c.execute('drop table %s'%quantName)
390 except:
391 pass
392 createStr = 'create table %s (variable_name varchar(%d), quant_bounds varchar(%d))'%(quantName,maxVarNameLen,maxStrBoundLen)
393 c.execute(createStr)
394 for i in xrange(len(varNames)):
395 c.execute("insert into %s values ('%s','%s')"%(quantName,varNames[i],strBounds[i]))
396
397 cn.commit()
398
399 -def DBToQuantData(dbName,baseTableName,quantName='',user='sysdba',password='masterkey'):
400 """ constructs an _MLData.MLQuantDataSet_ from a database
401
402 **Arguments**
403
404 - dbName: the name of the database to be opened
405
406 - baseTableName: the table name containing the data in the database
407
408 - quantName: the table name containing the quantization bounds in the database
409
410 - user: the user name to be used to connect to the database
411
412 - password: the password to be used to connect to the database
413
414 **Returns**
415
416 an _MLData.MLQuantDataSet_
417
418 **Notes**
419
420 - at the moment this is specific to using *Interbase* with the *gvib*
421 adaptor, but that would be straightforward to change.
422
423 - two tables are actually required:
424
425 1) _baseTableName_ contains the quantized data
426
427 2) _baseTableName_QBounds_ contains the quantization bounds
428
429 """
430 import gvib
431 try:
432 cn = gvib.connect(dbName,user,password)
433 except:
434 print 'cannot connect to database %s'%(dbName)
435 return None
436 c = cn.cursor()
437
438 if quantName == '':
439 quantName = '%s_QBounds'%(baseTableName)
440 try:
441 c.execute('select * from %s'%quantName)
442 except:
443 print 'cannot query table %s in database %s'%(quantName,dbName)
444 return None
445
446 res = c.fetchall()
447 varNames = map(lambda x:x[0][:-1],res)
448 qBounds = map(lambda x: eval(x[1]),res)
449
450 try:
451 c.execute('select * from %s'%baseTableName)
452 except:
453 print 'cannot query table %s in database %s'%(baseTableName,dbName)
454 return None
455 res = c.fetchall()
456 vals = map(lambda x: map(lambda y: int(y),x[1:]),res)
457
458 ptNames = map(lambda x: x[0],res)
459 data = MLData.MLQuantDataSet(vals,qBounds=qBounds,varNames=varNames,
460 ptNames=ptNames)
461 return data
462
464 """
465
466 >>> v = [10,20,30,40,50]
467 >>> TakeEnsemble(v,(1,2,3))
468 [20, 30, 40]
469 >>> v = ['foo',10,20,30,40,50,1]
470 >>> TakeEnsemble(v,(1,2,3),isDataVect=True)
471 ['foo', 20, 30, 40, 1]
472
473
474
475 """
476 if isDataVect:
477 ensembleIds = [x+1 for x in ensembleIds]
478 vect = [vect[0]]+[vect[x] for x in ensembleIds]+[vect[-1]]
479 else:
480 vect = [vect[x] for x in ensembleIds]
481 return vect
482
483
484
485 -def DBToData(dbName,tableName,user='sysdba',password='masterkey',dupCol=-1,
486 what='*',where='',join='',pickleCol=-1,pickleClass=None,
487 ensembleIds=None):
488 """ constructs an _MLData.MLDataSet_ from a database
489
490 **Arguments**
491
492 - dbName: the name of the database to be opened
493
494 - tableName: the table name containing the data in the database
495
496 - user: the user name to be used to connect to the database
497
498 - password: the password to be used to connect to the database
499
500 - dupCol: if nonzero specifies which column should be used to recognize
501 duplicates.
502
503 **Returns**
504
505 an _MLData.MLDataSet_
506
507 **Notes**
508
509 - this uses Dbase.DataUtils functionality
510
511 """
512 conn = DbConnect(dbName,tableName,user,password)
513 res = conn.GetData(fields=what,where=where,join=join,removeDups=dupCol,
514 forceList=1)
515 nPts = len(res)
516 vals = [None]*nPts
517 ptNames = [None]*nPts
518 classWorks=True
519 for i in range(nPts):
520 tmp = list(res[i])
521 ptNames[i] = tmp.pop(0)
522 if pickleCol>=0:
523 if not pickleClass or not classWorks:
524 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
525 else:
526 try:
527 tmp[pickleCol] = pickleClass(str(tmp[pickleCol]))
528 except:
529 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
530 classWorks=False
531 if ensembleIds:
532 tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol],ensembleIds)
533 else:
534 if ensembleIds:
535 tmp = TakeEnsemble(tmp,ensembleIds,isDataVect=True)
536 vals[i] = tmp
537 varNames = conn.GetColumnNames(join=join,what=what)
538 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames)
539 return data
540
541 -def TextToData(reader,ignoreCols=[],onlyCols=None):
542 """ constructs an _MLData.MLDataSet_ from a bunch of text
543 #DOC
544 **Arguments**
545 - reader needs to be iterable and return lists of elements
546 (like a csv.reader)
547
548 **Returns**
549
550 an _MLData.MLDataSet_
551
552 """
553
554 varNames = reader.next()
555 if not onlyCols:
556 keepCols = []
557 for i,name in enumerate(varNames):
558 if name not in ignoreCols:
559 keepCols.append(i)
560 else:
561 keepCols = [-1]*len(onlyCols)
562 for i,name in enumerate(varNames):
563 if name in onlyCols:
564 keepCols[onlyCols.index(name)]=i
565
566 nCols = len(varNames)
567 varNames = tuple([varNames[x] for x in keepCols])
568 nVars = len(varNames)
569 vals = []
570 ptNames = []
571 for splitLine in reader:
572 if len(splitLine):
573 if len(splitLine)!=nCols:
574 raise ValueError,'unequal line lengths'
575 tmp = [splitLine[x] for x in keepCols]
576 ptNames.append(tmp[0])
577 pt = [None]*(nVars-1)
578 for j in range(nVars-1):
579 try:
580 val = int(tmp[j+1])
581 except:
582 try:
583 val = float(tmp[j+1])
584 except:
585 val = str(tmp[j+1])
586 pt[j] = val
587 vals.append(pt)
588 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames)
589 return data
590
591 -def TextFileToData(fName,onlyCols=None):
592 """
593 #DOC
594
595 """
596 ext = fName.split('.')[-1]
597 if ext.upper() == 'CSV':
598
599 splitter = csv.reader(open(fName,'rU'))
600 else:
601 splitter = csv.reader(open(fName,'rU'),delimiter='\t')
602 return TextToData(splitter,onlyCols=onlyCols)
603
605 """ Seeds the random number generators
606
607 **Arguments**
608
609 - seed: a 2-tuple containing integers to be used as the random number seeds
610
611 **Notes**
612
613 this seeds both the RDRandom generator and the one in the standard
614 Python _random_ module
615
616 """
617 from rdkit import RDRandom
618 RDRandom.seed(seed[0])
619 import random
620 random.seed(seed[0])
621
622 -def FilterData(inData,val,frac,col=-1,indicesToUse=None,indicesOnly=0):
623 """
624 #DOC
625 """
626 if frac<0 or frac>1: raise ValueError,'filter fraction out of bounds'
627 try:
628 inData[0][col]
629 except IndexError:
630 raise ValueError,'target column index out of range'
631
632
633
634 if indicesToUse:
635 tmp = [inData[x] for x in indicesToUse]
636 else:
637 tmp = list(inData)
638 nOrig = len(tmp)
639 sortOrder = range(nOrig)
640 sortOrder.sort(lambda x,y,col=col,tmp=tmp:cmp(tmp[x][col],tmp[y][col]))
641 tmp = [tmp[x] for x in sortOrder]
642
643
644 start = 0
645 while start < nOrig and tmp[start][col] != val:
646 start += 1
647 if start >= nOrig:
648 raise ValueError,'target value (%d) not found in data'%(val)
649
650
651 finish = start+1
652 while finish<nOrig and tmp[finish][col] ==val:
653 finish += 1
654
655
656 nWithVal = finish-start
657
658
659 nOthers = len(tmp)-nWithVal
660
661 currFrac = float(nWithVal) / nOrig
662 if currFrac < frac:
663
664
665
666
667
668 nTgtFinal = nWithVal
669 nFinal = int(round(nWithVal / frac))
670 nOthersFinal = nFinal - nTgtFinal
671
672
673
674
675
676
677 while float(nTgtFinal) / nFinal > frac:
678 nTgtFinal -= 1
679 nFinal -= 1
680
681 else:
682
683
684
685
686
687 nOthersFinal = nOthers
688 nFinal = int(round(nOthers/(1-frac)))
689 nTgtFinal = nFinal - nOthersFinal
690
691
692
693
694
695
696 while float(nTgtFinal) / nFinal < frac:
697 nOthersFinal -= 1
698 nFinal -= 1
699
700 others = range(start) + range(finish,nOrig)
701 othersTake = permutation(nOthers)
702 others = [others[x] for x in othersTake[:nOthersFinal]]
703
704 targets = range(start,finish)
705 targetsTake = permutation(nWithVal)
706 targets = [targets[x] for x in targetsTake[:nTgtFinal]]
707
708
709 indicesToKeep = targets+others
710 nToKeep = len(indicesToKeep)
711 nRej = nOrig-nToKeep
712
713 res = []
714 rej = []
715
716 if not indicesOnly:
717 for i in permutation(nOrig):
718 if i in indicesToKeep:
719 res.append(tmp[i])
720 else:
721 rej.append(tmp[i])
722 else:
723
724 for i in permutation(nOrig):
725 if not indicesToUse:
726 idx = sortOrder[i]
727 else:
728 idx = indicesToUse[sortOrder[i]]
729 if i in indicesToKeep:
730 res.append(idx)
731 else:
732 rej.append(idx)
733 return res,rej
734
736 """ #DOC
737 """
738 counts = {}
739 for p in inData:
740 if not bounds:
741 r = p[col]
742 else:
743 act = p[col]
744 bound = 0
745 placed = 0
746 while not placed and bound < len(bounds):
747 if act < bounds[bound]:
748 r = bound
749 placed = 1
750 else:
751 bound += 1
752 if not placed:
753 r = bound
754
755 counts[r] = counts.get(r,0)+1
756 return counts
757
758
760 """ randomizes the activity values of a dataset
761
762 **Arguments**
763
764 - dataSet: a _ML.Data.MLQuantDataSet_, the activities here will be randomized
765
766 - shuffle: an optional toggle. If this is set, the activity values
767 will be shuffled (so the number in each class remains constant)
768
769 - runDetails: an optional CompositeRun object
770
771 **Note**
772
773 - _examples_ are randomized in place
774
775
776 """
777 nPossible = dataSet.GetNPossibleVals()[-1]
778 nPts = dataSet.GetNPts()
779 if shuffle:
780 if runDetails: runDetails.shuffled = 1
781 acts = dataSet.GetResults()[:]
782 random.shuffle(acts)
783 else:
784 if runDetails: runDetails.randomized = 1
785 acts = [random.randint(0,nPossible) for x in len(examples)]
786 for i in range(nPts):
787 tmp = dataSet[i]
788 tmp[-1] = acts[i]
789 dataSet[i] = tmp
790
791
792
793
794
795
796
797
798
800 import doctest,sys
801 return doctest.testmod(sys.modules["__main__"])
802
803 if __name__ == '__main__':
804 import sys
805 failed,tried = _test()
806 sys.exit(failed)
807