1
2
3
4
5
6 """ for the moment this is using Francois Fleuret's cmim library
7 to do the feature selection
8
9 Reference: F. Fleuret "Fast Binary Feature Selection with Conditional
10 Mutual Information", J. Machine Learn. Res. 5, 1531-1535
11 (2004)
12
13 """
14 from rdkit import RDConfig
15 from rdkit import DataStructs
16 import tempfile
17 import os
18 import rdFeatSelect
19
21 res = rdFeatSelect.selectCMIM(examples,nFeatsToPick)
22 if -1 in res:
23 res = list(res)
24 res = tuple(res[:res.index(-1)])
25 return res
26
28 nPts = len(examples)
29 nFeats = examples[0][bvCol].GetNumBits()
30
31 exe = os.path.join(RDConfig.RDBaseDir,'External','cmim-1.0','cmim.exe')
32 if not os.path.exists(exe):
33 raise ValueError,'could not find cmim executable %s'%exe
34
35 inFname = tempfile.mktemp('.dat')
36 outFname = inFname + '.out'
37 inF = open(inFname,'w+')
38 print >>inF,nPts,nFeats
39 for row in examples:
40 print >>inF,row[bvCol].ToBitString()
41 print >>inF,row[-1]
42 inF.close()
43 inF = None
44
45 os.spawnlp(os.P_WAIT,exe,exe,'--nb-features',str(nFeatsToPick),'--train',
46 inFname,outFname)
47
48 inD = open(outFname,'r')
49 inL = inD.readline()
50 nCreated = int(inL)
51 inL = inD.readline()
52 res = []
53 splitL = inL.split(' ')
54 for i in range(nFeatsToPick):
55 res.append(int(splitL[i]))
56 inD.close()
57 inD = None
58
59 os.unlink(inFname)
60 os.unlink(outFname)
61
62 return res
63