1
2
3
4
5
6
7
8 """
9
10 """
11
12 import numpy
13 from rdkit.ML.DecTree import SigTree
14 from rdkit.ML import InfoTheory
15 from rdkit.ML.FeatureSelect import CMIM
16 from rdkit.DataStructs.VectCollection import VectCollection
17 import copy
18 import random
20 """ Generates a random subset of a group of indices
21
22 **Arguments**
23
24 - nToInclude: the size of the desired set
25
26 - nBits: the maximum index to be included in the set
27
28 **Returns**
29
30 a list of indices
31
32 """
33
34
35 res = random.sample(range(nBits),nToInclude)
36 return res
37
38 -def BuildSigTree(examples,nPossibleRes,ensemble=None,random=0,
39 metric=InfoTheory.InfoType.BIASENTROPY,
40 biasList=[1],
41 depth=0,maxDepth=-1,
42 useCMIM=0,allowCollections=False,
43 verbose=0,**kwargs):
44 """
45 **Arguments**
46
47 - examples: the examples to be classified. Each example
48 should be a sequence at least three entries long, with
49 entry 0 being a label, entry 1 a BitVector and entry -1
50 an activity value
51
52 - nPossibleRes: the number of result codes possible
53
54 - ensemble: (optional) if this argument is provided, it
55 should be a sequence which is used to limit the bits
56 which are actually considered as potential descriptors.
57 The default is None (use all bits).
58
59 - random: (optional) If this argument is nonzero, it
60 specifies the number of bits to be randomly selected
61 for consideration at this node (i.e. this toggles the
62 growth of Random Trees).
63 The default is 0 (no random descriptor selection)
64
65 - metric: (optional) This is an _InfoTheory.InfoType_ and
66 sets the metric used to rank the bits.
67 The default is _InfoTheory.InfoType.BIASENTROPY_
68
69 - biasList: (optional) If provided, this provides a bias
70 list for the bit ranker.
71 See the _InfoTheory.InfoBitRanker_ docs for an explanation
72 of bias.
73 The default value is [1], which biases towards actives.
74
75 - maxDepth: (optional) the maximum depth to which the tree
76 will be grown
77 The default is -1 (no depth limit).
78
79 - useCMIM: (optional) if this is >0, the CMIM algorithm
80 (conditional mutual information maximization) will be
81 used to select the descriptors used to build the trees.
82 The value of the variable should be set to the number
83 of descriptors to be used. This option and the
84 ensemble option are mutually exclusive (CMIM will not be
85 used if the ensemble is set), but it happily coexsts
86 with the random argument (to only consider random subsets
87 of the top N CMIM bits)
88 The default is 0 (do not use CMIM)
89
90 - depth: (optional) the current depth in the tree
91 This is used in the recursion and should not be set
92 by the client.
93
94 **Returns**
95
96 a SigTree.SigTreeNode with the root of the decision tree
97
98 """
99 if verbose: print ' '*depth,'Build'
100 tree=SigTree.SigTreeNode(None,'node',level=depth)
101 tree.SetData(-666)
102
103
104
105
106 resCodes = [int(x[-1]) for x in examples]
107
108 counts = [0]*nPossibleRes
109 for res in resCodes:
110 counts[res] += 1
111
112
113 nzCounts = numpy.nonzero(counts)[0]
114 if verbose: print ' '*depth,'\tcounts:',counts
115 if len(nzCounts) == 1:
116
117
118
119 res = nzCounts[0]
120 tree.SetLabel(res)
121 tree.SetName(str(res))
122 tree.SetTerminal(1)
123 elif maxDepth>=0 and depth>maxDepth:
124
125
126
127
128 v = numpy.argmax(counts)
129 tree.SetLabel(v)
130 tree.SetName('%d?'%v)
131 tree.SetTerminal(1)
132 else:
133
134
135 fp = examples[0][1]
136 nBits = fp.GetNumBits()
137 ranker = InfoTheory.InfoBitRanker(nBits,nPossibleRes,metric)
138 if biasList: ranker.SetBiasList(biasList)
139 if useCMIM > 0 and not ensemble:
140 ensemble = CMIM.SelectFeatures(examples,useCMIM,bvCol=1)
141 if random:
142 if ensemble:
143 if len(ensemble)>random:
144 picks = _GenerateRandomEnsemble(random,len(ensemble))
145 availBits = list(take(ensemble,picks))
146 else:
147 availBits = range(len(ensemble))
148 else:
149 availBits = _GenerateRandomEnsemble(random,nBits)
150 else:
151 availBits=None
152 if availBits:
153 ranker.SetMaskBits(availBits)
154
155
156 useCollections=isinstance(examples[0][1],VectCollection)
157 for example in examples:
158
159 if not useCollections:
160 ranker.AccumulateVotes(example[1],example[-1])
161 else:
162 example[1].Reset()
163 ranker.AccumulateVotes(example[1].orVect,example[-1])
164
165 try:
166 bitInfo = ranker.GetTopN(1)[0]
167 best = int(bitInfo[0])
168 gain = bitInfo[1]
169 except:
170 import traceback
171 traceback.print_exc()
172 print 'get top n failed'
173 gain = -1.0
174 if gain <= 0.0:
175 v = numpy.argmax(counts)
176 tree.SetLabel(v)
177 tree.SetName('?%d?'%v)
178 tree.SetTerminal(1)
179 return tree
180 best = int(bitInfo[0])
181
182 if verbose: print ' '*depth,'\tbest:',bitInfo
183
184 tree.SetName('Bit-%d'%(best))
185 tree.SetLabel(best)
186
187 tree.SetTerminal(0)
188
189
190
191 onExamples = []
192 offExamples = []
193 for example in examples:
194 if example[1][best]:
195 if allowCollections and useCollections:
196 sig = copy.copy(example[1])
197 sig.DetachVectsNotMatchingBit(best)
198 ex = [example[0],sig]
199 if len(example)>2:
200 ex.extend(example[2:])
201 example = ex
202 onExamples.append(example)
203 else:
204 offExamples.append(example)
205
206 for ex in (offExamples,onExamples):
207 if len(ex) == 0:
208 v = numpy.argmax(counts)
209 tree.AddChild('%d??'%v,label=v,data=0.0,isTerminal=1)
210 else:
211 child = BuildSigTree(ex,nPossibleRes,random=random,
212 ensemble=ensemble,
213 metric=metric,biasList=biasList,
214 depth=depth+1,maxDepth=maxDepth,
215 verbose=verbose)
216 if child is None:
217 v = numpy.argmax(counts)
218 tree.AddChild('%d???'%v,label=v,data=0.0,isTerminal=1)
219 else:
220 tree.AddChildNode(child)
221 return tree
222
223
224 -def SigTreeBuilder(examples,attrs,nPossibleVals,initialVar=None,ensemble=None,
225 randomDescriptors=0,
226 **kwargs):
227 nRes = nPossibleVals[-1]
228 return BuildSigTree(examples,nRes,random=randomDescriptors,**kwargs)
229