Package rdkit :: Package ML :: Package DecTree :: Module ID3
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.DecTree.ID3

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  # 
  4  """ ID3 Decision Trees 
  5   
  6    contains an implementation of the ID3 decision tree algorithm 
  7    as described in Tom Mitchell's book "Machine Learning" 
  8   
  9    It relies upon the _Tree.TreeNode_ data structure (or something 
 10      with the same API) defined locally to represent the trees  
 11   
 12  """ 
 13   
 14  import numpy 
 15  from rdkit.ML.DecTree import DecTree 
 16  from rdkit.ML.InfoTheory import entropy 
 17   
18 -def CalcTotalEntropy(examples,nPossibleVals):
19 """ Calculates the total entropy of the data set (w.r.t. the results) 20 21 **Arguments** 22 23 - examples: a list (nInstances long) of lists of variable values + instance 24 values 25 - nPossibleVals: a list (nVars long) of the number of possible values each variable 26 can adopt. 27 28 **Returns** 29 30 a float containing the informational entropy of the data set. 31 32 """ 33 nRes = nPossibleVals[-1] 34 resList = numpy.zeros(nRes,'i') 35 for example in examples: 36 res = example[-1] 37 resList[res] = resList[res] + 1 38 return entropy.InfoEntropy(resList)
39
40 -def GenVarTable(examples,nPossibleVals,vars):
41 """Generates a list of variable tables for the examples passed in. 42 43 The table for a given variable records the number of times each possible value 44 of that variable appears for each possible result of the function. 45 46 **Arguments** 47 48 - examples: a list (nInstances long) of lists of variable values + instance 49 values 50 51 - nPossibleVals: a list containing the number of possible values of 52 each variable + the number of values of the function. 53 54 - vars: a list of the variables to include in the var table 55 56 57 **Returns** 58 59 a list of variable result tables. Each table is a Numeric array 60 which is varValues x nResults 61 """ 62 nVars = len(vars) 63 res = [None]*nVars 64 nFuncVals = nPossibleVals[-1] 65 66 for i in xrange(nVars): 67 res[i] = numpy.zeros((nPossibleVals[vars[i]],nFuncVals),'i') 68 for example in examples: 69 val = int(example[-1]) 70 for i in xrange(nVars): 71 res[i][int(example[vars[i]]),val] += 1 72 73 return res
74
75 -def ID3(examples,target,attrs,nPossibleVals,depth=0,maxDepth=-1, 76 **kwargs):
77 """ Implements the ID3 algorithm for constructing decision trees. 78 79 From Mitchell's book, page 56 80 81 This is *slightly* modified from Mitchell's book because it supports 82 multivalued (non-binary) results. 83 84 **Arguments** 85 86 - examples: a list (nInstances long) of lists of variable values + instance 87 values 88 89 - target: an int 90 91 - attrs: a list of ints indicating which variables can be used in the tree 92 93 - nPossibleVals: a list containing the number of possible values of 94 every variable. 95 96 - depth: (optional) the current depth in the tree 97 98 - maxDepth: (optional) the maximum depth to which the tree 99 will be grown 100 101 **Returns** 102 103 a DecTree.DecTreeNode with the decision tree 104 105 **NOTE:** This code cannot bootstrap (start from nothing...) 106 use _ID3Boot_ (below) for that. 107 """ 108 varTable = GenVarTable(examples,nPossibleVals,attrs) 109 tree=DecTree.DecTreeNode(None,'node') 110 111 # store the total entropy... in case that is interesting 112 totEntropy = CalcTotalEntropy(examples,nPossibleVals) 113 tree.SetData(totEntropy) 114 #tree.SetExamples(examples) 115 116 # the matrix of results for this target: 117 tMat = GenVarTable(examples,nPossibleVals,[target])[0] 118 # counts of each result code: 119 counts = sum(tMat) 120 nzCounts = numpy.nonzero(counts)[0] 121 122 if len(nzCounts) == 1: 123 # bottomed out because there is only one result code left 124 # with any counts (i.e. there's only one type of example 125 # left... this is GOOD!). 126 res = nzCounts[0] 127 tree.SetLabel(res) 128 tree.SetName(str(res)) 129 tree.SetTerminal(1) 130 elif len(attrs) == 0 or (maxDepth>=0 and depth>=maxDepth): 131 # Bottomed out: no variables left or max depth hit 132 # We don't really know what to do here, so 133 # use the heuristic of picking the most prevalent 134 # result 135 v = numpy.argmax(counts) 136 tree.SetLabel(v) 137 tree.SetName('%d?'%v) 138 tree.SetTerminal(1) 139 else: 140 # find the variable which gives us the largest information gain 141 142 gains = [entropy.InfoGain(x) for x in varTable] 143 best = attrs[numpy.argmax(gains)] 144 145 146 # remove that variable from the lists of possible variables 147 nextAttrs = attrs[:] 148 if not kwargs.get('recycleVars',0): 149 nextAttrs.remove(best) 150 151 # set some info at this node 152 tree.SetName('Var: %d'%best) 153 tree.SetLabel(best) 154 #tree.SetExamples(examples) 155 tree.SetTerminal(0) 156 157 # loop over possible values of the new variable and 158 # build a subtree for each one 159 for val in xrange(nPossibleVals[best]): 160 nextExamples = [] 161 for example in examples: 162 if example[best] == val: 163 nextExamples.append(example) 164 if len(nextExamples) == 0: 165 # this particular value of the variable has no examples, 166 # so there's not much sense in recursing. 167 # This can (and does) happen. 168 v = numpy.argmax(counts) 169 tree.AddChild('%d'%v,label=v,data=0.0,isTerminal=1) 170 else: 171 # recurse 172 tree.AddChildNode(ID3(nextExamples,best,nextAttrs,nPossibleVals,depth+1,maxDepth, 173 **kwargs)) 174 return tree
175
176 -def ID3Boot(examples,attrs,nPossibleVals,initialVar=None,depth=0,maxDepth=-1, 177 **kwargs):
178 """ Bootstrapping code for the ID3 algorithm 179 180 see ID3 for descriptions of the arguments 181 182 If _initialVar_ is not set, the algorithm will automatically 183 choose the first variable in the tree (the standard greedy 184 approach). Otherwise, _initialVar_ will be used as the first 185 split. 186 187 """ 188 totEntropy = CalcTotalEntropy(examples,nPossibleVals) 189 varTable = GenVarTable(examples,nPossibleVals,attrs) 190 191 tree=DecTree.DecTreeNode(None,'node') 192 #tree.SetExamples(examples) 193 tree._nResultCodes = nPossibleVals[-1] 194 195 # <perl>you've got to love any language which will let you 196 # do this much work in a single line :-)</perl> 197 if initialVar is None: 198 best = attrs[numpy.argmax([entropy.InfoGain(x) for x in varTable])] 199 else: 200 best = initialVar 201 202 tree.SetName('Var: %d'%best) 203 tree.SetData(totEntropy) 204 tree.SetLabel(best) 205 tree.SetTerminal(0) 206 nextAttrs = attrs[:] 207 if not kwargs.get('recycleVars',0): 208 nextAttrs.remove(best) 209 210 for val in xrange(nPossibleVals[best]): 211 nextExamples = [] 212 for example in examples: 213 if example[best] == val: 214 nextExamples.append(example) 215 216 tree.AddChildNode(ID3(nextExamples,best,nextAttrs,nPossibleVals,depth,maxDepth, 217 **kwargs)) 218 return tree
219