1
2
3
4
5 """ Informational Entropy functions
6
7 The definitions used are the same as those in Tom Mitchell's
8 book "Machine Learning"
9
10 """
11 from math import *
12 from Numeric import *
13
14 try:
15 import rdInfoTheory as cEntropy
16 except:
17 hascEntropy=0
18 else:
19 hascEntropy=1
20
21
22
23 _log2 = log(2)
24
26 """ Calculates the informational entropy of a set of results.
27
28 **Arguments**
29
30 results is a 1D Numeric array containing the number of times a
31 given set hits each possible result.
32 For example, if a function has 3 possible results, and the
33 variable in question hits them 5, 6 and 1 times each,
34 results would be [5,6,1]
35
36 **Returns**
37
38 the informational entropy
39
40 """
41 nInstances = float(sum(results))
42 if nInstances == 0:
43
44 return 0
45 probs = results/nInstances
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61 t = choose(greater(probs,0.0),(1,probs))
62 return sum(-probs*log(t)/_log2)
63
64
66 """ calculates the information gain for a variable
67
68 **Arguments**
69
70 varMat is a Numeric array with the number of possible occurances
71 of each result for reach possible value of the given variable.
72
73 So, for a variable which adopts 4 possible values and a result which
74 has 3 possible values, varMat would be 4x3
75
76 **Returns**
77
78 The expected information gain
79 """
80 variableRes = sum(varMat,1)
81 overallRes = sum(varMat)
82
83 term2 = 0
84 for i in xrange(len(variableRes)):
85 term2 = term2 + variableRes[i] * InfoEntropy(varMat[i])
86 tSum = sum(overallRes)
87 if tSum != 0.0:
88 term2 = 1./tSum * term2
89 gain = InfoEntropy(overallRes) - term2
90 else:
91 gain = 0
92 return gain
93
94
95 if hascEntropy:
96 InfoEntropy = cEntropy.InfoEntropy
97 InfoGain = cEntropy.InfoGain
98 else:
99 InfoEntropy = PyInfoEntropy
100 InfoGain = PyInfoGain
101
102
103 if __name__ == '__main__':
104 arr = array([9,5])
105 print arr, InfoEntropy(arr)
106 arr = array([9,9])
107 print arr, InfoEntropy(arr)
108 arr = array([5,5])
109 print arr, InfoEntropy(arr)
110 arr = array([5,0])
111 print arr, InfoEntropy(arr)
112 arr = array([5,5,5])
113 print arr, InfoEntropy(arr)
114 arr = array([2,5,5])
115 print arr, InfoEntropy(arr)
116
117 mat2 = array([[6,2],[3,3]])
118 print 'gain: ', InfoGain(mat2)
119
120
121 mat3 = array([[1,1],[2,1]])
122 print 'gain3: ', InfoGain(mat3)
123
124 mat4 = array([[2,0],[1,2]])
125 print 'gain4: ', InfoGain(mat4)
126