Package ML :: Package InfoTheory :: Module entropy
[hide private]
[frames] | no frames]

Source Code for Module ML.InfoTheory.entropy

  1  # 
  2  #  Copyright (C) 2000,2003  greg Landrum and Rational Discovery LLC 
  3  # 
  4   
  5  """ Informational Entropy functions 
  6   
  7    The definitions used are the same as those in Tom Mitchell's 
  8    book "Machine Learning" 
  9     
 10  """ 
 11  from math import * 
 12  from Numeric import * 
 13  # try to get the C versions of these routines 
 14  try: 
 15    import rdInfoTheory as cEntropy 
 16  except: 
 17    hascEntropy=0 
 18  else: 
 19    hascEntropy=1 
 20   
 21   
 22  # it's pretty obvious what this is for ;-) 
 23  _log2 = log(2) 
 24   
25 -def PyInfoEntropy(results):
26 """ Calculates the informational entropy of a set of results. 27 28 **Arguments** 29 30 results is a 1D Numeric array containing the number of times a 31 given set hits each possible result. 32 For example, if a function has 3 possible results, and the 33 variable in question hits them 5, 6 and 1 times each, 34 results would be [5,6,1] 35 36 **Returns** 37 38 the informational entropy 39 40 """ 41 nInstances = float(sum(results)) 42 if nInstances == 0: 43 # to return zero or one... that is the question 44 return 0 45 probs = results/nInstances 46 47 #------- 48 # NOTE: this is a little hack to allow the use of Numeric 49 # functionality to calculate the informational entropy. 50 # The problem is that the system log function pitches a fit 51 # when you call log(0.0). We are perfectly happy with that 52 # returning *anything* because we're gonna mutiply by 0 anyway. 53 54 # Here's the risky (but marginally faster way to do it: 55 # add a small number to probs and hope it doesn't screw 56 # things up too much. 57 #t = probs+1e-10 58 59 # Here's a perfectly safe approach that's a little bit more obfuscated 60 # and a tiny bit slower 61 t = choose(greater(probs,0.0),(1,probs)) 62 return sum(-probs*log(t)/_log2)
63 64
65 -def PyInfoGain(varMat):
66 """ calculates the information gain for a variable 67 68 **Arguments** 69 70 varMat is a Numeric array with the number of possible occurances 71 of each result for reach possible value of the given variable. 72 73 So, for a variable which adopts 4 possible values and a result which 74 has 3 possible values, varMat would be 4x3 75 76 **Returns** 77 78 The expected information gain 79 """ 80 variableRes = sum(varMat,1) # indexed by variable, Sv in Mitchell's notation 81 overallRes = sum(varMat) # indexed by result, S in Mitchell's notation 82 83 term2 = 0 84 for i in xrange(len(variableRes)): 85 term2 = term2 + variableRes[i] * InfoEntropy(varMat[i]) 86 tSum = sum(overallRes) 87 if tSum != 0.0: 88 term2 = 1./tSum * term2 89 gain = InfoEntropy(overallRes) - term2 90 else: 91 gain = 0 92 return gain
93 94 # if we have the C versions, use them, otherwise use the python stuff 95 if hascEntropy: 96 InfoEntropy = cEntropy.InfoEntropy 97 InfoGain = cEntropy.InfoGain 98 else: 99 InfoEntropy = PyInfoEntropy 100 InfoGain = PyInfoGain 101 102 103 if __name__ == '__main__': 104 arr = array([9,5]) 105 print arr, InfoEntropy(arr) 106 arr = array([9,9]) 107 print arr, InfoEntropy(arr) 108 arr = array([5,5]) 109 print arr, InfoEntropy(arr) 110 arr = array([5,0]) 111 print arr, InfoEntropy(arr) 112 arr = array([5,5,5]) 113 print arr, InfoEntropy(arr) 114 arr = array([2,5,5]) 115 print arr, InfoEntropy(arr) 116 117 mat2 = array([[6,2],[3,3]]) 118 print 'gain: ', InfoGain(mat2) 119 120 121 mat3 = array([[1,1],[2,1]]) 122 print 'gain3: ', InfoGain(mat3) 123 124 mat4 = array([[2,0],[1,2]]) 125 print 'gain4: ', InfoGain(mat4) 126