Package rdkit :: Package ML :: Package InfoTheory :: Module entropy
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.InfoTheory.entropy

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  # 
  4   
  5  """ Informational Entropy functions 
  6   
  7    The definitions used are the same as those in Tom Mitchell's 
  8    book "Machine Learning" 
  9     
 10  """ 
 11  import numpy 
 12  import math 
 13   
 14  # try to get the C versions of these routines 
 15  try: 
 16    import rdInfoTheory as cEntropy 
 17  except: 
 18    hascEntropy=0 
 19  else: 
 20    hascEntropy=1 
 21   
 22   
 23  # it's pretty obvious what this is for ;-) 
 24  _log2 = math.log(2) 
 25   
26 -def PyInfoEntropy(results):
27 """ Calculates the informational entropy of a set of results. 28 29 **Arguments** 30 31 results is a 1D Numeric array containing the number of times a 32 given set hits each possible result. 33 For example, if a function has 3 possible results, and the 34 variable in question hits them 5, 6 and 1 times each, 35 results would be [5,6,1] 36 37 **Returns** 38 39 the informational entropy 40 41 """ 42 nInstances = float(sum(results)) 43 if nInstances == 0: 44 # to return zero or one... that is the question 45 return 0 46 probs = results/nInstances 47 48 #------- 49 # NOTE: this is a little hack to allow the use of Numeric 50 # functionality to calculate the informational entropy. 51 # The problem is that the system log function pitches a fit 52 # when you call log(0.0). We are perfectly happy with that 53 # returning *anything* because we're gonna mutiply by 0 anyway. 54 55 # Here's the risky (but marginally faster way to do it: 56 # add a small number to probs and hope it doesn't screw 57 # things up too much. 58 #t = probs+1e-10 59 60 # Here's a perfectly safe approach that's a little bit more obfuscated 61 # and a tiny bit slower 62 t = numpy.choose(numpy.greater(probs,0.0),(1,probs)) 63 return sum(-probs*numpy.log(t)/_log2)
64 65
66 -def PyInfoGain(varMat):
67 """ calculates the information gain for a variable 68 69 **Arguments** 70 71 varMat is a Numeric array with the number of possible occurances 72 of each result for reach possible value of the given variable. 73 74 So, for a variable which adopts 4 possible values and a result which 75 has 3 possible values, varMat would be 4x3 76 77 **Returns** 78 79 The expected information gain 80 """ 81 variableRes = numpy.sum(varMat,1) # indexed by variable, Sv in Mitchell's notation 82 overallRes = numpy.sum(varMat,0) # indexed by result, S in Mitchell's notation 83 84 term2 = 0 85 for i in xrange(len(variableRes)): 86 term2 = term2 + variableRes[i] * InfoEntropy(varMat[i]) 87 tSum = sum(overallRes) 88 if tSum != 0.0: 89 term2 = 1./tSum * term2 90 gain = InfoEntropy(overallRes) - term2 91 else: 92 gain = 0 93 return gain
94 95 # if we have the C versions, use them, otherwise use the python stuff 96 if hascEntropy: 97 InfoEntropy = cEntropy.InfoEntropy 98 InfoGain = cEntropy.InfoGain 99 else: 100 InfoEntropy = PyInfoEntropy 101 InfoGain = PyInfoGain 102