RDKit
Open-source cheminformatics and machine learning.
InfoGainFuncs.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2003 Rational Discovery LLC
4 //
5 
6 #ifndef INFOGAINFUNC_H
7 #define INFOGAINFUNC_H
8 
9 #include <RDGeneral/types.h>
10 
11 namespace RDInfoTheory {
12 
13 template <class T>
14 double ChiSquare(T *dMat, long int dim1, long int dim2) {
15  // For a contingency matrix with each column corresponding to a class and each
16  // row to a
17  // the descriptor (or variable) state, the matrix looks something like for 3x3
18  // problem
19  //
20  // 1 2 3 Totals
21  // 1 | N11 N12 N13 R1
22  // 2 | N21 N22 N23 R2
23  // 3 | N31 N32 N33 R3
24  // Totals | C1 C2 C3 N
25  //
26  // Th chi squere formula is
27  // chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
28  T *rowSums, *colSums;
29  int i, j, tSum;
30  // find the row sum
31  tSum = 0;
32  rowSums = new T[dim1];
33  for (i = 0; i < dim1; i++) {
34  int idx1 = i * dim2;
35  rowSums[i] = (T)0.0;
36  for (j = 0; j < dim2; j++) {
37  rowSums[i] += dMat[idx1 + j];
38  }
39  tSum += (int)rowSums[i];
40  }
41 
42  // find the column sums
43  colSums = new T[dim2];
44  for (i = 0; i < dim2; i++) {
45  colSums[i] = (T)0.0;
46  for (j = 0; j < dim1; j++) {
47  colSums[i] += dMat[j * dim2 + i];
48  }
49  }
50 
51  double chi = 0.0;
52  for (i = 0; i < dim1; i++) {
53  double rchi = 0.0;
54  for (j = 0; j < dim2; j++) {
55  rchi += (pow((double)dMat[i * dim2 + j], 2) / colSums[j]);
56  }
57  chi += (((double)tSum / rowSums[i]) * rchi);
58  }
59  chi -= tSum;
60  delete[] rowSums;
61  delete[] colSums;
62 
63  return chi;
64 }
65 
66 template <class T>
67 double InfoEntropy(T *tPtr, long int dim) {
68  int i;
69  T nInstances = 0;
70  double accum = 0.0, d;
71 
72  for (i = 0; i < dim; i++) {
73  nInstances += tPtr[i];
74  }
75 
76  if (nInstances != 0) {
77  for (i = 0; i < dim; i++) {
78  d = (double)tPtr[i] / nInstances;
79  if (d != 0) {
80  accum += -d * log(d);
81  }
82  }
83  }
84  return accum / log(2.0);
85 }
86 
87 template <class T>
88 double InfoEntropyGain(T *dMat, long int dim1, long int dim2) {
89  T *variableRes, *overallRes;
90  double gain, term2;
91  int tSum;
92 
93  // std::cerr<<" --------\n ieg: "<<dim1<<" "<<dim2<<std::endl;
94  variableRes = new T[dim1];
95  for (long int i = 0; i < dim1; i++) {
96  long int idx1 = i * dim2;
97  variableRes[i] = (T)0.0;
98  for (long int j = 0; j < dim2; j++) {
99  variableRes[i] += dMat[idx1 + j];
100  // std::cerr<<" "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
101  }
102  }
103 
104  overallRes = new T[dim2];
105  // do the col sums
106  for (long int i = 0; i < dim2; i++) {
107  overallRes[i] = (T)0.0;
108  for (long int j = 0; j < dim1; j++) {
109  overallRes[i] += dMat[j * dim2 + i];
110  // std::cerr<<" "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
111  }
112  }
113 
114  term2 = 0.0;
115  for (long int i = 0; i < dim1; i++) {
116  T *tPtr;
117  tPtr = dMat + i * dim2;
118  term2 += variableRes[i] * InfoEntropy(tPtr, dim2);
119  }
120  tSum = 0;
121  for (long int i = 0; i < dim2; i++) {
122  tSum += static_cast<int>(overallRes[i]);
123  }
124 
125  if (tSum != 0) {
126  term2 /= tSum;
127  gain = InfoEntropy(overallRes, dim2) - term2;
128  } else {
129  gain = 0.0;
130  }
131  // std::cerr<<" >gain> "<<gain<<std::endl;
132 
133  delete[] overallRes;
134  delete[] variableRes;
135  return gain;
136 }
137 }
138 #endif
Class used to rank bits based on a specified measure of infomation.
double InfoEntropyGain(T *dMat, long int dim1, long int dim2)
Definition: InfoGainFuncs.h:88
double ChiSquare(T *dMat, long int dim1, long int dim2)
Definition: InfoGainFuncs.h:14
double InfoEntropy(T *tPtr, long int dim)
Definition: InfoGainFuncs.h:67