InfoGainFuncs.h

Go to the documentation of this file.
00001 // $Id: InfoGainFuncs.h 2 2006-05-06 22:54:39Z glandrum $
00002 //
00003 //  Copyright (C) 2003 Rational Discovery LLC
00004 //
00005 
00006 #ifndef INFOGAINFUNC_H
00007 #define INFOGAINFUNC_H
00008 
00009 #include <RDGeneral/types.h>
00010 
00011 namespace RDInfoTheory {
00012 
00013   template<class T> double ChiSquare(T *dMat, long int dim1,long int dim2) {
00014     // For a contingency matrix with each column corresponding to a class and each row to a 
00015     // the descriptor (or variable) state, the matrix looks something like for 3x3 problem
00016     // 
00017     //            1    2    3   Totals
00018     //      1 |  N11  N12  N13    R1
00019     //      2 |  N21  N22  N23    R2
00020     //      3 |  N31  N32  N33    R3
00021     // Totals |   C1   C2   C3    N
00022     //
00023     //  Th chi squere formula is 
00024     //  chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
00025     T *rowSums, *colSums;
00026     int i, j, tSum;
00027     // find the row sum
00028     tSum = 0;
00029     rowSums = new T[dim1];
00030     for (i = 0; i < dim1; i++) {
00031       int idx1 = i*dim2;
00032       rowSums[i] = (T)0.0;
00033       for (j = 0; j < dim2; j++) {
00034         rowSums[i] += dMat[idx1 + j];
00035       }
00036       tSum += (int)rowSums[i];
00037     }
00038 
00039     // find the column sums
00040     colSums = new T[dim2];
00041     for (i = 0; i < dim2; i++) {
00042       colSums[i] = (T)0.0;
00043       for (j = 0; j < dim1; j++) {
00044         colSums[i] += dMat[j*dim2 + i];
00045       }
00046     }
00047     
00048     double chi = 0.0;
00049     for ( i = 0; i < dim1; i++) {
00050       double rchi = 0.0;
00051       for (j = 0; j < dim2; j++) {
00052         rchi += (pow((double)dMat[i*dim2 + j], 2)/colSums[j]);
00053       }
00054       chi += ( ((double)tSum/rowSums[i])*rchi );
00055     }
00056     chi -= tSum;
00057     delete [] rowSums;
00058     delete [] colSums;
00059 
00060     return chi;
00061   }
00062 
00063   template<class T> double InfoEntropy(T *tPtr, long int dim) {
00064     int i;
00065     T nInstances = 0;
00066     double accum=0.0,d;
00067     
00068     for(i=0;i<dim;i++){
00069       nInstances += tPtr[i];
00070     }
00071   
00072     if(nInstances != 0){
00073       for(i=0;i<dim;i++){
00074         d = (double)tPtr[i]/nInstances;
00075         if(d != 0){
00076           accum += -d*log(d);
00077         }
00078       }
00079     }
00080     return accum/log(2.0);
00081   }
00082 
00083   template<class T> double InfoEntropyGain(T *dMat, long int dim1,long int dim2) {
00084     int i,j;
00085     T *variableRes, *overallRes;
00086     double gain,term2;
00087     int tSum;
00088 
00089     variableRes = new T[dim1];
00090     for(i=0;i<dim1;i++){
00091       int idx1 = i*dim2;
00092       variableRes[i] = (T)0.0;
00093       for(j=0;j<dim2;j++){
00094         variableRes[i] += dMat[idx1+j];
00095       }
00096     }
00097 
00098     overallRes = new T[dim2];
00099     // do the col sums
00100     for(i=0;i<dim2;i++){
00101       overallRes[i] = (T)0.0;
00102       for(j=0;j<dim1;j++){
00103         overallRes[i] += dMat[j*dim2+i];
00104       }
00105     }
00106 
00107     term2 = 0.0;
00108     for(i=0;i<dim1;i++) {
00109       T *tPtr;
00110       tPtr = dMat + i*dim2;
00111       term2 += variableRes[i] * InfoEntropy(tPtr,dim2);
00112     }
00113     tSum = 0;
00114     for(i=0;i<dim2;i++){
00115       tSum += static_cast<int>(overallRes[i]);
00116     }
00117     
00118     if(tSum != 0){
00119       term2 /= tSum;
00120       gain = InfoEntropy(overallRes,dim2) - term2;
00121     }
00122     else{
00123       gain = 0.0;
00124     }
00125     
00126     delete [] overallRes;
00127     delete [] variableRes;
00128     return gain;
00129   }
00130    
00131   
00132 }
00133 #endif
00134 
00135 

Generated on Sat May 24 08:36:32 2008 for RDCode by  doxygen 1.5.3