InfoGainFuncs.h

Go to the documentation of this file.
00001 // $Id: InfoGainFuncs.h 869 2008-10-27 10:04:34Z glandrum $
00002 //
00003 //  Copyright (C) 2003 Rational Discovery LLC
00004 //
00005 
00006 #ifndef INFOGAINFUNC_H
00007 #define INFOGAINFUNC_H
00008 
00009 #include <RDGeneral/types.h>
00010 
00011 namespace RDInfoTheory {
00012 
00013   template<class T> double ChiSquare(T *dMat, long int dim1,long int dim2) {
00014     // For a contingency matrix with each column corresponding to a class and each row to a 
00015     // the descriptor (or variable) state, the matrix looks something like for 3x3 problem
00016     // 
00017     //            1    2    3   Totals
00018     //      1 |  N11  N12  N13    R1
00019     //      2 |  N21  N22  N23    R2
00020     //      3 |  N31  N32  N33    R3
00021     // Totals |   C1   C2   C3    N
00022     //
00023     //  Th chi squere formula is 
00024     //  chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
00025     T *rowSums, *colSums;
00026     int i, j, tSum;
00027     // find the row sum
00028     tSum = 0;
00029     rowSums = new T[dim1];
00030     for (i = 0; i < dim1; i++) {
00031       int idx1 = i*dim2;
00032       rowSums[i] = (T)0.0;
00033       for (j = 0; j < dim2; j++) {
00034         rowSums[i] += dMat[idx1 + j];
00035       }
00036       tSum += (int)rowSums[i];
00037     }
00038 
00039     // find the column sums
00040     colSums = new T[dim2];
00041     for (i = 0; i < dim2; i++) {
00042       colSums[i] = (T)0.0;
00043       for (j = 0; j < dim1; j++) {
00044         colSums[i] += dMat[j*dim2 + i];
00045       }
00046     }
00047     
00048     double chi = 0.0;
00049     for ( i = 0; i < dim1; i++) {
00050       double rchi = 0.0;
00051       for (j = 0; j < dim2; j++) {
00052         rchi += (pow((double)dMat[i*dim2 + j], 2)/colSums[j]);
00053       }
00054       chi += ( ((double)tSum/rowSums[i])*rchi );
00055     }
00056     chi -= tSum;
00057     delete [] rowSums;
00058     delete [] colSums;
00059 
00060     return chi;
00061   }
00062 
00063   template<class T> double InfoEntropy(T *tPtr, long int dim) {
00064     int i;
00065     T nInstances = 0;
00066     double accum=0.0,d;
00067     
00068     for(i=0;i<dim;i++){
00069       nInstances += tPtr[i];
00070     }
00071   
00072     if(nInstances != 0){
00073       for(i=0;i<dim;i++){
00074         d = (double)tPtr[i]/nInstances;
00075         if(d != 0){
00076           accum += -d*log(d);
00077         }
00078       }
00079     }
00080     return accum/log(2.0);
00081   }
00082 
00083   template<class T> double InfoEntropyGain(T *dMat, long int dim1,long int dim2) {
00084     T *variableRes, *overallRes;
00085     double gain,term2;
00086     int tSum;
00087 
00088     //std::cerr<<" --------\n    ieg: "<<dim1<<" "<<dim2<<std::endl;
00089     variableRes = new T[dim1];
00090     for(long int i=0;i<dim1;i++){
00091       long int idx1 = i*dim2;
00092       variableRes[i] = (T)0.0;
00093       for(long int j=0;j<dim2;j++){
00094         variableRes[i] += dMat[idx1+j];
00095         //std::cerr<<"  "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
00096       }
00097     }
00098 
00099     overallRes = new T[dim2];
00100     // do the col sums
00101     for(long int i=0;i<dim2;i++){
00102       overallRes[i] = (T)0.0;
00103       for(long int j=0;j<dim1;j++){
00104         overallRes[i] += dMat[j*dim2+i];
00105         //std::cerr<<"  "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
00106       }
00107     }
00108 
00109     term2 = 0.0;
00110     for(long int i=0;i<dim1;i++) {
00111       T *tPtr;
00112       tPtr = dMat + i*dim2;
00113       term2 += variableRes[i] * InfoEntropy(tPtr,dim2);
00114     }
00115     tSum = 0;
00116     for(long int i=0;i<dim2;i++){
00117       tSum += static_cast<int>(overallRes[i]);
00118     }
00119     
00120     if(tSum != 0){
00121       term2 /= tSum;
00122       gain = InfoEntropy(overallRes,dim2) - term2;
00123     }
00124     else{
00125       gain = 0.0;
00126     }
00127     //std::cerr<<"  >gain> "<<gain<<std::endl;
00128     
00129     delete [] overallRes;
00130     delete [] variableRes;
00131     return gain;
00132   }
00133    
00134   
00135 }
00136 #endif
00137 
00138 

Generated on Fri Apr 3 06:03:02 2009 for RDCode by  doxygen 1.5.6