InfoBitRanker.h

Go to the documentation of this file.
00001 // $Id: InfoBitRanker.h 416 2007-11-10 05:51:36Z glandrum $
00002 //
00003 //  Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
00004 //   @All Rights Reserved@
00005 //
00006 
00007 #ifndef _RD_INFORANKER_H_
00008 #define _RD_INFORANKER_H_
00009 
00010 #include <RDGeneral/types.h>
00011 #include <DataStructs/BitVects.h>
00012 #include <iostream>
00013 
00014 
00015 /*! \brief Class used to rank bits based on a specified measure of infomation
00016  *
00017  * Basically a primitive mimic of the CombiChem "signal" functionality
00018  * To use:
00019  *  - create an instance of this class 
00020  *  - loop over the fingerprints in the dataset by calling accumulateVotes method
00021  *  - call getTopN to get the top n ranked bits
00022  *
00023  * Sample usage and results from the python wrapper:
00024  * Here's a small set of vectors:
00025  * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
00026  * ... 
00027  * 0001 0
00028  * 0101 0
00029  * 0010 1
00030  * 1110 1
00031  * 
00032  * Default ranker, using infogain:
00033  * >>> ranker = InfoBitRanker(4,2)  
00034  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
00035  * ... 
00036  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
00037  * ... 
00038  * 3 1.000 2 0
00039  * 2 1.000 0 2
00040  * 0 0.311 0 1
00041  * 
00042  * Using the biased infogain:
00043  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
00044  * >>> ranker.SetBiasList((1,))
00045  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
00046  * ... 
00047  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
00048  * ... 
00049  * 2 1.000 0 2
00050  * 0 0.311 0 1
00051  * 1 0.000 1 1
00052  * 
00053  * A chi squared ranker is also available:
00054  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
00055  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
00056  * ... 
00057  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
00058  * ... 
00059  * 3 4.000 2 0
00060  * 2 4.000 0 2
00061  * 0 1.333 0 1
00062  * 
00063  * As is a biased chi squared:
00064  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
00065  * >>> ranker.SetBiasList((1,))
00066  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
00067  * ... 
00068  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
00069  * ... 
00070  * 2 4.000 0 2
00071  * 0 1.333 0 1
00072  * 1 0.000 1 1
00073  */
00074 namespace RDInfoTheory {
00075   typedef std::vector<RDKit::USHORT> USHORT_VECT;
00076   typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
00077 
00078   class InfoBitRanker {
00079   public:
00080     
00081     /*! \brief the type of measure for information
00082      * 
00083      */
00084     typedef enum {
00085       ENTROPY=1,
00086       BIASENTROPY=2,
00087       CHISQUARE=3,
00088       BIASCHISQUARE=4
00089     } InfoType;
00090     
00091     /*! \brief Constructor 
00092      *
00093      * ARGUMENTS:
00094      *
00095      *   - nBits: the dimension of the bit vectors or the fingerprint length
00096      *   - nClasses: the number of classes used in the classification problem (e.g. active,
00097      *              moderately active, inactive etc.). It is assumed that the classes are 
00098      *              numbered from 0 to (nClasses - 1) 
00099      *   - infoType: the type of information metric
00100      */
00101     InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY) :
00102     d_dims(nBits), d_classes(nClasses), d_type(infoType) {
00103       d_counts.resize(0);
00104       for (unsigned int i = 0; i < nClasses; i++) {
00105         USHORT_VECT cCount;
00106         cCount.resize(d_dims, 0);
00107         d_counts.push_back(cCount);
00108       }
00109       d_clsCount.resize(d_classes, 0);
00110       d_nInst = 0;
00111       d_top = 0;
00112       dp_topBits=0;
00113       d_biasList.resize(0);
00114       dp_maskBits=0;
00115     }
00116     
00117     ~InfoBitRanker() {
00118       if(dp_topBits)
00119         delete [] dp_topBits;
00120       if(dp_maskBits)
00121         delete dp_maskBits;
00122     }
00123 
00124     /*! \brief Accumulate the votes for all the bits turned on in a bit vector
00125      *  
00126      *  ARGUMENTS:
00127      *
00128      *   - bv : bit vector that supports [] operator
00129      *   - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses
00130      */
00131     void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
00132     void accumulateVotes(const SparseBitVect &bv, unsigned int label);
00133     
00134     /*! \brief Returns the top n bits ranked by the information metric
00135      *
00136      * This is actually the function where most of the work of ranking is happening
00137      * 
00138      *  \param num the number of top ranked bits that are required
00139      *
00140      *  \return a pointer to an information array. The client should *not*
00141      *          delete this
00142      */
00143     double *getTopN(unsigned int num);
00144     
00145     /*! \brief return the number of labelled instances(examples) or fingerprints seen so far
00146      *
00147      */
00148     unsigned int getNumInstances() const {
00149       return d_nInst;
00150     }
00151     
00152     /*! \brief return the number of classes 
00153      *
00154      */
00155     unsigned int getNumClasses() const {
00156       return d_classes;
00157     }
00158 
00159     /*! \brief Set the classes to which the entropy calculation should be biased
00160      *
00161      * This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits. 
00162      * In this mode, a bit must be correllated higher with one of the biased classes than all the 
00163      * other classes. For example, in a two class problem with actives and inactives, the fraction of 
00164      * actives that hit the bit has to be greater than the fraction of inactives that hit the bit
00165      *
00166      * ARGUMENTS:
00167      *   classList - list of class ids that we want a bias towards
00168      */
00169     void setBiasList(RDKit::INT_VECT &classList);
00170 
00171 
00172     /*! \brief Set the bits to be used as a mask
00173      *
00174      * If this function is called, only the bits which are present in the
00175      *   maskBits list will be used.
00176      *
00177      * ARGUMENTS:
00178      *   maskBits - the bits to be considered
00179      */
00180     void setMaskBits(RDKit::INT_VECT &maskBits);
00181 
00182     /*! \brief Write the top N bits to a stream
00183      *
00184      */
00185     void writeTopBitsToStream(std::ostream *outStream) const;
00186     
00187     /*! \brief Write the top bits to a file
00188      *
00189      */
00190     void writeTopBitsToFile(std::string fileName) const;
00191 
00192   private:
00193     /*! \brief check if we want to compute the info content for a bit based on the bias list
00194      *
00195      * This what happens here:
00196      *    - the fraction of items in each class that hit a particular bit are computed
00197      *    - the maximum of these fractions for classes that are not in the biasList are computed
00198      *    - If this maximum is less than the fraction for atleast one of classes in the biaslist
00199      *      the bit is considered good
00200      * ARGUMENTS:
00201      *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
00202      *              a 2D structure is assumed with the first row containing number of items of each class 
00203      *              with the bit set and the second row to entires of each class with the bit turned off
00204      */
00205     bool BiasCheckBit(RDKit::USHORT *resMat) const;
00206 
00207     /*! \brief Compute the biased info entropy gain based on the bias list
00208      *
00209      *  This what happens here:
00210      *    - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
00211      *    - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0 
00212      *
00213      * ARGUMENTS:
00214      *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
00215      *              a 2D structure is assumed with the first row containing number of items of each class 
00216      *              with the bit set and the second row to entires of each class with the bit turned off
00217      */
00218     double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
00219 
00220     /*! \brief Compute the biased chi qsure value based on the bias list
00221      *
00222      *  This what happens here:
00223      *    - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
00224      *    - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0 
00225      *
00226      * ARGUMENTS:
00227      *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
00228      *              a 2D structure is assumed with the first row containing number of items of each class 
00229      *              with the bit set and the second row to entires of each class with the bit turned off
00230      */
00231     double BiasChiSquareGain(RDKit::USHORT *resMat) const;
00232 
00233     unsigned int d_dims; // the number of bits in the fingerprints
00234     unsigned int d_classes; // the number of classes (active, inactive, moderately active etc.)
00235     InfoType d_type; // the type of information meassure - currently we support only entropy
00236     VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for each bit for each class
00237     USHORT_VECT d_clsCount; // counter for the number of instances of each class 
00238     double *dp_topBits; // storage for the top ranked bits and the corresponding statistics
00239     unsigned int d_top; // the number of bits that have been ranked
00240     unsigned int d_nInst; // total number of instances or fingerprints used accumulate votes
00241     RDKit::INT_VECT d_biasList; // if we want a bias towards certain classes in ranking bits
00242     ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
00243     
00244   };
00245 }
00246 #endif

Generated on Sat May 24 08:36:32 2008 for RDCode by  doxygen 1.5.3