00001 // $Id: InfoBitRanker.h 416 2007-11-10 05:51:36Z glandrum $ 00002 // 00003 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC 00004 // @All Rights Reserved@ 00005 // 00006 00007 #ifndef _RD_INFORANKER_H_ 00008 #define _RD_INFORANKER_H_ 00009 00010 #include <RDGeneral/types.h> 00011 #include <DataStructs/BitVects.h> 00012 #include <iostream> 00013 00014 00015 /*! \brief Class used to rank bits based on a specified measure of infomation 00016 * 00017 * Basically a primitive mimic of the CombiChem "signal" functionality 00018 * To use: 00019 * - create an instance of this class 00020 * - loop over the fingerprints in the dataset by calling accumulateVotes method 00021 * - call getTopN to get the top n ranked bits 00022 * 00023 * Sample usage and results from the python wrapper: 00024 * Here's a small set of vectors: 00025 * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i] 00026 * ... 00027 * 0001 0 00028 * 0101 0 00029 * 0010 1 00030 * 1110 1 00031 * 00032 * Default ranker, using infogain: 00033 * >>> ranker = InfoBitRanker(4,2) 00034 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i]) 00035 * ... 00036 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1) 00037 * ... 00038 * 3 1.000 2 0 00039 * 2 1.000 0 2 00040 * 0 0.311 0 1 00041 * 00042 * Using the biased infogain: 00043 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY) 00044 * >>> ranker.SetBiasList((1,)) 00045 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i]) 00046 * ... 00047 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1) 00048 * ... 00049 * 2 1.000 0 2 00050 * 0 0.311 0 1 00051 * 1 0.000 1 1 00052 * 00053 * A chi squared ranker is also available: 00054 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE) 00055 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i]) 00056 * ... 00057 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1) 00058 * ... 00059 * 3 4.000 2 0 00060 * 2 4.000 0 2 00061 * 0 1.333 0 1 00062 * 00063 * As is a biased chi squared: 00064 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE) 00065 * >>> ranker.SetBiasList((1,)) 00066 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i]) 00067 * ... 00068 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1) 00069 * ... 00070 * 2 4.000 0 2 00071 * 0 1.333 0 1 00072 * 1 0.000 1 1 00073 */ 00074 namespace RDInfoTheory { 00075 typedef std::vector<RDKit::USHORT> USHORT_VECT; 00076 typedef std::vector<USHORT_VECT> VECT_USHORT_VECT; 00077 00078 class InfoBitRanker { 00079 public: 00080 00081 /*! \brief the type of measure for information 00082 * 00083 */ 00084 typedef enum { 00085 ENTROPY=1, 00086 BIASENTROPY=2, 00087 CHISQUARE=3, 00088 BIASCHISQUARE=4 00089 } InfoType; 00090 00091 /*! \brief Constructor 00092 * 00093 * ARGUMENTS: 00094 * 00095 * - nBits: the dimension of the bit vectors or the fingerprint length 00096 * - nClasses: the number of classes used in the classification problem (e.g. active, 00097 * moderately active, inactive etc.). It is assumed that the classes are 00098 * numbered from 0 to (nClasses - 1) 00099 * - infoType: the type of information metric 00100 */ 00101 InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY) : 00102 d_dims(nBits), d_classes(nClasses), d_type(infoType) { 00103 d_counts.resize(0); 00104 for (unsigned int i = 0; i < nClasses; i++) { 00105 USHORT_VECT cCount; 00106 cCount.resize(d_dims, 0); 00107 d_counts.push_back(cCount); 00108 } 00109 d_clsCount.resize(d_classes, 0); 00110 d_nInst = 0; 00111 d_top = 0; 00112 dp_topBits=0; 00113 d_biasList.resize(0); 00114 dp_maskBits=0; 00115 } 00116 00117 ~InfoBitRanker() { 00118 if(dp_topBits) 00119 delete [] dp_topBits; 00120 if(dp_maskBits) 00121 delete dp_maskBits; 00122 } 00123 00124 /*! \brief Accumulate the votes for all the bits turned on in a bit vector 00125 * 00126 * ARGUMENTS: 00127 * 00128 * - bv : bit vector that supports [] operator 00129 * - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses 00130 */ 00131 void accumulateVotes(const ExplicitBitVect &bv, unsigned int label); 00132 void accumulateVotes(const SparseBitVect &bv, unsigned int label); 00133 00134 /*! \brief Returns the top n bits ranked by the information metric 00135 * 00136 * This is actually the function where most of the work of ranking is happening 00137 * 00138 * \param num the number of top ranked bits that are required 00139 * 00140 * \return a pointer to an information array. The client should *not* 00141 * delete this 00142 */ 00143 double *getTopN(unsigned int num); 00144 00145 /*! \brief return the number of labelled instances(examples) or fingerprints seen so far 00146 * 00147 */ 00148 unsigned int getNumInstances() const { 00149 return d_nInst; 00150 } 00151 00152 /*! \brief return the number of classes 00153 * 00154 */ 00155 unsigned int getNumClasses() const { 00156 return d_classes; 00157 } 00158 00159 /*! \brief Set the classes to which the entropy calculation should be biased 00160 * 00161 * This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits. 00162 * In this mode, a bit must be correllated higher with one of the biased classes than all the 00163 * other classes. For example, in a two class problem with actives and inactives, the fraction of 00164 * actives that hit the bit has to be greater than the fraction of inactives that hit the bit 00165 * 00166 * ARGUMENTS: 00167 * classList - list of class ids that we want a bias towards 00168 */ 00169 void setBiasList(RDKit::INT_VECT &classList); 00170 00171 00172 /*! \brief Set the bits to be used as a mask 00173 * 00174 * If this function is called, only the bits which are present in the 00175 * maskBits list will be used. 00176 * 00177 * ARGUMENTS: 00178 * maskBits - the bits to be considered 00179 */ 00180 void setMaskBits(RDKit::INT_VECT &maskBits); 00181 00182 /*! \brief Write the top N bits to a stream 00183 * 00184 */ 00185 void writeTopBitsToStream(std::ostream *outStream) const; 00186 00187 /*! \brief Write the top bits to a file 00188 * 00189 */ 00190 void writeTopBitsToFile(std::string fileName) const; 00191 00192 private: 00193 /*! \brief check if we want to compute the info content for a bit based on the bias list 00194 * 00195 * This what happens here: 00196 * - the fraction of items in each class that hit a particular bit are computed 00197 * - the maximum of these fractions for classes that are not in the biasList are computed 00198 * - If this maximum is less than the fraction for atleast one of classes in the biaslist 00199 * the bit is considered good 00200 * ARGUMENTS: 00201 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes)) 00202 * a 2D structure is assumed with the first row containing number of items of each class 00203 * with the bit set and the second row to entires of each class with the bit turned off 00204 */ 00205 bool BiasCheckBit(RDKit::USHORT *resMat) const; 00206 00207 /*! \brief Compute the biased info entropy gain based on the bias list 00208 * 00209 * This what happens here: 00210 * - we call BiasCheckBit to see if the bit qualifies to compute the infocontent 00211 * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0 00212 * 00213 * ARGUMENTS: 00214 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes)) 00215 * a 2D structure is assumed with the first row containing number of items of each class 00216 * with the bit set and the second row to entires of each class with the bit turned off 00217 */ 00218 double BiasInfoEntropyGain(RDKit::USHORT *resMat) const; 00219 00220 /*! \brief Compute the biased chi qsure value based on the bias list 00221 * 00222 * This what happens here: 00223 * - we call BiasCheckBit to see if the bit qualifies to compute the infocontent 00224 * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0 00225 * 00226 * ARGUMENTS: 00227 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes)) 00228 * a 2D structure is assumed with the first row containing number of items of each class 00229 * with the bit set and the second row to entires of each class with the bit turned off 00230 */ 00231 double BiasChiSquareGain(RDKit::USHORT *resMat) const; 00232 00233 unsigned int d_dims; // the number of bits in the fingerprints 00234 unsigned int d_classes; // the number of classes (active, inactive, moderately active etc.) 00235 InfoType d_type; // the type of information meassure - currently we support only entropy 00236 VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for each bit for each class 00237 USHORT_VECT d_clsCount; // counter for the number of instances of each class 00238 double *dp_topBits; // storage for the top ranked bits and the corresponding statistics 00239 unsigned int d_top; // the number of bits that have been ranked 00240 unsigned int d_nInst; // total number of instances or fingerprints used accumulate votes 00241 RDKit::INT_VECT d_biasList; // if we want a bias towards certain classes in ranking bits 00242 ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered 00243 00244 }; 00245 } 00246 #endif
1.5.6