RDKit
Open-source cheminformatics and machine learning.
InfoBitRanker.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #ifndef _RD_INFORANKER_H_
12 #define _RD_INFORANKER_H_
13 
14 #include <RDGeneral/types.h>
15 #include <DataStructs/BitVects.h>
16 #include <iostream>
17 
18 /*! \brief Class used to rank bits based on a specified measure of infomation
19  *
20  * Basically a primitive mimic of the CombiChem "signal" functionality
21  * To use:
22  * - create an instance of this class
23  * - loop over the fingerprints in the dataset by calling accumulateVotes
24  *method
25  * - call getTopN to get the top n ranked bits
26  *
27  * Sample usage and results from the python wrapper:
28  * Here's a small set of vectors:
29  * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
30  * ...
31  * 0001 0
32  * 0101 0
33  * 0010 1
34  * 1110 1
35  *
36  * Default ranker, using infogain:
37  * >>> ranker = InfoBitRanker(4,2)
38  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
39  * ...
40  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
41  *int(bit),'%.3f'%gain,int(n0),int(n1)
42  * ...
43  * 3 1.000 2 0
44  * 2 1.000 0 2
45  * 0 0.311 0 1
46  *
47  * Using the biased infogain:
48  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
49  * >>> ranker.SetBiasList((1,))
50  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
51  * ...
52  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
53  *int(bit),'%.3f'%gain,int(n0),int(n1)
54  * ...
55  * 2 1.000 0 2
56  * 0 0.311 0 1
57  * 1 0.000 1 1
58  *
59  * A chi squared ranker is also available:
60  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
61  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
62  * ...
63  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
64  *int(bit),'%.3f'%gain,int(n0),int(n1)
65  * ...
66  * 3 4.000 2 0
67  * 2 4.000 0 2
68  * 0 1.333 0 1
69  *
70  * As is a biased chi squared:
71  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
72  * >>> ranker.SetBiasList((1,))
73  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
74  * ...
75  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
76  *int(bit),'%.3f'%gain,int(n0),int(n1)
77  * ...
78  * 2 4.000 0 2
79  * 0 1.333 0 1
80  * 1 0.000 1 1
81  */
82 namespace RDInfoTheory {
83 typedef std::vector<RDKit::USHORT> USHORT_VECT;
84 typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
85 
87  public:
88  /*! \brief the type of measure for information
89  *
90  */
91  typedef enum {
92  ENTROPY = 1,
94  CHISQUARE = 3,
96  } InfoType;
97 
98  /*! \brief Constructor
99  *
100  * ARGUMENTS:
101  *
102  * - nBits: the dimension of the bit vectors or the fingerprint length
103  * - nClasses: the number of classes used in the classification problem
104  *(e.g. active,
105  * moderately active, inactive etc.). It is assumed that the
106  *classes are
107  * numbered from 0 to (nClasses - 1)
108  * - infoType: the type of information metric
109  */
110  InfoBitRanker(unsigned int nBits, unsigned int nClasses,
111  InfoType infoType = InfoBitRanker::ENTROPY)
112  : d_dims(nBits), d_classes(nClasses), d_type(infoType) {
113  d_counts.resize(0);
114  for (unsigned int i = 0; i < nClasses; i++) {
115  USHORT_VECT cCount;
116  cCount.resize(d_dims, 0);
117  d_counts.push_back(cCount);
118  }
119  d_clsCount.resize(d_classes, 0);
120  d_nInst = 0;
121  d_top = 0;
122  dp_topBits = 0;
123  d_biasList.resize(0);
124  dp_maskBits = 0;
125  }
126 
128  if (dp_topBits) delete[] dp_topBits;
129  if (dp_maskBits) delete dp_maskBits;
130  }
131 
132  /*! \brief Accumulate the votes for all the bits turned on in a bit vector
133  *
134  * ARGUMENTS:
135  *
136  * - bv : bit vector that supports [] operator
137  * - label : the class label for the bit vector. It is assumed that 0 <=
138  *class < nClasses
139  */
140  void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
141  void accumulateVotes(const SparseBitVect &bv, unsigned int label);
142 
143  /*! \brief Returns the top n bits ranked by the information metric
144  *
145  * This is actually the function where most of the work of ranking is
146  *happening
147  *
148  * \param num the number of top ranked bits that are required
149  *
150  * \return a pointer to an information array. The client should *not*
151  * delete this
152  */
153  double *getTopN(unsigned int num);
154 
155  /*! \brief return the number of labelled instances(examples) or fingerprints
156  *seen so far
157  *
158  */
159  unsigned int getNumInstances() const { return d_nInst; }
160 
161  /*! \brief return the number of classes
162  *
163  */
164  unsigned int getNumClasses() const { return d_classes; }
165 
166  /*! \brief Set the classes to which the entropy calculation should be biased
167  *
168  * This list contains a set of class ids used when in the BIASENTROPY mode of
169  *ranking bits.
170  * In this mode, a bit must be correllated higher with one of the biased
171  *classes than all the
172  * other classes. For example, in a two class problem with actives and
173  *inactives, the fraction of
174  * actives that hit the bit has to be greater than the fraction of inactives
175  *that hit the bit
176  *
177  * ARGUMENTS:
178  * classList - list of class ids that we want a bias towards
179  */
180  void setBiasList(RDKit::INT_VECT &classList);
181 
182  /*! \brief Set the bits to be used as a mask
183  *
184  * If this function is called, only the bits which are present in the
185  * maskBits list will be used.
186  *
187  * ARGUMENTS:
188  * maskBits - the bits to be considered
189  */
190  void setMaskBits(RDKit::INT_VECT &maskBits);
191 
192  /*! \brief Write the top N bits to a stream
193  *
194  */
195  void writeTopBitsToStream(std::ostream *outStream) const;
196 
197  /*! \brief Write the top bits to a file
198  *
199  */
200  void writeTopBitsToFile(const std::string &fileName) const;
201 
202  private:
203  /*! \brief check if we want to compute the info content for a bit based on the
204  *bias list
205  *
206  * This what happens here:
207  * - the fraction of items in each class that hit a particular bit are
208  *computed
209  * - the maximum of these fractions for classes that are not in the
210  *biasList are computed
211  * - If this maximum is less than the fraction for atleast one of classes
212  *in the biaslist
213  * the bit is considered good
214  * ARGUMENTS:
215  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
216  *of classes))
217  * a 2D structure is assumed with the first row containing number
218  *of items of each class
219  * with the bit set and the second row to entires of each class
220  *with the bit turned off
221  */
222  bool BiasCheckBit(RDKit::USHORT *resMat) const;
223 
224  /*! \brief Compute the biased info entropy gain based on the bias list
225  *
226  * This what happens here:
227  * - we call BiasCheckBit to see if the bit qualifies to compute the
228  *infocontent
229  * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
230  *
231  * ARGUMENTS:
232  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
233  *of classes))
234  * a 2D structure is assumed with the first row containing number
235  *of items of each class
236  * with the bit set and the second row to entires of each class
237  *with the bit turned off
238  */
239  double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
240 
241  /*! \brief Compute the biased chi qsure value based on the bias list
242  *
243  * This what happens here:
244  * - we call BiasCheckBit to see if the bit qualifies to compute the
245  *infocontent
246  * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
247  *
248  * ARGUMENTS:
249  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
250  *of classes))
251  * a 2D structure is assumed with the first row containing number
252  *of items of each class
253  * with the bit set and the second row to entires of each class
254  *with the bit turned off
255  */
256  double BiasChiSquareGain(RDKit::USHORT *resMat) const;
257 
258  unsigned int d_dims; // the number of bits in the fingerprints
259  unsigned int d_classes; // the number of classes (active, inactive,
260  // moderately active etc.)
261  InfoType d_type; // the type of information meassure - currently we support
262  // only entropy
263  VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for
264  // each bit for each class
265  USHORT_VECT d_clsCount; // counter for the number of instances of each class
266  double *dp_topBits; // storage for the top ranked bits and the corresponding
267  // statistics
268  unsigned int d_top; // the number of bits that have been ranked
269  unsigned int d_nInst; // total number of instances or fingerprints used
270  // accumulate votes
272  d_biasList; // if we want a bias towards certain classes in ranking bits
273  ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
274 };
275 }
276 #endif
unsigned int getNumClasses() const
return the number of classes
unsigned short USHORT
Definition: types.h:185
Pulls in all the BitVect classes.
void setMaskBits(RDKit::INT_VECT &maskBits)
Set the bits to be used as a mask.
unsigned int getNumInstances() const
return the number of labelled instances(examples) or fingerprints seen so far
void writeTopBitsToStream(std::ostream *outStream) const
Write the top N bits to a stream.
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:33
Class used to rank bits based on a specified measure of infomation.
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label)
Accumulate the votes for all the bits turned on in a bit vector.
std::vector< int > INT_VECT
Definition: types.h:188
std::vector< USHORT_VECT > VECT_USHORT_VECT
Definition: InfoBitRanker.h:84
InfoType
the type of measure for information
Definition: InfoBitRanker.h:91
double * getTopN(unsigned int num)
Returns the top n bits ranked by the information metric.
void writeTopBitsToFile(const std::string &fileName) const
Write the top bits to a file.
a class for bit vectors that are densely occupied
InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY)
Constructor.
void setBiasList(RDKit::INT_VECT &classList)
Set the classes to which the entropy calculation should be biased.
std::vector< RDKit::USHORT > USHORT_VECT
Definition: InfoBitRanker.h:83