RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
InfoBitRanker.h
Go to the documentation of this file.
1// $Id$
2//
3// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10
11#include <RDGeneral/export.h>
12#ifndef _RD_INFORANKER_H_
13#define _RD_INFORANKER_H_
14
15#include <RDGeneral/types.h>
17#include <iostream>
18
19/*! \brief Class used to rank bits based on a specified measure of information
20 *
21 * Basically a primitive mimic of the CombiChem "signal" functionality
22 * To use:
23 * - create an instance of this class
24 * - loop over the fingerprints in the dataset by calling accumulateVotes
25 *method
26 * - call getTopN to get the top n ranked bits
27 *
28 * Sample usage and results from the python wrapper:
29 * Here's a small set of vectors:
30 * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
31 * ...
32 * 0001 0
33 * 0101 0
34 * 0010 1
35 * 1110 1
36 *
37 * Default ranker, using infogain:
38 * >>> ranker = InfoBitRanker(4,2)
39 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
40 * ...
41 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
42 *int(bit),'%.3f'%gain,int(n0),int(n1)
43 * ...
44 * 3 1.000 2 0
45 * 2 1.000 0 2
46 * 0 0.311 0 1
47 *
48 * Using the biased infogain:
49 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
50 * >>> ranker.SetBiasList((1,))
51 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
52 * ...
53 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
54 *int(bit),'%.3f'%gain,int(n0),int(n1)
55 * ...
56 * 2 1.000 0 2
57 * 0 0.311 0 1
58 * 1 0.000 1 1
59 *
60 * A chi squared ranker is also available:
61 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
62 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
63 * ...
64 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
65 *int(bit),'%.3f'%gain,int(n0),int(n1)
66 * ...
67 * 3 4.000 2 0
68 * 2 4.000 0 2
69 * 0 1.333 0 1
70 *
71 * As is a biased chi squared:
72 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
73 * >>> ranker.SetBiasList((1,))
74 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
75 * ...
76 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
77 *int(bit),'%.3f'%gain,int(n0),int(n1)
78 * ...
79 * 2 4.000 0 2
80 * 0 1.333 0 1
81 * 1 0.000 1 1
82 */
83namespace RDInfoTheory {
84typedef std::vector<RDKit::USHORT> USHORT_VECT;
85typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
86
88 public:
89 /*! \brief the type of measure for information
90 *
91 */
92 typedef enum {
93 ENTROPY = 1,
94 BIASENTROPY = 2,
95 CHISQUARE = 3,
96 BIASCHISQUARE = 4
97 } InfoType;
98
99 /*! \brief Constructor
100 *
101 * ARGUMENTS:
102 *
103 * - nBits: the dimension of the bit vectors or the fingerprint length
104 * - nClasses: the number of classes used in the classification problem
105 *(e.g. active,
106 * moderately active, inactive etc.). It is assumed that the
107 *classes are
108 * numbered from 0 to (nClasses - 1)
109 * - infoType: the type of information metric
110 */
111 InfoBitRanker(unsigned int nBits, unsigned int nClasses,
112 InfoType infoType = InfoBitRanker::ENTROPY)
113 : d_dims(nBits), d_classes(nClasses), d_type(infoType) {
114 d_counts.resize(0);
115 for (unsigned int i = 0; i < nClasses; i++) {
116 USHORT_VECT cCount;
117 cCount.resize(d_dims, 0);
118 d_counts.push_back(cCount);
119 }
120 d_clsCount.resize(d_classes, 0);
121 d_nInst = 0;
122 d_top = 0;
123 dp_topBits = nullptr;
124 d_biasList.resize(0);
125 dp_maskBits = nullptr;
126 }
127
129 if (dp_topBits) {
130 delete[] dp_topBits;
131 }
132 if (dp_maskBits) {
133 delete dp_maskBits;
134 }
135 }
136
137 /*! \brief Accumulate the votes for all the bits turned on in a bit vector
138 *
139 * ARGUMENTS:
140 *
141 * - bv : bit vector that supports [] operator
142 * - label : the class label for the bit vector. It is assumed that 0 <=
143 *class < nClasses
144 */
145 void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
146 void accumulateVotes(const SparseBitVect &bv, unsigned int label);
147
148 /*! \brief Returns the top n bits ranked by the information metric
149 *
150 * This is actually the function where most of the work of ranking is
151 *happening
152 *
153 * \param num the number of top ranked bits that are required
154 *
155 * \return a pointer to an information array. The client should *not*
156 * delete this
157 */
158 double *getTopN(unsigned int num);
159
160 /*! \brief return the number of labelled instances(examples) or fingerprints
161 *seen so far
162 *
163 */
164 unsigned int getNumInstances() const { return d_nInst; }
165
166 /*! \brief return the number of classes
167 *
168 */
169 unsigned int getNumClasses() const { return d_classes; }
170
171 /*! \brief Set the classes to which the entropy calculation should be biased
172 *
173 * This list contains a set of class ids used when in the BIASENTROPY mode of
174 *ranking bits.
175 * In this mode, a bit must be correllated higher with one of the biased
176 *classes than all the
177 * other classes. For example, in a two class problem with actives and
178 *inactives, the fraction of
179 * actives that hit the bit has to be greater than the fraction of inactives
180 *that hit the bit
181 *
182 * ARGUMENTS:
183 * classList - list of class ids that we want a bias towards
184 */
185 void setBiasList(RDKit::INT_VECT &classList);
186
187 /*! \brief Set the bits to be used as a mask
188 *
189 * If this function is called, only the bits which are present in the
190 * maskBits list will be used.
191 *
192 * ARGUMENTS:
193 * maskBits - the bits to be considered
194 */
196
197 /*! \brief Write the top N bits to a stream
198 *
199 */
200 void writeTopBitsToStream(std::ostream *outStream) const;
201
202 /*! \brief Write the top bits to a file
203 *
204 */
205 void writeTopBitsToFile(const std::string &fileName) const;
206
207 private:
208 /*! \brief check if we want to compute the info content for a bit based on the
209 *bias list
210 *
211 * This what happens here:
212 * - the fraction of items in each class that hit a particular bit are
213 *computed
214 * - the maximum of these fractions for classes that are not in the
215 *biasList are computed
216 * - If this maximum is less than the fraction for at least one of the
217 * classes in the biaslist, the bit is considered good
218 * ARGUMENTS:
219 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
220 *of classes))
221 * a 2D structure is assumed with the first row containing number
222 *of items of each class
223 * with the bit set and the second row to entires of each class
224 *with the bit turned off
225 */
226 bool BiasCheckBit(RDKit::USHORT *resMat) const;
227
228 /*! \brief Compute the biased info entropy gain based on the bias list
229 *
230 * This what happens here:
231 * - we call BiasCheckBit to see if the bit qualifies to compute the
232 *infocontent
233 * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
234 *
235 * ARGUMENTS:
236 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
237 *of classes))
238 * a 2D structure is assumed with the first row containing number
239 *of items of each class
240 * with the bit set and the second row to entires of each class
241 *with the bit turned off
242 */
243 double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
244
245 /*! \brief Compute the biased chi qsure value based on the bias list
246 *
247 * This what happens here:
248 * - we call BiasCheckBit to see if the bit qualifies to compute the
249 *infocontent
250 * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
251 *
252 * ARGUMENTS:
253 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
254 *of classes))
255 * a 2D structure is assumed with the first row containing number
256 *of items of each class
257 * with the bit set and the second row to entires of each class
258 *with the bit turned off
259 */
260 double BiasChiSquareGain(RDKit::USHORT *resMat) const;
261
262 unsigned int d_dims; // the number of bits in the fingerprints
263 unsigned int d_classes; // the number of classes (active, inactive,
264 // moderately active etc.)
265 InfoType d_type; // the type of information measure - currently we support
266 // only entropy
267 VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for
268 // each bit for each class
269 USHORT_VECT d_clsCount; // counter for the number of instances of each class
270 double *dp_topBits; // storage for the top ranked bits and the corresponding
271 // statistics
272 unsigned int d_top; // the number of bits that have been ranked
273 unsigned int d_nInst; // total number of instances or fingerprints used
274 // accumulate votes
276 d_biasList; // if we want a bias towards certain classes in ranking bits
277 ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
278};
279} // namespace RDInfoTheory
280#endif
Pulls in all the BitVect classes.
a class for bit vectors that are densely occupied
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label)
Accumulate the votes for all the bits turned on in a bit vector.
InfoType
the type of measure for information
void setMaskBits(RDKit::INT_VECT &maskBits)
Set the bits to be used as a mask.
void writeTopBitsToFile(const std::string &fileName) const
Write the top bits to a file.
InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY)
Constructor.
unsigned int getNumClasses() const
return the number of classes
void accumulateVotes(const SparseBitVect &bv, unsigned int label)
double * getTopN(unsigned int num)
Returns the top n bits ranked by the information metric.
unsigned int getNumInstances() const
return the number of labelled instances(examples) or fingerprints seen so far
void writeTopBitsToStream(std::ostream *outStream) const
Write the top N bits to a stream.
void setBiasList(RDKit::INT_VECT &classList)
Set the classes to which the entropy calculation should be biased.
a class for bit vectors that are sparsely occupied.
#define RDKIT_INFOTHEORY_EXPORT
Definition export.h:249
Class used to rank bits based on a specified measure of information.
std::vector< RDKit::USHORT > USHORT_VECT
std::vector< USHORT_VECT > VECT_USHORT_VECT
std::vector< int > INT_VECT
Definition types.h:291
unsigned short USHORT
Definition types.h:288