RDKit
Open-source cheminformatics and machine learning.
MaxMinPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef __RD_MAXMINPICKER_H__
11 #define __RD_MAXMINPICKER_H__
12 
13 #include <RDGeneral/types.h>
14 #include <RDGeneral/utils.h>
15 #include <RDGeneral/Invariant.h>
16 #include <RDGeneral/RDLog.h>
17 #include <RDGeneral/Exceptions.h>
18 #include <cstdlib>
19 #include "DistPicker.h"
20 #include <boost/random.hpp>
21 
22 namespace RDPickers {
23 
24 namespace {
25 class distmatFunctor {
26  public:
27  distmatFunctor(const double *distMat) : dp_distMat(distMat){};
28  double operator()(unsigned int i, unsigned int j) {
29  return getDistFromLTM(this->dp_distMat, i, j);
30  }
31 
32  private:
33  const double *dp_distMat;
34 };
35 }
36 
37 /*! \brief Implements the MaxMin algorithm for picking a subset of item from a
38  *pool
39  *
40  * This class inherits from the DistPicker and implements a specific picking
41  *strategy
42  * aimed at diversity. See documentation for "pick()" member function for the
43  *algorithm details
44  */
45 class MaxMinPicker : public DistPicker {
46  public:
47  /*! \brief Default Constructor
48  *
49  */
51 
52  /*! \brief Contains the implementation for a lazy MaxMin diversity picker
53  *
54  * See the documentation for the pick() method for details about the algorithm
55  *
56  * \param func - a function (or functor) taking two unsigned ints as
57  *arguments
58  * and returning the distance (as a double) between those two
59  *elements.
60  * \param poolSize - the size of the pool to pick the items from. It is
61  *assumed that the
62  * distance matrix above contains the right number of elements;
63  *i.e.
64  * poolSize*(poolSize-1)
65  * \param pickSize - the number items to pick from pool (<= poolSize)
66  * \param firstPicks - (optional)the first items in the pick list
67  * \param seed - (optional) seed for the random number generator
68  */
69  template <typename T>
70  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
71  unsigned int pickSize,
72  RDKit::INT_VECT firstPicks = RDKit::INT_VECT(),
73  int seed = -1) const;
74 
75  /*! \brief Contains the implementation for the MaxMin diversity picker
76  *
77  * Here is how the picking algorithm works, refer to
78  * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
79  * for more detail:
80  *
81  * A subset of k items is to be selected from a pool containing N molecules.
82  * Then the MaxMin method is as follows:
83  * -# Initialise Subset with some appropriately chosen seed
84  * compound and set x = 1.
85  * -# For each of the N-x remaining compounds in Dataset
86  * calculate its dissimilarity with each of the x compounds in
87  * Subset and retain the smallest of these x dissimilarities for
88  * each compound in Dataset.
89  * -# Select the molecule from Dataset with the largest value
90  * for the smallest dissimilarity calculated in Step 2 and
91  * transfer it to Subset.
92  * -# Set x = x + 1 and return to Step 2 if x < k.
93  *
94  *
95  *
96  * \param distMat - distance matrix - a vector of double. It is assumed that
97  *only the
98  * lower triangle element of the matrix are supplied in a 1D
99  *array\n
100  * \param poolSize - the size of the pool to pick the items from. It is
101  *assumed that the
102  * distance matrix above contains the right number of elements;
103  *i.e.
104  * poolSize*(poolSize-1) \n
105  * \param pickSize - the number items to pick from pool (<= poolSize)
106  * \param firstPicks - indices of the items used to seed the pick set.
107  * \param seed - (optional) seed for the random number generator
108  */
109  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
110  unsigned int pickSize, RDKit::INT_VECT firstPicks,
111  int seed = -1) const {
112  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
113  if (poolSize < pickSize)
114  throw ValueErrorException("pickSize cannot be larger than the poolSize");
115  distmatFunctor functor(distMat);
116  return this->lazyPick(functor, poolSize, pickSize, firstPicks, seed);
117  }
118 
119  /*! \overload */
120  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
121  unsigned int pickSize) const {
122  RDKit::INT_VECT iv;
123  return pick(distMat, poolSize, pickSize, iv);
124  }
125 };
126 // we implement this here in order to allow arbitrary functors without link
127 // errors
128 template <typename T>
129 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
130  unsigned int pickSize,
131  RDKit::INT_VECT firstPicks,
132  int seed) const {
133  if (poolSize < pickSize)
134  throw ValueErrorException("pickSize cannot be larger than the poolSize");
135 
136  RDKit::INT_LIST pool;
137 
138  RDKit::INT_VECT picks;
139  picks.reserve(pickSize);
140  unsigned int pick = 0;
141 
142  // enter the pool into a list so that we can pick out of it easily
143  for (unsigned int i = 0; i < poolSize; i++) {
144  pool.push_back(i);
145  }
146 
147  // get a seeded random number generator:
148  typedef boost::mt19937 rng_type;
149  typedef boost::uniform_int<> distrib_type;
150  typedef boost::variate_generator<rng_type &, distrib_type> source_type;
151  rng_type generator(42u);
152  distrib_type dist(0, poolSize);
153  source_type randomSource(generator, dist);
154  if (seed > 0) generator.seed(static_cast<rng_type::result_type>(seed));
155 
156  // pick the first entry
157  if (!firstPicks.size()) {
158  pick = randomSource();
159  // add the pick to the picks
160  picks.push_back(pick);
161  // and remove it from the pool
162  pool.remove(pick);
163  } else {
164  for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
165  pIdx != firstPicks.end(); ++pIdx) {
166  pick = static_cast<unsigned int>(*pIdx);
167  if (pick >= poolSize)
168  throw ValueErrorException("pick index was larger than the poolSize");
169  picks.push_back(pick);
170  pool.remove(pick);
171  }
172  }
173  // now pick 1 compound at a time
174  while (picks.size() < pickSize) {
175  double maxOFmin = -1.0;
176  RDKit::INT_LIST_I plri = pool.end();
177  for (RDKit::INT_LIST_I pli = pool.begin(); pli != pool.end(); ++pli) {
178  unsigned int poolIdx = (*pli);
179  double minTOi = RDKit::MAX_DOUBLE;
180  for (RDKit::INT_VECT_CI pi = picks.begin(); pi != picks.end(); ++pi) {
181  unsigned int pickIdx = (*pi);
182  CHECK_INVARIANT(poolIdx != pickIdx, "");
183  double dist = func(poolIdx, pickIdx);
184  if (dist <= minTOi) {
185  minTOi = dist;
186  }
187  }
188  if (minTOi > maxOFmin ||
189  (RDKit::feq(minTOi, maxOFmin) && poolIdx < pick)) {
190  maxOFmin = minTOi;
191  pick = poolIdx;
192  plri = pli;
193  }
194  }
195 
196  // now add the new pick to picks and remove it from the pool
197  picks.push_back(pick);
198  CHECK_INVARIANT(plri != pool.end(), "");
199  pool.erase(plri);
200  }
201  return picks;
202 }
203 };
204 
205 #endif
std::list< int > INT_LIST
Definition: types.h:194
boost::minstd_rand rng_type
Definition: utils.h:35
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:99
const double MAX_DOUBLE
INT_LIST::iterator INT_LIST_I
Definition: types.h:195
double getDistFromLTM(const double *distMat, unsigned int i, unsigned int j)
function to lookup distance from 1D lower triangular distance matrix
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed=-1) const
Contains the implementation for the MaxMin diversity picker.
Definition: MaxMinPicker.h:109
std::vector< int > INT_VECT
Definition: types.h:188
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
Definition: MaxMinPicker.h:120
Implements the MaxMin algorithm for picking a subset of item from a pool.
Definition: MaxMinPicker.h:45
INT_VECT::const_iterator INT_VECT_CI
Definition: types.h:190
MaxMinPicker()
Default Constructor.
Definition: MaxMinPicker.h:50
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:32
Abstract base class to do perform item picking (typically molecules) using a distance matrix...
Definition: DistPicker.h:43
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks=RDKit::INT_VECT(), int seed=-1) const
Contains the implementation for a lazy MaxMin diversity picker.
Definition: MaxMinPicker.h:129
bool feq(double v1, double v2, double tol=1e-4)
floating point comparison with a tolerance