RDKit
Open-source cheminformatics and machine learning.
MaxMinPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 // Copyright (C) 2017 Greg Landrum and NextMove Software
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #ifndef RD_MAXMINPICKER_H
12 #define RD_MAXMINPICKER_H
13 
14 #include <RDGeneral/types.h>
15 #include <RDGeneral/utils.h>
16 #include <RDGeneral/Invariant.h>
17 #include <RDGeneral/RDLog.h>
18 #include <RDGeneral/Exceptions.h>
19 #include <cstdlib>
20 #include "DistPicker.h"
21 #include <boost/random.hpp>
22 
23 namespace RDPickers {
24 
25 namespace {
26 class distmatFunctor {
27  public:
28  distmatFunctor(const double *distMat) : dp_distMat(distMat){};
29  double operator()(unsigned int i, unsigned int j) {
30  return getDistFromLTM(this->dp_distMat, i, j);
31  }
32 
33  private:
34  const double *dp_distMat;
35 };
36 }
37 
38 /*! \brief Implements the MaxMin algorithm for picking a subset of item from a
39  *pool
40  *
41  * This class inherits from the DistPicker and implements a specific picking
42  *strategy
43  * aimed at diversity. See documentation for "pick()" member function for the
44  *algorithm details
45  */
46 class MaxMinPicker : public DistPicker {
47  public:
48  /*! \brief Default Constructor
49  *
50  */
52 
53  /*! \brief Contains the implementation for a lazy MaxMin diversity picker
54  *
55  * See the documentation for the pick() method for details about the algorithm
56  *
57  * \param func - a function (or functor) taking two unsigned ints as
58  *arguments
59  * and returning the distance (as a double) between those two
60  *elements.
61  * \param poolSize - the size of the pool to pick the items from. It is
62  *assumed that the
63  * distance matrix above contains the right number of elements;
64  *i.e.
65  * poolSize*(poolSize-1)
66  * \param pickSize - the number items to pick from pool (<= poolSize)
67  * \param firstPicks - (optional)the first items in the pick list
68  * \param seed - (optional) seed for the random number generator
69  */
70  template <typename T>
71  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
72  unsigned int pickSize) const;
73 
74  template <typename T>
75  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
76  unsigned int pickSize,
77  const RDKit::INT_VECT &firstPicks,
78  int seed = -1) const;
79 
80  template <typename T>
81  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
82  unsigned int pickSize,
83  const RDKit::INT_VECT &firstPicks, int seed,
84  double &threshold) const;
85 
86  /*! \brief Contains the implementation for the MaxMin diversity picker
87  *
88  * Here is how the picking algorithm works, refer to
89  * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
90  * for more detail:
91  *
92  * A subset of k items is to be selected from a pool containing N molecules.
93  * Then the MaxMin method is as follows:
94  * -# Initialise Subset with some appropriately chosen seed
95  * compound and set x = 1.
96  * -# For each of the N-x remaining compounds in Dataset
97  * calculate its dissimilarity with each of the x compounds in
98  * Subset and retain the smallest of these x dissimilarities for
99  * each compound in Dataset.
100  * -# Select the molecule from Dataset with the largest value
101  * for the smallest dissimilarity calculated in Step 2 and
102  * transfer it to Subset.
103  * -# Set x = x + 1 and return to Step 2 if x < k.
104  *
105  *
106  *
107  * \param distMat - distance matrix - a vector of double. It is assumed that
108  *only the
109  * lower triangle element of the matrix are supplied in a 1D
110  *array\n
111  * \param poolSize - the size of the pool to pick the items from. It is
112  *assumed that the
113  * distance matrix above contains the right number of elements;
114  *i.e.
115  * poolSize*(poolSize-1) \n
116  * \param pickSize - the number items to pick from pool (<= poolSize)
117  * \param firstPicks - indices of the items used to seed the pick set.
118  * \param seed - (optional) seed for the random number generator
119  */
120  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
121  unsigned int pickSize, RDKit::INT_VECT firstPicks,
122  int seed = -1) const {
123  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
124  if (!poolSize) throw ValueErrorException("empty pool to pick from");
125  if (poolSize < pickSize)
126  throw ValueErrorException("pickSize cannot be larger than the poolSize");
127  distmatFunctor functor(distMat);
128  return this->lazyPick(functor, poolSize, pickSize, firstPicks, seed);
129  }
130 
131  /*! \overload */
132  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
133  unsigned int pickSize) const {
134  RDKit::INT_VECT iv;
135  return pick(distMat, poolSize, pickSize, iv);
136  }
137 };
138 
140  double dist_bound; // distance to closest reference
141  unsigned int picks; // number of references considered
142  unsigned int next; // singly linked list of candidates
143 };
144 
145 // we implement this here in order to allow arbitrary functors without link
146 // errors
147 template <typename T>
148 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
149  unsigned int pickSize,
150  const RDKit::INT_VECT &firstPicks,
151  int seed, double &threshold) const {
152  if (!poolSize) throw ValueErrorException("empty pool to pick from");
153 
154  if (poolSize < pickSize)
155  throw ValueErrorException("pickSize cannot be larger than the poolSize");
156 
157  RDKit::INT_VECT picks;
158 
159  unsigned int memsize = (unsigned int)(poolSize * sizeof(MaxMinPickInfo));
160  MaxMinPickInfo *pinfo = new MaxMinPickInfo[memsize];
161  if (!pinfo) {
162  threshold = -1.0;
163  return picks;
164  }
165  memset(pinfo, 0, memsize);
166 
167  picks.reserve(pickSize);
168  unsigned int picked = 0; // picks.size()
169  unsigned int pick;
170 
171  // pick the first entry
172  if (firstPicks.empty()) {
173  // get a seeded random number generator:
174  typedef boost::mt19937 rng_type;
175  typedef boost::uniform_int<> distrib_type;
176  typedef boost::variate_generator<rng_type &, distrib_type> source_type;
177  rng_type generator(42u);
178  distrib_type dist(0, poolSize - 1);
179  source_type randomSource(generator, dist);
180  if (seed > 0) generator.seed(static_cast<rng_type::result_type>(seed));
181 
182  pick = randomSource();
183  // add the pick to the picks
184  picks.push_back(pick);
185  // and remove it from the pool
186  pinfo[pick].picks = 1;
187  picked = 1;
188 
189  } else {
190  for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
191  pIdx != firstPicks.end(); ++pIdx) {
192  pick = static_cast<unsigned int>(*pIdx);
193  if (pick >= poolSize) {
194  delete[] pinfo;
195  throw ValueErrorException("pick index was larger than the poolSize");
196  }
197  picks.push_back(pick);
198  pinfo[pick].picks = 1;
199  picked++;
200  }
201  }
202 
203  if (picked >= pickSize) {
204  threshold = -1.0;
205  delete[] pinfo;
206  return picks;
207  }
208 
209  unsigned int pool_list = 0;
210  unsigned int *prev = &pool_list;
211  // enter the pool into a list so that we can pick out of it easily
212  for (unsigned int i = 0; i < poolSize; i++)
213  if (pinfo[i].picks == 0) {
214  *prev = i;
215  prev = &pinfo[i].next;
216  }
217  *prev = 0;
218 
219  unsigned int poolIdx;
220  unsigned int pickIdx;
221 
222  // First pass initialize dist_bound
223  prev = &pool_list;
224  pickIdx = picks[0];
225  do {
226  poolIdx = *prev;
227  pinfo[poolIdx].dist_bound = func(poolIdx, pickIdx);
228  pinfo[poolIdx].picks = 1;
229  prev = &pinfo[poolIdx].next;
230  } while (*prev != 0);
231 
232  // now pick 1 compound at a time
233  double maxOFmin = -1.0;
234  double tmpThreshold = -1.0;
235  while (picked < pickSize) {
236  unsigned int *pick_prev = 0;
237  maxOFmin = -1.0;
238  prev = &pool_list;
239  do {
240  poolIdx = *prev;
241  double minTOi = pinfo[poolIdx].dist_bound;
242  if (minTOi > maxOFmin) {
243  unsigned int pi = pinfo[poolIdx].picks;
244  while (pi < picked) {
245  unsigned int picki = picks[pi];
246  CHECK_INVARIANT(poolIdx != picki, "pool index != pick index");
247  double dist = func(poolIdx, picki);
248  pi++;
249  if (dist <= minTOi) {
250  minTOi = dist;
251  if (minTOi <= maxOFmin) break;
252  }
253  }
254  pinfo[poolIdx].dist_bound = minTOi;
255  pinfo[poolIdx].picks = pi;
256  if (minTOi > maxOFmin) {
257  maxOFmin = minTOi;
258  pick_prev = prev;
259  pick = poolIdx;
260  }
261  }
262  prev = &pinfo[poolIdx].next;
263  } while (*prev != 0);
264 
265  // if the current distance is closer then threshold, we're done
266  if (threshold >= 0.0 && maxOFmin < threshold) break;
267  tmpThreshold = maxOFmin;
268  // now add the new pick to picks and remove it from the pool
269  *pick_prev = pinfo[pick].next;
270  picks.push_back(pick);
271  picked++;
272  }
273 
274  threshold = tmpThreshold;
275  delete[] pinfo;
276  return picks;
277 }
278 
279 template <typename T>
280 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
281  unsigned int pickSize,
282  const RDKit::INT_VECT &firstPicks,
283  int seed) const {
284  double threshold = -1.0;
285  return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
286  threshold);
287 }
288 
289 template <typename T>
290 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
291  unsigned int pickSize) const {
292  RDKit::INT_LIST firstPicks;
293  double threshold = -1.0;
294  return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, -1,
295  threshold);
296 }
297 };
298 
299 #endif
std::list< int > INT_LIST
Definition: types.h:197
boost::minstd_rand rng_type
Definition: utils.h:35
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:99
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy MaxMin diversity picker.
Definition: MaxMinPicker.h:290
double getDistFromLTM(const double *distMat, unsigned int i, unsigned int j)
function to lookup distance from 1D lower triangular distance matrix
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed=-1) const
Contains the implementation for the MaxMin diversity picker.
Definition: MaxMinPicker.h:120
std::vector< int > INT_VECT
Definition: types.h:191
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
Definition: MaxMinPicker.h:132
Implements the MaxMin algorithm for picking a subset of item from a pool.
Definition: MaxMinPicker.h:46
MaxMinPicker()
Default Constructor.
Definition: MaxMinPicker.h:51
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:32
Abstract base class to do perform item picking (typically molecules) using a distance matrix...
Definition: DistPicker.h:43