RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MaxMinPicker.h
Go to the documentation of this file.
1//
2// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3// Copyright (C) 2017 Greg Landrum and NextMove Software
4//
5// @@ All Rights Reserved @@
6// This file is part of the RDKit.
7// The contents are covered by the terms of the BSD license
8// which is included in the file license.txt, found at the root
9// of the RDKit source tree.
10//
11#include <RDGeneral/export.h>
12#ifndef RD_MAXMINPICKER_H
13#define RD_MAXMINPICKER_H
14
15#include <RDGeneral/types.h>
16#include <RDGeneral/utils.h>
17#include <RDGeneral/Invariant.h>
18#include <RDGeneral/RDLog.h>
20#include <cstdlib>
21#include "DistPicker.h"
22#include <boost/random.hpp>
23#include <random>
24
25namespace RDPickers {
26
27/*! \brief Implements the MaxMin algorithm for picking a subset of item from a
28 *pool
29 *
30 * This class inherits from the DistPicker and implements a specific picking
31 *strategy
32 * aimed at diversity. See documentation for "pick()" member function for the
33 *algorithm details
34 */
36 public:
37 /*! \brief Default Constructor
38 *
39 */
41
42 /*! \brief Contains the implementation for a lazy MaxMin diversity picker
43 *
44 * See the documentation for the pick() method for details about the algorithm
45 *
46 * \param func - a function (or functor) taking two unsigned ints as
47 *arguments
48 * and returning the distance (as a double) between those two
49 *elements.
50 * \param poolSize - the size of the pool to pick the items from. It is
51 *assumed that the
52 * distance matrix above contains the right number of elements;
53 *i.e.
54 * poolSize*(poolSize-1)
55 * \param pickSize - the number items to pick from pool (<= poolSize)
56 * \param firstPicks - (optional)the first items in the pick list
57 * \param seed - (optional) seed for the random number generator.
58 * If this is <0 the generator will be seeded with a
59 * random number.
60 */
61 template <typename T>
62 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
63 unsigned int pickSize) const;
64
65 template <typename T>
66 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
67 unsigned int pickSize,
68 const RDKit::INT_VECT &firstPicks,
69 int seed = -1) const;
70
71 template <typename T>
72 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
73 unsigned int pickSize,
74 const RDKit::INT_VECT &firstPicks, int seed,
75 double &threshold) const;
76
77 /*! \brief Contains the implementation for the MaxMin diversity picker
78 *
79 * Here is how the picking algorithm works, refer to
80 * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
81 * for more detail:
82 *
83 * A subset of k items is to be selected from a pool containing N molecules.
84 * Then the MaxMin method is as follows:
85 * -# Initialise Subset with some appropriately chosen seed
86 * compound and set x = 1.
87 * -# For each of the N-x remaining compounds in Dataset
88 * calculate its dissimilarity with each of the x compounds in
89 * Subset and retain the smallest of these x dissimilarities for
90 * each compound in Dataset.
91 * -# Select the molecule from Dataset with the largest value
92 * for the smallest dissimilarity calculated in Step 2 and
93 * transfer it to Subset.
94 * -# Set x = x + 1 and return to Step 2 if x < k.
95 *
96 *
97 *
98 * \param distMat - distance matrix - a vector of double. It is assumed that
99 *only the
100 * lower triangle element of the matrix are supplied in a 1D
101 *array\n
102 * \param poolSize - the size of the pool to pick the items from. It is
103 *assumed that the
104 * distance matrix above contains the right number of elements;
105 *i.e.
106 * poolSize*(poolSize-1) \n
107 * \param pickSize - the number items to pick from pool (<= poolSize)
108 * \param firstPicks - indices of the items used to seed the pick set.
109 * \param seed - (optional) seed for the random number generator
110 * If this is <0 the generator will be seeded with a
111 * random number.
112 */
113 RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
114 unsigned int pickSize, RDKit::INT_VECT firstPicks,
115 int seed = -1) const {
116 CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
117 if (!poolSize) {
118 throw ValueErrorException("empty pool to pick from");
119 }
120 if (poolSize < pickSize) {
121 throw ValueErrorException("pickSize cannot be larger than the poolSize");
122 }
123 distmatFunctor functor(distMat);
124 return this->lazyPick(functor, poolSize, pickSize, firstPicks, seed);
125 }
126
127 /*! \overload */
128 RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
129 unsigned int pickSize) const override {
131 return pick(distMat, poolSize, pickSize, iv);
132 }
133};
134
136 double dist_bound; // distance to closest reference
137 unsigned int picks; // number of references considered
138 unsigned int next; // singly linked list of candidates
139};
140
141// we implement this here in order to allow arbitrary functors without link
142// errors
143template <typename T>
144RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
145 unsigned int pickSize,
146 const RDKit::INT_VECT &firstPicks,
147 int seed, double &threshold) const {
148 if (!poolSize) {
149 throw ValueErrorException("empty pool to pick from");
150 }
151
152 if (poolSize < pickSize) {
153 throw ValueErrorException("pickSize cannot be larger than the poolSize");
154 }
155
156 RDKit::INT_VECT picks;
157
158 unsigned int memsize = (unsigned int)(poolSize * sizeof(MaxMinPickInfo));
159 std::unique_ptr<MaxMinPickInfo[]> pinfo(new MaxMinPickInfo[memsize]);
160 memset(pinfo.get(), 0, memsize);
161
162 picks.reserve(pickSize);
163 unsigned int picked = 0; // picks.size()
164 unsigned int pick = 0;
165
166 // pick the first entry
167 if (firstPicks.empty()) {
168 // get a seeded random number generator:
169 typedef boost::mt19937 rng_type;
170 typedef boost::uniform_int<> distrib_type;
171 typedef boost::variate_generator<rng_type &, distrib_type> source_type;
172 rng_type generator;
173 distrib_type dist(0, poolSize - 1);
174 if (seed >= 0) {
175 generator.seed(static_cast<rng_type::result_type>(seed));
176 } else {
177 generator.seed(std::random_device()());
178 }
179 source_type randomSource(generator, dist);
180 pick = randomSource();
181 // add the pick to the picks
182 picks.push_back(pick);
183 // and remove it from the pool
184 pinfo[pick].picks = 1;
185 picked = 1;
186
187 } else {
188 for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
189 pIdx != firstPicks.end(); ++pIdx) {
190 pick = static_cast<unsigned int>(*pIdx);
191 if (pick >= poolSize) {
192 throw ValueErrorException("pick index was larger than the poolSize");
193 }
194 picks.push_back(pick);
195 pinfo[pick].picks = 1;
196 picked++;
197 }
198 }
199
200 if (picked >= pickSize) {
201 threshold = -1.0;
202 return picks;
203 }
204
205 unsigned int pool_list = 0;
206 unsigned int *prev = &pool_list;
207 // enter the pool into a list so that we can pick out of it easily
208 for (unsigned int i = 0; i < poolSize; i++) {
209 if (pinfo[i].picks == 0) {
210 *prev = i;
211 prev = &pinfo[i].next;
212 }
213 }
214 *prev = 0;
215
216 unsigned int poolIdx;
217 unsigned int pickIdx;
218
219 // First pass initialize dist_bound
220 prev = &pool_list;
221 pickIdx = picks[0];
222 do {
223 poolIdx = *prev;
224 pinfo[poolIdx].dist_bound = func(poolIdx, pickIdx);
225 pinfo[poolIdx].picks = 1;
226 prev = &pinfo[poolIdx].next;
227 } while (*prev != 0);
228
229 // now pick 1 compound at a time
230 double maxOFmin = -1.0;
231 double tmpThreshold = -1.0;
232 while (picked < pickSize) {
233 unsigned int *pick_prev = nullptr;
234 maxOFmin = -1.0;
235 prev = &pool_list;
236 do {
237 poolIdx = *prev;
238 double minTOi = pinfo[poolIdx].dist_bound;
239 if (minTOi > maxOFmin) {
240 unsigned int pi = pinfo[poolIdx].picks;
241 while (pi < picked) {
242 unsigned int picki = picks[pi];
243 CHECK_INVARIANT(poolIdx != picki, "pool index != pick index");
244 double dist = func(poolIdx, picki);
245 pi++;
246 if (dist <= minTOi) {
247 minTOi = dist;
248 if (minTOi <= maxOFmin) {
249 break;
250 }
251 }
252 }
253 pinfo[poolIdx].dist_bound = minTOi;
254 pinfo[poolIdx].picks = pi;
255 if (minTOi > maxOFmin) {
256 maxOFmin = minTOi;
257 pick_prev = prev;
258 pick = poolIdx;
259 }
260 }
261 prev = &pinfo[poolIdx].next;
262 } while (*prev != 0);
263
264 // if the current distance is closer then threshold, we're done
265 if (maxOFmin <= threshold && threshold >= 0.0) {
266 break;
267 }
268 tmpThreshold = maxOFmin;
269 // now add the new pick to picks and remove it from the pool
270 *pick_prev = pinfo[pick].next;
271 picks.push_back(pick);
272 ++picked;
273 }
274
275 threshold = tmpThreshold;
276 return picks;
277}
278
279template <typename T>
280RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
281 unsigned int pickSize,
282 const RDKit::INT_VECT &firstPicks,
283 int seed) const {
284 double threshold = -1.0;
285 return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
286 threshold);
287}
288
289template <typename T>
290RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
291 unsigned int pickSize) const {
292 RDKit::INT_VECT firstPicks;
293 double threshold = -1.0;
294 int seed = -1;
295 return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
296 threshold);
297}
298}; // namespace RDPickers
299
300#endif
#define CHECK_INVARIANT(expr, mess)
Definition Invariant.h:101
Abstract base class to do perform item picking (typically molecules) using a distance matrix.
Definition DistPicker.h:46
Implements the MaxMin algorithm for picking a subset of item from a pool.
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed=-1) const
Contains the implementation for the MaxMin diversity picker.
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy MaxMin diversity picker.
MaxMinPicker()
Default Constructor.
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const override
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition Exceptions.h:40
#define RDKIT_SIMDIVPICKERS_EXPORT
Definition export.h:497
std::vector< int > INT_VECT
Definition types.h:291